1
- utils :: globalVariables(c(" all_of" , " corr" , " DI" , " C1" ," id_oa" ," RP" ," UN" ," AU_ID" ," corresponding_author_ids" ))
1
+ utils :: globalVariables(c(" all_of" , " corr" , " DI" , " C1" ," id_oa" ," RP" ," UN" ," AU_ID" ," corresponding_author_ids" , " References " ))
2
2
3
3
csvOA2df <- function (file ){
4
4
options(readr.num_columns = 0 )
@@ -28,7 +28,7 @@ csvOA2df <- function(file){
28
28
# recode as numeric
29
29
DATA $ TC <- as.numeric(DATA $ TC )
30
30
DATA $ PY <- as.numeric(DATA $ PY )
31
- DATA $ relevance_score <- as.numeric(DATA $ relevance_score )
31
+ # DATA$relevance_score <- as.numeric(DATA$relevance_score)
32
32
33
33
# replace | with ;
34
34
DATA <- DATA %> %
@@ -48,21 +48,35 @@ csvOA2df <- function(file){
48
48
AFF <- DATA %> %
49
49
select(id_oa , starts_with(" authorships_raw_affiliation_strings_" ))
50
50
51
- colId <- c(- 1 ,parse_number(colnames(AFF )[- 1 ]))
52
-
53
- DATA <- AFF [order(colId )] %> %
54
- unite(. , C1 , starts_with(" authorships_raw_affiliation_strings_" ), sep = " ;" ) %> %
55
- mutate(C1 = gsub(" NA" ," " ,C1 ),
56
- C1 = TrimMult(C1 ,char = " ;" )) %> %
57
- bind_cols(DATA %> %
58
- select(- " id_oa" , - starts_with(" authorships_raw_affiliation_strings_" )))
59
-
51
+ if (ncol(AFF )> 1 ){
52
+ colId <- c(- 1 ,parse_number(colnames(AFF )[- 1 ]))
53
+
54
+ DATA <- AFF [order(colId )] %> %
55
+ unite(. , C1 , starts_with(" authorships_raw_affiliation_strings_" ), sep = " ;" ) %> %
56
+ mutate(C1 = gsub(" NA" ," " ,C1 ),
57
+ C1 = TrimMult(C1 ,char = " ;" )) %> %
58
+ bind_cols(DATA %> %
59
+ select(- " id_oa" , - starts_with(" authorships_raw_affiliation_strings_" )))
60
+ } else {
61
+ AFF <- lapply(stri_extract_all_regex(DATA $ authorships.raw_affiliation_strings , " \\ [([^\\ ]]+)\\ ]" ), function (l ){
62
+ gsub(" \\ ['|'\\ ]" ," " ,l )
63
+ })
64
+
65
+ AFF <- data.frame (id_oa = rep(DATA $ id_oa , lengths(AFF )), C1 = unlist(AFF )) %> %
66
+ group_by(id_oa ) %> %
67
+ summarize(C1 = paste(C1 ,collapse = " ;" ))
68
+ DATA <- DATA %> %
69
+ left_join(AFF , by = " id_oa" )
70
+ DATA $ C1 [is.na(DATA $ C1 )] <- " "
71
+ }
72
+
60
73
DATA $ C1 <- gsub(" https://" , " " , DATA $ C1 )
61
74
62
75
# country string
63
76
CO <- DATA %> %
64
77
select(id_oa , starts_with(" authorships_countries_" ))
65
78
79
+ if (ncol(CO )> 1 ){
66
80
colId <- c(- 1 ,parse_number(colnames(CO )[- 1 ]))
67
81
68
82
DATA <- CO [order(colId )] %> %
@@ -71,12 +85,32 @@ csvOA2df <- function(file){
71
85
AU_CO = TrimMult(AU_CO ,char = " ;" )) %> %
72
86
bind_cols(DATA %> %
73
87
select(- " id_oa" , - starts_with(" authorships_countries_" )))
88
+ } else {
89
+ CO <- lapply(stri_extract_all_regex(DATA $ authorships.countries , " \\ [([^\\ ]]+)\\ ]" ), function (l ){
90
+ gsub(" \\ ['|'\\ ]" ," " ,l )
91
+ })
92
+
93
+ CO <- data.frame (id_oa = rep(DATA $ id_oa , lengths(CO )), AU_CO = unlist(CO )) %> %
94
+ group_by(id_oa ) %> %
95
+ summarize(AU_CO = gsub(" '" ," " ,paste(AU_CO ,collapse = " ;" )))
96
+ DATA <- DATA %> %
97
+ left_join(CO , by = " id_oa" )
98
+ DATA $ AU_CO [is.na(DATA $ AU_CO )] <- " "
99
+ }
100
+
74
101
75
102
# # corresponding author
76
103
DATA <- DATA %> %
77
104
mutate(AU1_ID = gsub(" ;.*" , " " , corresponding_author_ids ))
78
105
UN <- strsplit(DATA $ C1 ," ;" )
79
- corresp <- strsplit(DATA $ authorships_is_corresponding ," ;" )
106
+ if (" authorships_is_corresponding" %in% names(DATA )){
107
+ corresp <- strsplit(tolower(DATA $ authorships_is_corresponding )," ;" )
108
+ } else {
109
+ corresp <- strsplit(tolower(DATA $ authorships.is_corresponding )," ;" )
110
+ }
111
+
112
+
113
+
80
114
df_UN <- data.frame (UN = unlist(UN ), id_oa = rep(DATA $ id_oa ,lengths(UN ))) %> %
81
115
group_by(id_oa ) %> %
82
116
mutate(n = row_number())
@@ -120,6 +154,7 @@ csvOA2df <- function(file){
120
154
mutate(across(all_of(label ), toupper ),
121
155
DI = gsub(" https://doi.org/" ," " ,DI ),
122
156
DI = ifelse(DI == " null" ,NA ,DI ))
157
+ DATA $ SO <- toupper(DATA $ SO )
123
158
124
159
return (DATA )
125
160
}
@@ -130,14 +165,18 @@ relabelling_OA <- function(DATA){
130
165
label [label %in% " id" ] <- " id_oa"
131
166
label [label %in% " display_name" ] <- " TI"
132
167
label [label %in% " primary_location_display_name" ] <- " SO"
168
+ label [label %in% " locations.source.display_name" ] <- " SO"
133
169
label [label %in% " primary_location_id" ] <- " SO_ID"
170
+ label [label %in% " locations.source.id" ] <- " SO_ID"
134
171
label [label %in% " primary_location_host_organization" ] <- " PU"
135
172
label [label %in% " primary_location_issns" ] <- " ISSN"
136
173
label [label %in% " primary_location_issn_l" ] <- " ISSN_I"
137
174
label [label %in% " primary_location_landing_page_url" ] <- " URL"
138
175
label [label %in% " primary_location_pdf_url" ] <- " URL_PDF"
139
176
label [label %in% " author_ids" ] <- " AU_ID"
177
+ label [label %in% " authorships.author.id" ] <- " AU_ID"
140
178
label [label %in% " author_names" ] <- " AU"
179
+ label [label %in% " authorships.author.display_name" ] <- " AU"
141
180
label [label %in% " author_orcids" ] <- " OI"
142
181
label [label %in% " author_institution_names" ] <- " C3"
143
182
label [label %in% " cited_by_count" ] <- " TC"
@@ -147,6 +186,7 @@ relabelling_OA <- function(DATA){
147
186
label [label %in% " biblio_volume" ] <- " VL"
148
187
label [label %in% " referenced_works" ] <- " CR"
149
188
label [label %in% " keywords_display_name" ] <- " DE"
189
+ label [label %in% " keywords.display_name" ] <- " DE"
150
190
label [label %in% " abstract" ] <- " AB"
151
191
label [label %in% " concepts_display_name" ] <- " CONCEPTS"
152
192
label [label %in% " topics_display_name" ] <- " TOPICS"
@@ -165,4 +205,4 @@ relabelling_OA <- function(DATA){
165
205
TrimMult <- function (x , char = " " ) {
166
206
return (gsub(paste0(" ^" , char , " *|(?<=" , char , " )" , char , " |" , char , " *$" ),
167
207
" " , x , perl = T ))
168
- }
208
+ }
0 commit comments