-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsciwheel_stats_process.R
340 lines (261 loc) · 8.73 KB
/
sciwheel_stats_process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
## ---------------------------
## Author: Jan D. Lanzer
##
## Date Created: 2021-12-09
##
## Copyright (c) Jan D. Lanzer, 2021
## Email: [email protected]
##
## Purpose of script:
##
## process that data dump of SciWheel statistics for plotting
##
## Main steps:
## 1) Subset data dump to the last year (optional)
## 2) the listed data structure is transformed to tidy table format for
## i) paper counts, ii) notes counts and iii) tags counts.
## 3) simple bar plots are generated and saved at the end
library(tidyverse)
data= readRDS("Sciwheel_update/data_dump.rds")
# data object explanation
# for paper info = data-> list of clubs -> contents -> list of papers -> list of features
# for note info = data-> list of clubs -> notes -> list of papers -> list of notes -> list of features
# note: club refers to SciWheel folder (and slack channel), not necessary a club
# data prep -----------------------------------------------------------------------------------
# prepare data:
clubs= names(data)
# rename object
all_paper_ids= map(clubs, function(x){
map(data[[x]]$content, function(y){
return(y$id)
})%>% unlist()
})
names(all_paper_ids)= clubs
for(i in clubs){
names(data[[i]]$content)= all_paper_ids[[i]]
}
#subset data to one year ----------------------------------------------------------------------------------
# translate milisec to year.
mil_to_year= function(mil){
mil/1000/60/60/24/365
}
# translate year to millisec
year_to_mil = function(year){
mil= year*1000*60*60*24*365
}
# subset function,
# will remove papers from the data object that have not been posted within the
# last year ( defined by the timepoint of most recent paper minus nyear)
subset_data_to_last_year= function(data, n_years= 1){
per.user= map(clubs, function(x){
map(data[[x]]$content, function(y){
y$f1000AddedDate
})%>% unlist()
})
names(per.user)= clubs
#transform to table format:
per.user.df= lapply(names(per.user),function(x) (enframe(per.user[[x]]) %>% mutate(folder = x)))%>%
do.call(rbind,. )
## Define here the interval (in milli secs) that youre intersted in
# we will select most recent paper and subtract a year
# calculate the most recent paper minus the interval of interest:
mil_sec_cut_high= max(per.user.df$value)
mil_sec_cut_low= max(per.user.df$value)- year_to_mil(n_years)
## now that we have the interval subset the data object
# get paper ids of paper in interval
paper_ids= map(clubs, function(x){
map(data[[x]]$content, function(y){
if(y$f1000AddedDate<= mil_sec_cut_high &
y$f1000AddedDate>mil_sec_cut_low ){
return(y$id)}
return(NULL)
})%>% unlist()
})
names(paper_ids)= clubs
#subset obj
data_sub=data
## subset the content
for(i in names(data_sub)){
#print(i)
for(j in names(data_sub[[i]]$content)){
#print(j)
if(!j %in% unlist(paper_ids)){
data_sub[[i]]$content= data_sub[[i]]$content[names(data_sub[[i]]$content)!= j]
}
}
}
##subset the notes:
for(i in names(data_sub)){
#print(i)
for(j in names(data_sub[[i]]$notes)){
#print(j)
if(!j %in% unlist(paper_ids)){
data_sub[[i]]$notes= data_sub[[i]]$notes[names(data_sub[[i]]$notes)!= j]
}
}
}
return(data_sub)
}
# run this if you want to subset the data to the last year:
data= subset_data_to_last_year(data, n_years = 1)
# number of papers ---------------------------------------------------
#### paper per club:
club.papers= map(clubs, function(x){
length(data[[x]]$content)
}) %>% unlist()
names(club.papers)= clubs
#number of papers per club:
club.papers
#total papers posted:
total.papers= sum(club.papers)
#### paper per user
per.user =
map(clubs, function(x){
map(data[[x]]$content, function(y){
y$f1000AddedBy
})%>% unlist()%>% table
})
names(per.user)= clubs
#transform to table format:
per.user.df= lapply(names(per.user),function(x) (enframe(per.user[[x]]) %>% mutate(folder = x)))%>%
do.call(rbind,. )%>% mutate(value= as.integer(value))
#simple plot
ggplot(per.user.df, aes(x= name, y= folder, fill = value))+
geom_tile() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
# Notes --------------------------------------------------------------------------------
# there are multiple notes per paper, also text highlights are saved as notes.
# this will get per club per paper per user the sum of lengths of all notes
#### note length
notes.per.user =
map(clubs, function(club){
map(data[[club]]$notes, function(paper){
notelengths= map(paper, function(notes){
note.l= nchar(notes$comment)
if(note.l<0){return(NULL)}
names(note.l)= notes$user
note.l
})%>% unlist()
# mutliple users per paper commented,summ up per user the note length
if(is.null(notelengths)){return(tibble(name= "nobody", "value"= 0))}
summed=
map(unique(names(notelengths)), function(user){
sum(notelengths[user])
})
names(summed)= unique(names(notelengths))
enframe(summed)%>% mutate(value=unlist(value))
# sum the length of all comments made on a paper
}) %>% do.call(rbind, .)
})
names(notes.per.user)= clubs
# bring in tidy format:
notes.per.user.df= lapply(names(notes.per.user),function(x){
if(!is.null(dim(notes.per.user[[x]])[1])){
notes.per.user[[x]] %>% mutate(folder = x)}
}
)%>% do.call(rbind, .)
# this df now contains for each paper a row, telling us who commented how many characters and from which folder was this paper
## plot number of characters per paper per user
notes.per.user.df %>%
group_by(name) %>%
ggplot(., aes(x= name,y= value ))+
geom_boxplot()+
geom_jitter(alpha= 0.3)+
coord_flip()
# plot number of notes per user:
notes.per.user.df %>%
group_by(name) %>%
count %>%
ggplot(., aes(x= name,y= n))+
geom_col()+
coord_flip()
# Tags ----------------------------------------------------------------------------------------
#### tags per club
tags.per.club =
map(clubs, function(x){
map(data[[x]]$content, function(y){
y$f1000Tags
})%>% unlist()%>% table
})
names(tags.per.club)= clubs
tags.per.club.df= lapply(names(tags.per.club),function(x) (enframe(tags.per.club[[x]]) %>% mutate(folder = x)))%>%
do.call(rbind,. )%>% mutate(value= as.integer(value))
#simple plot
ggplot(tags.per.club.df, aes(x= name, y= folder, fill = value))+
geom_tile() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
#### tags per user
tags.per.user =
map(clubs, function(x){
map(data[[x]]$content, function(y){
if(length(y$f1000Tags)==0){
t= "noTag"
names(t)= y$f1000AddedBy
t= enframe(t)%>% mutate(value= unlist(value))
return(t)
}
t= y$f1000Tags
u= y$f1000AddedBy
names(t)= rep(u, length(t))
enframe(t)%>% mutate(value= unlist(value))
})
})
names(tags.per.user)= clubs
tags.per.user.df= lapply(names(tags.per.user),function(x) (do.call(rbind,tags.per.user[[x]]))) %>%
do.call(rbind,. )%>% group_by(name, value)%>%count
#simple plot
ggplot(tags.per.user.df, aes(x= name, y= value, fill = n))+
geom_tile() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
# nice plots ----------------------------------------------------------------------------------
# we will generate a serious of informative overview plots
unify_axis= function(p){
p+theme(axis.text = element_text(size=12, color ="black"),
axis.title = element_blank())
}
#### number of papers by user and folder
p1= per.user.df%>%
group_by(name)%>%
summarize(value2= sum(value))%>%
ggplot(., aes(x= reorder(name, value2), y = value2))+
geom_col()+
coord_flip()+
theme_bw()+
ggtitle(paste0("paper per user (total:" , total.papers, ")"))
unify_axis(p1)
#papers per club
p2= per.user.df%>%
group_by(folder)%>%
summarize(value2= sum(value))%>%
ggplot(., aes(x= reorder(folder, value2), y = value2))+
geom_col()+
coord_flip()+
theme_bw()+
ggtitle(paste0("paper per club (total:" , total.papers, ")"))
unify_axis(p2)
#### tags usage
#tag frequency
p3= tags.per.user.df %>%
group_by(value)%>%
summarize(n2= sum(n))%>%
ggplot(., aes(x= reorder(value, n2), y = n2))+
geom_col()+
coord_flip()+
theme_bw()+
ggtitle(paste0("paper per tag (total:" , total.papers, ")"))
unify_axis(p3)
### commenting papers
notes.per.user.df
p4= tags.per.user.df %>%
group_by(name)%>%
summarize(n2= sum(n))%>%
ggplot(., aes(x= reorder(value, n2), y = n2))+
geom_col()+
coord_flip()+
theme_bw()+
ggtitle(paste0("paper per user (total:" , total.papers, ")"))
unify_axis(p3)
pdf("overviewplots2024.pdf",
width= 6,
height= 5.5)
unify_axis(p1)
unify_axis(p2)
unify_axis(p3)
dev.off()