-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge7T.R
executable file
·582 lines (516 loc) · 26.1 KB
/
merge7T.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
#!/usr/bin/env Rscript
##
# read various sources of information into a too-many-columns file.
# timepoint per row (beh, eeg, and MR on one line)
#
# see Makefile for individual file source
#
# modelled after mergePet.R
# /Volumes/Phillips/mMR_PETDA/scripts/mergePet_functions.R
##
#
# 20230518WF - init
# 20230531WF - update MRS to use gamadj
# 20240216WF - annotate resting state MRI "lost" scans
#
setwd("/Volumes/Hera/Projects/7TBrainMech/scripts/")
suppressPackageStartupMessages({
library(dplyr); library(tidyr)})
library(glue)
source('merge_funcs.R') # addcolprefix, lunadatemerge, check_datecol
source("mrsi/calc_asymmetry_and_ratio.R") # gaba_glu_asymmetry
# eeg file names have date typo. eeg.date extracted from that. corrected here
rewrite_date<-function(d,l, datecol='eeg.date'){
# data.frame(lunaid=c(1,2,3,1,3),eeg.date=c(1,1,1,2,2)) %>% rewrite_date(list("1_1"="10", "3_2"=20))
id_date <- names(l)
all_ld8 <- paste(sep="_",d$lunaid,d[[datecol]])
for(i in seq_along(id_date)){
d[all_ld8 %in% id_date[i], datecol] <- l[[i]]
}
return(d)
}
eeg_newdates <- list('11632_20190101'='20191001',
'11678_20180116'='20181016',
"11668_20170710"="20180710",
"11670_20210823"="20210824",
"11672_20220617"="20220615")
# 20240122 - 1 person 2 lunaids! twice.
# 11390 and 11665 both have the same data repeated
# list name is replaced by list value
SAME_LUNA <- list('11748'='11515', '11665'='11390')
SAME_LUNA_df <- data.frame(lunaid=unname(unlist(SAME_LUNA)), alt.lunaid=names(SAME_LUNA))
RM_LUNA <- c('11467') # one visit sneaking in on eeg data. not a 7T participant
# these are dropped after disclosing/discovering exclusion criteria
RM_LUNA <- c(RM_LUNA, "11646", "11659", "11800", "11653", "11690", "11812")
rewrite_luna<-function(d, id_col='lunaid', same_luna=SAME_LUNA){
d <- as.data.frame(d)
d$lunaid <- gsub('11748; 11515','11515', d$lunaid)
bad_luna_row<-d[,id_col] %in% RM_LUNA
if(any(bad_luna_row)){
cat("WARNING: non-7T LunaID removed\n")
print(d[bad_luna_row,])
d <- d[!bad_luna_row,]
}
# write lunaid from alt to used
# NB. d[,id_col] BAD SYNTAX. collapses xlsx dataframe column into single value?!
df_ids <- d[[id_col]]
for(bad_id in names(same_luna)){
has_bad_id <- df_ids %in% bad_id
if(!any(has_bad_id)) next
good_id <- same_luna[[bad_id]]
print(glue("WARNING: change id {bad_id} to {good_id}"))
d[has_bad_id, id_col] <- good_id
}
dedupped <- unique(d)
if(nrow(dedupped)!=nrow(d)){
cat("# alt id rewrite weirdness? (reporting that both ids exist in dataset -- dups will be removed. could be okay)\n")
# TODO: assuming lunaid is id_col (as of 20240219, it always is)
print(d[d$lunaid %in% unlist(SAME_LUNA),] %>%
mutate(is_dup=duplicated(.)) %>%
merge(SAME_LUNA_df,by='lunaid',all.x=T))
cat("---\n")
}
check_date_dups(dedupped)
return(dedupped)
}
# for anything built on duplicated preproc visit folders
# stochastic warp and despike (tat2, fd) mean slightly different data.
# not removed with simple unique
rm_dup_lunaid <- function(d) filter(d, !lunaid %in% SAME_LUNA_df$alt.lunaid)
#
# all file paths we need to merge
# also see Makefile for what scripts might make these
files <- list(
gdoc="txt/7T_packet.xlsx",
sess="txt/sessions_db.txt",
# see mri/MRSI_roi/gam_adjust/Makefile
mrsi="mri/MRSI_roi/gam_adjust/out/gamadj_wide.csv",
# see mri/tat2/Makefile
tat2="mri/tat2/maskave.csv",
# see eeg/Shane/python/fooof/runFooof.py
#fooof="eeg/Shane/fooof/Results/allSubjectsFooofMeasures_20230516.csv", # channel no region
fooof="eeg/Shane/Results/FOOOF/Results/allSubjectsDLPFCfooofMeasures_20230523.csv", # region no channel
# eeg/Shane/Rscripts/spectral_events_wide.R # 20230623
# Gamma eeg file renamed, updated 20231117
eegspec="eeg/Shane/Results/Spectral_Analysis/Spectral_events_analysis/Gamma/Gamma_DLPFCs_spectralEvents_wide.csv",
# 20231009 - switch to python for hurst and dfa
# see mri/hurst/hurst_nolds.py
hurst="mri/hurst/stats/dfa_mean-roi_atlas-mrsi13GM_prefix-brnasw.csv",
#hurst_brns="mri/hurst/stats/MRSI_pfc13_brnsdkm_hurst_rs.csv",
#hurst_ns="mri/hurst/stats/MRSI_pfc13_nsdkm_hurst_rs.csv",
#dfa_brns="mri/hurst/stats/MRSI_pfc13_brnsdkm_dfa.csv",
#dfa_ns="mri/hurst/stats/MRSI_pfc13_nsdkm_dfa.csv",
#hurst="mri/hurst/stats/MRSI_pfc13_H.csv", #remove matlab hurst see mri/hurst/hurst.m
#mgs_eog="eeg/eog_cal/eye_scored_mgs_eog.csv" # 20230612
mgs_eog="eeg/eog_cal/eye_scored_mgs_eog_cleanvisit.csv", # 20230616. new cleaned version
sr="behave/txt/SR.csv", # 20230620. pulled from db from RA matained sheets
ssp="behave/txt/SSP.csv", # 20230717. from behave/SSP_Cantab_spatial.R
sex="txt/db_sex.csv", # 20230718 all sex in DB
adi="/Volumes/Hera/Projects/Maria/Census/parguard_luna_visit_adi.csv", # 20230807, updated 20240118
fd="mri/txt/rest_fd.csv", # 20230912 (but generated long ago)
antiET="behave/txt/anti_scored.csv", # 20231101 (for MP)
# eeg_dlpfc_snr="eeg/Shane/Results/SNR/allSubjectDLPFC_SNRMeasures_20231113.csv" # 20231113
eeg_dlpfc_snr="eeg/Shane/Results/SNR/allSubjectDLPFC_SNRMeasures.csv", # 20231204
eeg_pc1_snr="eeg/Shane/Results/SNR/SNRmeasures_PC1_allStim.csv", # 20240214 #eeg/Shane/Rscripts/SNR/createImputed_PCAdataframes.R
lost_rest="txt/WF_rest_file_QC.tsv", # 20240216 - hand annotated rest data notes
ders="behave/txt/ders.csv", # 20240314
dts="behave/txt/dts.csv", # 20240314
rt18="behave/txt/rt18.csv", # 20240314
mp2rage="/Volumes/Hera/Projects/corticalmyelin_development/sample_info/7T_MP2RAGE_curation_QC.csv" # 20240422
)
## 7T "Packet". RA organized "ground truth" for study visits
# is in longer, row per visit type. Want wider row per visit (across all types)
overview_raw <- readxl::read_excel(files$gdoc, sheet="Overview") %>% mutate(ID=as.character(ID)) %>% filter(!is.na(ID))
topsheet_raw <- readxl::read_excel(files$gdoc, sheet="Top")
# top sheet was abandoned by MM. Overview sheet is up to date
# but top sheet has some drop status that is interesting. so we keep it
topsheet_overview <- merge(overview_raw,topsheet_raw,
all.x=T, by=c("7TROWID","VisitType","Date")) %>%
# 20240213: 59 missing visit year in overview fixed by MP+AF.
# cf. Top has 114 is.na(VisitYear.y)
mutate(VisitYear=ifelse(!is.na(VisitYear.x),VisitYear.x,VisitYear.y))
# clean IDs (remove double ID ; separator), rename columns, match visit type names to DB/session
topsheet_long <- topsheet_overview %>%
transmute(lunaid=gsub('\\.0$','',as.character(ID)), # lunaid column not labeled. comes in like 11702.0
date=format(Date,"%Y%m%d"), # match ymd "d8" format
vscore=Rating_0_5,
vdrop=VisitDropped,
sdrop=SubDropped,
dropreason=ReasonDropped,
# need these and would be lost otherwise (transmute) so renaming while it's free
visitno=VisitYear, vtype=VisitType, notes) %>% # top.demo=Demographics,
filter(!is.na(vtype)) %>%
mutate(vtype=case_when(vtype== "EEG"~"eeg",
vtype=="Behavioral"~"behave",
vtype=="MRI"~"mri")) # rest could be sipfc or hpc?
# widen to row per visit. per type columns for date,score,notes, and drops
topsheet <- topsheet_long %>%
pivot_wider(names_from=c("vtype"),
id_cols=c("lunaid","visitno"),
values_from=c("date","notes","vscore", "vdrop","sdrop","dropreason"),
names_glue="top.{vtype}.{.value}",
values_fn=\(x) paste0(x,collapse=";")) %>%
select(-matches('(eeg|behave).notes')) %>%
filter(!is.na(visitno))
## DB session info
# "pull_from_sheets" elsewhere uses google calendar, 7T scan files, and p participation flow to get sessions
# hard coding some missing data while that is being debugged (20240219). unique after for when they are added
sess <- read.table(files$sess, sep="\t", header=T) %>%
rename(lunaid=`id`) %>%
rbind(c(11904,"1",'Scan',20230727,15.68,"M",2.5,"")) %>% # TODO: this shoud be in DB. not hard coded here
rbind(c(11695,"1",'Scan',20201106,15.84,"F",1,"")) %>% # TODO: this shoud be in DB. not hard coded here
rbind(c(11945,"1",'eeg' ,20230829, 12.5,"F",5,"")) %>% # TODO: this shoud be in DB. not hard coded here
mutate(across(c(lunaid,visitno,vdate,age,vscore), as.numeric)) %>%
unique
sex <- read.table(files$sex,col.names=c("lunaid","sex"))
behave <- sess %>%
filter(vtype=="Behavioral",!is.na(visitno)) %>%
select(lunaid,visitno,behave.date=vdate, behave.age=age) %>%
filter(! paste0(lunaid,"_",behave.date) %in% c("11786_20210310")) %>% # beh visit with no imaging data. redone in 2022
merge(topsheet %>% select(lunaid,visitno,top.behave.date) %>% filter(!is.na(visitno)),
all=T, by=c("lunaid","visitno")) %>%
mutate(behave.date=ifelse(is.na(behave.date),top.behave.date,behave.date)) %>% select(-top.behave.date)
mrsi <- read.csv(files$mrsi)
tat2 <- read.csv(files$tat2)
fooof <- read.csv(files$fooof)
fd <- read.table(files$fd,header=T) %>%
rename(lunaid=id, date=d8) %>%
addcolprefix('rest') %>%
rm_dup_lunaid
antiET <- read.csv(files$antiET,header=T) %>%
filter(! ld8 %in% c("11786_20210310")) %>% # beh visit with no imaging data. redone in 2022
separate(ld8,c('lunaid', 'behave.date')) %>%
rename_with(\(x) gsub('^AS','',x)) %>%
select(-age,-sex,-visitno) %>%
addcolprefix('antiET', preserve=c("lunaid", "behave.date"))
mgs_eog_visit <- read.csv(files$mgs_eog)
sr <- read.csv(files$sr) %>% addcolprefix('sr') %>%
rename(screen.date=sr.date.screening, visitno=sr.visitno)
# cols like "eeg.Delay_LDLPFC_GammaEventDuration_mean"
eegspec <- read.csv(files$eegspec) %>% addcolprefix('eeg')
ssp <- read.csv(files$ssp) %>%
separate(ld8,c('lunaid','behave.date')) %>%
addcolprefix('ssp',sep="_", preserve=c("lunaid", "behave.date")) %>%
addcolprefix('cantab', preserve=c("lunaid", "behave.date"))
# removing city, state, zip, FIP. TODO: keep all columns?
adi <- read.csv(files$adi) %>%
select(lunaid, visitno=visit, ADI_NATRANK, ADI_STATERNK)
## tat2 - roi, subj (luan_date), event, beta
# use to get rest date
tat2_wide <- tat2 %>%
separate(subj,c('lunaid','rest.date')) %>%
# is all rest. no other events
#pivot_wider(names_from=c("roi","event"), values_from="beta")
select(-event) %>%
pivot_wider(names_from=c("roi"), values_from="beta") %>%
# add tat2 columns, but undo for ids
addcolprefix('tat2') %>%
rename(rest.date=tat2.rest.date) %>%
merge(sess %>%
filter(vtype=="Scan") %>%
select(lunaid,visitno,vdate,rest.age=age,rest.vscore=vscore),
all.x=T, by.x=c("lunaid","rest.date"), by.y=c("lunaid","vdate")) %>%
rm_dup_lunaid %>% # motion correction is stocastic. same visit different tat2
# fill missing rest.date. TODO: don't use tat2 to fill rest.date?
bind_rows(data.frame(lunaid='11745', visitno=1, rest.date='20190322')) %>% unique
## mrsi -- subset to just GABA|Glu + CR|SD
# and ditch anything without an roi label
# TODO: might want version with regresssion applied
mrsi_mrg <- mrsi %>%
separate(ld8, c("lunaid","date")) %>%
# same visit in duplicate lunaid, different placement. remove alt.id, keep "good"
rm_dup_lunaid %>%
addcolprefix('sipfc')
mrsi_mrg <- merge(mrsi_mrg,
sess %>% filter(vtype=="Scan") %>%
select(lunaid, vdate, visitno, sipfc.age=age, sipfc.vscore=vscore),
all.x=T, by.x=c("lunaid","sipfc.date"),by.y=c("lunaid","vdate"))
# fooof - Subject, Channel, Offset, Exponent, Condition (closed/open)
# 20230609 - from shane
if("Channel" %in% names(fooof)) {
eeg_lookup <- c("F4"="RDLPFC","F6"="RDLPFC","F8"="RDLPFC",
"F3"="LDLPFC","F4"="LDLPFC","F7"="LDLPFC")
fooof_dlpfc <- fooof %>% select(-X) %>%
filter(Channel %in% names(eeg_lookup)) %>%
mutate(Region=eeg_lookup[Channel]) %>%
select(-Channel) %>%
group_by(Subject, Region, Condition) %>%
summarise_all(mean) # across(c("Offset","Exponent"), mean)
} else {
fooof_dlpfc <- fooof %>% select(Subject,Condition,Offset,Exponent,Region)
}
fooof_wide <- fooof_dlpfc %>%
pivot_wider(
names_from=c("Condition", "Region"),
values_from = c("Offset","Exponent"),
names_glue = "{Condition}_{Region}_{.value}") %>%
separate(Subject, c("lunaid","date")) %>%
addcolprefix('eeg')
# 11668_20170710 first visit no in DB? only one missing currently. quick fix
# should be 20180710
fooof_wide$eeg.date[fooof_wide$lunaid == "11668"&fooof_wide$eeg.date == "20170710"] <- "20180710"
#cat("have",nrow(fooof_wide),"eeg rows\n")
#fooof_mia <- fooof_wide %>% filter(is.na(visitno))
#if(nrow(fooof_mia) >0L) {
# cat("fooof with missing visit number\n")
# fooof_mia %>% select(lunaid,eeg.date) %>% print
#}
## Hurst will use tat2's rest.date to merge
# dont need session info -- should already have. but just incase
#hursts <- lapply(list("hurst_brns","hurst_ns","dfa_brns","dfa_ns"),
# \(x) read.csv(files[[x]]) %>% addcolprefix(x, preserve='ld8')) %>%
# Reduce(f=\(x,y) merge(x,y,by='ld8'))
# 20240325WF/MP - only dfa on mrsi placed native space.
hursts <- read.csv(files$hurst) %>%
rename_with(function(x)gsub('^X[0-9]*_','',x), .cols=matches("^X")) %>%
addcolprefix("rest.hurst", preserve='ld8')
hurst_ses <- hursts %>%
separate(ld8,c('lunaid','vdate')) %>%
rm_dup_lunaid %>%
rename(rest.date=vdate) %>%
# TODO: maybe a function for all rest like inputs
merge(sess %>%
filter(vtype=="Scan") %>%
select(lunaid,visitno,vdate,rest.age=age,rest.vscore=vscore),
all.x=T, by.x=c("lunaid","rest.date"), by.y=c("lunaid","vdate"))
#hurst_ses %>% filter(is.na(visitno))
noX <- function(d) { if(names(d)[1] == "X") return(d[-1,]); return(d); }
#ignoreSNR=TRUE # 20231117 ignore eeg SNR per shane
#ignoreSNR=FALSE # 20231129 back up
ignoreSNR=!file.exists(files$eeg_dlpfc_snr) # 20240118 down again
#
if(!ignoreSNR){
# "","Subject","luna","vdate","Region","ERSP","ITC","BaselinePower","Induced","Evoked","InducedDB","EvokedDB","age","visitno","Freq"
eeg_dlpfc_snr <- read.csv(files$eeg_dlpfc_snr) %>% noX %>%
select(-Subject, -age, -vdate) %>%
rename(lunaid=luna) %>%
pivot_wider(id_cols=c("lunaid","visitno"),
names_from=c("Freq","Region"),
values_from=c("ERSP","ITC","BaselinePower",
"Induced","Evoked","InducedDB","EvokedDB")) %>%
addcolprefix("eeg.snr", preserve=c("lunaid","visitno"))
} else {
eeg_dlpfc_snr <- NULL
}
mgs_eog <- mgs_eog_visit %>%
rename(lunaid=LunaID,date=ScanDate) %>%
addcolprefix('eeg') %>%
#wrong: 11670_20210823, 11672_2022061 folder and filename disagree. db matches filename
rewrite_date(eeg_newdates) %>%
merge(sess %>%
filter(vtype=="eeg") %>%
select(lunaid,visitno,eeg.date=vdate,eeg.age=age,eeg.vscore=vscore),
all.x=T, by.x=c("lunaid","eeg.date"), by.y=c("lunaid","eeg.date"))
missing_eog <- filter(mgs_eog,is.na(eeg.age))
if(nrow(missing_eog)>0L){
cat("#",nrow(missing_eog),"MISSING EEG (EOG) visit in DB\n")
missing_eog %>% select(lunaid,eeg.date) %>% print
}
## EEG spectrum (gamma)
# hard code some date adjustments
eegspec <- eegspec %>% rewrite_date(eeg_newdates)
# add session b/c at least one visit isn't in fooof
#eegspec_ses <- eegspec %>% merge(sess %>%
# filter(vtype=="eeg") %>%
# select(lunaid,visitno,vdate,eeg.age=age,eeg.vscore=vscore),
# all.x=T, by.x=c("lunaid","eeg.date"), by.y=c("lunaid","vdate"))
eeg_date <- sess %>%
filter(vtype=="eeg") %>%
select(lunaid,visitno,eeg.date=vdate,eeg.age=age,eeg.vscore=vscore)
# 20240314. qualtircs for AO
DERS_names <- c("1_clearAboutFeelings", "1_REVERSESCORE", "2_payAttentionFeel", "2_REVERSESCORE", "3_emoOverwhelming", "4_noIdeaFeeling", "5_difficultySenseOfFeelings", "6_attentiveToFeelings", "6_REVERSESCORE", "7_knowHowFeeling", "7_REVERSESCORE", "8_careAboutFeeling", "8_REVERSESCORE", "9_confusedHowFeel", "10_acknowledgeEmo", "10_REVERSESCORE", "11_becomeAngry", "12_becomeEmbarassed", "13_difficultyWorking", "14_outOfControl", "15_believeLongTime", "16_believeEndUpDepressed", "17_believeFeelingsValid", "17_REVERSESCORE", "18_difficultyFocusing", "19_feelOutOfControl", "20_getThingsDone", "20_REVERSESCORE", "21_feelAshamed", "22_findWayFeelBetter", "22_REVERSESCORE", "23_feelWeak", "24_feelInControl", "24_REVERSESCORE", "25_feelGuilty", "26_difficultyConcentrating", "27_difficultyControllingBehav", "28_believeNothingFeelBetter", "29_becomeIrritated", "30_feelBad", "31_believeWallowing", "32_loseControlBehav", "33_difficultyThinking", "34_takeTime", "34_REVERSESCORE", "35_takesLongTimeFeelBetter", "36_emoFeelOverwhelming")
# 20240422 UNIT1 from VS
mp2rage <- read.csv(files$mp2rage,sep="\t") %>%
# BIDS -> merge7T (remove sub- and ses-, rename columns)
mutate(across(c(subject_id, session_id), \(x) gsub('^(sub|ses)-','',x))) %>%
rename(lunaid=subject_id, date=session_id) %>%
# done by merge_and check
#rewrite_luna() %>% # 11748 to 11515; removes excluded: 11653_20180608, 11690_20180920, 11812_20200227
# 'mp2rage.' to all columns, remove column name prefix already given to most
rename_with(\(x) gsub('MP2RAGE_','',x)) %>%
addcolprefix('mp2rage',preserve=c("lunaid")) %>%
# add visitno
merge(sess %>%
filter(vtype=="Scan") %>%
select(lunaid,visitno,vdate),
all.x=T, by.x=c("lunaid","mp2rage.date"), by.y=c("lunaid","vdate"))
# remove if lower QC or farther from EEG date
# https://github.com/LabNeuroCogDevel/corticalmyelin_maturation/tree/main/sample_construction
mp2rage_lowqc_repeat_visit <- c("11668_20180702", "11716_20221118", "11818_20221117", "11822_20210412", "11681_20181012")
mp2rage_filtered <- mp2rage %>%
# hard coded drop repeated QC
filter(!paste0(lunaid,"_",mp2rage.date) %in% mp2rage_lowqc_repeat_visit) %>%
# drop if more than 2 but failed QC
group_by(lunaid,visitno) %>% mutate(n=n()) %>%
filter(n==1|mp2rage.collected==1) %>% #mp2rage.QC_passfail==1) %>%
filter(mp2rage.QC_rating == max(mp2rage.QC_rating)) %>% select(-n)
#####
sessid <- function(d)
apply(d,1,function(x)
paste0(collapse="_",x[c("lunaid","visitno")])) %>%
gsub(' ','',.)
check_date_dups <- function(d){
d_datesub <- d %>% select(lunaid,matches('\\.date$|^visitno$'))
dups <- d[duplicated(d),]
if(nrow(dups)>0L) {
cat("WARNING: dups in ", df_varname,"\n")
print(dups)
}
return(dups)
}
merge_and_check <- function(big, d, ...) {
if(is.null(d)) return(big)
df_varname <- substitute(d)
# replace duplicate ids by picking one
if('lunaid' %in% names(d)) d <- rewrite_luna(d)
# did 2 IDs come in? maybe b/c rewrite_luna
check_date_dups(d)
# 20240411 -- stricter check for duplicated visit rows
if('lunaid' %in% names(d) & 'visitno' %in% names(d)) {
all_visits <- paste0(d$lunaid,"_",d$visitno)
dup_visits <- duplicated(all_visits)
if(any(dup_visits)) stop("ERROR: duplicated visits: ", all_visits[dup_visits])
}
big.new <- merge(big, d, ...)
big.new.uniq <- big.new %>% unique # 11832 is repeated 2 twice?
if(nrow(big.new)!=nrow(big.new.uniq)){
cat("WARNING: duplicated rows added to merged dataset!","\n")
}
big.new <- big.new.uniq
check_date_dups(big.new)
bs <- sessid(big)
ds <- sessid(d)
mia_big <- setdiff(bs,ds)
mia_d <- setdiff(ds,bs)
# debugging bad merge
#print(substitute(d))
#names(big) %>% grep(pattern='lunaid|visitno|\\.date$',value=T) %>% print()
#print(nrow(big))
#names(big.new) %>% grep(pattern='lunaid|visitno|\\.date$',value=T) %>% print()
#print(nrow(big.new))
new_visits <- anti_join(big.new,
big %>% select(lunaid,visitno),
by = c("lunaid", "visitno"))
# what column do we care about for merging?
args <- list(...)
if(is.null(args[['by.y']])) args[['by.y']] <- args[['by']]
new_ids <- apply(new_visits[args[['by.y']]], 1, paste0, collapse="_")
cat("#",
glue(
"{df_varname}({nrow(d)}x{ncol(d)}) adding ",
"to {nrow(big)}x{ncol(big)}; ",
"{length(mia_big)} visits w/visitno ({nrow(new_visits)} new) / ",
"{length(mia_d)} not in {df_varname}"),
"\n")
cat(" missing: ", paste(collapse=", ",head(mia_big)), "\n")
cat(" new: ", paste(collapse=", ",head(new_ids)), "\n")
cat(" total rows now",nrow(big.new),"\n")
return(big.new)
}
# hard coded visits known missing
missing_visitnums <-
setNames(data.frame(rbind(c(11695,1,20201106),
c(11745,1,20190322))),
c("lunaid","visitno","rest.lost.date"))
rest_ses <- sess %>% filter(vtype=='Scan') %>%
select(lunaid, visitno, rest.lost.date=vdate) %>%
# missing data also missing in sess
rbind(missing_visitnums) %>%
filter(!(lunaid=='11745'&rest.lost.date=='20190318')) %>%
unique # 11695_20201106 and 11745_20190322 maybe back into db?
# 2 rest days 11745_20180318, 11745_20180322
lost_rest <-
read.table(files$lost_rest,header=T,sep="\t") %>%
select(ld8,rest.lost=lost,rest.lost.note=WFnote) %>%
filter(rest.lost>0) %>%
separate(ld8,c("lunaid","rest.lost.date")) %>%
merge(rest_ses, all.x=T, by=c("lunaid","rest.lost.date"))
lost_rest_to_merge <- lost_rest %>% select(-rest.lost.date)
cat("mergeing all, writing merge file\n")
merged <-
topsheet %>% rewrite_luna %>%
merge_and_check(tat2_wide, by=c("lunaid","visitno"), all=T) %>%
merge_and_check(behave, by=c("lunaid","visitno"), all=T) %>%
merge_and_check(eeg_date, by=c("lunaid","visitno"), all=T) %>%
merge_and_check(mrsi_mrg, by=c("lunaid","visitno"), all=T) %>%
merge_and_check(fooof_wide, by=c("lunaid","eeg.date"), all=T) %>%
merge_and_check(hurst_ses, by=c("lunaid","visitno","rest.date","rest.age","rest.vscore"), all=T) %>%
merge_and_check(mgs_eog, by=c("lunaid","visitno","eeg.date","eeg.age","eeg.vscore"), all=T) %>%
merge_and_check(sr, by=c("lunaid","visitno"), all.x=T, all.y=F) %>%
merge_and_check(eegspec, by=c("lunaid","eeg.date"), all=T) %>%
# only all.x b/c might have dropped after behv visit
merge_and_check(ssp, by=c("lunaid","behave.date"),all.x=T) %>%
merge_and_check(sex, by=c("lunaid"), all.x=T) %>%
merge_and_check(adi, by=c("lunaid","visitno"), all.x=T) %>%
# 20240227 - misisng future vists assigned ADI at last known
group_by(lunaid) %>% arrange(visitno) %>%
fill(matches('^ADI_.*RANK'),.direction="down") %>% ungroup() %>%
merge_and_check(fd, by=c("lunaid","rest.date"), all.x=T) %>%
merge_and_check(antiET, by=c("lunaid","behave.date"),all.x=T) %>%
merge_and_check(eeg_dlpfc_snr, by=c("lunaid","visitno"),all.x=T) %>%
merge_and_check(lost_rest_to_merge, by=c("lunaid","visitno"),all.x=T) %>%
# 20240422 UNIT1
merge_and_check(mp2rage_filtered, by=c("lunaid","visitno"),all.x=T) %>%
# beh visit with no imaging data. redone in 2022
filter(! paste0(lunaid,'_',behave.date) %in% c("11786_20210310")) %>%
# remove behave in only overview sheet (42. additonal 17 have only behave.date)
filter(!(is.na(rest.date) & is.na(eeg.date) &is.na(top.eeg.date) & is.na(top.mri.date) & is.na(behave.date))) %>%
merge(SAME_LUNA_df, all.x=T, by='lunaid') %>%
# TODO: where does this bad eeg date come from!? also 11665 without session number w/eeg.date
# 20240422 -- was data.table instead of dataframe. fixed?
# lunaid alt.lunaid visitno behave.date rest.date eeg.date top.behave.date top.mri.date top.eeg.date sipfc.date screen.date
# 1 11748 <NA> NA <NA> <NA> 20190429 <NA> <NA> <NA> <NA> <NA>
filter(!lunaid %in% c(11748)) %>%
unique # 11832 is repeated 2 twice? (as of 20240213, this is no longer a problem)
cat(glue("# merged: {nrow(merged)} rows with {ncol(merged)} columns"),"\n")
# often have ages missing, want a single column to use for generic age @ session
# NB. 'sess.age' will match grepl. dont rerun these lines multiple times w/o reruning merge above
all_age_cols <- merged[,grepl('\\.age$',names(merged))] # eeg.age rest.age behave.age sipfc.age
merged$sess.age <- apply(all_age_cols, 1, mean,na.rm=T)
merged$sess.age_range <- apply(all_age_cols, 1, \(x) diff(range(x,na.rm=T)))
# 20240530 - mrsi gaba glu ratio on already clean data
mrsi_asym_ratio <- gaba_glu_asymmetry(merged)
merged <- merge_and_check(merged, mrsi_asym_ratio, by=c("lunaid","visitno"),all.x=T)
write.csv(merged, 'txt/merged_7t.csv', quote=T, row.names=F)
cat("merged with missing visit number\n")
merged %>% filter(is.na(visitno)) %>% select(lunaid,matches('\\.date$')) %>% print
cat("ashley flagged ids")
merged %>% filter(lunaid %in% c(11737 ,11744,11763,11824,11890,11950)) %>% select(lunaid,visitno, matches('\\.date$')) %>% print
# merged these in so this doesn't happen
#cat("bad behave.date")
#merged %>% filter(is.na(behave.date)&!is.na(top.behave.date)) %>% select(lunaid,matches('\\.date$')) %>% print
cat("missing rest")
merged %>%
merge(read.table('txt/MP_rest_file_qc.tsv',sep="\t",header=T), all=T,by="lunaid") %>%
filter(is.na(rest.date),!is.na(top.mri.date)) %>%
transmute(ld8=paste(lunaid,top.mri.date,sep="_") %>%gsub(';','\n',.),
vs=top.mri.vscore,
vdrop=top.mri.vdrop,
notes=top.mri.notes,
WFnote=rest.lost.note,
rest.lost,
MPnote=MPrestNote) %>%
filter(vdrop!="TRUE"|is.na(vdrop), vs>0) %>%
select(ld8,notes,vs,MPnote,WFnote, rest.lost) %>%
arrange(-!is.na(MPnote),-!is.na(WFnote)) %>%
#clipr::write_clip()
pander::pandoc.table(split.table = Inf)
cat("Visit type counts\n")
merged %>%
transmute(lunaid,visitno,across(matches('\\.date$'), \(x) !is.na(x))) %>%
count(behave.date,rest.date,eeg.date) %>%
arrange(-n) %>%
pander::pandoc.table()
# behave.date rest.date eeg.date n
# TRUE TRUE TRUE 291
# TRUE FALSE FALSE 43
# TRUE FALSE TRUE 41
# TRUE TRUE FALSE 4
# FALSE FALSE FALSE 1
cat("repeat visit numbers?!\n")
merged %>% count(lunaid,visitno) %>% filter(n>1)
# lunaid visitno n
# 11390 1 4
# 11515 1 4
# 11515 2 2
# 11745 1 2