Skip to content

Commit c8f471d

Browse files
committed
Draft curation code.
1 parent 7b2c4a2 commit c8f471d

15 files changed

+461
-1509
lines changed

.github/workflows/lint_r.yml

Lines changed: 0 additions & 31 deletions
This file was deleted.

.vscode/settings.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
{
2-
"python.linting.flake8Enabled": true,
3-
"python.linting.enabled": true
2+
"[python]": {
3+
"editor.rulers": [99]
4+
},
5+
"flake8.args": [
6+
"--max-line-length=100"
7+
],
48
}

00_download_data.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
# Flywheel project name
4+
project="bbl/MEBOLD"
5+
6+
# List any subjects you want to download here
7+
subjects="ID1 ID2"
8+
9+
# Include a path to your flywheel API token here
10+
token=$(</cbica/home/salot/tokens/flywheel.txt)
11+
fw login "$token"
12+
13+
# Navigate to the folder to which you want to download the data
14+
cd /cbica/projects/mebold/sourcedata || exit
15+
16+
for subject in $subjects; do
17+
fw download --yes --zip "fw://${project}/${subject}"
18+
done

01_unzip_dicoms.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""Expand dicom zip files in order to run heudiconv."""
2+
3+
import os
4+
import zipfile
5+
from glob import glob
6+
7+
if __name__ == "__main__":
8+
zip_files = sorted(glob("/cbica/projects/mebold/sourcedata/*_*/*/*/*.dicom.zip"))
9+
for zip_file in zip_files:
10+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
11+
zip_ref.extractall(os.path.dirname(zip_file))
12+
13+
os.remove(zip_file)

02_run_heudiconv.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
# Loop over subjects and run heudiconv on each.
3+
# Make sure to activate the conda environment with heudiconv installed before running this.
4+
5+
declare -a subjects=("ID1" "ID2")
6+
for sub in "${subjects[@]}"
7+
do
8+
echo "$sub"
9+
heudiconv \
10+
-f reproin \
11+
-o /cbica/projects/mebold/dset \
12+
-d "/cbica/projects/mebold/sourcedata/{subject}_{session}/*/*/*/*.dcm" \
13+
-s "$sub" \
14+
-ss 1 \
15+
--bids
16+
done

03_fix_bids.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
"""Fix BIDS files after heudiconv conversion.
2+
3+
This script should deal with steps 1-6 below.
4+
5+
The necessary steps are:
6+
7+
1. Deal with duplicates.
8+
2. Rename multi-echo magnitude BOLD files to part-mag_bold.
9+
3. Rename phase files to part-phase_bold.
10+
4. Split out noRF noise scans from multi-echo BOLD scans.
11+
- Also copy the JSON.
12+
5. Copy first echo of each multi-echo field map without echo entity.
13+
6. Update filenames in the scans.tsv files.
14+
7. Remove events files.
15+
"""
16+
17+
import os
18+
import shutil
19+
from glob import glob
20+
21+
import nibabel as nb
22+
import pandas as pd
23+
24+
# Number of EPI noise scans to split out of the BOLD scans.
25+
N_NOISE_VOLS = 3
26+
27+
FULL_RUN_LENGTHS = (240, 204, 200)
28+
29+
30+
if __name__ == "__main__":
31+
dset_dir = "/cbica/projects/mebold/dset/"
32+
subject_dirs = sorted(glob(os.path.join(dset_dir, "sub-*")))
33+
for subject_dir in subject_dirs:
34+
sub_id = os.path.basename(subject_dir)
35+
session_dirs = sorted(glob(os.path.join(subject_dir, "ses-*")))
36+
for session_dir in session_dirs:
37+
ses_id = os.path.basename(session_dir)
38+
anat_dir = os.path.join(session_dir, "anat")
39+
fmap_dir = os.path.join(session_dir, "fmap")
40+
func_dir = os.path.join(session_dir, "func")
41+
42+
# Remove empty events files created by heudiconv
43+
events_files = sorted(glob(os.path.join(func_dir, "*_events.tsv")))
44+
for events_file in events_files:
45+
os.remove(events_file)
46+
47+
# Load scans file
48+
scans_file = os.path.join(session_dir, f"{sub_id}_{ses_id}_scans.tsv")
49+
assert os.path.isfile(scans_file), f"Scans file DNE: {scans_file}"
50+
scans_df = pd.read_table(scans_file)
51+
52+
# Heudiconv's reproin heuristic currently (as of v1.2.0) names magnitude and phase
53+
# files as _bold and _phase, respectively.
54+
# The better way to do it is to call them part-mag_bold and part-phase_bold.
55+
mag_files = sorted(glob(os.path.join(func_dir, "*echo-*_bold.*")))
56+
for mag_file in mag_files:
57+
if "part-" in mag_file:
58+
print(f"Skipping {mag_file}")
59+
continue
60+
61+
new_mag_file = mag_file.replace("_bold.", "_part-mag_bold.")
62+
os.rename(mag_file, new_mag_file)
63+
64+
mag_filename = os.path.join("func", os.path.basename(mag_file))
65+
new_mag_filename = os.path.join("func", os.path.basename(new_mag_file))
66+
67+
# Replace the filename in the scans.tsv file
68+
scans_df = scans_df.replace({"filename": {mag_filename: new_mag_filename}})
69+
70+
# Rename phase files from _phase to _part-phase_bold.
71+
phase_files = sorted(glob(os.path.join(func_dir, "*_phase.*")))
72+
for phase_file in phase_files:
73+
new_phase_file = phase_file.replace("_phase.", "_part-phase_bold.")
74+
os.rename(phase_file, new_phase_file)
75+
76+
phase_filename = os.path.join("func", os.path.basename(phase_file))
77+
new_phase_filename = os.path.join("func", os.path.basename(new_phase_file))
78+
79+
# Replace the filename in the scans.tsv file
80+
scans_df = scans_df.replace({"filename": {phase_filename: new_phase_filename}})
81+
82+
# Split out noise scans from all multi-echo BOLD files.
83+
# There is no metadata to distinguish noise scans from BOLD scans,
84+
# so we need to hardcode the number of noise scans to split out.
85+
# In order to handle partial scans where the last N volumes aren't noise scans,
86+
# we also need to hardcode valid scan lengths.
87+
me_bolds = sorted(glob(os.path.join(func_dir, "*acq-MBME*_bold.nii.gz")))
88+
for me_bold in me_bolds:
89+
noise_scan = me_bold.replace("_bold.nii.gz", "_noRF.nii.gz")
90+
if os.path.isfile(noise_scan):
91+
print(f"File exists: {os.path.basename(noise_scan)}")
92+
continue
93+
94+
img = nb.load(me_bold)
95+
n_vols = img.shape[-1]
96+
if n_vols not in FULL_RUN_LENGTHS:
97+
print(f"File is a partial scan: {os.path.basename(me_bold)}")
98+
continue
99+
100+
noise_img = img.slicer[..., -N_NOISE_VOLS:]
101+
bold_img = img.slicer[..., :-N_NOISE_VOLS]
102+
103+
# Overwrite the BOLD scan
104+
os.remove(me_bold)
105+
bold_img.to_filename(me_bold)
106+
noise_img.to_filename(noise_scan)
107+
108+
# Copy the JSON as well
109+
shutil.copyfile(
110+
me_bold.replace(".nii.gz", ".json"),
111+
noise_scan.replace(".nii.gz", ".json"),
112+
)
113+
114+
# Add noise scans to scans DataFrame
115+
i_row = len(scans_df.index)
116+
me_bold_fname = os.path.join("func", os.path.basename(me_bold))
117+
noise_fname = os.path.join("func", os.path.basename(noise_scan))
118+
scans_df.loc[i_row] = scans_df.loc[scans_df["filename"] == me_bold_fname].iloc[0]
119+
scans_df.loc[i_row, "filename"] = noise_fname
120+
121+
# In this protocol, we have multi-echo field maps.
122+
# In practice, multi-echo field maps aren't useful, so we just grab the first echo's
123+
# data and label it as a single-echo field map.
124+
# Copy first echo's sbref of multi-echo field maps without echo entity.
125+
me_fmaps = sorted(glob(os.path.join(fmap_dir, "*_acq-ME*_echo-1_sbref.*")))
126+
for me_fmap in me_fmaps:
127+
out_fmap = me_fmap.replace("_echo-1_", "_").replace("_sbref", "_epi")
128+
if os.path.isfile(out_fmap):
129+
print(f"File exists: {os.path.basename(out_fmap)}")
130+
continue
131+
132+
me_fmap_fname = os.path.join("fmap", os.path.basename(me_fmap))
133+
out_fmap_fname = os.path.join("fmap", os.path.basename(out_fmap))
134+
shutil.copyfile(me_fmap, out_fmap)
135+
if me_fmap.endswith(".nii.gz"):
136+
i_row = len(scans_df.index)
137+
scans_df.loc[i_row] = scans_df.loc[
138+
scans_df["filename"] == me_fmap_fname
139+
].iloc[0]
140+
scans_df.loc[i_row, "filename"] = out_fmap_fname
141+
142+
# Save out the modified scans.tsv file.
143+
scans_df = scans_df.sort_values(by=["acq_time", "filename"])
144+
os.remove(scans_file)
145+
scans_df.to_csv(scans_file, sep="\t", na_rep="n/a", index=False)

04_reface_t1ws.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Reface T1w images using afni_refacer_run.
3+
module load afni/2022_05_03
4+
5+
t1w_files=$(find /cbica/projects/mebold/dset/sub-*/ses-*/anat/*T1w.nii.gz)
6+
for t1w_file in $t1w_files
7+
do
8+
echo "$t1w_file"
9+
@afni_refacer_run \
10+
-input "${t1w_file}" \
11+
-mode_reface \
12+
-no_images \
13+
-overwrite \
14+
-prefix "${t1w_file}"
15+
done

05_anonymize_acqtimes.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""Anonymize acquisition datetimes for a dataset.
2+
3+
Anonymize acquisition datetimes for a dataset. Works for both longitudinal
4+
and cross-sectional studies. The time of day is preserved, but the first
5+
scan is set to January 1st, 1800. In a longitudinal study, each session is
6+
anonymized relative to the first session, so that time between sessions is
7+
preserved.
8+
9+
Overwrites scan tsv files in dataset. Only run this *after* data collection
10+
is complete for the study, especially if it's longitudinal.
11+
"""
12+
13+
import os
14+
from glob import glob
15+
16+
import pandas as pd
17+
from dateutil import parser
18+
19+
if __name__ == "__main__":
20+
dset_dir = "/cbica/projects/mebold/dset"
21+
22+
bl_dt = parser.parse("1800-01-01")
23+
24+
subject_dirs = sorted(glob(os.path.join(dset_dir, "sub-*")))
25+
for subject_dir in subject_dirs:
26+
sub_id = os.path.basename(subject_dir)
27+
print(f"Processing {sub_id}")
28+
29+
scans_files = sorted(glob(os.path.join(subject_dir, "ses-*/*_scans.tsv")))
30+
31+
for i_ses, scans_file in enumerate(scans_files):
32+
ses_dir = os.path.dirname(scans_file)
33+
ses_name = os.path.basename(ses_dir)
34+
print(f"\t{ses_name}")
35+
36+
df = pd.read_table(scans_file)
37+
if i_ses == 0:
38+
# Anonymize in terms of first scan for subject.
39+
first_scan = df["acq_time"].min()
40+
first_dt = parser.parse(first_scan.split("T")[0])
41+
diff = first_dt - bl_dt
42+
43+
acq_times = df["acq_time"].apply(parser.parse)
44+
acq_times = (acq_times - diff).astype(str)
45+
df["acq_time"] = acq_times
46+
df["acq_time"] = df["acq_time"].str.replace(" ", "T")
47+
48+
os.remove(scans_file)
49+
df.to_csv(
50+
scans_file,
51+
sep="\t",
52+
lineterminator="\n",
53+
na_rep="n/a",
54+
index=False,
55+
)

06_clean_jsons.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""Remove unneeded fields from bottom-level JSON files."""
2+
3+
import json
4+
import os
5+
from glob import glob
6+
7+
if __name__ == "__main__":
8+
dset_dir = "/cbica/projects/mebold/dset/"
9+
drop_keys = [
10+
"AcquisitionTime",
11+
"CogAtlasID",
12+
"InstitutionAddress",
13+
"TaskName",
14+
"ImageComments",
15+
]
16+
17+
json_files = sorted(glob(os.path.join(dset_dir, "sub-*/ses-*/*/*.json")))
18+
for json_file in json_files:
19+
with open(json_file, "r") as fo:
20+
json_data = json.load(fo)
21+
22+
for drop_key in drop_keys:
23+
if drop_key in json_data.keys():
24+
json_data.pop(drop_key)
25+
26+
os.remove(json_file)
27+
with open(json_file, "w") as fo:
28+
json.dump(json_data, fo, indent=4, sort_keys=True)

0 commit comments

Comments
 (0)