Skip to content

Commit 978f10f

Browse files
authored
Merge pull request #483 from broadinstitute/subsampler-dp
Subsampler and augur workflow
2 parents 752cdbf + 0fba1bc commit 978f10f

8 files changed

+392
-39
lines changed

.dockstore.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ workflows:
5050
primaryDescriptorPath: /pipes/WDL/workflows/augur_from_msa.wdl
5151
testParameterFiles:
5252
- empty.json
53+
- name: augur_from_msa_with_subsampler
54+
subclass: WDL
55+
primaryDescriptorPath: /pipes/WDL/workflows/augur_from_msa_with_subsampler.wdl
56+
testParameterFiles:
57+
- empty.json
5358
- name: bams_multiqc
5459
subclass: WDL
5560
primaryDescriptorPath: /pipes/WDL/workflows/bams_multiqc.wdl
@@ -324,6 +329,11 @@ workflows:
324329
primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine.wdl
325330
testParameterFiles:
326331
- empty.json
332+
- name: subsample_by_casecounts
333+
subclass: WDL
334+
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_casecounts.wdl
335+
testParameterFiles:
336+
- empty.json
327337
- name: subsample_by_metadata
328338
subclass: WDL
329339
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_metadata.wdl
@@ -358,4 +368,4 @@ workflows:
358368
subclass: WDL
359369
primaryDescriptorPath: /pipes/WDL/workflows/bam_to_qiime.wdl
360370
testParameterFiles:
361-
- empty.json
371+
- empty.json

pipes/WDL/tasks/tasks_interhost.wdl

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,156 @@
11
version 1.0
22

3+
task subsample_by_cases {
4+
meta {
5+
description: "Run subsampler to get downsampled dataset and metadata proportional to epidemiological case counts."
6+
}
7+
input {
8+
File metadata
9+
File case_data
10+
11+
String id_column
12+
String geo_column
13+
String date_column = "date"
14+
String unit = "week"
15+
16+
File? keep_file
17+
File? remove_file
18+
File? filter_file
19+
Float baseline = 0.0001
20+
Int? seed_num
21+
String? start_date
22+
String? end_date
23+
24+
String docker = "quay.io/broadinstitute/subsampler"
25+
Int machine_mem_gb = 30
26+
}
27+
command <<<
28+
set -e -o pipefail
29+
mkdir -p data outputs
30+
31+
# decompress if compressed
32+
echo "staging and decompressing input data files"
33+
if [[ ~{metadata} == *.gz ]]; then
34+
cat "~{metadata}" | pigz -d > data/metadata.tsv
35+
elif [[ ~{metadata} == *.zst ]]; then
36+
cat "~{metadata}" | zstd -d > data/metadata.tsv
37+
else
38+
ln -s "~{metadata}" data/metadata.tsv
39+
fi
40+
if [[ ~{case_data} == *.gz ]]; then
41+
cat "~{case_data}" | pigz -d > data/case_data.tsv
42+
elif [[ ~{case_data} == *.zst ]]; then
43+
cat "~{case_data}" | zstd -d > data/case_data.tsv
44+
else
45+
ln -s "~{case_data}" data/case_data.tsv
46+
fi
47+
48+
## replicate snakemake DAG manually
49+
# rule genome_matrix
50+
# Generate matrix of genome counts per day, for each element in column ~{geo_column}
51+
echo "getting genome matrix"
52+
python3 /opt/subsampler/scripts/get_genome_matrix.py \
53+
--metadata data/metadata.tsv \
54+
--index-column ~{geo_column} \
55+
--date-column ~{date_column} \
56+
~{"--start-date " + start_date} \
57+
~{"--end-date " + end_date} \
58+
--output outputs/genome_matrix_days.tsv
59+
date;uptime;free
60+
61+
# rule unit_conversion
62+
# Generate matrix of genome and case counts per epiweek
63+
echo "converting matricies to epiweeks"
64+
python3 /opt/subsampler/scripts/aggregator.py \
65+
--input outputs/genome_matrix_days.tsv \
66+
--unit ~{unit} \
67+
--format integer \
68+
--output outputs/matrix_genomes_unit.tsv
69+
python3 /opt/subsampler/scripts/aggregator.py \
70+
--input data/case_data.tsv \
71+
--unit ~{unit} \
72+
--format integer \
73+
~{"--start-date " + start_date} \
74+
~{"--end-date " + end_date} \
75+
--output outputs/matrix_cases_unit.tsv
76+
date;uptime;free
77+
78+
# rule correct_bias
79+
# Correct under- and oversampling genome counts based on epidemiological data
80+
echo "create bias-correction matrix"
81+
python3 /opt/subsampler/scripts/correct_bias.py \
82+
--genome-matrix outputs/matrix_genomes_unit.tsv \
83+
--case-matrix outputs/matrix_cases_unit.tsv \
84+
--index-column code \
85+
~{"--baseline " + baseline} \
86+
--output1 outputs/weekly_sampling_proportions.tsv \
87+
--output2 outputs/weekly_sampling_bias.tsv \
88+
--output3 outputs/matrix_genomes_unit_corrected.tsv
89+
date;uptime;free
90+
91+
# rule subsample
92+
# Sample genomes and metadata according to the corrected genome matrix
93+
echo "subsample data according to bias-correction"
94+
# subsampler_timeseries says --keep is optional but actually fails if you don't specify one
95+
cp /dev/null data/keep.txt
96+
~{"cp " + keep_file + " data/keep.txt"}
97+
python3 /opt/subsampler/scripts/subsampler_timeseries.py \
98+
--metadata data/metadata.tsv \
99+
--genome-matrix outputs/matrix_genomes_unit_corrected.tsv \
100+
--index-column ~{id_column} \
101+
--geo-column ~{geo_column} \
102+
--date-column ~{date_column} \
103+
--time-unit ~{unit} \
104+
--keep data/keep.txt \
105+
~{"--remove " + remove_file} \
106+
~{"--filter-file " + filter_file} \
107+
~{"--seed " + seed_num} \
108+
~{"--start-date " + start_date} \
109+
~{"--end-date " + end_date} \
110+
--weekasdate no \
111+
--sampled-sequences outputs/selected_sequences.txt \
112+
--sampled-metadata outputs/selected_metadata.tsv \
113+
--report outputs/sampling_stats.txt
114+
echo '# Sampling proportion: ~{baseline}' | cat - outputs/sampling_stats.txt > temp && mv temp outputs/sampling_stats.txt
115+
date;uptime;free
116+
117+
# copy outputs from container's temp dir to host-accessible working dir for delocalization
118+
echo "wrap up"
119+
mv outputs/* .
120+
# get counts
121+
cat selected_sequences.txt | wc -l | tee NUM_OUT
122+
# get hardware utilization
123+
set +o pipefail
124+
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
125+
cat /proc/loadavg > CPU_LOAD
126+
{ cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES
127+
128+
>>>
129+
runtime {
130+
docker: docker
131+
memory: machine_mem_gb + " GB"
132+
cpu: 2
133+
disks: "local-disk 200 HDD"
134+
disk: "200 GB"
135+
dx_instance_type: "mem3_ssd1_v2_x4"
136+
}
137+
output {
138+
File genome_matrix_days = "genome_matrix_days.tsv"
139+
File matrix_genomes_unit = "matrix_genomes_unit.tsv"
140+
File matrix_cases_unit = "matrix_cases_unit.tsv"
141+
File weekly_sampling_proportions = "weekly_sampling_proportions.tsv"
142+
File weekly_sampling_bias = "weekly_sampling_bias.tsv"
143+
File matrix_genomes_unit_corrected = "matrix_genomes_unit_corrected.tsv"
144+
File selected_sequences = "selected_sequences.txt"
145+
File selected_metadata = "selected_metadata.tsv"
146+
File sampling_stats = "sampling_stats.txt"
147+
Int num_selected = read_int("NUM_OUT")
148+
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
149+
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
150+
String cpu_load = read_string("CPU_LOAD")
151+
}
152+
}
153+
3154
task multi_align_mafft_ref {
4155
input {
5156
File reference_fasta

0 commit comments

Comments
 (0)