|
1 | 1 | version 1.0
|
2 | 2 |
|
| 3 | +task subsample_by_cases { |
| 4 | + meta { |
| 5 | + description: "Run subsampler to get downsampled dataset and metadata proportional to epidemiological case counts." |
| 6 | + } |
| 7 | + input { |
| 8 | + File metadata |
| 9 | + File case_data |
| 10 | + |
| 11 | + String id_column |
| 12 | + String geo_column |
| 13 | + String date_column = "date" |
| 14 | + String unit = "week" |
| 15 | + |
| 16 | + File? keep_file |
| 17 | + File? remove_file |
| 18 | + File? filter_file |
| 19 | + Float baseline = 0.0001 |
| 20 | + Int? seed_num |
| 21 | + String? start_date |
| 22 | + String? end_date |
| 23 | + |
| 24 | + String docker = "quay.io/broadinstitute/subsampler" |
| 25 | + Int machine_mem_gb = 30 |
| 26 | + } |
| 27 | + command <<< |
| 28 | + set -e -o pipefail |
| 29 | + mkdir -p data outputs |
| 30 | + |
| 31 | + # decompress if compressed |
| 32 | + echo "staging and decompressing input data files" |
| 33 | + if [[ ~{metadata} == *.gz ]]; then |
| 34 | + cat "~{metadata}" | pigz -d > data/metadata.tsv |
| 35 | + elif [[ ~{metadata} == *.zst ]]; then |
| 36 | + cat "~{metadata}" | zstd -d > data/metadata.tsv |
| 37 | + else |
| 38 | + ln -s "~{metadata}" data/metadata.tsv |
| 39 | + fi |
| 40 | + if [[ ~{case_data} == *.gz ]]; then |
| 41 | + cat "~{case_data}" | pigz -d > data/case_data.tsv |
| 42 | + elif [[ ~{case_data} == *.zst ]]; then |
| 43 | + cat "~{case_data}" | zstd -d > data/case_data.tsv |
| 44 | + else |
| 45 | + ln -s "~{case_data}" data/case_data.tsv |
| 46 | + fi |
| 47 | + |
| 48 | + ## replicate snakemake DAG manually |
| 49 | + # rule genome_matrix |
| 50 | + # Generate matrix of genome counts per day, for each element in column ~{geo_column} |
| 51 | + echo "getting genome matrix" |
| 52 | + python3 /opt/subsampler/scripts/get_genome_matrix.py \ |
| 53 | + --metadata data/metadata.tsv \ |
| 54 | + --index-column ~{geo_column} \ |
| 55 | + --date-column ~{date_column} \ |
| 56 | + ~{"--start-date " + start_date} \ |
| 57 | + ~{"--end-date " + end_date} \ |
| 58 | + --output outputs/genome_matrix_days.tsv |
| 59 | + date;uptime;free |
| 60 | + |
| 61 | + # rule unit_conversion |
| 62 | + # Generate matrix of genome and case counts per epiweek |
| 63 | + echo "converting matricies to epiweeks" |
| 64 | + python3 /opt/subsampler/scripts/aggregator.py \ |
| 65 | + --input outputs/genome_matrix_days.tsv \ |
| 66 | + --unit ~{unit} \ |
| 67 | + --format integer \ |
| 68 | + --output outputs/matrix_genomes_unit.tsv |
| 69 | + python3 /opt/subsampler/scripts/aggregator.py \ |
| 70 | + --input data/case_data.tsv \ |
| 71 | + --unit ~{unit} \ |
| 72 | + --format integer \ |
| 73 | + ~{"--start-date " + start_date} \ |
| 74 | + ~{"--end-date " + end_date} \ |
| 75 | + --output outputs/matrix_cases_unit.tsv |
| 76 | + date;uptime;free |
| 77 | + |
| 78 | + # rule correct_bias |
| 79 | + # Correct under- and oversampling genome counts based on epidemiological data |
| 80 | + echo "create bias-correction matrix" |
| 81 | + python3 /opt/subsampler/scripts/correct_bias.py \ |
| 82 | + --genome-matrix outputs/matrix_genomes_unit.tsv \ |
| 83 | + --case-matrix outputs/matrix_cases_unit.tsv \ |
| 84 | + --index-column code \ |
| 85 | + ~{"--baseline " + baseline} \ |
| 86 | + --output1 outputs/weekly_sampling_proportions.tsv \ |
| 87 | + --output2 outputs/weekly_sampling_bias.tsv \ |
| 88 | + --output3 outputs/matrix_genomes_unit_corrected.tsv |
| 89 | + date;uptime;free |
| 90 | + |
| 91 | + # rule subsample |
| 92 | + # Sample genomes and metadata according to the corrected genome matrix |
| 93 | + echo "subsample data according to bias-correction" |
| 94 | + # subsampler_timeseries says --keep is optional but actually fails if you don't specify one |
| 95 | + cp /dev/null data/keep.txt |
| 96 | + ~{"cp " + keep_file + " data/keep.txt"} |
| 97 | + python3 /opt/subsampler/scripts/subsampler_timeseries.py \ |
| 98 | + --metadata data/metadata.tsv \ |
| 99 | + --genome-matrix outputs/matrix_genomes_unit_corrected.tsv \ |
| 100 | + --index-column ~{id_column} \ |
| 101 | + --geo-column ~{geo_column} \ |
| 102 | + --date-column ~{date_column} \ |
| 103 | + --time-unit ~{unit} \ |
| 104 | + --keep data/keep.txt \ |
| 105 | + ~{"--remove " + remove_file} \ |
| 106 | + ~{"--filter-file " + filter_file} \ |
| 107 | + ~{"--seed " + seed_num} \ |
| 108 | + ~{"--start-date " + start_date} \ |
| 109 | + ~{"--end-date " + end_date} \ |
| 110 | + --weekasdate no \ |
| 111 | + --sampled-sequences outputs/selected_sequences.txt \ |
| 112 | + --sampled-metadata outputs/selected_metadata.tsv \ |
| 113 | + --report outputs/sampling_stats.txt |
| 114 | + echo '# Sampling proportion: ~{baseline}' | cat - outputs/sampling_stats.txt > temp && mv temp outputs/sampling_stats.txt |
| 115 | + date;uptime;free |
| 116 | + |
| 117 | + # copy outputs from container's temp dir to host-accessible working dir for delocalization |
| 118 | + echo "wrap up" |
| 119 | + mv outputs/* . |
| 120 | + # get counts |
| 121 | + cat selected_sequences.txt | wc -l | tee NUM_OUT |
| 122 | + # get hardware utilization |
| 123 | + set +o pipefail |
| 124 | + cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC |
| 125 | + cat /proc/loadavg > CPU_LOAD |
| 126 | + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES |
| 127 | + |
| 128 | + >>> |
| 129 | + runtime { |
| 130 | + docker: docker |
| 131 | + memory: machine_mem_gb + " GB" |
| 132 | + cpu: 2 |
| 133 | + disks: "local-disk 200 HDD" |
| 134 | + disk: "200 GB" |
| 135 | + dx_instance_type: "mem3_ssd1_v2_x4" |
| 136 | + } |
| 137 | + output { |
| 138 | + File genome_matrix_days = "genome_matrix_days.tsv" |
| 139 | + File matrix_genomes_unit = "matrix_genomes_unit.tsv" |
| 140 | + File matrix_cases_unit = "matrix_cases_unit.tsv" |
| 141 | + File weekly_sampling_proportions = "weekly_sampling_proportions.tsv" |
| 142 | + File weekly_sampling_bias = "weekly_sampling_bias.tsv" |
| 143 | + File matrix_genomes_unit_corrected = "matrix_genomes_unit_corrected.tsv" |
| 144 | + File selected_sequences = "selected_sequences.txt" |
| 145 | + File selected_metadata = "selected_metadata.tsv" |
| 146 | + File sampling_stats = "sampling_stats.txt" |
| 147 | + Int num_selected = read_int("NUM_OUT") |
| 148 | + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) |
| 149 | + Int runtime_sec = ceil(read_float("UPTIME_SEC")) |
| 150 | + String cpu_load = read_string("CPU_LOAD") |
| 151 | + } |
| 152 | +} |
| 153 | + |
3 | 154 | task multi_align_mafft_ref {
|
4 | 155 | input {
|
5 | 156 | File reference_fasta
|
|
0 commit comments