forked from FredHutch/wdl-test-workflows
-
Notifications
You must be signed in to change notification settings - Fork 0
276 lines (232 loc) · 11.6 KB
/
variant-calling-test-run.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
name: Variant Calling Test Run
on:
pull_request:
paths:
- 'variantCalling/variantCalling.wdl'
- '.github/workflows/variant-calling-test-run.yml'
jobs:
test:
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set Up Java
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: '21'
- name: Install required tools
run: |
# Install system packages
sudo apt-get update
sudo apt-get install -y wget unzip samtools bwa tabix
# Download and install GATK
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip
unzip gatk-4.4.0.0.zip
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk
- name: Pull Cromwell Jarfile
run: wget -q https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar
- name: Setup Test Data
run: |
mkdir -p test/data
cd test/data
# Download reference chromosome 20 from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz
gunzip chr20.fa.gz
# Add "chr" prefix to sequence name
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta
rm chr20.fa
# Create sequence dictionary and index
samtools faidx ref.fasta
gatk CreateSequenceDictionary -R ref.fasta -O ref.dict
# Debug: Check the content of our files
echo "=== Reference FASTA header ==="
head -n 1 ref.fasta
echo "=== Reference FAI content ==="
cat ref.fasta.fai
echo "=== Dictionary content ==="
head -n 5 ref.dict
# Create and sort test region bed file
echo -e "chr20\t1000000\t1100000" > test.bed
sort -k1,1 -k2,2n test.bed > sorted.bed
mv sorted.bed test.bed
# Debug: Verify bed file content
echo "=== BED file content ==="
cat test.bed
# Test bed to interval_list conversion directly
echo "=== Testing bed to interval_list conversion ==="
gatk BedToIntervalList \
-I test.bed \
-O test.interval_list \
-SD ref.dict
# Generate BWA indexes
bwa index ref.fasta
# Function to extract reference base at a position
echo "# Create function to get reference base at position"
get_ref_base() {
local pos=$1
local length=$2
samtools faidx ref.fasta chr20:${pos}-$((pos+length-1)) | tail -n1
}
# Create a minimal synthetic dbSNP VCF for testing
echo '##fileformat=VCFv4.2' > dbsnp.vcf
echo '##reference=GRCh38' >> dbsnp.vcf
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf
# Get actual reference bases and create VCF records
ref1=$(get_ref_base 1000100 1)
ref2=$(get_ref_base 1000200 1)
ref3=$(get_ref_base 1000300 1)
ref4=$(get_ref_base 1000400 1)
ref5=$(get_ref_base 1000500 2)
printf "chr20\t1000100\trs1234567\t${ref1}\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf
printf "chr20\t1000200\trs2345678\t${ref2}\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf
printf "chr20\t1000300\trs3456789\t${ref3}\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf
printf "chr20\t1000400\trs4567890\t${ref4}\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf
printf "chr20\t1000500\trs5678901\t${ref5}\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf
# Index the VCF
gatk IndexFeatureFile -I dbsnp.vcf
# Create synthetic Mills and 1000G indels VCF similarly
echo '##fileformat=VCFv4.2' > mills_1000G.vcf
echo '##reference=GRCh38' >> mills_1000G.vcf
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf
ref6=$(get_ref_base 1000150 2)
ref7=$(get_ref_base 1000250 1)
ref8=$(get_ref_base 1000350 3)
ref9=$(get_ref_base 1000450 1)
printf "chr20\t1000150\tMILL1\t${ref6}\t${ref6:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf
printf "chr20\t1000250\tMILL2\t${ref7}\t${ref7}TT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf
printf "chr20\t1000350\tG1000_1\t${ref8}\t${ref8:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf
printf "chr20\t1000450\tG1000_2\t${ref9}\t${ref9}AGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf
bgzip mills_1000G.vcf
tabix -p vcf mills_1000G.vcf.gz
# Create synthetic known indels VCF
echo '##fileformat=VCFv4.2' > known_indels.vcf
echo '##reference=GRCh38' >> known_indels.vcf
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf
ref10=$(get_ref_base 1000550 2)
ref11=$(get_ref_base 1000650 1)
ref12=$(get_ref_base 1000750 3)
printf "chr20\t1000550\tindel1\t${ref10}\t${ref10:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
printf "chr20\t1000650\tindel2\t${ref11}\t${ref11}TT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf
printf "chr20\t1000750\tindel3\t${ref12}\t${ref12:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
bgzip known_indels.vcf
tabix -p vcf known_indels.vcf.gz
# Create test JSON with all reference files
cat << EOF > ../test-inputs.json
{
"PanelBwaGatk4Annovar.sample_batch": [
{
"sample_name": "test_sample",
"bam_file": "test/data/test.unmapped.bam",
"bed_file": "test/data/test.bed"
}
],
"PanelBwaGatk4Annovar.reference_genome": {
"ref_name": "hg38",
"ref_fasta": "test/data/ref.fasta",
"ref_fasta_index": "test/data/ref.fasta.fai",
"ref_dict": "test/data/ref.dict",
"ref_pac": "test/data/ref.fasta.pac",
"ref_sa": "test/data/ref.fasta.sa",
"ref_amb": "test/data/ref.fasta.amb",
"ref_ann": "test/data/ref.fasta.ann",
"ref_bwt": "test/data/ref.fasta.bwt",
"dbSNP_vcf": "test/data/dbsnp.vcf",
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx",
"known_indels_sites_VCFs": [
"test/data/mills_1000G.vcf.gz",
"test/data/known_indels.vcf.gz"
],
"known_indels_sites_indices": [
"test/data/mills_1000G.vcf.gz.tbi",
"test/data/known_indels.vcf.gz.tbi"
],
"annovar_protocols": "refGene",
"annovar_operation": "g"
}
}
EOF
# Add debugging for BQSR inputs
echo "=== Checking BQSR input files ==="
echo "dbSNP VCF:"
head -n 5 dbsnp.vcf
echo "Mills & 1000G VCF:"
zcat mills_1000G.vcf.gz | head -n 5
echo "Known indels VCF:"
zcat known_indels.vcf.gz | head -n 5
# Validate VCF files
echo "=== Validating VCF files ==="
gatk ValidateVariants -V dbsnp.vcf -R ref.fasta
gatk ValidateVariants -V mills_1000G.vcf.gz -R ref.fasta
gatk ValidateVariants -V known_indels.vcf.gz -R ref.fasta
# Navigating back to original directory
cd ../../
- name: Generate test BAM
run: |
# Create a more structured test BAM with multiple reads
cat << EOF > test/data/test.sam
@HD VN:1.6 SO:queryname
@SQ SN:chr20 LN:63025520
@RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1
read1 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
read1 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
read2 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
read2 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
read3 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
read3 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
EOF
# Convert SAM to BAM and proper sorting
samtools view -b test/data/test.sam > test/data/test.unsorted.bam
samtools sort -n test/data/test.unsorted.bam > test/data/test.unmapped.bam
samtools index test/data/test.unmapped.bam
# Debug: Check the BAM
echo "=== Checking final BAM structure ==="
samtools view -H test/data/test.unmapped.bam
echo "=== First few reads ==="
samtools view test/data/test.unmapped.bam | head -n 2
# Clean up intermediate files
rm test/data/test.sam test/data/test.unsorted.bam
# Validate BAM file
echo "=== Validating BAM file ==="
gatk ValidateSamFile -I test/data/test.unmapped.bam
- name: Run test workflow
run: |
# Create a directory for workflow logs
mkdir -p workflow_logs
# Run workflow with more verbose logging
java -Dbackend.providers.Local.config.root="workflow_logs" \
-Xmx4g \
-jar cromwell-86.jar run \
-v variantCalling/variantCalling.wdl \
--inputs test/test-inputs.json \
--metadata-output workflow_logs/metadata.json \
2>&1 | tee workflow_logs/workflow.log
# If workflow fails, print additional debug info
if [ $? -ne 0 ]; then
echo "=== Workflow failed, printing debug information ==="
echo "=== Last 50 lines of workflow log ==="
tail -n 50 workflow_logs/workflow.log
echo "=== Execution directory contents ==="
ls -R workflow_logs
# Print content of stderr from the failed task if it exists
find workflow_logs -name "stderr" -exec sh -c 'echo "=== Content of {}" && cat {}' \;
# Exit with failure
exit 1
fi
# - name: Run test workflow
# run: |
# java -Xmx4g -jar cromwell-86.jar run variantCalling/variantCalling.wdl --inputs test/test-inputs.json
- name: Check outputs
run: |
# Basic output existence checks
test -f $(find cromwell-executions -name "*.recal.bam")
test -f $(find cromwell-executions -name "*.GATK.vcf")
test -f $(find cromwell-executions -name "*_multianno.txt")