Adding GitHub Action for Variant Calling Test Run #16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Variant Calling Test Run | |
on: | |
pull_request: | |
paths: | |
- 'variantCalling/variantCalling.wdl' | |
- '.github/workflows/variant-calling-test-run.yml' | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Install required tools | |
run: | | |
# Install system packages | |
sudo apt-get update | |
sudo apt-get install -y wget curl unzip default-jre samtools bwa tabix | |
# Download and install GATK | |
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip | |
unzip gatk-4.4.0.0.zip | |
# Make gatk executable available | |
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk | |
- name: Install Cromwell | |
run: | | |
wget https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar | |
mv cromwell-86.jar cromwell.jar | |
- name: Setup Test Data | |
run: | | |
mkdir -p test/data | |
cd test/data | |
# Download reference chromosome 20 from NCBI | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz | |
gunzip chr20.fa.gz | |
# Add "chr" prefix to sequence name | |
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta | |
rm chr20.fa | |
# Create sequence dictionary and index | |
samtools faidx ref.fasta | |
gatk CreateSequenceDictionary -R ref.fasta -O ref.fasta.dict | |
# Debug: Check the content of our files | |
echo "=== Reference FASTA header ===" | |
head -n 1 ref.fasta | |
echo "=== Reference FAI content ===" | |
cat ref.fasta.fai | |
echo "=== Dictionary content ===" | |
head -n 5 ref.fasta.dict | |
# Create and sort test region bed file | |
echo -e "chr20\t1000000\t1100000" > test.bed | |
sort -k1,1 -k2,2n test.bed > sorted.bed | |
mv sorted.bed test.bed | |
# Debug: Verify bed file content | |
echo "=== BED file content ===" | |
cat test.bed | |
# Test bed to interval_list conversion directly | |
echo "=== Testing bed to interval_list conversion ===" | |
gatk BedToIntervalList \ | |
-I test.bed \ | |
-O test.interval_list \ | |
-SD ref.fasta.dict | |
# Generate BWA indexes | |
bwa index ref.fasta | |
# Create a minimal synthetic dbSNP VCF for testing | |
# Write header lines | |
echo '##fileformat=VCFv4.2' > dbsnp.vcf | |
echo '##reference=GRCh38' >> dbsnp.vcf | |
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf | |
printf "chr20\t1000100\trs1234567\tA\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf | |
printf "chr20\t1000200\trs2345678\tT\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf | |
printf "chr20\t1000300\trs3456789\tG\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf | |
printf "chr20\t1000400\trs4567890\tC\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf | |
printf "chr20\t1000500\trs5678901\tAG\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf | |
gatk IndexFeatureFile -I dbsnp.vcf | |
# Create synthetic Mills and 1000G indels VCF | |
echo '##fileformat=VCFv4.2' > mills_1000G.vcf | |
echo '##reference=GRCh38' >> mills_1000G.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf | |
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf | |
printf "chr20\t1000150\tMILL1\tAT\tA\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000250\tMILL2\tG\tGTT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000350\tG1000_1\tCTA\tC\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf | |
printf "chr20\t1000450\tG1000_2\tT\tTAGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf | |
bgzip mills_1000G.vcf | |
tabix -p vcf mills_1000G.vcf.gz | |
# Create synthetic known indels VCF | |
echo '##fileformat=VCFv4.2' > known_indels.vcf | |
echo '##reference=GRCh38' >> known_indels.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf | |
printf "chr20\t1000550\tindel1\tAT\tA\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
printf "chr20\t1000650\tindel2\tG\tGTT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf | |
printf "chr20\t1000750\tindel3\tCTA\tC\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
bgzip known_indels.vcf | |
tabix -p vcf known_indels.vcf.gz | |
# Create test JSON with all reference files | |
cat << EOF > ../test-inputs.json | |
{ | |
"PanelBwaGatk4Annovar.sample_batch": [ | |
{ | |
"sample_name": "test_sample", | |
"bam_file": "test/data/test.unmapped.bam", | |
"bed_file": "test/data/test.bed" | |
} | |
], | |
"PanelBwaGatk4Annovar.reference_genome": { | |
"ref_name": "hg38", | |
"ref_fasta": "test/data/ref.fasta", | |
"ref_fasta_index": "test/data/ref.fasta.fai", | |
"ref_dict": "test/data/ref.fasta.dict", | |
"ref_pac": "test/data/ref.fasta.pac", | |
"ref_sa": "test/data/ref.fasta.sa", | |
"ref_amb": "test/data/ref.fasta.amb", | |
"ref_ann": "test/data/ref.fasta.ann", | |
"ref_bwt": "test/data/ref.fasta.bwt", | |
"dbSNP_vcf": "test/data/dbsnp.vcf", | |
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx", | |
"known_indels_sites_VCFs": [ | |
"test/data/mills_1000G.vcf.gz", | |
"test/data/known_indels.vcf.gz" | |
], | |
"known_indels_sites_indices": [ | |
"test/data/mills_1000G.vcf.gz.tbi", | |
"test/data/known_indels.vcf.gz.tbi" | |
], | |
"annovar_protocols": "refGene", | |
"annovar_operation": "g" | |
} | |
} | |
EOF | |
- name: Generate test BAM | |
run: | | |
samtools view -b -o test/data/test.unmapped.bam \ | |
<(echo -e "@RG\tID:test\tSM:test_sample\tPL:ILLUMINA\n@SQ\tSN:chr20\tLN:63025520\nread1\t4\t*\t0\t0\t*\t*\t0\t0\tACTG\tFFFF\tRG:Z:test") | |
- name: Run test workflow | |
run: | | |
java -jar cromwell.jar run variantCalling/variantCalling.wdl -i test/test-inputs.json | |
- name: Check outputs | |
run: | | |
# Basic output existence checks | |
test -f $(find cromwell-executions -name "*.recal.bam") | |
test -f $(find cromwell-executions -name "*.GATK.vcf") | |
test -f $(find cromwell-executions -name "*_multianno.txt") |