Adding GitHub Action for Variant Calling Test Run #14
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Variant Calling Test Run | |
on: | |
pull_request: | |
paths: | |
- 'variantCalling/variantCalling.wdl' | |
- '.github/workflows/variant-calling-test-run.yml' | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Install required tools | |
run: | | |
# Install system packages | |
sudo apt-get update | |
sudo apt-get install -y wget curl unzip default-jre samtools bwa tabix | |
# Download and install GATK | |
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip | |
unzip gatk-4.4.0.0.zip | |
# Make gatk executable available | |
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk | |
- name: Install Cromwell | |
run: | | |
wget https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar | |
mv cromwell-86.jar cromwell.jar | |
- name: Setup Test Data | |
run: | | |
mkdir -p test/data | |
cd test/data | |
# Download reference chromosome 20 from NCBI | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz | |
gunzip chr20.fa.gz | |
# Add "chr" prefix to sequence name | |
sed 's/^>20/>chr20/' chr20.fa > ref.fasta | |
rm chr20.fa | |
# Create sequence dictionary and index | |
samtools faidx ref.fasta | |
gatk CreateSequenceDictionary -R ref.fasta -O ref.fasta.dict | |
# Generate BWA indexes | |
bwa index ref.fasta | |
# Create a minimal synthetic dbSNP VCF for testing | |
# Write header lines | |
echo '##fileformat=VCFv4.2' > dbsnp.vcf | |
echo '##reference=GRCh38' >> dbsnp.vcf | |
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf | |
# Write tab-separated header line using printf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf | |
# Write variants | |
printf "chr20\t1000100\trs1234567\tA\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf | |
printf "chr20\t1000200\trs2345678\tT\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf | |
printf "chr20\t1000300\trs3456789\tG\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf | |
printf "chr20\t1000400\trs4567890\tC\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf | |
printf "chr20\t1000500\trs5678901\tAG\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf | |
gatk IndexFeatureFile -I dbsnp.vcf | |
# Create synthetic Mills and 1000G indels VCF | |
# Write header lines | |
echo '##fileformat=VCFv4.2' > mills_1000G.vcf | |
echo '##reference=GRCh38' >> mills_1000G.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf | |
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf | |
# Write tab-separated header line | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf | |
# Write variants | |
printf "chr20\t1000150\tMILL1\tAT\tA\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000250\tMILL2\tG\tGTT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000350\tG1000_1\tCTA\tC\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf | |
printf "chr20\t1000450\tG1000_2\tT\tTAGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf | |
bgzip mills_1000G.vcf | |
tabix -p vcf mills_1000G.vcf.gz | |
# Create synthetic known indels VCF | |
# Write header lines | |
echo '##fileformat=VCFv4.2' > known_indels.vcf | |
echo '##reference=GRCh38' >> known_indels.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf | |
# Write tab-separated header line | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf | |
# Write variants | |
printf "chr20\t1000550\tindel1\tAT\tA\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
printf "chr20\t1000650\tindel2\tG\tGTT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf | |
printf "chr20\t1000750\tindel3\tCTA\tC\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
bgzip known_indels.vcf | |
tabix -p vcf known_indels.vcf.gz | |
# Create minimal test region | |
echo -e "chr20\t1000000\t1100000" > test.bed | |
# Create test JSON with all reference files | |
cat << EOF > ../test-inputs.json | |
{ | |
"PanelBwaGatk4Annovar.sample_batch": [ | |
{ | |
"sample_name": "test_sample", | |
"bam_file": "test/data/test.unmapped.bam", | |
"bed_file": "test/data/test.bed" | |
} | |
], | |
"PanelBwaGatk4Annovar.reference_genome": { | |
"ref_name": "hg38", | |
"ref_fasta": "test/data/ref.fasta", | |
"ref_fasta_index": "test/data/ref.fasta.fai", | |
"ref_dict": "test/data/ref.fasta.dict", | |
"ref_pac": "test/data/ref.fasta.pac", | |
"ref_sa": "test/data/ref.fasta.sa", | |
"ref_amb": "test/data/ref.fasta.amb", | |
"ref_ann": "test/data/ref.fasta.ann", | |
"ref_bwt": "test/data/ref.fasta.bwt", | |
"dbSNP_vcf": "test/data/dbsnp.vcf", | |
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx", | |
"known_indels_sites_VCFs": [ | |
"test/data/mills_1000G.vcf.gz", | |
"test/data/known_indels.vcf.gz" | |
], | |
"known_indels_sites_indices": [ | |
"test/data/mills_1000G.vcf.gz.tbi", | |
"test/data/known_indels.vcf.gz.tbi" | |
], | |
"annovar_protocols": "refGene", | |
"annovar_operation": "g" | |
} | |
} | |
EOF | |
- name: Generate test BAM | |
run: | | |
samtools view -b -o test/data/test.unmapped.bam \ | |
<(echo -e "@RG\tID:test\tSM:test_sample\tPL:ILLUMINA\n@SQ\tSN:chr20\tLN:63025520\nread1\t4\t*\t0\t0\t*\t*\t0\t0\tACTG\tFFFF\tRG:Z:test") | |
- name: Run test workflow | |
run: | | |
java -jar cromwell.jar run variantCalling/variantCalling.wdl -i test/test-inputs.json | |
- name: Check outputs | |
run: | | |
# Basic output existence checks | |
test -f $(find cromwell-executions -name "*.recal.bam") | |
test -f $(find cromwell-executions -name "*.GATK.vcf") | |
test -f $(find cromwell-executions -name "*_multianno.txt") |