Adding GitHub Action for Variant Calling Test Run #33
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Variant Calling Test Run | |
on: | |
pull_request: | |
paths: | |
- 'variantCalling/variantCalling.wdl' | |
- '.github/workflows/variant-calling-test-run.yml' | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Install required tools | |
run: | | |
# Install system packages | |
sudo apt-get update | |
sudo apt-get install -y wget curl unzip default-jre samtools bwa tabix | |
# Download and install GATK | |
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip | |
unzip gatk-4.4.0.0.zip | |
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk | |
- name: Install Cromwell | |
run: | | |
wget https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar | |
mv cromwell-86.jar cromwell.jar | |
# Create Cromwell config file | |
cat << EOF > cromwell.conf | |
include required(classpath("application")) | |
backend { | |
default = "LocalExample" | |
providers { | |
LocalExample { | |
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" | |
config { | |
runtime-attributes = """ | |
Int? runtime_minutes = 10 | |
Int? cpu = 1 | |
Int? memory_mb = 4096 | |
String? docker | |
""" | |
submit = """ | |
docker run \ | |
--rm \ | |
-v ${cwd}:${docker_cwd} \ | |
-i \ | |
${docker} \ | |
/bin/bash ${script} | |
""" | |
submit-docker = """ | |
docker run \ | |
--rm \ | |
-v ${cwd}:${docker_cwd} \ | |
-i \ | |
${docker} \ | |
/bin/bash ${script} | |
""" | |
root = "cromwell-executions" | |
dockerRoot = "cromwell-executions" | |
# Map between runtime attributes and docker runtime flags | |
runtime-attributes-mapping { | |
cpu: "-c" | |
memory_gb: "-m" | |
} | |
concurrent-job-limit = 2 | |
# Mount paths for docker | |
docker { | |
hash-lookup { | |
enabled = false | |
} | |
cwd = "${docker_cwd}" | |
} | |
} | |
} | |
} | |
} | |
system { | |
job-rate-control { | |
jobs = 1 | |
per = 1 second | |
} | |
abort-jobs-on-terminate = true | |
workflow-heartbeat { | |
ttl = "10 minutes" | |
heartbeat-interval = "2 minutes" | |
write-batch-size = 10000 | |
write-threshold = 10000 | |
} | |
} | |
database { | |
profile = "slick.jdbc.HsqldbProfile$" | |
db { | |
driver = "org.hsqldb.jdbcDriver" | |
url = "jdbc:hsqldb:mem:cromwell;shutdown=false;hsqldb.tx=mvcc" | |
connectionTimeout = 3000 | |
} | |
} | |
EOF | |
- name: Setup Test Data | |
run: | | |
mkdir -p test/data | |
cd test/data | |
# Download reference chromosome 20 from NCBI | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz | |
gunzip chr20.fa.gz | |
# Add "chr" prefix to sequence name | |
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta | |
rm chr20.fa | |
# Create sequence dictionary and index | |
samtools faidx ref.fasta | |
gatk CreateSequenceDictionary -R ref.fasta -O ref.fasta.dict | |
# Debug: Check the content of our files | |
echo "=== Reference FASTA header ===" | |
head -n 1 ref.fasta | |
echo "=== Reference FAI content ===" | |
cat ref.fasta.fai | |
echo "=== Dictionary content ===" | |
head -n 5 ref.fasta.dict | |
# Create and sort test region bed file | |
echo -e "chr20\t1000000\t1100000" > test.bed | |
sort -k1,1 -k2,2n test.bed > sorted.bed | |
mv sorted.bed test.bed | |
# Debug: Verify bed file content | |
echo "=== BED file content ===" | |
cat test.bed | |
# Test bed to interval_list conversion directly | |
echo "=== Testing bed to interval_list conversion ===" | |
gatk BedToIntervalList \ | |
-I test.bed \ | |
-O test.interval_list \ | |
-SD ref.fasta.dict | |
# Generate BWA indexes | |
bwa index ref.fasta | |
# Create a minimal synthetic dbSNP VCF for testing | |
# Write header lines | |
echo '##fileformat=VCFv4.2' > dbsnp.vcf | |
echo '##reference=GRCh38' >> dbsnp.vcf | |
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf | |
printf "chr20\t1000100\trs1234567\tA\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf | |
printf "chr20\t1000200\trs2345678\tT\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf | |
printf "chr20\t1000300\trs3456789\tG\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf | |
printf "chr20\t1000400\trs4567890\tC\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf | |
printf "chr20\t1000500\trs5678901\tAG\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf | |
gatk IndexFeatureFile -I dbsnp.vcf | |
# Create synthetic Mills and 1000G indels VCF | |
echo '##fileformat=VCFv4.2' > mills_1000G.vcf | |
echo '##reference=GRCh38' >> mills_1000G.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf | |
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf | |
printf "chr20\t1000150\tMILL1\tAT\tA\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000250\tMILL2\tG\tGTT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000350\tG1000_1\tCTA\tC\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf | |
printf "chr20\t1000450\tG1000_2\tT\tTAGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf | |
bgzip mills_1000G.vcf | |
tabix -p vcf mills_1000G.vcf.gz | |
# Create synthetic known indels VCF | |
echo '##fileformat=VCFv4.2' > known_indels.vcf | |
echo '##reference=GRCh38' >> known_indels.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf | |
printf "chr20\t1000550\tindel1\tAT\tA\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
printf "chr20\t1000650\tindel2\tG\tGTT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf | |
printf "chr20\t1000750\tindel3\tCTA\tC\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
bgzip known_indels.vcf | |
tabix -p vcf known_indels.vcf.gz | |
# Create test JSON with all reference files | |
cat << EOF > ../test-inputs.json | |
{ | |
"PanelBwaGatk4Annovar.sample_batch": [ | |
{ | |
"sample_name": "test_sample", | |
"bam_file": "test/data/test.unmapped.bam", | |
"bed_file": "test/data/test.bed" | |
} | |
], | |
"PanelBwaGatk4Annovar.reference_genome": { | |
"ref_name": "hg38", | |
"ref_fasta": "test/data/ref.fasta", | |
"ref_fasta_index": "test/data/ref.fasta.fai", | |
"ref_dict": "test/data/ref.fasta.dict", | |
"ref_pac": "test/data/ref.fasta.pac", | |
"ref_sa": "test/data/ref.fasta.sa", | |
"ref_amb": "test/data/ref.fasta.amb", | |
"ref_ann": "test/data/ref.fasta.ann", | |
"ref_bwt": "test/data/ref.fasta.bwt", | |
"dbSNP_vcf": "test/data/dbsnp.vcf", | |
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx", | |
"known_indels_sites_VCFs": [ | |
"test/data/mills_1000G.vcf.gz", | |
"test/data/known_indels.vcf.gz" | |
], | |
"known_indels_sites_indices": [ | |
"test/data/mills_1000G.vcf.gz.tbi", | |
"test/data/known_indels.vcf.gz.tbi" | |
], | |
"annovar_protocols": "refGene", | |
"annovar_operation": "g" | |
} | |
} | |
EOF | |
- name: Generate test BAM | |
run: | | |
# Create a more structured test BAM with multiple reads | |
cat << EOF > test/data/test.sam | |
@HD VN:1.6 SO:queryname | |
@SQ SN:chr20 LN:63025520 | |
@RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1 | |
read1 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read1 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
read2 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read2 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
read3 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read3 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
EOF | |
# Convert SAM to BAM and proper sorting | |
samtools view -b test/data/test.sam > test/data/test.unsorted.bam | |
samtools sort -n test/data/test.unsorted.bam > test/data/test.unmapped.bam | |
samtools index test/data/test.unmapped.bam | |
# Debug: Check the BAM | |
echo "=== Checking final BAM structure ===" | |
samtools view -H test/data/test.unmapped.bam | |
echo "=== First few reads ===" | |
samtools view test/data/test.unmapped.bam | head -n 2 | |
# Clean up intermediate files | |
rm test/data/test.sam test/data/test.unsorted.bam | |
- name: Verify test data | |
run: | | |
echo "=== Verifying Reference Files ===" | |
ls -l test/data/ref* | |
echo "=== Verifying BAM File ===" | |
samtools view -H test/data/test.unmapped.bam | |
echo "=== Verifying VCF Files ===" | |
ls -l test/data/*.vcf* | |
echo "=== Verifying Bed Files ===" | |
cat test/data/test.bed | |
- name: Verify reference files in detail | |
run: | | |
echo "=== Reference FASTA dict content ===" | |
cat test/data/ref.fasta.dict | |
echo "=== Reference FAI content ===" | |
cat test/data/ref.fasta.fai | |
echo "=== Reference ANN content ===" | |
cat test/data/ref.fasta.ann | |
# Check if the reference FASTA header matches what's in the dict | |
echo "=== First line of reference FASTA ===" | |
head -n 1 test/data/ref.fasta | |
- name: Verify GATK installation | |
run: | | |
echo "=== GATK Installation ===" | |
ls -l /usr/local/bin/gatk || echo "GATK not found in /usr/local/bin" | |
which gatk || echo "GATK not in PATH" | |
gatk --version || echo "Cannot get GATK version" | |
echo "=== Java Version ===" | |
java -version | |
- name: Debug Docker setup | |
run: | | |
echo "=== Docker Version ===" | |
docker --version | |
echo "=== Docker Info ===" | |
docker info | |
echo "=== Available Docker Images ===" | |
docker images | |
echo "=== Testing Docker Pull ===" | |
docker pull getwilds/gatk:4.3.0.0 | |
docker pull getwilds/bwa:0.7.17 | |
docker pull getwilds/annovar:hg38 | |
- name: Verify Docker volumes | |
run: | | |
echo "=== Testing Docker Mount ===" | |
docker run --rm \ | |
-v $(pwd)/test:/test \ | |
getwilds/gatk:4.3.0.0 \ | |
ls -la /test | |
- name: Run test workflow | |
run: | | |
echo "=== Pre-execution Directory Check ===" | |
ls -la | |
echo "=== Running Workflow ===" | |
java -Dconfig.file=cromwell.conf \ | |
-jar cromwell.jar run \ | |
variantCalling/variantCalling.wdl \ | |
-i test/test-inputs.json | |
echo "=== Post-execution Directory Check ===" | |
ls -la cromwell-executions/ | |
- name: Debug script execution | |
if: always() # Run even if previous step failed | |
run: | | |
if [ -d "cromwell-executions" ]; then | |
echo "=== SortBed Script ===" | |
find cromwell-executions -name "script" -type f | while read script; do | |
echo "=== Content of $script ===" | |
cat "$script" | |
done | |
echo "=== Directory Structure ===" | |
find cromwell-executions -type d | while read dir; do | |
echo "=== Contents of $dir ===" | |
ls -la "$dir" || echo "Cannot access directory" | |
done | |
else | |
echo "No cromwell-executions directory found" | |
fi | |
- name: Check outputs | |
run: | | |
# Basic output existence checks | |
test -f $(find cromwell-executions -name "*.recal.bam") | |
test -f $(find cromwell-executions -name "*.GATK.vcf") | |
test -f $(find cromwell-executions -name "*_multianno.txt") |