Skip to content

Adding GitHub Action for Variant Calling Test Run #33

Adding GitHub Action for Variant Calling Test Run

Adding GitHub Action for Variant Calling Test Run #33

name: Variant Calling Test Run
on:
pull_request:
paths:
- 'variantCalling/variantCalling.wdl'
- '.github/workflows/variant-calling-test-run.yml'
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install required tools
run: |
# Install system packages
sudo apt-get update
sudo apt-get install -y wget curl unzip default-jre samtools bwa tabix
# Download and install GATK
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip
unzip gatk-4.4.0.0.zip
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk
- name: Install Cromwell
run: |
wget https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar
mv cromwell-86.jar cromwell.jar
# Create Cromwell config file
cat << EOF > cromwell.conf
include required(classpath("application"))
backend {
default = "LocalExample"
providers {
LocalExample {
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
config {
runtime-attributes = """
Int? runtime_minutes = 10
Int? cpu = 1
Int? memory_mb = 4096
String? docker
"""
submit = """
docker run \
--rm \
-v ${cwd}:${docker_cwd} \
-i \
${docker} \
/bin/bash ${script}
"""
submit-docker = """
docker run \
--rm \
-v ${cwd}:${docker_cwd} \
-i \
${docker} \
/bin/bash ${script}
"""
root = "cromwell-executions"
dockerRoot = "cromwell-executions"
# Map between runtime attributes and docker runtime flags
runtime-attributes-mapping {
cpu: "-c"
memory_gb: "-m"
}
concurrent-job-limit = 2
# Mount paths for docker
docker {
hash-lookup {
enabled = false
}
cwd = "${docker_cwd}"
}
}
}
}
}
system {
job-rate-control {
jobs = 1
per = 1 second
}
abort-jobs-on-terminate = true
workflow-heartbeat {
ttl = "10 minutes"
heartbeat-interval = "2 minutes"
write-batch-size = 10000
write-threshold = 10000
}
}
database {
profile = "slick.jdbc.HsqldbProfile$"
db {
driver = "org.hsqldb.jdbcDriver"
url = "jdbc:hsqldb:mem:cromwell;shutdown=false;hsqldb.tx=mvcc"
connectionTimeout = 3000
}
}
EOF
- name: Setup Test Data
run: |
mkdir -p test/data
cd test/data
# Download reference chromosome 20 from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz
gunzip chr20.fa.gz
# Add "chr" prefix to sequence name
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta
rm chr20.fa
# Create sequence dictionary and index
samtools faidx ref.fasta
gatk CreateSequenceDictionary -R ref.fasta -O ref.fasta.dict
# Debug: Check the content of our files
echo "=== Reference FASTA header ==="
head -n 1 ref.fasta
echo "=== Reference FAI content ==="
cat ref.fasta.fai
echo "=== Dictionary content ==="
head -n 5 ref.fasta.dict
# Create and sort test region bed file
echo -e "chr20\t1000000\t1100000" > test.bed
sort -k1,1 -k2,2n test.bed > sorted.bed
mv sorted.bed test.bed
# Debug: Verify bed file content
echo "=== BED file content ==="
cat test.bed
# Test bed to interval_list conversion directly
echo "=== Testing bed to interval_list conversion ==="
gatk BedToIntervalList \
-I test.bed \
-O test.interval_list \
-SD ref.fasta.dict
# Generate BWA indexes
bwa index ref.fasta
# Create a minimal synthetic dbSNP VCF for testing
# Write header lines
echo '##fileformat=VCFv4.2' > dbsnp.vcf
echo '##reference=GRCh38' >> dbsnp.vcf
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf
printf "chr20\t1000100\trs1234567\tA\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf
printf "chr20\t1000200\trs2345678\tT\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf
printf "chr20\t1000300\trs3456789\tG\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf
printf "chr20\t1000400\trs4567890\tC\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf
printf "chr20\t1000500\trs5678901\tAG\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf
gatk IndexFeatureFile -I dbsnp.vcf
# Create synthetic Mills and 1000G indels VCF
echo '##fileformat=VCFv4.2' > mills_1000G.vcf
echo '##reference=GRCh38' >> mills_1000G.vcf
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf
printf "chr20\t1000150\tMILL1\tAT\tA\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf
printf "chr20\t1000250\tMILL2\tG\tGTT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf
printf "chr20\t1000350\tG1000_1\tCTA\tC\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf
printf "chr20\t1000450\tG1000_2\tT\tTAGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf
bgzip mills_1000G.vcf
tabix -p vcf mills_1000G.vcf.gz
# Create synthetic known indels VCF
echo '##fileformat=VCFv4.2' > known_indels.vcf
echo '##reference=GRCh38' >> known_indels.vcf
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf
printf "chr20\t1000550\tindel1\tAT\tA\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
printf "chr20\t1000650\tindel2\tG\tGTT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf
printf "chr20\t1000750\tindel3\tCTA\tC\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
bgzip known_indels.vcf
tabix -p vcf known_indels.vcf.gz
# Create test JSON with all reference files
cat << EOF > ../test-inputs.json
{
"PanelBwaGatk4Annovar.sample_batch": [
{
"sample_name": "test_sample",
"bam_file": "test/data/test.unmapped.bam",
"bed_file": "test/data/test.bed"
}
],
"PanelBwaGatk4Annovar.reference_genome": {
"ref_name": "hg38",
"ref_fasta": "test/data/ref.fasta",
"ref_fasta_index": "test/data/ref.fasta.fai",
"ref_dict": "test/data/ref.fasta.dict",
"ref_pac": "test/data/ref.fasta.pac",
"ref_sa": "test/data/ref.fasta.sa",
"ref_amb": "test/data/ref.fasta.amb",
"ref_ann": "test/data/ref.fasta.ann",
"ref_bwt": "test/data/ref.fasta.bwt",
"dbSNP_vcf": "test/data/dbsnp.vcf",
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx",
"known_indels_sites_VCFs": [
"test/data/mills_1000G.vcf.gz",
"test/data/known_indels.vcf.gz"
],
"known_indels_sites_indices": [
"test/data/mills_1000G.vcf.gz.tbi",
"test/data/known_indels.vcf.gz.tbi"
],
"annovar_protocols": "refGene",
"annovar_operation": "g"
}
}
EOF
- name: Generate test BAM
run: |
# Create a more structured test BAM with multiple reads
cat << EOF > test/data/test.sam
@HD VN:1.6 SO:queryname
@SQ SN:chr20 LN:63025520
@RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1
read1 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
read1 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
read2 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
read2 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
read3 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
read3 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
EOF
# Convert SAM to BAM and proper sorting
samtools view -b test/data/test.sam > test/data/test.unsorted.bam
samtools sort -n test/data/test.unsorted.bam > test/data/test.unmapped.bam
samtools index test/data/test.unmapped.bam
# Debug: Check the BAM
echo "=== Checking final BAM structure ==="
samtools view -H test/data/test.unmapped.bam
echo "=== First few reads ==="
samtools view test/data/test.unmapped.bam | head -n 2
# Clean up intermediate files
rm test/data/test.sam test/data/test.unsorted.bam
- name: Verify test data
run: |
echo "=== Verifying Reference Files ==="
ls -l test/data/ref*
echo "=== Verifying BAM File ==="
samtools view -H test/data/test.unmapped.bam
echo "=== Verifying VCF Files ==="
ls -l test/data/*.vcf*
echo "=== Verifying Bed Files ==="
cat test/data/test.bed
- name: Verify reference files in detail
run: |
echo "=== Reference FASTA dict content ==="
cat test/data/ref.fasta.dict
echo "=== Reference FAI content ==="
cat test/data/ref.fasta.fai
echo "=== Reference ANN content ==="
cat test/data/ref.fasta.ann
# Check if the reference FASTA header matches what's in the dict
echo "=== First line of reference FASTA ==="
head -n 1 test/data/ref.fasta
- name: Verify GATK installation
run: |
echo "=== GATK Installation ==="
ls -l /usr/local/bin/gatk || echo "GATK not found in /usr/local/bin"
which gatk || echo "GATK not in PATH"
gatk --version || echo "Cannot get GATK version"
echo "=== Java Version ==="
java -version
- name: Debug Docker setup
run: |
echo "=== Docker Version ==="
docker --version
echo "=== Docker Info ==="
docker info
echo "=== Available Docker Images ==="
docker images
echo "=== Testing Docker Pull ==="
docker pull getwilds/gatk:4.3.0.0
docker pull getwilds/bwa:0.7.17
docker pull getwilds/annovar:hg38
- name: Verify Docker volumes
run: |
echo "=== Testing Docker Mount ==="
docker run --rm \
-v $(pwd)/test:/test \
getwilds/gatk:4.3.0.0 \
ls -la /test
- name: Run test workflow
run: |
echo "=== Pre-execution Directory Check ==="
ls -la
echo "=== Running Workflow ==="
java -Dconfig.file=cromwell.conf \
-jar cromwell.jar run \
variantCalling/variantCalling.wdl \
-i test/test-inputs.json
echo "=== Post-execution Directory Check ==="
ls -la cromwell-executions/
- name: Debug script execution
if: always() # Run even if previous step failed
run: |
if [ -d "cromwell-executions" ]; then
echo "=== SortBed Script ==="
find cromwell-executions -name "script" -type f | while read script; do
echo "=== Content of $script ==="
cat "$script"
done
echo "=== Directory Structure ==="
find cromwell-executions -type d | while read dir; do
echo "=== Contents of $dir ==="
ls -la "$dir" || echo "Cannot access directory"
done
else
echo "No cromwell-executions directory found"
fi
- name: Check outputs
run: |
# Basic output existence checks
test -f $(find cromwell-executions -name "*.recal.bam")
test -f $(find cromwell-executions -name "*.GATK.vcf")
test -f $(find cromwell-executions -name "*_multianno.txt")