Adding GitHub Action for Variant Calling Test Run #33

Summary
Jobs
- test
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/variant-calling-test-run.yml at 2a77fde

	name: Variant Calling Test Run

	on:
	pull_request:
	paths:
	- 'variantCalling/variantCalling.wdl'
	- '.github/workflows/variant-calling-test-run.yml'

	jobs:
	test:
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Install required tools
	run: \|
	# Install system packages
	sudo apt-get update
	sudo apt-get install -y wget curl unzip default-jre samtools bwa tabix

	# Download and install GATK
	wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip
	unzip gatk-4.4.0.0.zip
	sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk

	- name: Install Cromwell
	run: \|
	wget https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar
	mv cromwell-86.jar cromwell.jar

	# Create Cromwell config file
	cat << EOF > cromwell.conf
	include required(classpath("application"))

	backend {
	default = "LocalExample"
	providers {
	LocalExample {
	actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
	config {
	runtime-attributes = """
	Int? runtime_minutes = 10
	Int? cpu = 1
	Int? memory_mb = 4096
	String? docker
	"""

	submit = """
	docker run \
	--rm \
	-v ${cwd}:${docker_cwd} \
	-i \
	${docker} \
	/bin/bash ${script}
	"""

	submit-docker = """
	docker run \
	--rm \
	-v ${cwd}:${docker_cwd} \
	-i \
	${docker} \
	/bin/bash ${script}
	"""

	root = "cromwell-executions"
	dockerRoot = "cromwell-executions"

	# Map between runtime attributes and docker runtime flags
	runtime-attributes-mapping {
	cpu: "-c"
	memory_gb: "-m"
	}

	concurrent-job-limit = 2

	# Mount paths for docker
	docker {
	hash-lookup {
	enabled = false
	}
	cwd = "${docker_cwd}"
	}
	}
	}
	}
	}

	system {
	job-rate-control {
	jobs = 1
	per = 1 second
	}

	abort-jobs-on-terminate = true

	workflow-heartbeat {
	ttl = "10 minutes"
	heartbeat-interval = "2 minutes"
	write-batch-size = 10000
	write-threshold = 10000
	}
	}

	database {
	profile = "slick.jdbc.HsqldbProfile$"
	db {
	driver = "org.hsqldb.jdbcDriver"
	url = "jdbc:hsqldb:mem:cromwell;shutdown=false;hsqldb.tx=mvcc"
	connectionTimeout = 3000
	}
	}
	EOF

	- name: Setup Test Data
	run: \|
	mkdir -p test/data
	cd test/data

	# Download reference chromosome 20 from NCBI
	wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz
	gunzip chr20.fa.gz
	# Add "chr" prefix to sequence name
	sed 's/^>.*/>chr20/' chr20.fa > ref.fasta
	rm chr20.fa

	# Create sequence dictionary and index
	samtools faidx ref.fasta
	gatk CreateSequenceDictionary -R ref.fasta -O ref.fasta.dict

	# Debug: Check the content of our files
	echo "=== Reference FASTA header ==="
	head -n 1 ref.fasta
	echo "=== Reference FAI content ==="
	cat ref.fasta.fai
	echo "=== Dictionary content ==="
	head -n 5 ref.fasta.dict

	# Create and sort test region bed file
	echo -e "chr20\t1000000\t1100000" > test.bed
	sort -k1,1 -k2,2n test.bed > sorted.bed
	mv sorted.bed test.bed

	# Debug: Verify bed file content
	echo "=== BED file content ==="
	cat test.bed

	# Test bed to interval_list conversion directly
	echo "=== Testing bed to interval_list conversion ==="
	gatk BedToIntervalList \
	-I test.bed \
	-O test.interval_list \
	-SD ref.fasta.dict

	# Generate BWA indexes
	bwa index ref.fasta

	# Create a minimal synthetic dbSNP VCF for testing
	# Write header lines
	echo '##fileformat=VCFv4.2' > dbsnp.vcf
	echo '##reference=GRCh38' >> dbsnp.vcf
	echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf
	echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai \| cut -f2)>" >> dbsnp.vcf
	printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf
	printf "chr20\t1000100\trs1234567\tA\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf
	printf "chr20\t1000200\trs2345678\tT\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf
	printf "chr20\t1000300\trs3456789\tG\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf
	printf "chr20\t1000400\trs4567890\tC\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf
	printf "chr20\t1000500\trs5678901\tAG\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf

	gatk IndexFeatureFile -I dbsnp.vcf

	# Create synthetic Mills and 1000G indels VCF
	echo '##fileformat=VCFv4.2' > mills_1000G.vcf
	echo '##reference=GRCh38' >> mills_1000G.vcf
	echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf
	echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf
	echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai \| cut -f2)>" >> mills_1000G.vcf
	printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf
	printf "chr20\t1000150\tMILL1\tAT\tA\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf
	printf "chr20\t1000250\tMILL2\tG\tGTT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf
	printf "chr20\t1000350\tG1000_1\tCTA\tC\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf
	printf "chr20\t1000450\tG1000_2\tT\tTAGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf

	bgzip mills_1000G.vcf
	tabix -p vcf mills_1000G.vcf.gz

	# Create synthetic known indels VCF
	echo '##fileformat=VCFv4.2' > known_indels.vcf
	echo '##reference=GRCh38' >> known_indels.vcf
	echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf
	echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai \| cut -f2)>" >> known_indels.vcf
	printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf
	printf "chr20\t1000550\tindel1\tAT\tA\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
	printf "chr20\t1000650\tindel2\tG\tGTT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf
	printf "chr20\t1000750\tindel3\tCTA\tC\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf

	bgzip known_indels.vcf
	tabix -p vcf known_indels.vcf.gz

	# Create test JSON with all reference files
	cat << EOF > ../test-inputs.json
	{
	"PanelBwaGatk4Annovar.sample_batch": [
	{
	"sample_name": "test_sample",
	"bam_file": "test/data/test.unmapped.bam",
	"bed_file": "test/data/test.bed"
	}
	],
	"PanelBwaGatk4Annovar.reference_genome": {
	"ref_name": "hg38",
	"ref_fasta": "test/data/ref.fasta",
	"ref_fasta_index": "test/data/ref.fasta.fai",
	"ref_dict": "test/data/ref.fasta.dict",
	"ref_pac": "test/data/ref.fasta.pac",
	"ref_sa": "test/data/ref.fasta.sa",
	"ref_amb": "test/data/ref.fasta.amb",
	"ref_ann": "test/data/ref.fasta.ann",
	"ref_bwt": "test/data/ref.fasta.bwt",
	"dbSNP_vcf": "test/data/dbsnp.vcf",
	"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx",
	"known_indels_sites_VCFs": [
	"test/data/mills_1000G.vcf.gz",
	"test/data/known_indels.vcf.gz"
	],
	"known_indels_sites_indices": [
	"test/data/mills_1000G.vcf.gz.tbi",
	"test/data/known_indels.vcf.gz.tbi"
	],
	"annovar_protocols": "refGene",
	"annovar_operation": "g"
	}
	}
	EOF

	- name: Generate test BAM
	run: \|
	# Create a more structured test BAM with multiple reads
	cat << EOF > test/data/test.sam
	@HD VN:1.6 SO:queryname
	@SQ SN:chr20 LN:63025520
	@RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1
	read1 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
	read1 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
	read2 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
	read2 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
	read3 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test
	read3 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test
	EOF

	# Convert SAM to BAM and proper sorting
	samtools view -b test/data/test.sam > test/data/test.unsorted.bam
	samtools sort -n test/data/test.unsorted.bam > test/data/test.unmapped.bam
	samtools index test/data/test.unmapped.bam

	# Debug: Check the BAM
	echo "=== Checking final BAM structure ==="
	samtools view -H test/data/test.unmapped.bam
	echo "=== First few reads ==="
	samtools view test/data/test.unmapped.bam \| head -n 2

	# Clean up intermediate files
	rm test/data/test.sam test/data/test.unsorted.bam

	- name: Verify test data
	run: \|
	echo "=== Verifying Reference Files ==="
	ls -l test/data/ref*

	echo "=== Verifying BAM File ==="
	samtools view -H test/data/test.unmapped.bam

	echo "=== Verifying VCF Files ==="
	ls -l test/data/.vcf

	echo "=== Verifying Bed Files ==="
	cat test/data/test.bed

	- name: Verify reference files in detail
	run: \|
	echo "=== Reference FASTA dict content ==="
	cat test/data/ref.fasta.dict

	echo "=== Reference FAI content ==="
	cat test/data/ref.fasta.fai

	echo "=== Reference ANN content ==="
	cat test/data/ref.fasta.ann

	# Check if the reference FASTA header matches what's in the dict
	echo "=== First line of reference FASTA ==="
	head -n 1 test/data/ref.fasta

	- name: Verify GATK installation
	run: \|
	echo "=== GATK Installation ==="
	ls -l /usr/local/bin/gatk \|\| echo "GATK not found in /usr/local/bin"
	which gatk \|\| echo "GATK not in PATH"
	gatk --version \|\| echo "Cannot get GATK version"

	echo "=== Java Version ==="
	java -version

	- name: Debug Docker setup
	run: \|
	echo "=== Docker Version ==="
	docker --version

	echo "=== Docker Info ==="
	docker info

	echo "=== Available Docker Images ==="
	docker images

	echo "=== Testing Docker Pull ==="
	docker pull getwilds/gatk:4.3.0.0
	docker pull getwilds/bwa:0.7.17
	docker pull getwilds/annovar:hg38

	- name: Verify Docker volumes
	run: \|
	echo "=== Testing Docker Mount ==="
	docker run --rm \
	-v $(pwd)/test:/test \
	getwilds/gatk:4.3.0.0 \
	ls -la /test

	- name: Run test workflow
	run: \|
	echo "=== Pre-execution Directory Check ==="
	ls -la

	echo "=== Running Workflow ==="
	java -Dconfig.file=cromwell.conf \
	-jar cromwell.jar run \
	variantCalling/variantCalling.wdl \
	-i test/test-inputs.json

	echo "=== Post-execution Directory Check ==="
	ls -la cromwell-executions/

	- name: Debug script execution
	if: always() # Run even if previous step failed
	run: \|
	if [ -d "cromwell-executions" ]; then
	echo "=== SortBed Script ==="
	find cromwell-executions -name "script" -type f \| while read script; do
	echo "=== Content of $script ==="
	cat "$script"
	done

	echo "=== Directory Structure ==="
	find cromwell-executions -type d \| while read dir; do
	echo "=== Contents of $dir ==="
	ls -la "$dir" \|\| echo "Cannot access directory"
	done
	else
	echo "No cromwell-executions directory found"
	fi

	- name: Check outputs
	run: \|
	# Basic output existence checks
	test -f $(find cromwell-executions -name "*.recal.bam")
	test -f $(find cromwell-executions -name "*.GATK.vcf")
	test -f $(find cromwell-executions -name "*_multianno.txt")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Adding GitHub Action for Variant Calling Test Run #33

Workflow file

Adding GitHub Action for Variant Calling Test Run #33

Jobs

Run details

Workflow file for this run