Skip to content

Commit

Permalink
Updated short read variant calling and joint calling pipelines to run…
Browse files Browse the repository at this point in the history
… at scale (#450)

Single-sample variant calling:

* Fixied critical typo in HaplotypeCaller disk spec.
* Fixed wdl-computed divide by zero error in `SRFlowcell` outputs.
* Updated `MergeVCFs` to have an option to name an output as a GVCF.
* Removed deprecated GC logging flags from GATK commands.
* Fixed issue in `FastQC` if no read qualities are in the file.
* Added missing annotation groups to ReblockGVCFs.
* HaplotypeCaller WDL now returns the reblocked GVCF.

Joint Calling:
* Added option to use gnarly genotyper
* Added het inputs to joint genotyping
* Fixed java memory allocation in joint genotyping to be based on memory of the VM, not hard-coded
* Fixed name of outdir in `ConvertToZarrStore` to be correct for this workflow.
* Updated the zarr conversion to use parallel Dask processes and to log to stdout.
* Upped default zarr conversion memory to 32gb.

VETS:
* Added stack trace logging for errors in `ExtractVariantAnnotations`,
  `TrainVariantAnnotationsModel`, and `ScoreVariantAnnotations`.
* Removed `HAPCOMP`, `HAPDOM`, and `HEC` from default annotations for SNP and INDEL VETS filtration on joint-called VCF files.  Need to do more testing / debugging to include these in joint calling.

Misc:
* Fixed the name of the workflow in ExpandedDrugResistanceMarkerAggregation to reflect the actual name.
* Miscellaneous updates for debugging
  • Loading branch information
jonn-smith authored May 8, 2024
1 parent 8656736 commit fe32d91
Show file tree
Hide file tree
Showing 10 changed files with 257 additions and 53 deletions.
14 changes: 10 additions & 4 deletions wdl/pipelines/ILMN/Alignment/SRFlowcell.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,12 @@ workflow SRFlowcell {
File fqboup = unaligned_reads_dir + "/" + basename(select_first([DecontaminateSample.decontaminated_unpaired, t_003_Bam2Fastq.fq_unpaired]))
}

# Prep some output values before the output block:
Float raw_est_fold_cov_value = if t_013_ComputeGenomeLength.length != 0 then t_014_ComputeBamStats.results['bases']/t_013_ComputeGenomeLength.length else 0.0
Float aligned_frac_bases_value = if t_011_SamStats.stats_map['total_length'] != 0 then t_011_SamStats.stats_map['bases_mapped']/t_011_SamStats.stats_map['total_length'] else 0.0
Float aligned_est_fold_cov_value = if t_013_ComputeGenomeLength.length != 0 then t_011_SamStats.stats_map['bases_mapped']/t_013_ComputeGenomeLength.length else 0.0
Float average_identity_value = if t_011_SamStats.stats_map['bases_mapped'] != 0 then 100.0 - (100.0*t_011_SamStats.stats_map['mismatches']/t_011_SamStats.stats_map['bases_mapped']) else 0.0

############################################
# ___ _ _
# / _ \ _ _| |_ _ __ _ _| |_
Expand Down Expand Up @@ -388,7 +394,7 @@ workflow SRFlowcell {
# Unaligned read stats
Float num_reads = t_014_ComputeBamStats.results['reads']
Float num_bases = t_014_ComputeBamStats.results['bases']
Float raw_est_fold_cov = t_014_ComputeBamStats.results['bases']/t_013_ComputeGenomeLength.length
Float raw_est_fold_cov = raw_est_fold_cov_value

Float read_length = t_014_ComputeBamStats.results['read_mean']

Expand All @@ -404,16 +410,16 @@ workflow SRFlowcell {
# Aligned read stats
Float aligned_num_reads = t_012_FastQC.stats_map['number_of_reads']
Float aligned_num_bases = t_011_SamStats.stats_map['bases_mapped']
Float aligned_frac_bases = t_011_SamStats.stats_map['bases_mapped']/t_011_SamStats.stats_map['total_length']
Float aligned_est_fold_cov = t_011_SamStats.stats_map['bases_mapped']/t_013_ComputeGenomeLength.length
Float aligned_frac_bases = aligned_frac_bases_value
Float aligned_est_fold_cov = aligned_est_fold_cov_value

Float aligned_read_length = t_012_FastQC.stats_map['read_length']

Float insert_size_average = t_011_SamStats.stats_map['insert_size_average']
Float insert_size_standard_deviation = t_011_SamStats.stats_map['insert_size_standard_deviation']
Float pct_properly_paired_reads = t_011_SamStats.stats_map['percentage_of_properly_paired_reads_%']

Float average_identity = 100.0 - (100.0*t_011_SamStats.stats_map['mismatches']/t_011_SamStats.stats_map['bases_mapped'])
Float average_identity = average_identity_value

File fastqc_report = t_027_FinalizeFastQCReport.gcs_path
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version 1.0
import "../../../structs/Structs.wdl"
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ExpandedDrugResistanceMarkerExtraction {
workflow ExpandedDrugResistanceMarkerAggregation {

meta {
author: "Jonn Smith"
Expand Down
2 changes: 1 addition & 1 deletion wdl/pipelines/TechAgnostic/Utility/ConvertToZarrStore.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ workflow ConvertToZarrStore {
String gcs_out_root_dir
}

String outdir = sub(gcs_out_root_dir, "/$", "") + "/JointCallGVCFs/~{prefix}"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/ConvertToZarrStore/~{prefix}"

# Gather across multiple input gVCFs
call SGKit.ConvertToZarrStore as PerformZarrStoreConversion {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ workflow SRJointCallGVCFsWithGenomicsDB {
gvcf_indices: "Array of gvcf index files for `gvcfs`. Order should correspond to that in `gvcfs`."
ref_map_file: "Reference map file indicating reference sequence and auxillary file locations"

heterozygosity: "Joint Genotyping Parameter - Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept"
heterozygosity_stdev: "Joint Genotyping Parameter - Standard deviation of heterozygosity for SNP and indel calling."
indel_heterozygosity: "Joint Genotyping Parameter - Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept"

snp_calibration_sensitivity: "VETS (ScoreVariantAnnotations) parameter - score below which SNP variants will be filtered."
snp_max_unlabeled_variants: "VETS (ExtractVariantAnnotations) parameter - maximum number of unlabeled SNP variants/alleles to randomly sample with reservoir sampling. If nonzero, annotations will also be extracted from unlabeled sites."
snp_recalibration_annotation_values: "VETS (ScoreSnpVariantAnnotations/ScoreVariantAnnotations) parameter - Array of annotation names to use to create the SNP variant scoring model and over which to score SNP variants."
Expand All @@ -43,6 +47,8 @@ workflow SRJointCallGVCFsWithGenomicsDB {
annotation_bed_file_indexes: "Array of bed indexes for `annotation_bed_files`. Order should correspond to `annotation_bed_files`."
annotation_bed_file_annotation_names: "Array of names/FILTER column entries to use for each given file in `annotation_bed_files`. Order should correspond to `annotation_bed_files`."

use_gnarly_genotyper: "If true, the `GnarlyGenotyper` will be used, greatly speeding up joint genotyping (at the cost of potentially lower accuracy). Setting this to true is recommended for large callsets. If false, `GenotypeGVCFs` will be used to generate the final VCF. Default is false."

prefix: "Prefix to use for output files."
gcs_out_root_dir: "GCS Bucket into which to finalize outputs."
}
Expand All @@ -55,9 +61,15 @@ workflow SRJointCallGVCFsWithGenomicsDB {

File interval_list

Float heterozygosity = 0.001
Float heterozygosity_stdev = 0.01
Float indel_heterozygosity = 0.000125

Float snp_calibration_sensitivity = 0.99
Int snp_max_unlabeled_variants = 0
Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
# TODO: Fix the annotations here to include the missing ones. Must debug.
# Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]

Array[File] snp_known_reference_variants
Array[File] snp_known_reference_variants_index
Expand All @@ -67,7 +79,9 @@ workflow SRJointCallGVCFsWithGenomicsDB {

Float indel_calibration_sensitivity = 0.99
Int indel_max_unlabeled_variants = 0
Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
# TODO: Fix the annotations here to include the missing ones. Must debug.
# Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]

Array[File] indel_known_reference_variants
Array[File] indel_known_reference_variants_index
Expand All @@ -81,6 +95,8 @@ workflow SRJointCallGVCFsWithGenomicsDB {

File? snpeff_db

Boolean use_gnarly_genotyper = false

String prefix

String gcs_out_root_dir
Expand Down Expand Up @@ -125,23 +141,47 @@ workflow SRJointCallGVCFsWithGenomicsDB {
}

# Joint call
call SRJOINT.GenotypeGVCFs as JointCallGVCFs {
input:
input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
interval_list = contig_interval_list,
ref_fasta = ref_map['fasta'],
ref_fasta_fai = ref_map['fai'],
ref_dict = ref_map['dict'],
dbsnp_vcf = ref_map["known_sites_vcf"],
prefix = prefix + "." + contig + ".raw",
runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype.
if (use_gnarly_genotyper) {
call SRJOINT.GnarlyGenotypeGVCFs as GnarlyJointCallGVCFs {
input:
input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
interval_list = contig_interval_list,
ref_fasta = ref_map['fasta'],
ref_fasta_fai = ref_map['fai'],
ref_dict = ref_map['dict'],
dbsnp_vcf = ref_map["known_sites_vcf"],
prefix = prefix + "." + contig + ".gnarly_genotyper.raw",
heterozygosity = heterozygosity,
heterozygosity_stdev = heterozygosity_stdev,
indel_heterozygosity = indel_heterozygosity,
runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype.
}
}
if (!use_gnarly_genotyper) {
call SRJOINT.GenotypeGVCFs as JointCallGVCFs {
input:
input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
interval_list = contig_interval_list,
ref_fasta = ref_map['fasta'],
ref_fasta_fai = ref_map['fai'],
ref_dict = ref_map['dict'],
dbsnp_vcf = ref_map["known_sites_vcf"],
prefix = prefix + "." + contig + ".genotype_gvcfs.raw",
heterozygosity = heterozygosity,
heterozygosity_stdev = heterozygosity_stdev,
indel_heterozygosity = indel_heterozygosity,
runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype.
}
}
# Select the VCF + index for the raw joint called file:
File joint_vcf = select_first([GnarlyJointCallGVCFs.output_vcf, JointCallGVCFs.output_vcf])
File joint_vcf_index = select_first([GnarlyJointCallGVCFs.output_vcf_index, JointCallGVCFs.output_vcf_index])

# First make a sites-only VCF for recal (smaller file, easier to work with):
call VARUTIL.MakeSitesOnlyVcf as MakeSitesOnlyVCF {
input:
vcf = JointCallGVCFs.output_vcf,
vcf_index = JointCallGVCFs.output_vcf_index,
vcf = joint_vcf,
vcf_index = joint_vcf_index,
prefix = prefix + "." + contig + ".sites_only"
}
}
Expand Down Expand Up @@ -209,11 +249,11 @@ workflow SRJointCallGVCFsWithGenomicsDB {
}

# Shard by contig for speed:
scatter (idx_2 in range(length(JointCallGVCFs.output_vcf))) {
scatter (idx_2 in range(length(joint_vcf))) {

String contig_2 = MakeChrIntervalList.chrs[idx_2][0]
File joint_called_vcf = JointCallGVCFs.output_vcf[idx_2]
File joint_called_vcf_index = JointCallGVCFs.output_vcf_index[idx_2]
File joint_called_vcf = joint_vcf[idx_2]
File joint_called_vcf_index = joint_vcf_index[idx_2]

call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations {
input:
Expand Down Expand Up @@ -303,8 +343,8 @@ workflow SRJointCallGVCFsWithGenomicsDB {
# Consolidate files:
call VARUTIL.GatherVcfs as GatherRawVcfs {
input:
input_vcfs = JointCallGVCFs.output_vcf,
input_vcf_indices = JointCallGVCFs.output_vcf_index,
input_vcfs = joint_vcf,
input_vcf_indices = joint_vcf_index,
prefix = prefix + ".raw.combined"
}

Expand Down
42 changes: 31 additions & 11 deletions wdl/tasks/QC/FastQC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,42 @@ task FastQC {
echo $read_length | awk '{ print "read_length\t" $1 }' >> map.txt
echo $number_of_reads $read_length | awk '{ print "number_of_bases\t" $1*$2 }' >> map.txt
mean_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
grep -v '^#' | \
grep -v '>>' | \
awk '{ print $2 }' | \
awk '{x+=$1; next} END{print x/NR}')
echo "Calculating number of base qualities:"
echo -n "Should be: "
sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | grep -v '^#' | grep -v '>>' | wc -l
echo $mean_qual | awk '{ print "mean_qual\t" $1 }' >> map.txt
# Need to unset `pipefail` for these commands:
set -euxo
median_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
# Get the mean and median base quality scores:
num_base_qualities=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
grep -v '^#' | \
grep -v '>>' | \
awk '{ print $2 }' | \
awk '{x+=$1; next} END{print x/NR}' | \
sort -n | \
awk '{ a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }')
wc -l)
echo "Num Base Qualities: ${num_base_qualities}"
if [[ ${num_base_qualities} -eq 0 ]]; then
echo "WARNING: No base quality information found in FastQC report. Setting mean_qual and median_qual to 0."
mean_qual=0
median_qual=0
else
mean_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
grep -v '^#' | \
grep -v '>>' | \
awk '{ print $2 }' | \
awk '{x+=$1; next} END{print x/NR}')
median_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
grep -v '^#' | \
grep -v '>>' | \
awk '{ print $2 }' | \
awk '{x+=$1; next} END{print x/NR}' | \
sort -n | \
awk '{ a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }')
fi
echo $mean_qual | awk '{ print "mean_qual\t" $1 }' >> map.txt
echo $median_qual | awk '{ print "median_qual\t" $1 }' >> map.txt
>>>
Expand Down
12 changes: 11 additions & 1 deletion wdl/tasks/Utility/SGKit.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,23 @@ task ConvertToZarrStore {
command <<<
set -x
# According to the sgkit documentation, running the conversion in parallel using Dask
# is significantly faster than running it serially.
# This can be easily done by importing the Dask library and setting the client info.
# For more info see: https://sgkit-dev.github.io/sgkit/latest/vcf.html#example-converting-1000-genomes-vcf-to-zarr
# Set up number of worker processes for Dask:
num_processors=$(lscpu | grep '^CPU(s):' | awk '{print $NF}')
num_workers=$((num_processors - 1))
python3 <<EOF
from sgkit.io.vcf import vcf_to_zarr
vcfs = ["~{vcf}"]
target = "~{prefix}.zarr"
# Log our task status to a file we can inspect later:
vcf_to_zarr(vcfs, target, tempdir="tmp")
EOF
Expand All @@ -49,7 +59,7 @@ task ConvertToZarrStore {
#########################
RuntimeAttr default_attr = object {
cpu_cores: num_cpus,
mem_gb: 16,
mem_gb: 32,
disk_gb: disk_size,
boot_disk_gb: 10,
preemptible_tries: 0,
Expand Down
17 changes: 9 additions & 8 deletions wdl/tasks/Utility/SRUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,7 @@ task BaseRecalibrator {

command {

gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
-XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \
-Xloggc:gc_log.log -Xms5000m -Xmx5500m" \
gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal -Xms5000m" \
BaseRecalibrator \
-R ~{ref_fasta} \
-I ~{input_bam} \
Expand Down Expand Up @@ -492,8 +490,7 @@ task ApplyBQSR {

command <<<

gatk --java-options "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \
-XX:+PrintGCDetails -Xloggc:gc_log.log \
gatk --java-options "-XX:+PrintFlagsFinal \
-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms8192m -Xmx~{java_memory_size_mb}m" \
ApplyBQSR \
--create-output-bam-md5 \
Expand Down Expand Up @@ -661,21 +658,25 @@ task MergeVCFs {
Array[File] input_vcfs_indexes
String prefix

Boolean is_gvcf = false

RuntimeAttr? runtime_attr_override
}
Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10
String gvcf_decorator = if is_gvcf then ".g" else ""
command <<<
java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \
MergeVcfs \
INPUT=~{sep=' INPUT=' input_vcfs} \
OUTPUT=~{prefix}.vcf.gz
OUTPUT=~{prefix}~{gvcf_decorator}.vcf.gz
>>>
output {
File output_vcf = "~{prefix}.vcf.gz"
File output_vcf_index = "~{prefix}.vcf.gz.tbi"
File output_vcf = "~{prefix}~{gvcf_decorator}.vcf.gz"
File output_vcf_index = "~{prefix}~{gvcf_decorator}.vcf.gz.tbi"
}

#########################
Expand Down
6 changes: 3 additions & 3 deletions wdl/tasks/Utility/VariantUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -1696,7 +1696,7 @@ task ExtractVariantAnnotations {
let mem_start=${mem_available}-2
let mem_max=${mem_available}-2

gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \
gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
ExtractVariantAnnotations \
--verbosity DEBUG \
-V ~{vcf} \
Expand Down Expand Up @@ -1774,7 +1774,7 @@ task TrainVariantAnnotationsModel {
let mem_start=${mem_available}-2
let mem_max=${mem_available}-2

gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \
gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
TrainVariantAnnotationsModel \
--verbosity DEBUG \
--annotations-hdf5 ~{annotation_hdf5} \
Expand Down Expand Up @@ -1907,7 +1907,7 @@ task ScoreVariantAnnotations {
# Debugging:
find .

gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \
gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
ScoreVariantAnnotations \
--verbosity DEBUG \
-V ~{vcf} \
Expand Down
Loading

0 comments on commit fe32d91

Please sign in to comment.