Updated short read variant calling and joint calling pipelines to run…

… at scale (#450) Single-sample variant calling: * Fixied critical typo in HaplotypeCaller disk spec. * Fixed wdl-computed divide by zero error in `SRFlowcell` outputs. * Updated `MergeVCFs` to have an option to name an output as a GVCF. * Removed deprecated GC logging flags from GATK commands. * Fixed issue in `FastQC` if no read qualities are in the file. * Added missing annotation groups to ReblockGVCFs. * HaplotypeCaller WDL now returns the reblocked GVCF. Joint Calling: * Added option to use gnarly genotyper * Added het inputs to joint genotyping * Fixed java memory allocation in joint genotyping to be based on memory of the VM, not hard-coded * Fixed name of outdir in `ConvertToZarrStore` to be correct for this workflow. * Updated the zarr conversion to use parallel Dask processes and to log to stdout. * Upped default zarr conversion memory to 32gb. VETS: * Added stack trace logging for errors in `ExtractVariantAnnotations`, `TrainVariantAnnotationsModel`, and `ScoreVariantAnnotations`. * Removed `HAPCOMP`, `HAPDOM`, and `HEC` from default annotations for SNP and INDEL VETS filtration on joint-called VCF files. Need to do more testing / debugging to include these in joint calling. Misc: * Fixed the name of the workflow in ExpandedDrugResistanceMarkerAggregation to reflect the actual name. * Miscellaneous updates for debugging
broadinstitute · May 8, 2024 · fe32d91 · fe32d91
1 parent 8656736
commit fe32d91
Show file tree

Hide file tree

Showing 10 changed files with 257 additions and 53 deletions.
diff --git a/wdl/pipelines/ILMN/Alignment/SRFlowcell.wdl b/wdl/pipelines/ILMN/Alignment/SRFlowcell.wdl
@@ -360,6 +360,12 @@ workflow SRFlowcell {
         File fqboup = unaligned_reads_dir + "/" + basename(select_first([DecontaminateSample.decontaminated_unpaired, t_003_Bam2Fastq.fq_unpaired]))
     }
 
+    # Prep some output values before the output block:
+    Float raw_est_fold_cov_value = if t_013_ComputeGenomeLength.length != 0 then t_014_ComputeBamStats.results['bases']/t_013_ComputeGenomeLength.length else 0.0
+    Float aligned_frac_bases_value = if t_011_SamStats.stats_map['total_length'] != 0 then t_011_SamStats.stats_map['bases_mapped']/t_011_SamStats.stats_map['total_length'] else 0.0
+    Float aligned_est_fold_cov_value = if t_013_ComputeGenomeLength.length != 0 then t_011_SamStats.stats_map['bases_mapped']/t_013_ComputeGenomeLength.length else 0.0
+    Float average_identity_value = if t_011_SamStats.stats_map['bases_mapped'] != 0 then 100.0 - (100.0*t_011_SamStats.stats_map['mismatches']/t_011_SamStats.stats_map['bases_mapped']) else 0.0
+
     ############################################
     #      ___        _               _
     #     / _ \ _   _| |_ _ __  _   _| |_
@@ -388,7 +394,7 @@ workflow SRFlowcell {
         # Unaligned read stats
         Float num_reads = t_014_ComputeBamStats.results['reads']
         Float num_bases = t_014_ComputeBamStats.results['bases']
-        Float raw_est_fold_cov = t_014_ComputeBamStats.results['bases']/t_013_ComputeGenomeLength.length
+        Float raw_est_fold_cov = raw_est_fold_cov_value
 
         Float read_length = t_014_ComputeBamStats.results['read_mean']
 
@@ -404,16 +410,16 @@ workflow SRFlowcell {
         # Aligned read stats
         Float aligned_num_reads = t_012_FastQC.stats_map['number_of_reads']
         Float aligned_num_bases = t_011_SamStats.stats_map['bases_mapped']
-        Float aligned_frac_bases = t_011_SamStats.stats_map['bases_mapped']/t_011_SamStats.stats_map['total_length']
-        Float aligned_est_fold_cov = t_011_SamStats.stats_map['bases_mapped']/t_013_ComputeGenomeLength.length
+        Float aligned_frac_bases = aligned_frac_bases_value
+        Float aligned_est_fold_cov = aligned_est_fold_cov_value
 
         Float aligned_read_length = t_012_FastQC.stats_map['read_length']
 
         Float insert_size_average = t_011_SamStats.stats_map['insert_size_average']
         Float insert_size_standard_deviation = t_011_SamStats.stats_map['insert_size_standard_deviation']
         Float pct_properly_paired_reads = t_011_SamStats.stats_map['percentage_of_properly_paired_reads_%']
 
-        Float average_identity = 100.0 - (100.0*t_011_SamStats.stats_map['mismatches']/t_011_SamStats.stats_map['bases_mapped'])
+        Float average_identity = average_identity_value
 
         File fastqc_report = t_027_FinalizeFastQCReport.gcs_path
     }

diff --git a/wdl/pipelines/TechAgnostic/TertiaryAnalysis/ExpandedDrugResistanceMarkerAggregation.wdl b/wdl/pipelines/TechAgnostic/TertiaryAnalysis/ExpandedDrugResistanceMarkerAggregation.wdl
@@ -3,7 +3,7 @@ version 1.0
 import "../../../structs/Structs.wdl"
 import "../../../tasks/Utility/Finalize.wdl" as FF
 
-workflow ExpandedDrugResistanceMarkerExtraction {
+workflow ExpandedDrugResistanceMarkerAggregation {
 
     meta {
         author: "Jonn Smith"

diff --git a/wdl/pipelines/TechAgnostic/Utility/ConvertToZarrStore.wdl b/wdl/pipelines/TechAgnostic/Utility/ConvertToZarrStore.wdl
@@ -24,7 +24,7 @@ workflow ConvertToZarrStore {
         String gcs_out_root_dir
     }
 
-    String outdir = sub(gcs_out_root_dir, "/$", "") + "/JointCallGVCFs/~{prefix}"
+    String outdir = sub(gcs_out_root_dir, "/$", "") + "/ConvertToZarrStore/~{prefix}"
 
     # Gather across multiple input gVCFs
     call SGKit.ConvertToZarrStore as PerformZarrStoreConversion {

diff --git a/wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB.wdl b/wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB.wdl
@@ -19,6 +19,10 @@ workflow SRJointCallGVCFsWithGenomicsDB {
         gvcf_indices:   "Array of gvcf index files for `gvcfs`.  Order should correspond to that in `gvcfs`."
         ref_map_file:  "Reference map file indicating reference sequence and auxillary file locations"
 
+        heterozygosity: "Joint Genotyping Parameter - Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept"
+        heterozygosity_stdev: "Joint Genotyping Parameter - Standard deviation of heterozygosity for SNP and indel calling."
+        indel_heterozygosity: "Joint Genotyping Parameter - Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept"
+
         snp_calibration_sensitivity:    "VETS (ScoreVariantAnnotations) parameter - score below which SNP variants will be filtered."
         snp_max_unlabeled_variants: "VETS (ExtractVariantAnnotations) parameter - maximum number of unlabeled SNP variants/alleles to randomly sample with reservoir sampling.  If nonzero, annotations will also be extracted from unlabeled sites."
         snp_recalibration_annotation_values:    "VETS (ScoreSnpVariantAnnotations/ScoreVariantAnnotations) parameter - Array of annotation names to use to create the SNP variant scoring model and over which to score SNP variants."
@@ -43,6 +47,8 @@ workflow SRJointCallGVCFsWithGenomicsDB {
         annotation_bed_file_indexes:    "Array of bed indexes for `annotation_bed_files`.  Order should correspond to `annotation_bed_files`."
         annotation_bed_file_annotation_names:   "Array of names/FILTER column entries to use for each given file in `annotation_bed_files`.  Order should correspond to `annotation_bed_files`."
 
+        use_gnarly_genotyper: "If true, the `GnarlyGenotyper` will be used, greatly speeding up joint genotyping (at the cost of potentially lower accuracy).  Setting this to true is recommended for large callsets.  If false, `GenotypeGVCFs` will be used to generate the final VCF.  Default is false."
+
         prefix: "Prefix to use for output files."
         gcs_out_root_dir:    "GCS Bucket into which to finalize outputs."
     }
@@ -55,9 +61,15 @@ workflow SRJointCallGVCFsWithGenomicsDB {
 
         File interval_list
 
+        Float heterozygosity = 0.001
+        Float heterozygosity_stdev = 0.01
+        Float indel_heterozygosity = 0.000125
+
         Float snp_calibration_sensitivity = 0.99
         Int snp_max_unlabeled_variants = 0
-        Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
+        # TODO: Fix the annotations here to include the missing ones.  Must debug.
+#        Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
+        Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
 
         Array[File] snp_known_reference_variants
         Array[File] snp_known_reference_variants_index
@@ -67,7 +79,9 @@ workflow SRJointCallGVCFsWithGenomicsDB {
 
         Float indel_calibration_sensitivity = 0.99
         Int indel_max_unlabeled_variants = 0
-        Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
+        # TODO: Fix the annotations here to include the missing ones.  Must debug.
+#        Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
+        Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ]
 
         Array[File] indel_known_reference_variants
         Array[File] indel_known_reference_variants_index
@@ -81,6 +95,8 @@ workflow SRJointCallGVCFsWithGenomicsDB {
 
         File? snpeff_db
 
+        Boolean use_gnarly_genotyper = false
+
         String prefix
 
         String gcs_out_root_dir
@@ -125,23 +141,47 @@ workflow SRJointCallGVCFsWithGenomicsDB {
         }
 
         # Joint call
-        call SRJOINT.GenotypeGVCFs as JointCallGVCFs {
-            input:
-                input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
-                interval_list   = contig_interval_list,
-                ref_fasta       = ref_map['fasta'],
-                ref_fasta_fai   = ref_map['fai'],
-                ref_dict        = ref_map['dict'],
-                dbsnp_vcf       = ref_map["known_sites_vcf"],
-                prefix          = prefix + "." + contig + ".raw",
-                runtime_attr_override = object {preemptible_tries: 0},  # Disable preemption for prototype.
+        if (use_gnarly_genotyper) {
+            call SRJOINT.GnarlyGenotypeGVCFs as GnarlyJointCallGVCFs {
+                input:
+                    input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
+                    interval_list   = contig_interval_list,
+                    ref_fasta       = ref_map['fasta'],
+                    ref_fasta_fai   = ref_map['fai'],
+                    ref_dict        = ref_map['dict'],
+                    dbsnp_vcf       = ref_map["known_sites_vcf"],
+                    prefix          = prefix + "." + contig + ".gnarly_genotyper.raw",
+                    heterozygosity = heterozygosity,
+                    heterozygosity_stdev = heterozygosity_stdev,
+                    indel_heterozygosity = indel_heterozygosity,
+                    runtime_attr_override = object {preemptible_tries: 0},  # Disable preemption for prototype.
+            }
+        }
+        if (!use_gnarly_genotyper) {
+            call SRJOINT.GenotypeGVCFs as JointCallGVCFs {
+                input:
+                    input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb,
+                    interval_list   = contig_interval_list,
+                    ref_fasta       = ref_map['fasta'],
+                    ref_fasta_fai   = ref_map['fai'],
+                    ref_dict        = ref_map['dict'],
+                    dbsnp_vcf       = ref_map["known_sites_vcf"],
+                    prefix          = prefix + "." + contig + ".genotype_gvcfs.raw",
+                    heterozygosity = heterozygosity,
+                    heterozygosity_stdev = heterozygosity_stdev,
+                    indel_heterozygosity = indel_heterozygosity,
+                    runtime_attr_override = object {preemptible_tries: 0},  # Disable preemption for prototype.
+            }
         }
+        # Select the VCF + index for the raw joint called file:
+        File joint_vcf = select_first([GnarlyJointCallGVCFs.output_vcf, JointCallGVCFs.output_vcf])
+        File joint_vcf_index = select_first([GnarlyJointCallGVCFs.output_vcf_index, JointCallGVCFs.output_vcf_index])
 
         # First make a sites-only VCF for recal (smaller file, easier to work with):
         call VARUTIL.MakeSitesOnlyVcf as MakeSitesOnlyVCF {
             input:
-                vcf = JointCallGVCFs.output_vcf,
-                vcf_index = JointCallGVCFs.output_vcf_index,
+                vcf = joint_vcf,
+                vcf_index = joint_vcf_index,
                 prefix = prefix + "." + contig + ".sites_only"
         }
     }
@@ -209,11 +249,11 @@ workflow SRJointCallGVCFsWithGenomicsDB {
     }
 
     # Shard by contig for speed:
-    scatter (idx_2 in range(length(JointCallGVCFs.output_vcf))) {
+    scatter (idx_2 in range(length(joint_vcf))) {
 
         String contig_2 = MakeChrIntervalList.chrs[idx_2][0]
-        File joint_called_vcf = JointCallGVCFs.output_vcf[idx_2]
-        File joint_called_vcf_index = JointCallGVCFs.output_vcf_index[idx_2]
+        File joint_called_vcf = joint_vcf[idx_2]
+        File joint_called_vcf_index = joint_vcf_index[idx_2]
 
         call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations {
             input:
@@ -303,8 +343,8 @@ workflow SRJointCallGVCFsWithGenomicsDB {
     # Consolidate files:
     call VARUTIL.GatherVcfs as GatherRawVcfs {
         input:
-            input_vcfs = JointCallGVCFs.output_vcf,
-            input_vcf_indices = JointCallGVCFs.output_vcf_index,
+            input_vcfs = joint_vcf,
+            input_vcf_indices = joint_vcf_index,
             prefix = prefix + ".raw.combined"
     }
 

diff --git a/wdl/tasks/QC/FastQC.wdl b/wdl/tasks/QC/FastQC.wdl
@@ -31,22 +31,42 @@ task FastQC {
         echo $read_length | awk '{ print "read_length\t" $1 }' >> map.txt
         echo $number_of_reads $read_length | awk '{ print "number_of_bases\t" $1*$2 }' >> map.txt
 
-        mean_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
-            grep -v '^#' | \
-            grep -v '>>' | \
-            awk '{ print $2 }' | \
-            awk '{x+=$1; next} END{print x/NR}')
+        echo "Calculating number of base qualities:"
+        echo -n "Should be: "
+        sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | grep -v '^#' | grep -v '>>' | wc -l
 
-        echo $mean_qual | awk '{ print "mean_qual\t" $1 }' >> map.txt
+        # Need to unset `pipefail` for these commands:
+        set -euxo
 
-        median_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
+        # Get the mean and median base quality scores:
+        num_base_qualities=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
             grep -v '^#' | \
             grep -v '>>' | \
-            awk '{ print $2 }' | \
-            awk '{x+=$1; next} END{print x/NR}' | \
-            sort -n | \
-            awk '{ a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }')
+            wc -l)
+
+        echo "Num Base Qualities: ${num_base_qualities}"
+
+        if [[ ${num_base_qualities} -eq 0 ]]; then
+            echo "WARNING: No base quality information found in FastQC report.  Setting mean_qual and median_qual to 0."
+            mean_qual=0
+            median_qual=0
+        else
+            mean_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
+                grep -v '^#' | \
+                grep -v '>>' | \
+                awk '{ print $2 }' | \
+                awk '{x+=$1; next} END{print x/NR}')
+
+            median_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \
+                grep -v '^#' | \
+                grep -v '>>' | \
+                awk '{ print $2 }' | \
+                awk '{x+=$1; next} END{print x/NR}' | \
+                sort -n | \
+                awk '{ a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }')
+        fi
 
+        echo $mean_qual | awk '{ print "mean_qual\t" $1 }' >> map.txt
         echo $median_qual | awk '{ print "median_qual\t" $1 }' >> map.txt
     >>>
 

diff --git a/wdl/tasks/Utility/SGKit.wdl b/wdl/tasks/Utility/SGKit.wdl
@@ -28,13 +28,23 @@ task ConvertToZarrStore {
     command <<<
         set -x
 
+        # According to the sgkit documentation, running the conversion in parallel using Dask
+        # is significantly faster than running it serially.
+        # This can be easily done by importing the Dask library and setting the client info.
+        # For more info see: https://sgkit-dev.github.io/sgkit/latest/vcf.html#example-converting-1000-genomes-vcf-to-zarr
+
+        # Set up number of worker processes for Dask:
+        num_processors=$(lscpu | grep '^CPU(s):' | awk '{print $NF}')
+        num_workers=$((num_processors - 1))
+
         python3 <<EOF
 
         from sgkit.io.vcf import vcf_to_zarr
 
         vcfs = ["~{vcf}"]
         target = "~{prefix}.zarr"
 
+        # Log our task status to a file we can inspect later:
         vcf_to_zarr(vcfs, target, tempdir="tmp")
 
         EOF
@@ -49,7 +59,7 @@ task ConvertToZarrStore {
     #########################
     RuntimeAttr default_attr = object {
         cpu_cores:          num_cpus,
-        mem_gb:             16,
+        mem_gb:             32,
         disk_gb:            disk_size,
         boot_disk_gb:       10,
         preemptible_tries:  0,

diff --git a/wdl/tasks/Utility/SRUtils.wdl b/wdl/tasks/Utility/SRUtils.wdl
@@ -419,9 +419,7 @@ task BaseRecalibrator {
 
     command {
 
-        gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
-            -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \
-            -Xloggc:gc_log.log -Xms5000m -Xmx5500m" \
+        gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal -Xms5000m" \
             BaseRecalibrator \
             -R ~{ref_fasta} \
             -I ~{input_bam} \
@@ -492,8 +490,7 @@ task ApplyBQSR {
 
     command <<<
 
-        gatk --java-options "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \
-            -XX:+PrintGCDetails -Xloggc:gc_log.log \
+        gatk --java-options "-XX:+PrintFlagsFinal \
             -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms8192m -Xmx~{java_memory_size_mb}m" \
             ApplyBQSR \
             --create-output-bam-md5 \
@@ -661,21 +658,25 @@ task MergeVCFs {
         Array[File] input_vcfs_indexes
         String prefix
 
+        Boolean is_gvcf = false
+
         RuntimeAttr? runtime_attr_override
     }
 
     Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10
 
+    String gvcf_decorator = if is_gvcf then ".g" else ""
+
     command <<<
         java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \
           MergeVcfs \
           INPUT=~{sep=' INPUT=' input_vcfs} \
-          OUTPUT=~{prefix}.vcf.gz
+          OUTPUT=~{prefix}~{gvcf_decorator}.vcf.gz
     >>>
 
     output {
-        File output_vcf = "~{prefix}.vcf.gz"
-        File output_vcf_index = "~{prefix}.vcf.gz.tbi"
+        File output_vcf = "~{prefix}~{gvcf_decorator}.vcf.gz"
+        File output_vcf_index = "~{prefix}~{gvcf_decorator}.vcf.gz.tbi"
     }
 
     #########################

diff --git a/wdl/tasks/Utility/VariantUtils.wdl b/wdl/tasks/Utility/VariantUtils.wdl
@@ -1696,7 +1696,7 @@ task ExtractVariantAnnotations {
         let mem_start=${mem_available}-2
         let mem_max=${mem_available}-2
 
-        gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \
+        gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
           ExtractVariantAnnotations \
             --verbosity DEBUG \
             -V ~{vcf} \
@@ -1774,7 +1774,7 @@ task TrainVariantAnnotationsModel {
         let mem_start=${mem_available}-2
         let mem_max=${mem_available}-2
 
-        gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \
+        gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
             TrainVariantAnnotationsModel \
                 --verbosity DEBUG \
                 --annotations-hdf5 ~{annotation_hdf5} \
@@ -1907,7 +1907,7 @@ task ScoreVariantAnnotations {
         # Debugging:
         find .
 
-        gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \
+        gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
             ScoreVariantAnnotations \
                 --verbosity DEBUG \
                 -V ~{vcf} \