stjudecloud · adthrasher · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 21, 2023
diff --git a/tools/samtools.wdl b/tools/samtools.wdl
@@ -431,7 +431,7 @@ task collate {
 
 task bam_to_fastq {
     meta {
-        description: "This WDL task runs `samtools fastq` on the input BAM file. Splits the BAM into FASTQ files. Assumes either a name sorted or collated BAM. For splitting a position sorted BAM see `collate_to_fastq`."
+        description: "This WDL task runs `samtools fastq` on the input BAM file. Converts the BAM into FASTQ files. If paired-end = false, then all reads in the BAM will be output to a single FASTQ file. Use filtering arguments to remove any unwanted reads. Assumes either a name sorted or collated BAM. For splitting a position sorted BAM see `collate_to_fastq`."
         outputs: {
             read_one_fastq_gz: "Gzipped FASTQ file with 1st reads in pair"
             read_two_fastq_gz: "Gzipped FASTQ file with 2nd reads in pair"
@@ -446,9 +446,10 @@ task bam_to_fastq {
         f: "Only output alignments with all bits set in INT present in the FLAG field. INT can be specified in hex by beginning with `0x` (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0` (i.e. /^0[0-7]+/)."
         F: "Do not output alignments with any bits set in INT present in the FLAG field. INT can be specified in hex by beginning with `0x` (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0` (i.e. /^0[0-7]+/). This defaults to 0x900 representing filtering of secondary and supplementary alignments."
         G: "Only EXCLUDE reads with all of the bits set in INT present in the FLAG field. INT can be specified in hex by beginning with `0x` (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0` (i.e. /^0[0-7]+/)."
-        paired_end: "Is the data paired-end?"
+        append_read_number: "Append /1 and /2 suffixes to read names"
         interleaved: "Create an interleaved FASTQ file from paired-end data?"
         output_singletons: "Output singleton reads as their own FASTQ?"
+        paired_end: "Is the data paired-end?"
         ncpu: "Number of cores to allocate for task"
         memory_gb: "RAM to allocate for task, specified in GB"
         modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
@@ -462,9 +463,10 @@ task bam_to_fastq {
         String f = "0"
         String F = "0x900"
         String G = "0"
-        Boolean paired_end = true
+        Boolean append_read_number = true
         Boolean interleaved = false
         Boolean output_singletons = false
+        Boolean paired_end = true
         Boolean use_all_cores = false
         Int ncpu = 1
         Int memory_gb = 4
@@ -488,22 +490,35 @@ task bam_to_fastq {
             -f ~{f} \
             -F ~{F} \
             -G ~{G} \
-            -1 ~{if interleaved
-                then prefix + ".fastq.gz"
-                else prefix + "_R1.fastq.gz"
+            ~{if append_read_number
+                then "-N"
+                else "-n"
+            } \
+            -1 ~{
+                if paired_end then (
+                    if interleaved then prefix + ".fastq.gz" else prefix + "_R1.fastq.gz"
+                )
+                else prefix + ".fastq.gz"
             } \
             -2 ~{
                 if paired_end then (
                     if interleaved then prefix + ".fastq.gz" else prefix + "_R2.fastq.gz"
                 )
-                else "/dev/null"
+                else prefix + ".fastq.gz"
+            } \
+            ~{
+                if paired_end then (
+                    if output_singletons
+                    then "-s " + prefix+".singleton.fastq.gz"
+                    else "-s junk.singleton.fastq.gz"
+                )
+                else ""
             } \
-            -s ~{
-                if output_singletons
-                then prefix+".singleton.fastq.gz"
-                else "/dev/null"
+            -0 ~{
+                if paired_end
+                then "junk.unknown_bit_setting.fastq.gz"
+                else prefix + ".fastq.gz"
             } \
-            -0 /dev/null \
             ~{bam}
     >>>
 
@@ -637,3 +652,68 @@ task collate_to_fastq {
         maxRetries: max_retries
     }
 }
+
+task fixmate {
+    meta {
+        description: "This WDL task runs Samtools fixmate on the input BAM file. This fills in mate coordinates and insert size fields."
+    }
+
+    parameter_meta {
+        bam: "Input BAM format file to add mate information. Must be name-sorted or name-collated."
+        prefix: "Prefix for the output file. The extension specified with the `extension` parameter will be added."
+        extension: {
+            description: "File format extension to use for output file.",
+            choices: [
+                ".sam",
+                ".bam",
+                ".cram"
+            ]
+        }
+        add_cigar: "Add template cigar ct tag"
+        add_mate_score: "Add mate score tags. These are used by markdup to select the best reads to keep."
+        disable_proper_pair_check: "Disable proper pair check [ensure one forward and one reverse read in each pair]"
+        remove_unaligned_and_secondary: "Remove unmapped and secondary reads"
+        ncpu: "Number of cores to allocate for task"
+        memory_gb: "RAM to allocate for task, specified in GB"
+        modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
+        max_retries: "Number of times to retry in case of failure"
+    }
+
+    input {
+        File bam
+        String prefix = basename(bam, ".bam") + ".fixmate"
+        String extension = ".bam"
+        Boolean add_cigar = true
+        Boolean add_mate_score = true
+        Boolean disable_proper_pair_check = false
+        Boolean remove_unaligned_and_secondary = false
+        Int ncpu = 1
+        Int memory_gb = 4
+        Int modify_disk_size_gb = 0
+        Int max_retries = 1
+    }
+
+    Float bam_size = size(bam, "GiB")
+    Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb
+
+    command <<<
+        samtools fixmate \
+            ~{if remove_unaligned_and_secondary then "-r" else ""} \
+            ~{if disable_proper_pair_check then "-p" else ""} \
+            ~{if add_cigar then "-c" else ""} \
+            ~{if add_mate_score then "-m" else ""} \
+            ~{bam} ~{prefix}~{extension}
+    >>>
+
+    output {
+        File fixmate_bam = "~{prefix}~{extension}"
+    }
+
+    runtime {
+        cpu: ncpu
+        memory: "~{memory_gb} GB"
+        disk: "~{disk_size_gb} GB"
+        docker: 'quay.io/biocontainers/samtools:1.16.1--h6899075_1'
+        maxRetries: max_retries
+    }
+}
diff --git a/tools/util.wdl b/tools/util.wdl
@@ -322,7 +322,10 @@ task add_to_bam_header {
     String outfile_name = prefix + ".bam"
 
     command <<<
-        samtools view -H ~{bam} > header.sam
+        set -euo pipefail
+        # Remove trailing tab characters that some tools (bowtie) add
+        # to header records. Also skip adding a @PG record for this operation.
+        samtools view --no-PG -H ~{bam} | sed 's/\t$//' > header.sam
         echo "~{additional_header}" >> header.sam
         samtools reheader -P header.sam ~{bam} > ~{outfile_name}
     >>>

diff --git a/workflows/chipseq/chipseq-standard.wdl b/workflows/chipseq/chipseq-standard.wdl
@@ -36,14 +36,15 @@ import "../../tools/picard.wdl"
 import "../../tools/samtools.wdl"
 import "../../tools/util.wdl"
 import "../general/bam-to-fastqs.wdl" as b2fq
-import "https://raw.githubusercontent.com/stjude/seaseq/2.3/workflows/workflows/mapping.wdl" as seaseq_map
-import "https://raw.githubusercontent.com/stjude/seaseq/3.0/workflows/tasks/samtools.wdl" as seaseq_samtools
-import "https://raw.githubusercontent.com/stjude/seaseq/3.0/workflows/tasks/seaseq_util.wdl" as seaseq_util
+import "https://raw.githubusercontent.com/stjude/seaseq/3.1/workflows/workflows/mapping.wdl" as seaseq_map
+import "https://raw.githubusercontent.com/stjude/seaseq/3.1/workflows/tasks/samtools.wdl" as seaseq_samtools
+import "https://raw.githubusercontent.com/stjude/seaseq/3.1/workflows/tasks/seaseq_util.wdl" as seaseq_util
 
 workflow chipseq_standard {
     parameter_meta {
         bam: "Input BAM format file to realign with bowtie"
         bowtie_indexes: "Database of v1 reference files for the bowtie aligner. Can be generated with https://github.com/stjude/seaseq/blob/master/workflows/tasks/bowtie.wdl. [*.ebwt]"
+        paired_end: "Is the data paired-end (true) or single-end (false)?"
         excludelist: "Optional list of regions that will be excluded after reference alignment"
         prefix: "Prefix for output files"
         validate_input: "Run Picard ValidateSamFile on the input BAM"
@@ -55,6 +56,7 @@ workflow chipseq_standard {
     input {
         File bam
         Array[File] bowtie_indexes
+        Boolean paired_end = false
         File? excludelist
         String prefix = basename(bam, ".bam")
         Boolean validate_input = true
@@ -89,7 +91,7 @@ workflow chipseq_standard {
 
     call b2fq.bam_to_fastqs { input:
         bam=selected_bam,
-        paired_end=false,
+        paired_end=paired_end,
         use_all_cores=use_all_cores,
         max_retries=max_retries
     }
@@ -105,29 +107,31 @@ workflow chipseq_standard {
         max_retries=max_retries
     }
 
-    scatter (pair in zip(bam_to_fastqs.read1s, read_groups)){
+    scatter (tuple in zip(zip(bam_to_fastqs.read1s, bam_to_fastqs.read2s), read_groups)){
         call seaseq_util.basicfastqstats as basic_stats { input:
-            fastqfile=pair.left
+            fastqfile=tuple.left.left
         }
-        call seaseq_map.mapping as bowtie_single_end_mapping { input:
-            fastqfile=pair.left,
+        call seaseq_map.mapping as bowtie_mapping { input:
+            fastqfile=tuple.left.left, # the FASTQ pair is the left of the first pair, then it is R1 = left, R2 = right in the nested pair
+            fastqfile_R2=tuple.left.right,
             index_files=bowtie_indexes,
             metricsfile=basic_stats.metrics_out,
-            blacklist=excludelist
+            blacklist=excludelist,
+            paired_end=paired_end
         }
         File chosen_bam = select_first(
             [
-                bowtie_single_end_mapping.bklist_bam,
-                bowtie_single_end_mapping.mkdup_bam,
-                bowtie_single_end_mapping.sorted_bam
+                bowtie_mapping.bklist_bam,
+                bowtie_mapping.mkdup_bam,
+                bowtie_mapping.sorted_bam
             ]
         )
         call util.add_to_bam_header { input:
             bam=chosen_bam,
-            additional_header=pair.right,
+            additional_header=tuple.right,
             max_retries=max_retries
         }
-        String rg_id_field = sub(sub(pair.right, ".*ID:", "ID:"), "\t.*", "") 
+        String rg_id_field = sub(sub(tuple.right, ".*ID:", "ID:"), "\t.*", "") 
         String rg_id = sub(rg_id_field, "ID:", "")
         call samtools.addreplacerg as single_end { input:
             bam=add_to_bam_header.reheadered_bam,
@@ -159,7 +163,16 @@ workflow chipseq_standard {
         use_all_cores=use_all_cores,
         max_retries=max_retries
     }
-    call picard.validate_bam { input: bam=markdup.mkdupbam, max_retries=max_retries }
+    call picard.validate_bam { input: 
+        bam=markdup.mkdupbam,
+        ignore_list=["MISSING_PLATFORM_VALUE",
+                    "INVALID_PLATFORM_VALUE",
+                    "INVALID_MAPPING_QUALITY",
+                    "MATES_ARE_SAME_END",
+                    "MISMATCH_FLAG_MATE_NEG_STRAND",
+                    "MISMATCH_MATE_ALIGNMENT_START"],
+        max_retries=max_retries
+    }
 
     call md5sum.compute_checksum { input:
         file=markdup.mkdupbam,

diff --git a/workflows/general/bam-to-fastqs.wdl b/workflows/general/bam-to-fastqs.wdl
@@ -54,18 +54,39 @@ workflow bam_to_fastqs {
 
     call samtools.quickcheck { input: bam=bam, max_retries=max_retries }
     call samtools.split { input: bam=bam, use_all_cores=use_all_cores, max_retries=max_retries }
-    scatter (split_bam in split.split_bams) {
-        call samtools.collate_to_fastq as bam_to_fastq { input:
-            bam=split_bam,
-            paired_end=paired_end,
-            interleaved=false,  # matches default but prevents user from overriding
-            use_all_cores=use_all_cores,
-            max_retries=max_retries
+
+    if (paired_end){
+        scatter (split_bam in split.split_bams) {
+            call samtools.collate_to_fastq as bam_to_fastq { input:
+                bam=split_bam,
+                paired_end=paired_end,
+                interleaved=false,  # matches default but prevents user from overriding
+                use_all_cores=use_all_cores,
+                max_retries=max_retries
+            }
         }
     }
 
-    scatter (reads in 
-        zip(bam_to_fastq.read_one_fastq_gz, bam_to_fastq.read_two_fastq_gz)
+    if (!paired_end){
+        scatter (split_bam in split.split_bams) {
+            call samtools.bam_to_fastq as bam_to_fastq_se { input:
+                bam=split_bam,
+                paired_end=paired_end,
+                interleaved=false,  # matches default but prevents user from overriding
+                output_singletons=true,
+                use_all_cores=use_all_cores,
+                max_retries=max_retries
+            }
+        }
+    }
+
+    Array[File?] r1 = select_first([bam_to_fastq.read_one_fastq_gz, bam_to_fastq_se.interleaved_reads_fastq_gz])
+    Array[File] read1s_ = select_all(r1)
+
+    Array[File?] read2s_ = select_first([bam_to_fastq.read_two_fastq_gz, bam_to_fastq_se.read_two_fastq_gz])
+
+     scatter (reads in 
+        zip(read1s_, read2s_)
     ) {
         call fq.fqlint { input:
             read_one_fastq=select_first([reads.left, "undefined"]),
@@ -75,7 +96,7 @@ workflow bam_to_fastqs {
     }
 
     output {
-        Array[File] read1s = select_all(bam_to_fastq.read_one_fastq_gz)
-        Array[File?] read2s = bam_to_fastq.read_two_fastq_gz
+        Array[File] read1s = read1s_
+        Array[File?] read2s = read2s_
     }
 }