refactor: make STAR task more user friendly (#205)

* WIP * Update star_alignment.json * Update star.wdl * Update star.wdl * Update sort_star_input.py * chore: point to branch for allignment image * Update star_alignment.json * Create rnaseq-standard-fastq.json * chore: update changelogs * chore: update param_meta * docs: changelog headers * chore: bump star image revision num * chore: point to future image * chore: bump version
stjudecloud · Jan 31, 2025 · afe5018 · afe5018
1 parent 3a04b6e
commit afe5018
Show file tree

Hide file tree

Showing 16 changed files with 85 additions and 37 deletions.
diff --git a/docker/star/package.json b/docker/star/package.json
@@ -1,5 +1,5 @@
 {
     "name": "star",
     "version": "2.7.11b",
-    "revision": "4"
+    "revision": "5"
 }
diff --git a/scripts/CHANGELOG.md b/scripts/CHANGELOG.md
@@ -8,12 +8,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 Any change to the `scripts/` directory should be accompanied by version increases in the `docker/` directory! If you are editing this file, please ensure these changes propagate!
 
-## Unreleased
+## 2025 January
 
 ### Added
 
-- Added `black` formatting and `pyright` validation for Python scripts [#201](https://github.com/stjudecloud/workflows/pull/201)
+- Added `black` formatting and `pyright` validation for Python scripts [#201](https://github.com/stjudecloud/workflows/pull/201).
 
 ### Fixed
 
-- Fixed processing of STAR inputs [#202](https://github.com/stjudecloud/workflows/pull/202)
+- Fixed processing of STAR inputs [#202](https://github.com/stjudecloud/workflows/pull/202).
+- _Actually_ fixed processing of STAR inputs [#205](https://github.com/stjudecloud/workflows/pull/205).
diff --git a/scripts/star/sort_star_input.py b/scripts/star/sort_star_input.py
@@ -53,7 +53,7 @@ def sort_read_groups(
     for rg in read_groups:
         if "ID" not in rg:
             raise SystemExit("Read group information must contain 'ID' key")
-        for flag in rg.split(","):
+        for flag in rg.split(" "):
             if "ID" in flag:
                 rgids.append(flag.split(":")[1])
 

diff --git a/tests/tools/input_json/star_alignment.json b/tests/tools/input_json/star_alignment.json
@@ -3,5 +3,5 @@
     "read_two_fastqs_gz": ["tests/input/test_R2.fq.gz"],
     "star_db_tar_gz": "tests/input/star_db.chrY_chrM.tar.gz",
     "prefix": "test",
-    "read_groups": "ID:test"
+    "read_groups": ["ID:test SM:test PL:ILLUMINA"]
 }
diff --git a/tests/tools/input_json/util_get_read_groups.json b/tests/tools/input_json/util_get_read_groups.json
@@ -1,4 +1,3 @@
 {
-    "bam": "https://github.com/stjude/CICERO/raw/master/test/data/input/test.bam",
-    "format_for_star": false
+    "bam": "https://github.com/stjude/CICERO/raw/master/test/data/input/test.bam"
 }
diff --git a/tests/tools/test_util.yaml b/tests/tools/test_util.yaml
@@ -13,19 +13,33 @@
       contains:
         - "MIT License"
 
-- name: get_read_groups
+- name: get_read_groups_without_clean
   tags:
     - miniwdl
     - util
   command: >-
-    miniwdl run --verbose -d test-output/. -i tests/tools/input_json/util_get_read_groups.json --task get_read_groups tools/util.wdl
+    miniwdl run --verbose -d test-output/. -i tests/tools/input_json/util_get_read_groups.json --task get_read_groups tools/util.wdl clean=false
   stdout:
       contains:
         - "@RG\\tID:1"
         - "@RG\\tID:2"
       must_not_contain:
         - "@RG\\tID:3"
 
+- name: get_read_groups_with_clean
+  tags:
+    - miniwdl
+    - util
+  command: >-
+    miniwdl run --verbose -d test-output/. -i tests/tools/input_json/util_get_read_groups.json --task get_read_groups tools/util.wdl clean=true
+  stdout:
+      contains:
+        - "ID:1"
+        - "ID:2"
+      must_not_contain:
+        - "@RG"
+
+
 - name: split_string
   tags:
     - miniwdl

diff --git a/tests/workflows/input_json/rnaseq-standard-fastq.json b/tests/workflows/input_json/rnaseq-standard-fastq.json
@@ -0,0 +1,15 @@
+{
+    "rnaseq_standard_fastq.read_one_fastqs_gz": ["tests/input/test_R1.fq.gz"],
+    "rnaseq_standard_fastq.read_two_fastqs_gz": ["tests/input/test_R2.fq.gz"],
+    "rnaseq_standard_fastq.read_groups": [{
+        "ID": "test",
+        "PI": 150,
+        "PL": "ILLUMINA",
+        "SM": "Sample",
+        "LB": "Sample"
+    }],
+    "rnaseq_standard_fastq.prefix": "test",
+    "rnaseq_standard_fastq.gtf": "tests/input/gencode.v31.chrY_chrM.gtf.gz",
+    "rnaseq_standard_fastq.star_db": "tests/input/star_db.chrY_chrM.tar.gz",
+    "rnaseq_standard_fastq.strandedness": "Unstranded"
+}
diff --git a/tools/CHANGELOG.md b/tools/CHANGELOG.md
@@ -4,4 +4,9 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
-## Unreleased
+## 2025 January
+
+### Changed
+
+- `util.get_read_groups` had the param `format_for_star` reworked to a more generic `clean` parameter [#205](https://github.com/stjudecloud/workflows/pull/205).
+- `star.allignment` now takes an `Array[String]` for `read_groups` instead of a single `String` [#205](https://github.com/stjudecloud/workflows/pull/205).
diff --git a/tools/star.wdl b/tools/star.wdl
@@ -127,7 +127,7 @@ task build_star_db {
         cpu: ncpu
         memory: "~{memory_gb} GB"
         disks: "~{disk_size_gb} GB"
-        container: "ghcr.io/stjudecloud/star:2.7.11b-4"
+        container: "ghcr.io/stjudecloud/star:2.7.11b-5"
         maxRetries: 1
     }
 }
@@ -148,7 +148,11 @@ task alignment {
         read_one_fastqs_gz: "An array of gzipped FASTQ files containing read one information"
         star_db_tar_gz: "A gzipped TAR file containing the STAR reference files. The name of the root directory which was archived must match the archive's filename without the `.tar.gz` extension."
         prefix: "Prefix for the BAM and other STAR files. The extensions `.Aligned.out.bam`, `.Log.final.out`, `.SJ.out.tab`, and `.Chimeric.out.junction` will be added."
-        read_groups: "A string containing the read group information to output in the BAM file. If including multiple read group fields per-read group, they should be space delimited. Read groups should be comma separated, with a space on each side (i.e. ' , '). The ID field must come first for each read group and must be contained in the basename of a FASTQ file or pair of FASTQ files if Paired-End. Example: `ID:rg1 PU:flowcell1.lane1 SM:sample1 PL:illumina LB:sample1_lib1 , ID:rg2 PU:flowcell1.lane2 SM:sample1 PL:illumina LB:sample1_lib1`. These two read groups could be associated with the following four FASTQs: `sample1.rg1.R1.fastq,sample1.rg2.R1.fastq` and `sample1.rg1.R2.fastq,sample1.rg2.R2.fastq`"
+        read_groups: {
+            description: "An array of `String`s where each `String` corresponds to one read group.",
+            help: "Each read group string should start with the `ID` field followed by any other read group fields, where fields are delimited by a space. See `../data_structures/read_group.wdl` for information about possible fields and utility tasks for constructing, validating, and \"stringifying\" read groups.",
+            warning: "The `ID` field for each read group _must_ be contained in the basename of a FASTQ file or pair of FASTQ files if Paired-End. Example: `[\"ID:rg1 PU:flowcell1.lane1 SM:sample1 PL:illumina LB:sample1_lib1\", \"ID:rg2 PU:flowcell1.lane2 SM:sample1 PL:illumina LB:sample1_lib1\"]`. These two read groups could be associated with the following four FASTQs: `[\"sample1.rg1.R1.fastq\", \"sample1.rg2.R1.fastq\"]` and `[\"sample1.rg1.R2.fastq\", \"sample1.rg2.R2.fastq\"]`",
+        }
         read_two_fastqs_gz: {
             description: "An array of gzipped FASTQ files containing read two information",
             group: "common",
@@ -477,8 +481,8 @@ task alignment {
         File star_db_tar_gz
         Array[File] read_one_fastqs_gz
         String prefix
-        String? read_groups
         Array[File] read_two_fastqs_gz = []
+        Array[String] read_groups = []
         Array[Int] out_sj_filter_intron_max_vs_read_n = [50000, 100000, 200000]
         SpliceJunctionMotifs out_sj_filter_overhang_min = SpliceJunctionMotifs {
             noncanonical_motifs: 30,
@@ -640,12 +644,12 @@ task alignment {
             --read-one-fastqs "~{sep(",", read_one_fastqs_gz)}" \
             ~{(
                 if (length(read_two_fastqs_gz) != 0)
-                then "--read-two-fastqs '~{sep(",", (read_two_fastqs_gz))}'"
+                then "--read-two-fastqs '~{sep(",", read_two_fastqs_gz)}'"
                 else ""
             )} \
             ~{(
-                if defined(read_groups)
-                then "--read-groups '~{read_groups}'"
+                if (length(read_groups) != 0)
+                then "--read-groups '~{sep(" , ", read_groups)}'"
                 else ""
             )}
 
@@ -829,7 +833,7 @@ task alignment {
         cpu: ncpu
         memory: "50 GB"
         disks: "~{disk_size_gb} GB"
-        container: "ghcr.io/stjudecloud/star:2.7.11b-4"
+        container: "ghcr.io/stjudecloud/star:2.7.11b-5"
         maxRetries: 1
     }
 }

diff --git a/tools/util.wdl b/tools/util.wdl
@@ -51,7 +51,7 @@ task get_read_groups {
     meta {
         description: "Gets read group information from a BAM file and writes it out to as a string"
         outputs: {
-            read_groups: "An array of strings containing read group information. If `format_for_star = true`, all found read groups are contained in one string (`read_groups[0]`). If `format_for_star = false`, each found @RG line will be its own entry in output array `read_groups`."
+            read_groups: "An array of strings containing read group information. If `clean = true`, the `@RG\t` prefix is stripped and tabs are replaced with spaces. If `clean = false`, each unmodified @RG line will be its own entry in output array `read_groups`."
         }
     }
 
@@ -60,16 +60,17 @@ task get_read_groups {
             description: "Input BAM format file to get read groups from",
             stream: true,
         }
-        format_for_star: {
-            description: "Format read group information for the STAR aligner (true) or output @RG lines of the header without further processing (false)? STAR formatted results will be an array of length 1, where all found read groups are contained in one string (`read_groups[0]`). If no processing is selected, each found @RG line will be its own entry in output array `read_groups`.",
+        clean: {
+            description: "Clean @RG lines to remove the `@RG\t` prefix and use spaces instead of tabs (true) or output @RG lines of the header without further processing (false)?",
+            help: "`clean = true` output matches the formatting of the `read_group_to_string` task in `../data_structures/read_group.wdl`",
             group: "common",
         }
         modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
     }
 
     input {
         File bam
-        Boolean format_for_star = true
+        Boolean clean = true
         Int modify_disk_size_gb = 0
     }
 
@@ -79,13 +80,12 @@ task get_read_groups {
     command <<<
         set -euo pipefail
 
-        if ~{format_for_star}; then
+        if ~{clean}; then
             samtools view -H ~{bam} \
                 | grep "^@RG" \
                 | cut -f 2- \
                 | sed -e 's/\t/ /g' \
-                | awk '{print}' ORS=' , ' \
-                | sed 's/ , $//' > read_groups.txt
+                > read_groups.txt
         else
             samtools view -H ~{bam} | grep "^@RG" > read_groups.txt
         fi

diff --git a/workflows/chipseq/chipseq-standard.wdl b/workflows/chipseq/chipseq-standard.wdl
@@ -65,7 +65,7 @@ workflow chipseq_standard {
 
     call util.get_read_groups { input:
         bam = selected_bam,
-        format_for_star = false,
+        clean = false,
     }
 
     call b2fq.bam_to_fastqs { input:

diff --git a/workflows/methylation/CHANGELOG.md b/workflows/methylation/CHANGELOG.md
@@ -0,0 +1,7 @@
+# Change Log
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/).
+
+## Unreleased
diff --git a/workflows/rnaseq/CHANGELOG.md b/workflows/rnaseq/CHANGELOG.md
@@ -4,4 +4,8 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
-## Unreleased
+## 2025 January
+
+### Changed
+
+- `rnaseq-core` now takes an `Array[String]` for the `read_groups` param instead of a single `String` [#205](https://github.com/stjudecloud/workflows/pull/205).
diff --git a/workflows/rnaseq/rnaseq-core.wdl b/workflows/rnaseq/rnaseq-core.wdl
@@ -28,7 +28,11 @@ workflow rnaseq_core {
         read_two_fastqs_gz: "Input gzipped FASTQ format file(s) with 2nd read in pair to align"
         gtf: "Gzipped GTF feature file"
         star_db: "Database of reference files for the STAR aligner. The name of the root directory which was archived must match the archive's filename without the `.tar.gz` extension. Can be generated by `star-db-build.wdl`"
-        read_groups: "A string containing the read group information to output in the BAM file. If including multiple read group fields per-read group, they should be space delimited. Read groups should be comma separated, with a space on each side (i.e. ' , '). The ID field must come first for each read group and must be contained in the basename of a FASTQ file or pair of FASTQ files if Paired-End. Example: `ID:rg1 PU:flowcell1.lane1 SM:sample1 PL:illumina LB:sample1_lib1 , ID:rg2 PU:flowcell1.lane2 SM:sample1 PL:illumina LB:sample1_lib1`. These two read groups could be associated with the following four FASTQs: `sample1.rg1.R1.fastq,sample1.rg2.R1.fastq` and `sample1.rg1.R2.fastq,sample1.rg2.R2.fastq`"
+        read_groups: {
+            description: "An array of `String`s where each `String` corresponds to one read group.",
+            help: "Each read group string should start with the `ID` field followed by any other read group fields, where fields are delimited by a space. See `../data_structures/read_group.wdl` for information about possible fields and utility tasks for constructing, validating, and \"stringifying\" read groups.",
+            warning: "The `ID` field for each read group _must_ be contained in the basename of a FASTQ file or pair of FASTQ files if Paired-End. Example: `[\"ID:rg1 PU:flowcell1.lane1 SM:sample1 PL:illumina LB:sample1_lib1\", \"ID:rg2 PU:flowcell1.lane2 SM:sample1 PL:illumina LB:sample1_lib1\"]`. These two read groups could be associated with the following four FASTQs: `[\"sample1.rg1.R1.fastq\", \"sample1.rg2.R1.fastq\"]` and `[\"sample1.rg1.R2.fastq\", \"sample1.rg2.R2.fastq\"]`",
+        }
         prefix: "Prefix for output files"
         contaminant_db: "A compressed reference database corresponding to the aligner chosen with `xenocp_aligner` for the contaminant genome"
         align_sj_stitch_mismatch_n_max: {
@@ -118,7 +122,7 @@ workflow rnaseq_core {
         File star_db
         Array[File] read_one_fastqs_gz
         Array[File] read_two_fastqs_gz
-        String read_groups
+        Array[String] read_groups
         String prefix
         File? contaminant_db
         SpliceJunctionMotifs align_sj_stitch_mismatch_n_max = SpliceJunctionMotifs {

diff --git a/workflows/rnaseq/rnaseq-standard-fastq.wdl b/workflows/rnaseq/rnaseq-standard-fastq.wdl
@@ -117,9 +117,6 @@ workflow rnaseq_standard_fastq {
     scatter (rg in read_groups) {
         call read_group.read_group_to_string after parse_input { input: read_group = rg }
     }
-    String stringified_read_groups = sep(
-        " , ", read_group_to_string.stringified_read_group
-    )
 
     if (validate_input){
         scatter (reads in zip(read_one_fastqs_gz, read_two_fastqs_gz)) {
@@ -154,7 +151,7 @@ workflow rnaseq_standard_fastq {
     call rnaseq_core_wf.rnaseq_core { input:
         read_one_fastqs_gz = selected_read_one_fastqs,
         read_two_fastqs_gz = selected_read_two_fastqs,
-        read_groups = stringified_read_groups,
+        read_groups = read_group_to_string.stringified_read_group,
         prefix,
         gtf,
         star_db,

diff --git a/workflows/rnaseq/rnaseq-standard.wdl b/workflows/rnaseq/rnaseq-standard.wdl
@@ -92,7 +92,7 @@ workflow rnaseq_standard {
 
     call util.get_read_groups after parse_input { input:
         bam = selected_bam,
-        format_for_star = true,  # matches default but prevents user from overriding
+        clean = true,  # matches default but prevents user from overriding
     }
     call bam_to_fastqs_wf.bam_to_fastqs after parse_input { input:
         bam = selected_bam,
@@ -103,9 +103,7 @@ workflow rnaseq_standard {
     call rnaseq_core_wf.rnaseq_core { input:
         read_one_fastqs_gz = bam_to_fastqs.read1s,
         read_two_fastqs_gz = select_all(bam_to_fastqs.read2s),
-        # format_for_star=true in get_read_groups puts
-        # all found RG info in read_groups[0]
-        read_groups = get_read_groups.read_groups[0],
+        read_groups = get_read_groups.read_groups,
         prefix,
         gtf,
         star_db,