diff --git a/.codespellignore b/.codespellignore
new file mode 100644
index 00000000..24717958
--- /dev/null
+++ b/.codespellignore
@@ -0,0 +1,5 @@
+RNAseq
+OTU
+otu
+groupD
+groupd
\ No newline at end of file
diff --git a/.github/workflows/check_typos.yml b/.github/workflows/check_typos.yml
index 600f3d3d..1bea1476 100644
--- a/.github/workflows/check_typos.yml
+++ b/.github/workflows/check_typos.yml
@@ -15,8 +15,9 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@v3
- - uses: codespell-project/actions-codespell@master
+ - uses: codespell-project/codespell-problem-matcher@v1
+ - uses: codespell-project/actions-codespell@v2.0
with:
check_filenames: true
- skip: "*.yml,*.cff,*.js,*.lock"
- ignore_words_list: RNAseq
+ skip: "*.yml,*.cff,*.js,*.lock,*.pdf,*.ipynb"
+ ignore_words_file: ".codespellignore"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..18c89177
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,97 @@
+name: CI Updated Modules Testing
+# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
+on:
+ push:
+ branches:
+ - DEV_NF_RCP-F
+
+env:
+ NXF_ANSI_LOG: false
+
+jobs:
+ changes:
+ name: Check for changes
+ runs-on: ubuntu-latest
+ outputs:
+ # Expose matched filters as job 'modules' output variable
+ modules: ${{ steps.filter.outputs.changes }}
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: dorny/paths-filter@v2
+ id: filter
+ with:
+ base: 'DEV_NF_RCP-F'
+ filters: "RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml"
+
+ test:
+ name: ${{ matrix.tags }} ${{ matrix.profile }} ${{ matrix.NXF_VER }}
+ runs-on: ubuntu-latest
+ needs: changes
+ if: needs.changes.outputs.modules != '[]'
+ strategy:
+ matrix:
+ NXF_VER:
+ - "22.10.1"
+ - "latest-everything"
+ profile:
+ - "docker"
+ - "singularity"
+ tags: ["${{ fromJson(needs.changes.outputs.modules) }}"]
+
+ steps:
+ - name: Check out pipeline code
+ uses: actions/checkout@v3
+
+ - name: Install Nextflow
+ uses: nf-core/setup-nextflow@be72b1dc0f932cea69aef64479ac863a86516c0c
+ with:
+ version: "${{ matrix.NXF_VER }}"
+
+ - name: Set up Singularity
+ if: matrix.profile == 'singularity'
+ uses: eWaterCycle/setup-singularity@v5
+ with:
+ singularity-version: 3.7.1
+
+ - name: Install nf-test
+ id: nf-test
+ run: |
+ curl -fsSL https://code.askimed.com/install/nf-test | bash
+ chmod u+x nf-test
+ echo "bin_path=$(pwd)/nf-test" >> $GITHUB_OUTPUT
+
+
+ - name: Hash Github Workspace
+ id: hash_workspace
+ run: |
+ echo "digest=$(echo RNA_3.10.1_${{ github.workspace }} | md5sum | cut -c 1-25)" >> $GITHUB_OUTPUT
+
+ - name: Cache test data
+ id: cache-testdata
+ uses: actions/cache@v3
+ with:
+ path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets
+ key: ${{ steps.hash_workspace.outputs.digest }}
+
+ - name: Check out test data
+ if: steps.cache-testdata.outputs.cache-hit != 'true'
+ uses: actions/checkout@v3
+ with:
+ repository: J-81/test-datasets-extended
+ ref: NF_RCP-F
+ path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets
+
+ # Test the module
+ - name: Run nf-test
+ run: |
+ cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/
+ ${{ steps.nf-test.outputs.bin_path}} test \
+ --profile=${{ matrix.profile }} \
+ --tag ${{ matrix.tags }} \
+ --tap=test.tap
+
+ - uses: pcolby/tap-summary@v1
+ with:
+ path: >-
+ ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test.tap
\ No newline at end of file
diff --git a/.github/workflows/ci_minimal_full_pipeline.yml b/.github/workflows/ci_minimal_full_pipeline.yml
new file mode 100644
index 00000000..d0b4c44b
--- /dev/null
+++ b/.github/workflows/ci_minimal_full_pipeline.yml
@@ -0,0 +1,77 @@
+name: CI Minimal Dataset Full Pipeline
+# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
+on: push
+
+env:
+ NXF_ANSI_LOG: false
+
+jobs:
+ Minimal_Dataset_Full_Pipeline:
+ name: ${{ matrix.tags }} ${{ matrix.profile }} ${{ matrix.NXF_VER }}
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ NXF_VER:
+ - "22.10.1"
+ - "latest-everything"
+ profile:
+ - "docker"
+ - "singularity"
+
+ steps:
+ - name: Check out pipeline code
+ uses: actions/checkout@v3
+
+ - name: Install Nextflow
+ uses: nf-core/setup-nextflow@v1.3.0
+ with:
+ version: "${{ matrix.NXF_VER }}"
+
+ - name: Set up Singularity
+ if: matrix.profile == 'singularity'
+ uses: eWaterCycle/setup-singularity@v5
+ with:
+ singularity-version: 3.7.1
+
+ - name: Install nf-test
+ id: nf-test
+ run: |
+ curl -fsSL https://code.askimed.com/install/nf-test | bash
+ chmod u+x nf-test
+ echo "bin_path=$(pwd)/nf-test" >> $GITHUB_OUTPUT
+
+
+ - name: Hash Github Workspace
+ id: hash_workspace
+ run: |
+ echo "digest=$(echo RNA_3.10.1_${{ github.workspace }} | md5sum | cut -c 1-25)" >> $GITHUB_OUTPUT
+
+ - name: Cache test data
+ id: cache-testdata
+ uses: actions/cache@v3
+ with:
+ path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets
+ key: ${{ steps.hash_workspace.outputs.digest }}
+
+ - name: Check out test data
+ if: steps.cache-testdata.outputs.cache-hit != 'true'
+ uses: actions/checkout@v3
+ with:
+ repository: J-81/test-datasets-extended
+ ref: NF_RCP-F
+ path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets
+
+ # Test the module
+ - name: Run nf-test on minimal core test datasets
+ run: |
+ cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/
+ ${{ steps.nf-test.outputs.bin_path}} test \
+ --profile=${{ matrix.profile }} \
+ --tag core \
+ --tap=test.tap \
+ tests/*.test
+
+ - uses: pcolby/tap-summary@v1
+ with:
+ path: >-
+ ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test.tap
\ No newline at end of file
diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml
index 04a60dc9..774d6745 100644
--- a/.github/workflows/markdown-link-check.yml
+++ b/.github/workflows/markdown-link-check.yml
@@ -1,7 +1,8 @@
name: Check Markdown links
-on: push
-
+on:
+ create: # runs when a reference (branch or tag) is created
+
jobs:
markdown-link-check:
runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..c05e0a71
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.nextflow
+*.pyc
\ No newline at end of file
diff --git a/.gitpod.yml b/.gitpod.yml
new file mode 100644
index 00000000..cd0f7c9f
--- /dev/null
+++ b/.gitpod.yml
@@ -0,0 +1,13 @@
+image: nfcore/gitpod:latest
+
+vscode:
+ extensions:
+ - ms-python.python
+ - eamodio.gitlens
+ - GitHub.copilot
+ - REditorSupport.r
+ - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code
+ - mechatroner.rainbow-csv # Highlight columns in csv files in different colors
+ - nextflow.nextflow # Nextflow syntax highlighting
+ - oderwat.indent-rainbow # Highlight indentation level
+ - streetsidesoftware.code-spell-checker # Spelling checker for source code
diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index 363d9a36..7db21261 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -43,6 +43,7 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform
- Fixed rare edge case where groupwise mean and standard deviations could become misassociated to incorrect groups. This had affected [step 9f](#9f-prepare-genelab-dge-tables-with-annotations-on-datasets-with-ercc-spike-in) and [step 9i](#9i-prepare-genelab-dge-tables-with-annotations-on-datasets-without-ercc-spike-in).
+- [Step 2a](#2a-trimfilter-raw-data) adapter type argument removed in favor of using the built in TrimGalore! adapter [autodetection](https://github.com/FelixKrueger/TrimGalore/blob/0.6.7/Docs/Trim_Galore_User_Guide.md#adapter-auto-detection).
---
# Table of contents
@@ -122,7 +123,7 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform
|tximport|1.27.1|[https://github.com/mikelove/tximport](https://github.com/mikelove/tximport)|
|tidyverse|1.3.1|[https://www.tidyverse.org](https://www.tidyverse.org)|
|stringr|1.4.1|[https://github.com/tidyverse/stringr](https://github.com/tidyverse/stringr)|
-|dp_tools|1.1.8|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
+|dp_tools|1.3.3|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
|pandas|1.5.0|[https://github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas)|
|seaborn|0.12.0|[https://seaborn.pydata.org/](https://seaborn.pydata.org/)|
|matplotlib|3.6.0|[https://matplotlib.org/stable](https://matplotlib.org/stable)|
@@ -204,7 +205,6 @@ trim_galore --gzip \
--path_to_cutadapt /path/to/cutadapt \
--cores NumberOfThreads \
--phred33 \
- --illumina \ # if adapters are not illumina, replace with adapters used
--output_dir /path/to/TrimGalore/output/directory \
--paired \ # only for PE studies, remove this parameter if raw data are SE
sample1_R1_raw.fastq.gz sample1_R2_raw.fastq.gz sample2_R1_raw.fastq.gz sample2_R2_raw.fastq.gz
@@ -218,7 +218,6 @@ trim_galore --gzip \
- `--path_to_cutadapt` - specify path to cutadapt software if it is not in your `$PATH`
- `--cores` - specify the number of threads available on the server node to perform trimming
- `--phred33` - instructs cutadapt to use ASCII+33 quality scores as Phred scores for quality trimming
-- `--illumina` - defines the adapter sequence to be trimmed as the first 13bp of the Illumina universal adapter `AGATCGGAAGAGC`
- `--output_dir` - the output directory to store results
- `--paired` - indicates paired-end reads - both reads, forward (R1) and reverse (R2) must pass length threshold or else both reads are removed
- `sample1_R1_raw.fastq.gz sample1_R2_raw.fastq.gz sample2_R1_raw.fastq.gz sample2_R2_raw.fastq.gz` – the input reads are specified as a positional argument, paired-end read files are listed pairwise such that the forward reads (*R1_raw.fastq.gz) are immediately followed by the respective reverse reads (*R2_raw.fastq.gz) for each sample
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md b/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md
index eef3c524..a89e3ab6 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+
+### Added
+- Github action support for CI testing (a83006ba91b1209e1857fefd96e9ff950ebb0cdc)
+
+### Fixed
+- Workflow usage files will all follow output directory set by workflow user (3e69f06432f62b7924d2e043ef4768c5d09bf614)
+### Changed
+- TrimGalore! will now use autodetect for adaptor type (3b7e0bab4017e90481359c48f9cf7c8837ed54d2)
+- V&V migrated from dp_tools version 1.1.8 to 1.3.2 including:
+ - Migration of V&V protocol code to this codebase instead of dp_tools (b3684a4c1db5df06eab20916ef7e130c410c147c)
+ - Fix for sample wise checks reusing same samples (dca4fdad7518ac9ead3ee2e4c5f57ac0fe25c715)
+
## [1.0.3](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP-F_1.0.3/RNAseq/Workflow_Documentation/NF_RCP-F) - 2023-01-25
### Added
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md
index 25d4f1a7..1f564486 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md
@@ -101,9 +101,9 @@ All files required for utilizing the NF_RCP-F GeneLab workflow for processing RN
copy of latest NF_RCP-F version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands:
```bash
-wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_RCP-F_1.0.3/NF_RCP-F_1.0.3.zip
+wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_RCP-F_1.0.4/NF_RCP-F_1.0.4.zip
-unzip NF_RCP-F_1.0.3.zip
+unzip NF_RCP-F_1.0.4.zip
```
@@ -115,10 +115,10 @@ unzip NF_RCP-F_1.0.3.zip
Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210).
To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_RCP-F workflow:
-> Note: This command should be run in the location containing the `NF_RCP-F_1.0.3` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. Depending on your network speed, fetching the images will take ~20 minutes.
+> Note: This command should be run in the location containing the `NF_RCP-F_1.0.4` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. Depending on your network speed, fetching the images will take ~20 minutes.
```bash
-bash NF_RCP-F_1.0.3/bin/prepull_singularity.sh NF_RCP-F_1.0.3/config/software/by_docker_image.config
+bash NF_RCP-F_1.0.4/bin/prepull_singularity.sh NF_RCP-F_1.0.4/config/software/by_docker_image.config
```
@@ -134,7 +134,7 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity
### 4. Run the Workflow
-While in the location containing the `NF_RCP-F_1.0.3` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_RCP-F workflow:
+While in the location containing the `NF_RCP-F_1.0.4` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_RCP-F workflow:
> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument.
@@ -142,7 +142,7 @@ While in the location containing the `NF_RCP-F_1.0.3` directory that was downloa
#### 4a. Approach 1: Run the workflow on a GeneLab RNAseq dataset with automatic retrieval of Ensembl reference fasta and gtf files
```bash
-nextflow run NF_RCP-F_1.0.3/main.nf \
+nextflow run NF_RCP-F_1.0.4/main.nf \
-profile singularity \
--gldsAccession GLDS-194
```
@@ -154,7 +154,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \
> Note: The `--ref_source` and `--ensemblVersion` parameters should match the reference source and version number of the local reference fasta and gtf files used
```bash
-nextflow run NF_RCP-F_1.0.3/main.nf \
+nextflow run NF_RCP-F_1.0.4/main.nf \
-profile singularity \
--gldsAccession GLDS-194 \
--ensemblVersion 107 \
@@ -170,7 +170,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \
> Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md).
```bash
-nextflow run NF_RCP-F_1.0.3/main.nf \
+nextflow run NF_RCP-F_1.0.4/main.nf \
-profile singularity \
--runsheetPath
```
@@ -179,7 +179,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \
**Required Parameters For All Approaches:**
-* `NF_RCP-F_1.0.3/main.nf` - Instructs Nextflow to run the NF_RCP-F workflow
+* `NF_RCP-F_1.0.4/main.nf` - Instructs Nextflow to run the NF_RCP-F workflow
* `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow
@@ -225,7 +225,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \
All parameters listed above and additional optional arguments for the RCP workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
```bash
-nextflow run NF_RCP-F_1.0.3/main.nf --help
+nextflow run NF_RCP-F_1.0.4/main.nf --help
```
See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts.zip b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts.zip
index 3e0230d8..fc859190 100644
Binary files a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts.zip and b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts.zip differ
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd
index 84358dc8..878db1b2 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd
@@ -7,7 +7,6 @@ params:
input_gene_results_dir: ""
# One and only one of the following must be specified
runsheet_path: NULL
- isa_path: NULL
primary_keytype: "" # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
normalization: "default" # ENUM like, supports "ERCC-groupB" and "default"
@@ -15,41 +14,63 @@ params:
dge_output_prefix: ""
DEBUG_MODE_LIMIT_GENES: FALSE
DEBUG_MODE_ADD_DUMMY_COUNTS: FALSE
- work_dir: "." # should be set to launch directory
- verbose: FALSE
+ work_dir: "." # NON_DPPD: should be set to launch directory
+ SUMMARY_FILE_PATH: "summary.txt"
---
## Substeps {.tabset}
### 1. Setup
-
+
```{r, setup, include=FALSE}
knitr::opts_knit$set(root.dir = params$work_dir)
library(knitr)
```
-```{r libary-loading, message = params$verbose, warning = params$verbose}
+```{r libary-loading}
# allow more flexibility in download time
# useful for slower connections where the default of 60 seconds might be exceeded
options(timeout=600)
-# Import libraries (tximport, DESeq2, tidyverse, Risa)
+# Import libraries (tximport, DESeq2, tidyverse)
library(tximport)
library(DESeq2)
library(stringr)
params
+SUMMARY_FILE_PATH <- params$SUMMARY_FILE_PATH
yaml::write_yaml(params, "last_params.yml")
-```
-```{r validate_params}
-# assert either runsheet_path OR isa_path supplied in params
-if (!xor(!is.null(params$runsheet_path), !is.null(params$isa_path))) {
- stop("Must supply EITHER runsheet_path or isa_path in params")
-}
+# END:NON_DPPD
+
+# START:ONLY_DPPD
+# params <- c(
+# runsheet_path = "/path/to/runsheet", # Used for downloading
+# input_gene_results_dir = "/path/to/genes_results_files", # Location of the gene results files
+# primary_keytype = "", # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
+# normalization = "", # ENUM like, supports "ERCC-groupB" and "default"
+# normalized_counts_output_prefix = "", # Output prefix for normalized counts files
+# dge_output_prefix = "" # Output prefix for DGE files
+# )
+# END:ONLY_DPPD
```
### 2. Load Study Metadata
-```{r runsheet-to-compare_df, include=(!is.null(params$runsheet_path)), eval=(!is.null(params$runsheet_path))}
+```{r runsheet-to-compare_df}
+#' Calculate the square of a number
+#'
+#' This function takes a numeric input and returns its square.
+#'
+#' @param x Numeric value to be squared.
+#'
+#' @return The square of the input value.
+#'
+#' @examples
+#' square(2)
+#' # Output: 4
+#'
+#' square(-3)
+#' # Output: 9
+#'
compare_csv_from_runsheet <- function(runsheet_path) {
df = read.csv(runsheet_path)
# get only Factor Value columns
@@ -64,25 +85,6 @@ compare_csv <- compare_csv_from_runsheet(params$runsheet_path)
#DT::datatable(compare_csv, caption = "Data Frame of parsed runsheet filtered to required columns")
```
-```{r isa-to-compare_df, include=(!is.null(params$isa_path)), eval=(!is.null(params$isa_path))}
-# TODO: Remove this route, ISA zip support will be dropped as of DPPD-7101-F
-library(Risa)
-
-compare_csv_from_isa_archive <- function(isa_path) {
- td = tempdir()
- unzip(isa_path, exdir = td)
- isa <- Risa::readISAtab(path = td)
- n = as.numeric(which(isa@assay.technology.types == "RNA Sequencing (RNA-Seq)"))
- isa_tabs <- isa@assay.tabs[[n]]@assay.file
- factors <- as.data.frame(isa@factors[[1]], stringsAsFactors = FALSE)
- colnames(factors) <- paste("factor",1:dim(factors)[2], sep = "_")
- return(data.frame(sample_id = isa_tabs$`Sample Name`, factors))
-}
-# Loading metadata from isa archive
-compare_csv <- compare_csv_from_isa_archive(params$isa_path)
-#DT::datatable(compare_csv, caption = "Data Frame of parsed isa archive filtered to required metadata")
-```
-
```{r compare_df-to-study_df}
study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
@@ -130,8 +132,7 @@ files <- list.files(
## Reorder the *genes.results files to match the ordering of the ISA samples
-# Replace spaces in sample names from ISA with "_", consistent with runsheet generation
-samples = str_replace_all(rownames(study), " ", "_")
+samples = rownames(study)
reordering <- sapply(samples, function(x)grep(paste0("Rsem_gene_counts/", x,".genes.results$"), files, value=FALSE))
files <- files[reordering]
names(files) <- samples
@@ -335,12 +336,12 @@ output_table_1$LRT.p.value <- res_1_lrt@listData$padj
```{r wald-test-iteration}
## Iterate through Wald Tests to generate pairwise comparisons of all groups
for (i in 1:dim(contrasts)[2]){
- res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
- res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
- colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
- output_table_1<-cbind(output_table_1,res_1)
- rm(res_1)
+ res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
+ res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
+ colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
+ output_table_1<-cbind(output_table_1,res_1)
}
+
```
```{r}
@@ -385,6 +386,16 @@ write.csv(
sampleTable,
file = paste0(params$dge_output_prefix, "SampleTable.csv")
)
+
+# Create summary file based on output_table_1
+output <- capture.output(summary(output_table_1))
+
+# Open file connection
+conn <- file(paste0(params$dge_output_prefix, "summary.txt"), "w")
+
+# Write the captured output to the file
+writeLines(output, conn)
+
# DT::datatable(head(output_table_1, n = 30),
# caption = "First 30 rows of differential gene expression table",
# extensions = "FixedColumns",
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R
index fedf44d6..1d5965e0 100755
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R
@@ -4,10 +4,6 @@ library("here")
library("cli")
parser <- OptionParser()
-parser <- add_option(parser, c("-v", "--verbose"),
- action = "store_true",
- default = FALSE, help = "Print extra output [default]"
-)
parser <- add_option(parser, c("--skip_perform_dge"),
action = "store_true", default = FALSE,
help = "Skips running the DGE, this can be used when the output from the DGE already exist",
@@ -43,9 +39,6 @@ parser <- add_option(parser, c("--DEBUG_MODE_ADD_DUMMY_COUNTS"),
default = FALSE, action = "store_true",
help = "Replaces all gene counts with random values from 0 to 5000",
)
-parser <- add_option(parser, c("--isa_path"),
- help = "ISA Archive path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
-)
parser <- add_option(parser, c("--runsheet_path"),
help = "runsheet csv path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
)
@@ -69,14 +62,11 @@ if (!args$skip_perform_dge) {
cli_alert_warning("Running Perform_DGE.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Perform_DGE.Rmd"),
output_dir = args$work_dir,
- quiet = !args$verbose,
params = list(
work_dir = args$work_dir,
- verbose = args$verbose,
input_gene_results_dir = args$input_gene_results_dir,
primary_keytype = args$primary_keytype,
runsheet_path = args$runsheet_path,
- isa_path = args$isa_path,
normalization = args$normalization,
dge_output_prefix = args$dge_output_prefix,
normalized_counts_output_prefix = args$normalized_counts_output_prefix,
@@ -93,7 +83,6 @@ if (!args$skip_gene_annotation) {
cli_alert_warning("Running Add_Gene_Annotations.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Add_Gene_Annotations.Rmd"),
output_dir = args$work_dir,
- quiet = !args$verbose,
params = list(
input_table_path = paste0(args$dge_output_prefix, "differential_expression_no_annotations.csv"),
work_dir = args$work_dir,
@@ -111,7 +100,6 @@ if (!args$skip_gene_annotation) {
cli_alert_warning("Running Extend_DGE_Table.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Extend_DGE_Table.Rmd"),
output_dir = args$work_dir,
- quiet = !args$verbose,
params = list(
input_table_path = paste0(args$dge_output_prefix, "differential_expression.csv"),
work_dir = args$work_dir,
@@ -128,7 +116,6 @@ if (!args$skip_gene_annotation) {
cli_alert_warning("Running Generate_PCA_Table.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Generate_PCA_Table.Rmd"),
output_dir = args$work_dir,
- quiet = !args$verbose,
params = list(
input_table_path = paste0(args$normalized_counts_output_prefix, "Normalized_Counts.csv"),
work_dir = args$work_dir,
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py
new file mode 100644
index 00000000..5faa427c
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+# Import for access at the module level
+from . import checks
+from . import protocol
+from . import schemas
+
+# Set config path
+config = Path(__file__).parent / "config.yaml"
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py
new file mode 100644
index 00000000..885d2160
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py
@@ -0,0 +1,1537 @@
+from collections import defaultdict
+import copy
+import enum
+import gzip
+import itertools
+import logging
+import math
+from pathlib import Path
+from statistics import mean
+import string
+import subprocess
+from typing import Callable, Dict, Union
+from importlib.metadata import files
+
+import pandas as pd
+
+from dp_tools.core.entity_model import Dataset, Sample, multiqc_run_to_dataframes
+
+log = logging.getLogger(__name__)
+
+from dp_tools.core.check_model import FlagCode, FlagEntry, FlagEntryWithOutliers
+
+
+def r_style_make_names(s: str) -> str:
+ """Recreates R's make.names function for individual strings.
+ This function is often used to create syntactically valid names in R which are then saved in R outputs.
+ Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names
+
+ Args:
+ s (str): A string to convert
+
+ Returns:
+ str: A string converted in the same way as R's make.names function
+ """
+ EXTRA_WHITELIST_CHARACTERS = "_ΩπϴλθijkuΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψω_µ" # Note: there are two "μμ" like characters one is greek letter mu, the other is the micro sign
+ VALID_CHARACTERS = string.ascii_letters + string.digits + "." + EXTRA_WHITELIST_CHARACTERS
+ REPLACEMENT_CHAR = "."
+ new_string_chars = list()
+ for char in s:
+ if char in VALID_CHARACTERS:
+ new_string_chars.append(char)
+ else:
+ new_string_chars.append(REPLACEMENT_CHAR)
+ return "".join(new_string_chars)
+
+
+# adapted from reference: https://stackoverflow.com/questions/56048627/round-floats-in-a-nested-dictionary-recursively
+# used to round values for easier to read messages
+def formatfloat(x):
+ return "%.3g" % float(x)
+
+
+def pformat(original_dictionary, function):
+ dictionary = copy.deepcopy(
+ original_dictionary
+ ) # we don't want to override original values
+ if isinstance(dictionary, dict):
+ new_dict = dict()
+ for k, v in dictionary.items():
+ new_dict[k] = function(v) if isinstance(v, float) else pformat(v, function)
+ return new_dict
+ return dictionary
+
+
+def convert_nan_to_zero(input: Dict[str, Union[float, int]]) -> Dict:
+ """Convert any Nan into zero"""
+ output = dict()
+ for key, value in input.items():
+ output[key] = value if not math.isnan(value) else 0
+ return output
+
+
+## Functions that use the following syntax to merge values from general stats:
+# "stat1 + stat2" should search and sum the stats
+# TODO: refine dict typehint
+def stat_string_to_value(stat_string: str, mqcData: dict) -> float:
+ """ "stat1 + stat2" should search and sum the stats"""
+ sum = float(0)
+ direct_keys = stat_string.split(" + ")
+ for direct_key in direct_keys:
+ print(direct_key)
+ sum += mqcData[direct_key]
+ return float(sum)
+
+
+## Dataframe and Series specific helper functions
+def nonNull(df: pd.DataFrame) -> bool:
+ # negation since it checks if any are null
+ return ~df.isnull().any(axis=None)
+
+
+def nonNegative(df: pd.DataFrame) -> bool:
+ """This ignores null values, use nonNull to validate that condition"""
+ return ((df >= 0) | (df.isnull())).all(axis=None)
+
+
+def onlyAllowedValues(df: pd.DataFrame, allowed_values: list) -> bool:
+ """This ignores null values, use nonNull to validate that condition"""
+ return ((df.isin(allowed_values)) | (df.isnull())).all(axis=None)
+
+
+def check_forward_and_reverse_reads_counts_match(
+ sample: Sample, reads_key_1: str, reads_key_2: str
+) -> FlagEntry:
+ # data specific preprocess
+ count_fwd_reads = float(
+ sample.compile_multiqc_data([reads_key_1])["general_stats"]["FastQC"][
+ "total_sequences"
+ ]
+ )
+ count_rev_reads = float(
+ sample.compile_multiqc_data([reads_key_2])["general_stats"]["FastQC"][
+ "total_sequences"
+ ]
+ )
+
+ # check logic
+ if count_fwd_reads == count_rev_reads:
+ code = FlagCode.GREEN
+ message = (
+ f"Forward and reverse read counts match at "
+ f"{int(count_rev_reads)} sequences "
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Forward and reverse read counts do not "
+ f"match: forward_Count:{int(count_fwd_reads)}, "
+ f"reverse_Count:{int(count_rev_reads)}"
+ )
+
+ return {"code": code, "message": message}
+
+
+def check_file_exists(file: Path) -> FlagEntry:
+ # check logic
+ if file.is_file():
+ code = FlagCode.GREEN
+ message = f"File exists: {file.name} "
+ else:
+ code = FlagCode.HALT
+ message = f"Missing file: {file.name} expected at {str(file)} "
+
+ return {"code": code, "message": message}
+
+
+def check_fastqgz_file_contents(file: Path, count_lines_to_check: int) -> FlagEntry:
+ """Check fastqgz by:
+ 1. Decompressing as a stream of lines.
+ 2. Affirming expected headers (every 4th line) look correct.
+
+ :param file: Input fastqGZ file path
+ :type file: Path
+ :param count_lines_to_check: Maximum number of lines to check. Setting this to a negative value will remove the limit
+ :type count_lines_to_check: int
+ :return: A required fields-only flag entry dictionary
+ :rtype: FlagEntry
+ """
+
+ lines_with_issues: list[int] = list()
+
+ # check logic
+ # truncated files raise EOFError
+ # catch this as HALT3
+ try:
+ with gzip.open(file, "rb") as f:
+ for i, byte_line in enumerate(f):
+ # checks if lines counted equals the limit input
+ if i + 1 == count_lines_to_check:
+ log.debug(
+ f"Reached {count_lines_to_check} lines, ending line check"
+ )
+ break
+
+ line = byte_line.decode()
+ # every fourth line should be an identifier
+ expected_identifier_line = i % 4 == 0
+ # check if line is actually an identifier line
+ if expected_identifier_line and line[0] != "@":
+ lines_with_issues.append(i + 1)
+ # update every 2,000,000 reads
+ if i % 2_000_000 == 0:
+ log.debug(f"Checked {i} lines for {file}")
+ pass
+
+ if not len(lines_with_issues) == 0:
+ code = FlagCode.HALT
+ message = (
+ f"Following decompressed fastqGZ lines have issues: {lines_with_issues}"
+ )
+ else:
+ code = FlagCode.GREEN
+ message = f"First {count_lines_to_check} lines checked found no issues. This means headers lines were identifiable and no decompression errors occured."
+ except (EOFError, gzip.BadGzipFile):
+ code = FlagCode.HALT
+ message = (
+ f"Error during decompression, likely a compression or truncation issue."
+ )
+
+ return {"code": code, "message": message}
+
+def check_gzip_file_integrity(file: Path, gzip_bin: Path = Path("gzip")) -> FlagEntry:
+ """ Check gzip file integrity using 'gzip -t' as per https://www.gnu.org/software/gzip/manual/gzip.html """
+ output = subprocess.run(
+ [str(gzip_bin), "-t", str(file)], capture_output=True
+ )
+ stdout_string = output.stdout.decode()
+ if stdout_string == "":
+ code = FlagCode.GREEN
+ message = f"Gzip integrity test raised no issues"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Gzip integrity test failed on this file with output: {stdout_string}"
+ )
+ return {"code": code, "message": message}
+
+def check_bam_file_integrity(
+ file: Path, samtools_bin: Path = Path("samtools")
+) -> FlagEntry:
+ """Uses http://www.htslib.org/doc/samtools-quickcheck.html"""
+ # data specific preprocess
+
+ # check logic
+ output = subprocess.run(
+ [str(samtools_bin), "quickcheck", "-v", str(file)], capture_output=True
+ )
+ stdout_string = output.stdout.decode()
+ if stdout_string == "":
+ code = FlagCode.GREEN
+ message = f"Samtools quickcheck raised no issues"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Samtools quickcheck failed on this file with output: {stdout_string}"
+ )
+ return {"code": code, "message": message}
+
+
+def check_thresholds(
+ multiqc_inputs: list[Path], mqc_key: str, stat_string: str, thresholds: list[dict]
+) -> FlagEntry:
+ # data specific preprocess
+ data = multiqc_run_to_dataframes(multiqc_inputs)
+ value = stat_string_to_value(stat_string, data["general_stats"][mqc_key])
+
+ # check logic
+ # Assuming GREEN unless reassigned
+ code = FlagCode.GREEN
+ for threshold in thresholds:
+ match threshold["type"]:
+ case "lower":
+ if value < threshold["value"]:
+ code = (
+ FlagCode[threshold["code"]]
+ if code < FlagCode[threshold["code"]]
+ else code
+ )
+
+ if code == FlagCode.GREEN:
+ message = f"Value: ({value}) did not breech any configured thresholds"
+ else:
+ message = f"Value: ({value}) breeched configured thresholds"
+ return {"code": code, "message": message}
+
+
+def check_metadata_attributes_exist(
+ dataset: Dataset, expected_attrs: list[str]
+) -> FlagEntry:
+ missing_metadata_fields = list(set(expected_attrs) - set(dataset.metadata))
+
+ # check if any missing_metadata_fields are present
+ # check logic
+ if not missing_metadata_fields:
+ code = FlagCode.GREEN
+ message = f"All expected metadata keys found: Expected {expected_attrs}, Found {set(dataset.metadata)}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing dataset metadata (source from Runsheet): {missing_metadata_fields}"
+ return {"code": code, "message": message}
+
+
+def check_for_outliers(
+ dataset: Dataset,
+ data_asset_keys: list[str],
+ mqc_module: str,
+ mqc_plot: str,
+ mqc_keys: list[str],
+ thresholds: list[dict],
+) -> FlagEntryWithOutliers:
+ # assume code is GREEN until outliers detected
+ code = FlagCode.GREEN
+ # dataframe extraction
+ compiled_mqc_data = dataset.compile_multiqc_data(data_asset_keys=data_asset_keys)
+
+ if mqc_plot == "general_stats":
+ df = compiled_mqc_data["general_stats"][mqc_module]
+ else:
+ df = compiled_mqc_data["plots"][mqc_module][mqc_plot]
+
+ def default_to_regular(d):
+ if isinstance(d, defaultdict):
+ d = {k: default_to_regular(v) for k, v in d.items()}
+ return d
+
+ # track for outliers
+ outliers: dict[str, dict[str, dict[str, str]]] = defaultdict(
+ lambda: defaultdict(dict)
+ )
+
+ # override if mqc_keys is a special value
+ if mqc_keys == ["_ALL"]:
+ mqc_keys = df.columns
+
+ for mqc_key in mqc_keys:
+ for threshold in thresholds:
+ if threshold["middle_fcn"] == "mean":
+ middle = df[mqc_key].mean()
+ elif threshold["middle_fcn"] == "median":
+ middle = df[mqc_key].median()
+ else:
+ raise ValueError(
+ f"Cannot compute middle from supplied middle_fcn name: {threshold['middle_fcn']}. Must supply either 'median' or 'mean'"
+ )
+
+ # bail if standard deviation == 0
+ # e.g. if all values are identical (and thus has no outliers)
+ if df[mqc_key].std() == 0:
+ continue
+
+ # compute difference
+ df_diffs = df[mqc_key] - middle
+
+ # compute as number of standard deviations
+ df_diffs_in_std = df_diffs / df[mqc_key].std()
+
+ # add to outlier tracker if over the threshold
+ for key, value in df_diffs_in_std.iteritems():
+ # if an outlier
+ if abs(value) > threshold["stdev_threshold"]:
+ # track it
+ outliers[key][mqc_module][mqc_key] = value
+ # elevate code if current code is lower severity
+ if code < FlagCode[threshold["code"]]:
+ code = FlagCode[threshold["code"]]
+
+ # convert defaultdict to regular for all reporting
+ outliers = default_to_regular(outliers)
+ # check logic
+ if code == FlagCode.GREEN:
+ message = f"No outliers found for {mqc_keys} in {mqc_plot} part of {mqc_module} multiQC module"
+ else:
+ message = (
+ f"Outliers found in {mqc_module} multiQC module as follows: {outliers}"
+ )
+ return {"code": code, "message": message, "outliers": outliers}
+
+
+def _check_expected_files_exist(
+ input_dir: Path, expected_extensions: list[str], parent_dir_is_filename: bool = True
+):
+ if parent_dir_is_filename:
+ fname = input_dir.name
+ expected_files = [input_dir / f"{fname}{ext}" for ext in expected_extensions]
+ missing_files = list()
+ for expected_file in expected_files:
+ if not expected_file.is_file():
+ missing_files.append(str(expected_file))
+
+ expected_file_str = [str(f) for f in expected_files]
+ return missing_files, expected_file_str
+
+
+def check_genebody_coverage_output(input_dir: Path):
+ EXPECTED_EXTENSIONS = [
+ ".geneBodyCoverage.r",
+ ".geneBodyCoverage.txt",
+ ".geneBodyCoverage.curves.pdf",
+ ]
+
+ missing_files, expected_file_str = _check_expected_files_exist(
+ input_dir, expected_extensions=EXPECTED_EXTENSIONS
+ )
+
+ if not missing_files:
+ code = FlagCode.GREEN
+ message = f"All output from geneBody coverage found: {expected_file_str}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing output from geneBody coverage: {missing_files}. Expected: {expected_file_str}"
+ return {"code": code, "message": message}
+
+
+def check_inner_distance_output(input_dir: Path):
+ EXPECTED_EXTENSIONS = [
+ ".inner_distance_plot.r",
+ ".inner_distance_freq.txt",
+ ".inner_distance.txt",
+ ".inner_distance_plot.pdf",
+ ]
+
+ missing_files, expected_file_str = _check_expected_files_exist(
+ input_dir, expected_extensions=EXPECTED_EXTENSIONS
+ )
+
+ if not missing_files:
+ code = FlagCode.GREEN
+ message = f"All output from inner distance found: {expected_file_str}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing output from inner distance: {missing_files}. Expected: {expected_file_str}"
+ return {"code": code, "message": message}
+
+
+def check_strandedness_assessable_from_infer_experiment(
+ dataset: Dataset,
+ stranded_assessment_range: dict[str, float],
+ unstranded_assessment_range: dict[str, float],
+ valid_dominant_strandedness_assessments: list[str],
+) -> FlagEntry:
+ # data specific preprocess
+ def get_median_strandedness(
+ dataset: Dataset,
+ ) -> dict[str, float]:
+
+ df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][
+ "Infer experiment"
+ ].fillna(
+ 0
+ ) # Nan is a zero for this MultiQC table
+
+ median_strandedness = df.median().to_dict()
+
+ return median_strandedness
+
+ median_strandedness = get_median_strandedness(dataset)
+
+ # check if dominant assessment is valid
+ strand_assessment: str = max(
+ median_strandedness, key=lambda k: median_strandedness[k]
+ )
+
+ # flag based on thresholds
+ assessment_value: float = median_strandedness[strand_assessment]
+
+ is_stranded: bool = (
+ stranded_assessment_range["max"]
+ > assessment_value
+ > stranded_assessment_range["min"]
+ )
+ is_unstranded: bool = (
+ unstranded_assessment_range["max"]
+ > assessment_value
+ > unstranded_assessment_range["min"]
+ )
+
+ def determine_samples_outside_range(
+ dataset: Dataset, min: float, max: float
+ ) -> list[str]:
+ df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][
+ "Infer experiment"
+ ].fillna(
+ 0
+ ) # Nan is a zero for this MultiQC table
+
+ return df.index[df[strand_assessment].between(min, max) == False].to_list()
+
+ # Catalog and flag any samples outside of range
+ # flags based on samples that are out of the assessment range
+ samples_outside_range: list[str]
+ if is_stranded:
+ samples_outside_range = determine_samples_outside_range(
+ dataset,
+ stranded_assessment_range["min"],
+ stranded_assessment_range["max"],
+ )
+ elif is_unstranded:
+ samples_outside_range = determine_samples_outside_range(
+ dataset,
+ unstranded_assessment_range["min"],
+ unstranded_assessment_range["max"],
+ )
+ else: # this means that the strandedness is ambiguous
+ samples_outside_range = list()
+
+ # check logic
+ if strand_assessment not in valid_dominant_strandedness_assessments:
+ code = FlagCode.HALT
+ message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is invalid for processing. Valid assessments: {valid_dominant_strandedness_assessments}"
+ elif not samples_outside_range and any([is_stranded, is_unstranded]):
+ code = FlagCode.GREEN
+ message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with no individual samples outside the assessment range"
+ elif samples_outside_range and any([is_stranded, is_unstranded]):
+ code = FlagCode.RED
+ message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with samples outside the assessment range: {samples_outside_range}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is ambiguous due to being inside range "
+ f"({stranded_assessment_range['min']}-{unstranded_assessment_range['max']})"
+ )
+
+ return {"code": code, "message": message}
+
+
+def check_rsem_counts_and_unnormalized_tables_parity(
+ rsem_table_path: Path, deseq2_table_path: Path
+) -> FlagEntry:
+ # data specific preprocess
+ df_rsem = pd.read_csv(rsem_table_path)
+ df_deseq2 = pd.read_csv(deseq2_table_path)
+
+ # return halt flag if column labels not conserved
+ if not set(df_deseq2.columns) == set(df_rsem.columns):
+ unique_to_deseq2 = set(df_deseq2.columns) - set(df_rsem.columns)
+ unique_to_rsem = set(df_rsem.columns) - set(df_deseq2.columns)
+ return {
+ "code": FlagCode.HALT,
+ "message": f"Columns do not match: unique to rsem: {unique_to_rsem}. unique to deseq2: {unique_to_deseq2}.",
+ }
+
+ # rearrange columns to the same order
+ df_deseq2 = df_deseq2[df_rsem.columns]
+
+ # check logic
+ if df_deseq2.equals(df_rsem):
+ code = FlagCode.GREEN
+ message = f"Tables of unnormalized counts match."
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Tables of unnormalized counts have same columns but values do not match."
+ )
+ return {"code": code, "message": message}
+
+
+def check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables(
+ unnormalizedCountTable: Path, samplewise_tables: dict[str, Path]
+) -> FlagEntry:
+ STAR_COUNT_MODES = ["unstranded", "sense", "antisense"]
+ # data specific preprocess
+ df_agg = pd.read_csv(unnormalizedCountTable, index_col=0)
+
+ # based on which column matches the first entry
+ # all columns must match with the same strand column
+ strand_assessment: str = None # type: ignore
+ samples_with_issues: dict[str, list[str]] = {
+ "Not in aggregate table": list(),
+ "Sample counts mismatch": list(),
+ }
+ for sample, path in samplewise_tables.items():
+ # check if samples exist as a column
+ if sample not in df_agg:
+ samples_with_issues["Not in aggregate table"].append(sample)
+ break
+
+ # load
+ df_samp = pd.read_csv(
+ path, sep="\t", names=STAR_COUNT_MODES, index_col=0
+ ).filter(
+ regex="^(?!N_.*).*", axis="rows"
+ ) # filter out N_* entries
+
+ # check if the values match for any of the count modes
+ # unstranded, sense, antisense
+ # for remaining samples, only check the match for the first count mode
+ # TODO: Fix rare false postive related to zero counts, in those cases the strand_assessment can be prematurely determined which causes other samples to be compared with an inappropriate assessment
+ for count_mode in STAR_COUNT_MODES:
+ # make sure to sort indicies
+ if df_agg[sample].sort_index().equals(df_samp[count_mode].sort_index()):
+ # assign strand assessment if first sample
+ if strand_assessment is None:
+ strand_assessment = count_mode
+
+ if strand_assessment == count_mode:
+ # no issues found (i.e. counts match with a consistent count mode column), break out
+ break
+ else: # no break
+ samples_with_issues["Sample counts mismatch"].append(sample)
+
+ # check logic
+ if not any([issue_type for issue_type in samples_with_issues.values()]):
+ code = FlagCode.GREEN
+ message = (
+ f"All samples accounted for and with matching counts "
+ f"between samplewise and aggregate table using strand assessment: '{strand_assessment}'"
+ )
+ else:
+ code = FlagCode.HALT
+ message = f"Identified issues: {samples_with_issues}"
+ return {"code": code, "message": message}
+
+
+def check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables(
+ unnormalizedCountTable: Path, samplewise_tables: dict[str, Path]
+) -> FlagEntry:
+ # data specific preprocess
+ df_agg = pd.read_csv(unnormalizedCountTable, index_col=0)
+
+ # based on which column matches the first entry
+ # TODO: LOW PRIORITY, fix this typehint
+ samples_with_issues: dict[str, Union[list[str], list[tuple[str, list[str]]]]] = {
+ "Not in aggregate table": list(), # type: ignore
+ "Sample counts mismatch": list(), # type: ignore
+ }
+ for sample, path in samplewise_tables.items():
+ # check if samples exist as a column
+ if sample not in df_agg:
+ samples_with_issues["Not in aggregate table"].append(sample)
+ break
+
+ # load
+ df_samp = pd.read_csv(path, sep="\t", index_col=0)
+
+ # check if values match
+ if geneID_with_mismatched_counts := (
+ list(df_agg.loc[df_agg[sample] != df_samp["expected_count"]].index)
+ ):
+ samples_with_issues["Sample counts mismatch"].append(
+ (sample, geneID_with_mismatched_counts)
+ )
+
+ # check logic
+ if not any([issue_type for issue_type in samples_with_issues.values()]):
+ code = FlagCode.GREEN
+ message = f"All samples accounted for and with matching counts between samplewise and aggregate table"
+ else:
+ code = FlagCode.HALT
+ message = f"Identified issues: {samples_with_issues}"
+ return {"code": code, "message": message}
+
+
+def check_sample_table_against_runsheet(
+ runsheet: Path, sampleTable: Path, all_samples_required: bool
+) -> FlagEntry:
+ """Check the sample table includes all samples as denoted in the runsheet.
+
+ Args:
+ runsheet (Path): csv file used for processing, the index denotes all samples
+ sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table)
+ all_samples_required (bool): denotes if all samples must be shared or if a subset of samples from the runsheet is okay.
+
+ Returns:
+ FlagEntry: A check result
+ """
+ # data specific preprocess
+ df_rs = pd.read_csv(runsheet, index_col="Sample Name").sort_index()
+ df_sample = pd.read_csv(sampleTable, index_col=0).sort_index()
+
+ extra_samples: dict[str, set[str]] = {
+ "unique_to_runsheet": set(df_rs.index) - set(df_sample.index),
+ "unique_to_sampleTable": set(df_sample.index) - set(df_rs.index),
+ }
+
+ # check logic
+ if any(
+ [
+ (extra_samples["unique_to_runsheet"] and all_samples_required),
+ (extra_samples["unique_to_sampleTable"]),
+ ]
+ ):
+ code = FlagCode.HALT
+ message = f"Samples mismatched: {[f'{entry}:{v}' for entry, v in extra_samples.items() if v]}"
+ else:
+ code = FlagCode.GREEN
+ message = f"All samples accounted for based on runsheet (All samples required?: {all_samples_required})"
+ return {"code": code, "message": message}
+
+
+class GroupFormatting(enum.Enum):
+ r_make_names = enum.auto()
+ ampersand_join = enum.auto()
+
+
+def utils_runsheet_to_expected_groups(
+ runsheet: Path,
+ formatting: GroupFormatting = GroupFormatting.ampersand_join,
+ limit_to_samples: list = None,
+ map_to_lists: bool = False,
+) -> Union[dict[str, str], dict[str, list[str]]]:
+ df_rs = (
+ pd.read_csv(runsheet, index_col="Sample Name", dtype=str)
+ .filter(regex="^Factor Value\[.*\]")
+ .sort_index()
+ ) # using only Factor Value columns
+
+ if limit_to_samples:
+ df_rs = df_rs.filter(items=limit_to_samples, axis="rows")
+
+ match formatting:
+ case GroupFormatting.r_make_names:
+ expected_conditions_based_on_runsheet = (
+ df_rs.apply(lambda x: "...".join(x), axis="columns")
+ .apply(r_style_make_names) # join factors with '...'
+ .to_dict()
+ ) # reformat entire group in the R style
+ case GroupFormatting.ampersand_join:
+ expected_conditions_based_on_runsheet = df_rs.apply(
+ lambda x: f"({' & '.join(x)})", axis="columns"
+ ).to_dict()
+ case _:
+ raise ValueError(
+ f"Formatting method invalid, must be one of the following: {list(GroupFormatting)}"
+ )
+
+ # convert from {sample: group} dict
+ # to {group: [samples]} dict
+ if map_to_lists:
+ unique_groups = set(expected_conditions_based_on_runsheet.values())
+ reformatted_dict: dict[str, list[str]] = dict()
+ for query_group in unique_groups:
+ reformatted_dict[query_group] = [
+ sample
+ for sample, group in expected_conditions_based_on_runsheet.items()
+ if group == query_group
+ ]
+ expected_conditions_based_on_runsheet: dict[str, list[str]] = reformatted_dict
+
+ return expected_conditions_based_on_runsheet
+
+
+def check_sample_table_for_correct_group_assignments(
+ runsheet: Path, sampleTable: Path
+) -> FlagEntry:
+ """Check the sample table is assigned to the correct experimental group.
+ An experimental group is defined by the Factor Value columns found in the runsheet.
+
+ Args:
+ runsheet (Path): csv file used for processing, includes metadata used for experimental group designation
+ sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table)
+
+ Returns:
+ FlagEntry: A check result
+ """
+ df_sample = pd.read_csv(sampleTable, index_col=0).sort_index()
+ # data specific preprocess
+ df_rs = (
+ pd.read_csv(runsheet, index_col="Sample Name", dtype=str) # Ensure no factor value columns are misinterpreted as numeric
+ .filter(regex="^Factor Value\[.*\]")
+ .loc[df_sample.index] # ensure only sampleTable groups are checked
+ .sort_index()
+ ) # using only Factor Value columns
+
+ # TODO: refactor with utils_runsheet_to_expected_groups
+ expected_conditions_based_on_runsheet = df_rs.apply(
+ lambda x: "...".join(x), axis="columns"
+ ).apply( # join factors with '...'
+ r_style_make_names
+ ) # reformat entire group in the R style
+
+ mismatched_rows = expected_conditions_based_on_runsheet != df_sample["condition"]
+
+ # check logic
+ if not any(mismatched_rows):
+ code = FlagCode.GREEN
+ message = f"Conditions are formatted and assigned correctly based on runsheet for all {len(df_sample)} samples in sample table: {list(df_sample.index)}"
+ else:
+ code = FlagCode.HALT
+ mismatch_description = (
+ df_sample[mismatched_rows]["condition"]
+ + " <--SAMPLETABLE : RUNSHEET--> "
+ + expected_conditions_based_on_runsheet[mismatched_rows]
+ ).to_dict()
+ message = f"Mismatch in expected conditions based on runsheet for these rows: {mismatch_description}"
+ return {"code": code, "message": message}
+
+
+def check_contrasts_table_headers(contrasts_table: Path, runsheet: Path) -> FlagEntry:
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ df_contrasts = pd.read_csv(contrasts_table, index_col=0)
+
+ # check logic
+ differences = set(expected_comparisons).symmetric_difference(
+ set(df_contrasts.columns)
+ )
+ if not differences:
+ code = FlagCode.GREEN
+ message = f"Contrasts header includes expected comparisons as determined runsheet Factor Value Columns: {set(expected_comparisons)}"
+ else:
+ code = FlagCode.HALT
+ message = f"Contrasts header does not match expected comparisons as determined runsheet Factor Value Columns: {differences}"
+ return {"code": code, "message": message}
+
+
+def check_contrasts_table_rows(contrasts_table: Path, **_) -> FlagEntry:
+ # data specific preprocess
+ df_contrasts = pd.read_csv(contrasts_table, index_col=0)
+
+ def _get_groups_from_comparisions(s: str) -> set[str]:
+ """Converts '(G1)v(G2)'
+ into G1...G2 where G1 and G2 are renamed as per the r make names function
+
+ Args:
+ s (str): Input that fits this format: '(G1)v(G2)'
+
+ Returns:
+ str: Reformatted string
+ """
+ g1, g2 = s.split(")v(")
+ # remove parens and reformat with r make names style
+ g1 = r_style_make_names(g1[1:].replace(" & ", "..."))
+ g2 = r_style_make_names(g2[:-1].replace(" & ", "..."))
+ return {g1, g2}
+
+ bad_columns: dict[str, dict[str, set]] = dict()
+ for (col_name, col_series) in df_contrasts.iteritems():
+ expected_values = _get_groups_from_comparisions(col_name)
+ if not expected_values == set(col_series):
+ bad_columns[col_name] = {
+ "expected": expected_values,
+ "actual": set(col_series),
+ }
+
+ # check logic
+ if not bad_columns:
+ code = FlagCode.GREEN
+ message = f"Contrasts column and rows match expected formatting"
+ else:
+ code = FlagCode.HALT
+ message = f"Contrasts columns {bad_columns} have unexpected values"
+ return {"code": code, "message": message}
+
+
+def check_dge_table_annotation_columns_exist(
+ dge_table: Path, organism: str, **_
+) -> FlagEntry:
+ REQUIRED_ANNOTATION_KEYS = {
+ "SYMBOL",
+ "GENENAME",
+ "REFSEQ",
+ "ENTREZID",
+ "STRING_id",
+ "GOSLIM_IDS",
+ }
+ MASTER_ANNOTATION_KEY = {"_DEFAULT": "ENSEMBL", "Arabidopsis thaliana": "TAIR"}
+
+ df_dge = pd.read_csv(dge_table)
+
+ required_columns = REQUIRED_ANNOTATION_KEYS.union(
+ {MASTER_ANNOTATION_KEY.get(organism, MASTER_ANNOTATION_KEY["_DEFAULT"])}
+ )
+
+ missing_columns = required_columns - set(df_dge.columns)
+ # check logic
+ if not missing_columns:
+ code = FlagCode.GREEN
+ message = f"Found all required annotation columns: {required_columns}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Missing the following required annotation columns: {missing_columns}"
+ )
+ return {"code": code, "message": message}
+
+
+def check_dge_table_sample_columns_exist(
+ dge_table: Path, samples: set[str], **_
+) -> FlagEntry:
+ # data specific preprocess
+ df_dge = pd.read_csv(dge_table)
+
+ missing_sample_columns = samples - set(df_dge.columns)
+
+ # check logic
+ if not missing_sample_columns:
+ code = FlagCode.GREEN
+ message = f"All samplewise columns present"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing these sample count columns: {missing_sample_columns}"
+ return {"code": code, "message": message}
+
+
+def check_dge_table_sample_columns_constraints(
+ dge_table: Path, samples: set[str], **_
+) -> FlagEntry:
+ MINIMUM_COUNT = 0
+ # data specific preprocess
+ df_dge = pd.read_csv(dge_table)[samples]
+
+ column_meets_constraints = df_dge.apply(
+ lambda col: all(col >= MINIMUM_COUNT), axis="rows"
+ )
+
+ # check logic
+ contraint_description = f"All counts are greater or equal to {MINIMUM_COUNT}"
+ if all(column_meets_constraints):
+ code = FlagCode.GREEN
+ message = (
+ f"All values in columns: {samples} met constraint: {contraint_description}"
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"These columns {list(column_meets_constraints.index[~column_meets_constraints])} "
+ f"fail the contraint: {contraint_description}."
+ )
+ return {"code": code, "message": message}
+
+
+def check_dge_table_group_columns_exist(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"]
+ expected_groups = utils_runsheet_to_expected_groups(runsheet)
+ expected_columns = {
+ "".join(comb)
+ for comb in itertools.product(GROUP_PREFIXES, expected_groups.values())
+ }
+ df_dge_columns = set(pd.read_csv(dge_table).columns)
+ missing_cols = expected_columns - df_dge_columns
+
+ # check logic
+ if not missing_cols:
+ code = FlagCode.GREEN
+ message = f"All group summary statistic columns (Prefixes: {GROUP_PREFIXES}) present. {sorted(list(expected_columns))}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing these group summary statistic columns (Prefixes: {GROUP_PREFIXES}): {sorted(list(missing_cols))}"
+ return {"code": code, "message": message}
+
+
+def check_dge_table_group_columns_constraints(
+ dge_table: Path, runsheet: Path, samples: set[str], **_
+) -> FlagEntry:
+ FLOAT_TOLERANCE = (
+ 0.001 # Percent allowed difference due to float precision differences
+ )
+ # data specific preprocess
+ GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"]
+ expected_groups = utils_runsheet_to_expected_groups(runsheet)
+ query_columns = {
+ "".join(comb)
+ for comb in itertools.product(GROUP_PREFIXES, expected_groups.values())
+ }
+
+ expected_group_lists = utils_runsheet_to_expected_groups(
+ runsheet, map_to_lists=True, limit_to_samples=samples
+ )
+ df_dge = pd.read_csv(dge_table)
+
+ # issue trackers
+ issues: dict[str, list[str]] = {
+ f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [],
+ f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [],
+ }
+
+ group: str
+ sample_set: list[str]
+ for group, sample_set in expected_group_lists.items():
+ abs_percent_differences = abs(
+ (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns"))
+ / df_dge[sample_set].mean(axis="columns")
+ * 100
+ )
+ if any(abs_percent_differences > FLOAT_TOLERANCE):
+ issues[
+ f"mean computation deviates by more than {FLOAT_TOLERANCE} percent"
+ ].append(group)
+
+ abs_percent_differences = abs(
+ (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns"))
+ / df_dge[sample_set].mean(axis="columns")
+ * 100
+ )
+ if any(abs_percent_differences > FLOAT_TOLERANCE):
+ issues[
+ f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent"
+ ].append(group)
+
+ # check logic
+ contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )"
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = f"All values in columns: {query_columns} met constraint: {contraint_description}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that"
+ f"fail the contraint: {contraint_description}."
+ )
+ return {"code": code, "message": message}
+
+
+def check_dge_table_comparison_statistical_columns_exist(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ COMPARISON_PREFIXES = ["Log2fc_", "Stat_", "P.value_", "Adj.p.value_"]
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ expected_columns = {
+ "".join(comb)
+ for comb in itertools.product(COMPARISON_PREFIXES, expected_comparisons)
+ }
+ df_dge_columns = set(pd.read_csv(dge_table).columns)
+ missing_cols = expected_columns - df_dge_columns
+
+ # check logic
+ if not missing_cols:
+ code = FlagCode.GREEN
+ message = f"All comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}) present. {sorted(list(expected_columns))}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing these comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}): {sorted(list(missing_cols))}"
+ return {"code": code, "message": message}
+
+
+def utils_common_constraints_on_dataframe(
+ df: pd.DataFrame, constraints: tuple[tuple[set, dict], ...]
+) -> dict:
+
+ issues: dict[str, list[str]] = {
+ "Failed non null constraint": list(),
+ "Failed non negative constraint": list(),
+ }
+
+ for (col_set, col_constraints) in constraints:
+ # this will avoid overriding the original constraints dictionary
+ # which is likely used in the check message
+ col_constraints = col_constraints.copy()
+
+ # limit to only columns of interest
+ query_df = df[col_set]
+ for (colname, colseries) in query_df.iteritems():
+ # check non null constraint
+ if col_constraints.pop("nonNull", False) and nonNull(colseries) == False:
+ issues["Failed non null constraint"].append(colname)
+ # check non negative constraint
+ if (
+ col_constraints.pop("nonNegative", False)
+ and nonNegative(colseries) == False
+ ):
+ issues["Failed non negative constraint"].append(colname)
+ # check allowed values constraint
+ if allowedValues := col_constraints.pop("allowedValues", False):
+ if onlyAllowedValues(colseries, allowedValues) == False:
+ issues["Failed non negative constraint"].append(colname)
+
+ # raise exception if there are unhandled constraint keys
+ if col_constraints:
+ raise ValueError(f"Unhandled constraint types: {col_constraints}")
+
+ return issues
+
+
+def check_dge_table_group_statistical_columns_constraints(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+
+ resolved_constraints = (
+ ({f"Log2fc_{comp}" for comp in expected_comparisons}, {"nonNull": True}),
+ ({f"Stat_{comp}" for comp in expected_comparisons}, {"nonNull": True}),
+ # can be removed from analysis before p-value and adj-p-value assessed
+ # ref: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
+ (
+ {f"P.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": True, "nonNull": False},
+ ),
+ (
+ {f"Adj.p.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": True, "nonNull": False},
+ ),
+ )
+
+ df_dge = pd.read_csv(dge_table)
+
+ # issue trackers
+ # here: {prefix+constraint: [failed_columns]}
+ issues: dict[str, list[str]] = dict()
+
+ issues = utils_common_constraints_on_dataframe(df_dge, resolved_constraints)
+
+ # check logic
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = f"All values in columns met constraint: {resolved_constraints}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that" f"fail the contraint: {resolved_constraints}."
+ )
+ return {"code": code, "message": message}
+
+
+def check_dge_table_fixed_statistical_columns_exist(dge_table: Path, **_) -> FlagEntry:
+ # data specific preprocess
+ fixed_stats_columns = {
+ "All.mean": {"nonNull": True, "nonNegative": True},
+ "All.stdev": {"nonNull": True, "nonNegative": True},
+ "LRT.p.value": {"nonNull": False, "nonNegative": True},
+ }
+ expected_columns = set(fixed_stats_columns)
+ df_dge_columns = set(pd.read_csv(dge_table).columns)
+ missing_cols = expected_columns - df_dge_columns
+
+ # check logic
+ if not missing_cols:
+ code = FlagCode.GREEN
+ message = f"All dataset summary stat columns present. {sorted(list(expected_columns))}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Missing these dataset summary stat columns: {sorted(list(missing_cols))}"
+ )
+ return {"code": code, "message": message}
+
+
+def check_dge_table_fixed_statistical_columns_constraints(
+ dge_table: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ fixed_stats_columns = (
+ ({"All.mean", "All.stdev"}, {"nonNull": True, "nonNegative": True}),
+ ({"LRT.p.value"}, {"nonNull": False, "nonNegative": True}),
+ )
+
+ df_dge = pd.read_csv(dge_table)
+
+ # issue trackers
+ # here: {prefix+constraint: [failed_columns]}
+ issues: dict[str, list[str]] = dict()
+
+ issues = utils_common_constraints_on_dataframe(df_dge, fixed_stats_columns)
+
+ # check logic
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = f"All values in columns met constraint: {fixed_stats_columns}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that" f"fail the contraint: {fixed_stats_columns}."
+ )
+ return {"code": code, "message": message}
+
+
+def check_dge_table_log2fc_within_reason(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 10 # Percent
+ LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT = 50 # Percent
+
+ # TODO: discuss, this might even be fine to lower quite a bit
+ # e.g THRESHOLD_PERCENT_MEANS_DIFFERENCE = 1 # percent
+ THRESHOLD_PERCENT_MEANS_DIFFERENCE = 50 # percent
+
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ df_dge = pd.read_csv(dge_table)
+
+ # Track error messages
+ err_msg_yellow = ""
+ all_suspect_signs: dict[int, dict[str, float]] = dict()
+ for comparision in expected_comparisons:
+ query_column = f"Log2fc_{comparision}"
+ group1_mean_col = (
+ "Group.Mean_" + comparision.split(")v(")[0] + ")"
+ ) # Uses parens and adds them back to prevent slicing on 'v' within factor names
+ group2_mean_col = "Group.Mean_" + "(" + comparision.split(")v(")[1]
+ computed_log2fc = (df_dge[group1_mean_col] / df_dge[group2_mean_col]).apply(
+ math.log, args=[2]
+ )
+ abs_percent_difference = abs(
+ ((computed_log2fc - df_dge[query_column]) / df_dge[query_column]) * 100
+ )
+ percent_within_tolerance = (
+ mean(
+ abs_percent_difference
+ < LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD
+ )
+ * 100
+ )
+ # flag if not enough within tolerance
+ if percent_within_tolerance < LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT:
+ err_msg_yellow += (
+ f"For comparison: '{comparision}' {percent_within_tolerance:.2f} % of genes have absolute percent differences "
+ f"(between log2fc direct computation and DESeq2's approach) "
+ f"less than {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD} % which does not met the minimum percentage "
+ f"({LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT} %) of genes required. "
+ f"This may indicate misassigned or misaligned columns. "
+ )
+
+ #### sign based checks
+
+ # filter to genes with based on groups means
+ abs_percent_differences = (
+ abs(
+ (df_dge[group1_mean_col] - df_dge[group2_mean_col])
+ / df_dge[group2_mean_col]
+ )
+ * 100
+ )
+ df_dge_filtered = df_dge.loc[
+ abs_percent_differences > THRESHOLD_PERCENT_MEANS_DIFFERENCE
+ ]
+
+ df_dge_filtered["positive_sign_expected"] = (
+ df_dge[group1_mean_col] - df_dge[group2_mean_col] > 0
+ )
+
+ df_dge_filtered["matches_expected_sign"] = (
+ (df_dge[query_column] > 0) & df_dge_filtered["positive_sign_expected"]
+ ) | ((df_dge[query_column] < 0) & ~df_dge_filtered["positive_sign_expected"])
+
+ all_suspect_signs = all_suspect_signs | df_dge_filtered.loc[
+ df_dge_filtered["matches_expected_sign"] == False
+ ][[group1_mean_col, group2_mean_col, query_column]].to_dict("index")
+
+ if all_suspect_signs:
+ code = FlagCode.RED
+ message = f"At least one log2fc sign is suspect, the following log2fc compared to actual group means: {all_suspect_signs}"
+ elif err_msg_yellow:
+ code = FlagCode.YELLOW
+ message = (
+ f"All log2fc not within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% "
+ f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than "
+ f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. "
+ )
+ else:
+ code = FlagCode.GREEN
+ message = (
+ f"All log2fc within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% "
+ f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than "
+ f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. Additionally, for comparisons with mean differences "
+ f"greater than {THRESHOLD_PERCENT_MEANS_DIFFERENCE}% all have reasonable log2fc signs"
+ )
+
+ return {"code": code, "message": message}
+
+
+def check_viz_table_columns_exist(dge_table: Path, runsheet: Path, **_) -> FlagEntry:
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ viz_pairwise_columns_prefixes = (
+ (
+ {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons},
+ {"nonNull": False},
+ ),
+ (
+ {f"Sig.1_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [False, True], "nonNull": False},
+ ),
+ (
+ {f"Sig.05_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [False, True], "nonNull": False},
+ ),
+ (
+ {f"Log2_P.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": False, "nonNull": False},
+ ),
+ (
+ {f"Updown_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [1, 0, -1], "nonNull": True},
+ ),
+ )
+
+ expected_columns = set(
+ itertools.chain(*[c1 for c1, _ in viz_pairwise_columns_prefixes])
+ )
+ df_dge_columns = set(pd.read_csv(dge_table).columns)
+ missing_cols = expected_columns - df_dge_columns
+
+ # check logic
+ if not missing_cols:
+ code = FlagCode.GREEN
+ message = f"All viz specific comparison columns present. {sorted(list(expected_columns))}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing these viz specific comparison columns: {sorted(list(missing_cols))}"
+ return {"code": code, "message": message}
+
+
+def check_viz_table_columns_constraints(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ viz_pairwise_columns_constraints = (
+ (
+ {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons},
+ {"nonNull": False},
+ ),
+ (
+ {f"Sig.1_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [False, True], "nonNull": False},
+ ),
+ (
+ {f"Sig.05_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [False, True], "nonNull": False},
+ ),
+ (
+ {f"Log2_P.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": False, "nonNull": False},
+ ),
+ (
+ {f"Updown_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [1, 0, -1], "nonNull": True},
+ ),
+ )
+
+ df_viz = pd.read_csv(dge_table)
+
+ # issue trackers
+ # here: {prefix+constraint: [failed_columns]}
+ issues: dict[str, list[str]] = dict()
+
+ issues = utils_common_constraints_on_dataframe(
+ df_viz, viz_pairwise_columns_constraints
+ )
+
+ # check logic
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = (
+ f"All values in columns met constraint: {viz_pairwise_columns_constraints}"
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that"
+ f"fail the contraint: {viz_pairwise_columns_constraints}."
+ )
+ return {"code": code, "message": message}
+
+
+def check_viz_pca_table_index_and_columns_exist(
+ pca_table: Path, samples: set[str]
+) -> FlagEntry:
+ EXPECTED_VIS_PCA_COLUMNS = {"PC1", "PC2", "PC3"}
+ err_msg = ""
+ # data specific preprocess
+ df = pd.read_csv(pca_table, index_col=0)
+
+ # check all samples included
+ if missing_samples := samples - set(df.index):
+ err_msg += f"Missing samples in index: {missing_samples}"
+
+ # check all expected columns exist
+ if missing_cols := EXPECTED_VIS_PCA_COLUMNS - set(df.columns):
+ err_msg += f"Missing expected columns: {missing_cols}"
+
+ if not err_msg:
+ code = FlagCode.GREEN
+ message = f"PCA Table has all the samples in the index and these columns exist: {EXPECTED_VIS_PCA_COLUMNS}"
+ else:
+ code = FlagCode.HALT
+ message = err_msg
+
+ return {"code": code, "message": message}
+
+
+def utils_formatting_list(l: list[str], spaces: int = 2) -> str:
+ """Reformats list to print friendly multi line string.
+
+ Example:
+ Reformatting a list of samples::
+
+ l = ['groundControl_1','groundControl_2','spaceFlight_1','spaceFlight-2']
+ print(f"Samples: \n{utils_formatting_list(l)}")
+
+ Args:
+ l (list): A list of strings to format
+ spaces (int): Number of leading spaces per line
+
+ Returns:
+ str: Print friendly multiline string
+ """
+ leading_spaces = " " * spaces
+ return "\n".join([f"{leading_spaces}- {item}" for item in l])
+
+
+def utils_rsem_counts_table_to_dataframe(
+ counts_table: Path, describe: bool = True
+) -> pd.DataFrame:
+ df = pd.read_csv(counts_table, index_col=0).rename_axis("geneID")
+ if describe:
+ print(f"Loaded rsem counts table:")
+ print(f" Samples: \n{utils_formatting_list(list(df.columns), spaces = 4)}")
+ print(f" Number of Genes: {len(df)}")
+ return df
+
+
+def utils_get_asset(asset_name: str) -> Path:
+ [p] = (p for p in files("dp_tools") if p.name == asset_name)
+ return p.locate()
+
+
+def check_ERCC_subgroup_representation(unnormalizedCountTable: Path, **_) -> FlagEntry:
+ """Check ERCC subgroup representation is robust.
+ Specifically, counts the dataset wide ERCC IDs then categorizes each subgroup
+ by the number of represented ERCC IDs in that subgroup.
+ Finally, generates a Flag result by comparison to thresholds.
+
+ Args:
+ counts_table (Path): RSEM unnormalized counts table
+
+ Returns:
+ FlagEntry: Result of the check.
+ """
+ MINIMUM_GREEN = 21
+ MINIMUM_YELLOW = 19
+ MINIMUM_RED = 0
+ MINIMUM_HALT = 0
+
+ # data specific preprocess
+ df_counts = utils_rsem_counts_table_to_dataframe(unnormalizedCountTable)
+
+ ercc_file = utils_get_asset("cms_095046.txt")
+ df_ercc = pd.read_csv(ercc_file, sep="\t")
+
+ # filter to only ercc genes
+ df_counts = df_counts.loc[df_counts.index.isin(df_ercc["ERCC ID"])]
+
+ # filter to only genes with at least one count (i.e. ERCC genes represented in the dataset)
+ df_counts = df_counts.loc[df_counts.sum(axis="columns") > 0]
+
+ # merge to ercc table data including subgroup
+ df_counts = df_counts.merge(df_ercc, left_index=True, right_on="ERCC ID")
+
+ # generate subgroup counts
+ df_subgroup_counts = df_counts["subgroup"].value_counts().sort_index()
+
+ green_key = f"green level subgroups: > {MINIMUM_GREEN} ERCC represented"
+ yellow_key = (
+ f"yellow level subgroups: {MINIMUM_YELLOW}-{MINIMUM_GREEN} ERCC represented"
+ )
+ red_key = f"red level subgroups: {MINIMUM_RED}-{MINIMUM_YELLOW} ERCC represented"
+ halt_key = f"halt level subgroups: < {MINIMUM_HALT} ERCC represented"
+
+ # classify each representation count
+ representation_category: dict[str, dict[str,int]] = {
+ green_key: df_subgroup_counts.loc[df_subgroup_counts > MINIMUM_GREEN].to_dict(),
+ yellow_key:
+ df_subgroup_counts.loc[
+ df_subgroup_counts.between(MINIMUM_YELLOW, MINIMUM_GREEN)
+ ].to_dict()
+ ,
+ red_key:
+ df_subgroup_counts.loc[
+ df_subgroup_counts.between(
+ MINIMUM_RED, MINIMUM_YELLOW, inclusive="left"
+ )
+ ].to_dict()
+ ,
+ halt_key: df_subgroup_counts.loc[df_subgroup_counts < MINIMUM_HALT].to_dict(),
+ }
+
+ # check logic
+ if representation_category[halt_key]:
+ code = FlagCode.HALT
+ message = (
+ f"Dataset wide ERCC representation is not robust: {representation_category}"
+ )
+ elif representation_category[red_key]:
+ code = FlagCode.RED
+ message = (
+ f"Dataset wide ERCC representation is not robust: {representation_category}"
+ )
+ elif representation_category[yellow_key]:
+ code = FlagCode.YELLOW
+ message = (
+ f"Dataset wide ERCC representation is not robust: {representation_category}"
+ )
+ else:
+ code = FlagCode.GREEN
+ message = (
+ f"Dataset wide ERCC representation is robust: {representation_category}"
+ )
+ return {"code": code, "message": message}
+
+
+def check_sample_in_multiqc_report(
+ samples: list[str],
+ multiqc_report_path: Path,
+ name_reformat_func: Callable = lambda s: s,
+) -> FlagEntry:
+ """Determines if the query samples are present in the multiqc report.
+
+ This is achieved by checking the 'multiqc_sources.txt' table, 'Sample Name' column.
+ An optional name_reformat_function can be supplied to address sample name changes that occur in the multiqc report.
+ An example being the renaming of Sample '-' characters to '_' for certain RSeQC modules.
+
+ :param sample: Query sample names to check for presense
+ :type sample: list[str]
+ :param multiqc_report_path: MultiQC report directory
+ :type multiqc_report_path: Path
+ :param name_reformat_func: A function applied to the multiQC sample names before searching against query sample names, defaults to not renaming the multiQC sample names
+ :type name_reformat_func: Callable, optional
+ :return: Flag Entry denoting successful or failing results. Includes description of query sample names and any missing samples
+ :rtype: FlagEntry
+ """
+ # Load multiQC sources table and retrieve set of samples
+ [sources_table] = multiqc_report_path.glob("**/multiqc_sources.txt")
+ multiQC_samples = list(pd.read_csv(sources_table, sep="\t")["Sample Name"])
+
+ # Transform multiQC samples using name_reformat_func
+ reformatted_multiQC_samples = [name_reformat_func(s) for s in multiQC_samples]
+
+ # Check for any missing reformatted sample names.
+ # Also track extra samples, these are not errors but should be reported as well.
+ missing_samples = set(samples) - set(reformatted_multiQC_samples)
+
+ # check logic
+ if len(missing_samples) == 0:
+ code = FlagCode.GREEN
+ message = f"Found all query samples after reformatting multiQC sample names. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing the following query samples: {missing_samples}. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }"
+ return {"code": code, "message": message}
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml
new file mode 100644
index 00000000..20163de3
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml
@@ -0,0 +1,1308 @@
+# TOP LEVEL
+NAME: "bulkRNASeq"
+VERSION: "1"
+
+# anchors for reuse
+_anchors:
+ rawDataDir: &rawDataDir "00-RawData"
+ trimDataDir: &trimDataDir "01-TG_Preproc"
+ alignDataDir: &alignDataDir "02-STAR_Alignment"
+ countsDataDir: &countsDataDir "03-RSEM_Counts"
+ normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts"
+ DGEDataDir: &DGEDataDir "05-DESeq2_DGE"
+ rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings
+ ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis"
+ FastQC_Reports: &FastQC_Reports "FastQC_Reports"
+ neverPublished: &neverPublished
+ subcategory: null
+ subdirectory: null
+ publish to repo: false
+ include subdirectory in table: false
+ table order: -1
+
+Staging:
+ General:
+ Required Metadata:
+ From ISA:
+ # - ISA Field Name: Study Assay Measurement Type
+ # ISA Table Source: Investigation
+ # Investigation Subtable: STUDY ASSAYS
+ # Runsheet Column Name: Study Assay Measurement Type
+ # Processing Usage: >-
+ # Mapping to the appropriate processing pipeline for the assay.
+ # Example: transcription profiling
+
+ # - ISA Field Name: Study Assay Technology Type
+ # ISA Table Source: Investigation
+ # Investigation Subtable: STUDY ASSAYS
+ # Runsheet Column Name: Study Assay Technology Type
+ # Processing Usage: >-
+ # Mapping to the appropriate processing pipeline for the assay.
+ # Example: DNA microarray
+
+ # - ISA Field Name: Study Assay Technology Platform
+ # ISA Table Source: Investigation
+ # Investigation Subtable: STUDY ASSAYS
+ # Runsheet Column Name: Study Assay Technology Platform
+ # Processing Usage: >-
+ # Mapping to the appropriate processing pipeline for the assay.
+ # Example: Affymetrix
+
+ - ISA Field Name: Study Protocol Type
+ ISA Table Source: Investigation
+ Investigation Subtable: STUDY PROTOCOLS
+ # will return a boolean indicating if any of the following includes
+ True If Includes At Least One:
+ - spike-in quality control role
+ - spike-in protocol
+ - spike-in control
+ - spike-in control protocol
+ Runsheet Column Name: has_ERCC
+ Processing Usage: >-
+ Indicates is ERCC spike-in has been added. This can be automatically
+ determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type'
+ Example: 'TRUE'
+
+ - ISA Field Name:
+ - Characteristics[Organism]
+ - Characteristics[organism]
+ ISA Table Source: Sample
+ Runsheet Column Name: organism
+ Processing Usage: >-
+ Mapping to the appropriate alignment reference and annotation databases.
+ Example: Arabidopsis thaliana
+
+ - ISA Field Name: Sample Name
+ ISA Table Source: Assay
+ Runsheet Column Name: sample_name
+ Runsheet Index: true
+ Processing Usage: >-
+ Sample name is used as a unique sample identifier during processing
+ Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538
+
+ - ISA Field Name:
+ - Parameter Value[library layout]
+ - Parameter Value[Library Layout]
+ ISA Table Source: Assay
+ Runsheet Column Name: paired_end
+ Remapping: {"PAIRED":true, "Paired":true, "SINGLE":false}
+ Processing Usage: >-
+ Indicates if the sequencing was paired end. This controls how a variety of tools are invoked
+ including in-house written scripts.
+ Example: 'TRUE'
+
+ # this entry denotes the following:
+ # retrive from that ISA field name
+ # multiple values (separated by ",")
+ # index those to certain runsheet columns
+ # if the index doesn't exist, optional prevents raising an exception
+ # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls
+ # an exception will be raised if one and only one url is not mapped to each filename
+ - ISA Field Name:
+ - Parameter Value[Merged Sequence Data File]
+ - Characteristics[Merged Sequence Data File]
+ - Raw Data File
+ ISA Table Source: Assay
+ Multiple Values Per Entry: true
+ Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma
+ Runsheet Column Name:
+ - {'name':'read1_path', 'index':0}
+ - {'name':'read2_path', 'index':1, 'optional':true}
+ GLDS URL Mapping: true
+ Processing Usage: >-
+ Location to the raw data fastq file. May be a url or local path.
+ Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...'
+
+ - ISA Field Name: Factor Value[{factor_name}]
+ ISA Table Source: [Assay, Sample]
+ Runsheet Column Name: Factor Value[{factor_name}]
+ Matches Multiple Columns: true
+ Match Regex: "Factor Value\\[.*\\]"
+ Append Column Following: "Unit"
+ Processing Usage: >-
+ Factor values in a study. Used to assign experimental groups for each sample.
+ Note: On the runsheet, a subsequent 'Unit' Column value will be
+ suffix-concatenated if it exists.
+ Example: Basal Control
+
+ - ISA Field Name: Unit
+ ISA Table Source: [Assay, Sample]
+ Runsheet Column Name: null
+ Matches Multiple Columns: true
+ Autoload: false # handled by factor value loading above
+ Processing Usage: >-
+ Unit to be suffix-concatenated onto prior Factor value columns.
+ Example: day
+
+ From User:
+ # Removed since unused by Processing via the runsheet
+ # - Runsheet Column Name: GLDS
+ # Processing Usage: >-
+ # The GLDS accession number
+ # Example: GLDS-205
+
+ - Runsheet Column Name: read1_path
+ # used to generate candidate file names for searching GLDS repository filelisting
+ Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"]
+ Processing Usage: >-
+ The location of either the forward reads (paired end) or only reads file (single end)
+ raw fastq file. Can be either a url or local path.
+ Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+ may be used to retrieve urls given the array data filename (sourced from ISA archive).
+ Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+
+
+ - Runsheet Column Name: read2_path
+ Data Asset Keys: ["raw reverse reads fastq GZ"]
+ Processing Usage: >-
+ The location of either the reverse reads (paired end)
+ raw fastq file. Can be either a url or local path.
+ For single end studies, this should be an empty string.
+ Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+ may be used to retrieve urls given the array data filename (sourced from ISA archive).
+ Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+
+ISA Meta:
+ Valid Study Assay Technology And Measurement Types:
+ - measurement: "transcription profiling"
+ technology: "RNA Sequencing (RNA-Seq)"
+
+ # this is prepended to all file names in the curation assay table
+ Global file prefix: "{datasystem}_rna_seq_"
+
+ # configuration related to updating investigation file
+ # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file
+ # LEADCAP_organism should be the studied organisms scientific name with a leading cap
+ Post Processing Add Study Protocol:
+ GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1
+
+data assets:
+ runsheet:
+ processed location:
+ - "Metadata"
+ - "{dataset}_bulkRNASeq_v1_runsheet.csv"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ ISA Archive:
+ processed location:
+ - "Metadata"
+ - "*-ISA.zip"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw MultiQC directory:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "raw_multiqc_report"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw MultiQC directory ZIP:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "raw_multiqc_report.zip"
+
+ tags:
+ - raw
+
+ resource categories: &MergedSequenceData_MultiQCReports
+ subcategory: Merged Sequence Data
+ subdirectory: Multiqc Reports
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 1
+
+ raw forward reads fastq GZ:
+ processed location:
+ - *rawDataDir
+ - "Fastq"
+ - "{sample}_R1_raw.fastq.gz"
+
+ tags:
+ - raw
+
+ resource categories: &MergedSequenceData_Fastq
+ subcategory: Merged Sequence Data
+ subdirectory: Fastq
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 0
+
+ raw reverse reads fastq GZ:
+ processed location:
+ - *rawDataDir
+ - "Fastq"
+ - "{sample}_R2_raw.fastq.gz"
+
+ tags:
+ - raw
+
+ resource categories: *MergedSequenceData_Fastq
+
+ raw reads fastq GZ:
+ processed location:
+ - *rawDataDir
+ - "Fastq"
+ - "{sample}_raw.fastq.gz"
+
+ tags:
+ - raw
+
+ resource categories: *MergedSequenceData_Fastq
+
+ raw forward reads fastQC HTML:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "{sample}_R1_raw_fastqc.html"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ # J.Oribello: We should revisit this, fastQC includes some unique (not parsed
+ # into multiQC) relevant information like the actual overrepresented sequence strings
+ raw reverse reads fastQC HTML:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "{sample}_R2_raw_fastqc.html"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw reads fastQC HTML:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "{sample}_raw_fastqc.html"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw forward reads fastQC ZIP:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "{sample}_R1_raw_fastqc.zip"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw reverse reads fastQC ZIP:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "{sample}_R2_raw_fastqc.zip"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw reads fastQC ZIP:
+ processed location:
+ - *rawDataDir
+ - *FastQC_Reports
+ - "{sample}_raw_fastqc.zip"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ trimmed fastQC MultiQC directory:
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "trimmed_multiqc_report"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ trimmed fastQC MultiQC directory ZIP:
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "trimmed_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: &TrimmedSequenceData_MultiQCReports
+ subcategory: Trimmed Sequence Data
+ subdirectory: Multiqc Reports
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 4
+
+ trimmed forward reads fastq GZ: &trimmedFastqGZ
+ processed location:
+ - *trimDataDir
+ - "Fastq"
+ - "{sample}_R1_trimmed.fastq.gz"
+
+ tags:
+ - processed
+
+ resource categories:
+ subcategory: Trimmed Sequence Data
+ subdirectory: Fastq
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 3
+
+ trimmed reverse reads fastq GZ:
+ <<: *trimmedFastqGZ
+ processed location:
+ - *trimDataDir
+ - "Fastq"
+ - "{sample}_R2_trimmed.fastq.gz"
+
+ tags:
+ - processed
+
+ trimmed reads fastq GZ:
+ <<: *trimmedFastqGZ
+ processed location:
+ - *trimDataDir
+ - "Fastq"
+ - "{sample}_trimmed.fastq.gz"
+
+ tags:
+ - processed
+
+ trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "{sample}_R1_trimmed_fastqc.html"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ trimmed reverse reads fastQC HTML:
+ <<: *trimmedForwardReadsFastQCHTML
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "{sample}_R2_trimmed_fastqc.html"
+
+ tags:
+ - processed
+
+ trimmed reads fastQC HTML:
+ <<: *trimmedForwardReadsFastQCHTML
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "{sample}_trimmed_fastqc.html"
+
+ tags:
+ - processed
+
+ trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "{sample}_R1_trimmed_fastqc.zip"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ trimmed reverse reads fastQC ZIP:
+ <<: *trimmedForwardReadsFastQCZIP
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "{sample}_R2_trimmed_fastqc.zip"
+
+ tags:
+ - processed
+
+ trimmed reads fastQC ZIP:
+ <<: *trimmedForwardReadsFastQCZIP
+ processed location:
+ - *trimDataDir
+ - *FastQC_Reports
+ - "{sample}_trimmed_fastqc.zip"
+
+ tags:
+ - processed
+
+ trimming MultiQC directory:
+ processed location:
+ - *trimDataDir
+ - &trimmingReportsDir "Trimming_Reports"
+ - "trimming_multiqc_report"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport
+ processed location:
+ - *trimDataDir
+ - *trimmingReportsDir
+ - "{sample}_R1_raw.fastq.gz_trimming_report.txt"
+
+ tags:
+ - processed
+
+ resource categories:
+ subcategory: Trimmed Sequence Data
+ subdirectory: Trimming Reports
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 5
+
+ reverse reads trimming report:
+ <<: *trimmedForwardReadsFastQCTrimmingReport
+ processed location:
+ - *trimDataDir
+ - *trimmingReportsDir
+ - "{sample}_R2_raw.fastq.gz_trimming_report.txt"
+
+ tags:
+ - processed
+
+ reads trimming report:
+ <<: *trimmedForwardReadsFastQCTrimmingReport
+ processed location:
+ - *trimDataDir
+ - *trimmingReportsDir
+ - "{sample}_raw.fastq.gz_trimming_report.txt"
+
+ tags:
+ - processed
+
+ aligned MultiQC directory:
+ processed location:
+ - *alignDataDir
+ - "align_multiqc_report"
+
+ resource categories: *neverPublished
+
+ tags:
+ - processed
+
+ aligned MultiQC directory ZIP:
+ processed location:
+ - *alignDataDir
+ - "align_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: &AlignedSequenceData_MultiQCReports
+ subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board
+ subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 8
+
+ aligned ToTranscriptome Bam:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Aligned.toTranscriptome.out.bam"
+
+ tags:
+ - processed
+
+ resource categories: &AlignedSequenceData_AlignedData
+ subcategory: Aligned Sequence Data
+ subdirectory: Aligned Data
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 6
+
+ aligned SortedByCoord Bam:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Aligned.sortedByCoord.out.bam"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ aligned SortedByCoord ResortedBam:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Aligned.sortedByCoord_sorted.out.bam"
+
+ tags:
+ - processed
+
+ resource categories: *AlignedSequenceData_AlignedData
+
+ aligned SortedByCoord ResortedBamIndex:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai"
+
+ tags:
+ - processed
+
+ resource categories: *AlignedSequenceData_AlignedData
+
+ aligned log Final:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Log.final.out"
+
+ tags:
+ - processed
+
+ resource categories: &AlignedSequenceData_AlignmentLogs
+ subcategory: Aligned Sequence Data
+ subdirectory: Alignment Logs
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 7
+
+ aligned log Progress:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Log.progress.out"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ aligned log Full:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_Log.out"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ aligned sjTab:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_SJ.out.tab"
+
+ tags:
+ - processed
+
+ resource categories: *AlignedSequenceData_AlignedData
+
+ genebody coverage MultiQC directory:
+ processed location:
+ - *rseqcDataDir
+ - "02_geneBody_coverage"
+ - "geneBody_cov_multiqc_report"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ genebody coverage MultiQC directory ZIP:
+ processed location:
+ - *rseqcDataDir
+ - "02_geneBody_coverage"
+ - "geneBody_cov_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: &RSeQC_MultiQCReports
+ subcategory: RSeQC
+ subdirectory: MultiQC Reports
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 9
+
+ infer experiment MultiQC directory:
+ processed location:
+ - *rseqcDataDir
+ - "03_infer_experiment"
+ - "infer_exp_multiqc_report"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ infer experiment MultiQC directory ZIP:
+ processed location:
+ - *rseqcDataDir
+ - "03_infer_experiment"
+ - "infer_exp_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: *RSeQC_MultiQCReports
+
+ inner distance MultiQC directory:
+ processed location:
+ - *rseqcDataDir
+ - "04_inner_distance"
+ - "inner_dist_multiqc_report"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ inner distance MultiQC directory ZIP:
+ processed location:
+ - *rseqcDataDir
+ - "04_inner_distance"
+ - "inner_dist_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: *RSeQC_MultiQCReports
+
+ read distribution MultiQC directory:
+ processed location:
+ - *rseqcDataDir
+ - "05_read_distribution"
+ - "read_dist_multiqc_report"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ read distribution MultiQC directory ZIP:
+ processed location:
+ - *rseqcDataDir
+ - "05_read_distribution"
+ - "read_dist_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: *RSeQC_MultiQCReports
+
+ genebody coverage out:
+ processed location:
+ - *rseqcDataDir
+ - "02_geneBody_coverage"
+ - "{sample}"
+
+ tags:
+ - processed
+
+ # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents
+ # is directory: true
+ # contents:
+ # - ["{sample}.geneBodyCoverage.r"]
+ # - ["{sample}.geneBodyCoverage.txt"]
+ # - ["{sample}.geneBodyCoverage.curves.pdf"]
+
+ resource categories: *neverPublished
+
+ infer experiment out:
+ processed location:
+ - *rseqcDataDir
+ - "03_infer_experiment"
+ - "{sample}_infer_expt.out"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ inner distance out:
+ processed location:
+ - *rseqcDataDir
+ - "04_inner_distance"
+ - "{sample}"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ read distribution out:
+ processed location:
+ - *rseqcDataDir
+ - "05_read_distribution"
+ - "{sample}_read_dist.out"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ RSEM counts MultiQC directory:
+ processed location:
+ - *countsDataDir
+ - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report as of 4/14/2022
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ RSEM counts MultiQC directory ZIP:
+ processed location:
+ - *countsDataDir
+ - "RSEM_count_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: &RawCountsData_MultiQCReports
+ subcategory: Raw Counts Data
+ subdirectory: Multiqc Reports
+ publish to repo: true
+ include subdirectory in table: true
+ table order: 11
+
+ star number non-zero count genes table:
+ processed location:
+ - *alignDataDir
+ - "STAR_NumNonZeroGenes.csv"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ star unnormalized counts table:
+ processed location:
+ - *alignDataDir
+ - "STAR_Unnormalized_Counts.csv"
+
+ tags:
+ - processed
+
+ resource categories: &RawCountsTables
+ subcategory: Raw Counts Tables
+ subdirectory: ""
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 12
+
+ rsem number non-zero count genes table:
+ processed location:
+ - *countsDataDir
+ - "RSEM_NumNonZeroGenes.csv"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ rsem unnormalized counts table:
+ processed location:
+ - *countsDataDir
+ - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv'
+
+ tags:
+ - processed
+
+ resource categories: *RawCountsTables
+
+ sample reads per gene table:
+ processed location:
+ - *alignDataDir
+ - "{sample}"
+ - "{sample}_ReadsPerGene.out.tab"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way?
+
+ sample gene counts table:
+ processed location:
+ - *countsDataDir
+ # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+ - "{sample}.genes.results"
+
+ tags:
+ - processed
+
+ resource categories: &RawCountsData_CountData
+ subcategory: Raw Counts Data
+ subdirectory: Count Data
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 10
+
+ sample isoform counts table:
+ processed location:
+ - *countsDataDir
+ # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+ - "{sample}.isoforms.results"
+
+ tags:
+ - processed
+
+ resource categories: *RawCountsData_CountData
+
+ sample counts stats directory:
+ processed location:
+ - *countsDataDir
+ # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+ - "{sample}.stat"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ DESeq2 normalized counts table:
+ processed location:
+ - *normCountsDataDir
+ - "Normalized_Counts.csv"
+
+ tags:
+ - processed
+
+ resource categories: &normalizedCountsData
+ subcategory: Normalized Counts Data
+ subdirectory: ""
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 13
+
+ ERCC normalized DESeq2 normalized counts table:
+ processed location:
+ - *normCountsDataDir
+ - "ERCC_Normalized_Counts.csv"
+
+ tags:
+ - processed
+
+ resource categories: *normalizedCountsData
+
+ sample table:
+ processed location:
+ - *DGEDataDir
+ - "SampleTable.csv"
+
+ tags:
+ - processed
+
+ resource categories: &DGEAnalysisData
+ subcategory: Differential Expression Analysis Data
+ subdirectory: ""
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 14
+
+ ERCC sample table:
+ processed location:
+ - *DGEDataDir
+ - &erccSubDir "ERCC_NormDGE"
+ - "ERCCnorm_SampleTable.csv"
+
+ tags:
+ - processed
+
+ resource categories: *DGEAnalysisData
+
+ DESeq2 unnormalized counts table:
+ processed location:
+ - *normCountsDataDir
+ - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables
+
+ DESeq2 contrasts table:
+ processed location:
+ - *DGEDataDir
+ - "contrasts.csv"
+
+ tags:
+ - processed
+
+ resource categories: *DGEAnalysisData
+
+ ERCC normalized DESeq2 contrasts table:
+ processed location:
+ - *DGEDataDir
+ - *erccSubDir
+ - "ERCCnorm_contrasts.csv"
+
+ tags:
+ - processed
+
+ resource categories: *DGEAnalysisData
+
+ DESeq2 annotated DGE table:
+ processed location:
+ - *DGEDataDir
+ - "differential_expression.csv"
+
+ tags:
+ - processed
+
+ resource categories: *DGEAnalysisData
+
+ ERCC normalized DESeq2 annotated DGE table:
+ processed location:
+ - *DGEDataDir
+ - *erccSubDir
+ - "ERCCnorm_differential_expression.csv"
+
+ tags:
+ - processed
+
+ resource categories: *DGEAnalysisData
+
+ DESeq2 annotated DGE extended for viz table:
+ processed location:
+ - *DGEDataDir
+ - "visualization_output_table.csv"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ ERCC normalized DESeq2 annotated DGE extended for viz table:
+ processed location:
+ - *DGEDataDir
+ - *erccSubDir
+ - "visualization_output_table_ERCCnorm.csv"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ DESeq2 viz PCA table:
+ processed location:
+ - *DGEDataDir
+ - "visualization_PCA_table.csv"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+ ERCC normalized DESeq2 viz PCA table:
+ processed location:
+ - *DGEDataDir
+ - *erccSubDir
+ - "visualization_PCA_table_ERCCnorm.csv"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+
+ ERCC analysis HTML:
+ processed location:
+ - *ERCCAnalysisDir
+ - "ERCC_analysis.html"
+
+ tags:
+ - processed
+
+ conditional on dataset:
+ - has_ERCC: [True]
+
+ resource categories:
+ subcategory: ERCC Analyses
+ subdirectory: ""
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 15
+
+ # NOTE: this is while the ERCC analysis sits outside the full pipeline and
+ # once incoporated, it should be validated for existence!
+ validate exists: false
+
+# Assets that are no longer generated by the latest pipeline
+Archived Data Assets:
+
+ # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited
+ # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore
+ # Specific metrics.
+ # - Jonathan Oribello
+ trimming MultiQC directory ZIP:
+ processed location:
+ - *trimDataDir
+ - *trimmingReportsDir
+ - "trimming_multiqc_report.zip"
+
+ tags:
+ - processed
+
+ resource categories: *neverPublished
+
+
+data asset sets:
+ # These assets are not generated in the workflow, but are generated after the workflow
+ PUTATIVE:
+ - "ERCC analysis HTML"
+ glds metadata:
+ - "ISA Archive"
+ has ercc:
+ - "ERCC normalized DESeq2 normalized counts table"
+ - "ERCC sample table"
+ - "ERCC normalized DESeq2 contrasts table"
+ - "ERCC normalized DESeq2 annotated DGE table"
+ - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+ - "ERCC normalized DESeq2 viz PCA table"
+ # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+ demuliplexed paired end raw data:
+ - "runsheet"
+ - "raw forward reads fastq GZ"
+ - "raw reverse reads fastq GZ"
+ qc reports for paired end raw data:
+ - "raw forward reads fastQC HTML"
+ - "raw reverse reads fastQC HTML"
+ - "raw forward reads fastQC ZIP"
+ - "raw reverse reads fastQC ZIP"
+ - "raw MultiQC directory"
+ - "raw MultiQC directory ZIP"
+ paired end trimmed reads:
+ - "trimmed forward reads fastq GZ"
+ - "trimmed reverse reads fastq GZ"
+ qc reports for paired end trimmed reads data:
+ - "trimmed forward reads fastQC HTML"
+ - "trimmed reverse reads fastQC HTML"
+ - "trimmed forward reads fastQC ZIP"
+ - "trimmed reverse reads fastQC ZIP"
+ - "trimmed fastQC MultiQC directory"
+ - "trimming MultiQC directory"
+ - "forward reads trimming report"
+ - "reverse reads trimming report"
+ demuliplexed single end raw data:
+ - "runsheet"
+ - "raw reads fastq GZ"
+ qc reports for single end raw data:
+ - "raw reads fastQC HTML"
+ - "raw reads fastQC ZIP"
+ - "raw MultiQC directory"
+ - "raw MultiQC directory ZIP"
+ single end trimmed reads:
+ - "trimmed reads fastq GZ"
+ qc reports for single end trimmed reads data:
+ - "trimmed reads fastQC HTML"
+ - "trimmed reads fastQC ZIP"
+ - "trimmed fastQC MultiQC directory"
+ - "trimming MultiQC directory"
+ - "reads trimming report"
+ STAR alignments:
+ - "aligned MultiQC directory"
+ - "aligned MultiQC directory ZIP"
+ - "aligned ToTranscriptome Bam"
+ - "aligned SortedByCoord Bam"
+ - "aligned SortedByCoord ResortedBam"
+ - "aligned SortedByCoord ResortedBamIndex"
+ - "aligned log Final"
+ - "aligned log Progress"
+ - "aligned log Full"
+ - "aligned sjTab"
+ - "sample reads per gene table"
+ - "star number non-zero count genes table"
+ - "star unnormalized counts table"
+ RSeQC output for paired end data:
+ - "genebody coverage MultiQC directory"
+ - "genebody coverage MultiQC directory ZIP"
+ - "infer experiment MultiQC directory"
+ - "infer experiment MultiQC directory ZIP"
+ - "inner distance MultiQC directory"
+ - "inner distance MultiQC directory ZIP"
+ - "read distribution MultiQC directory"
+ - "read distribution MultiQC directory ZIP"
+ - "genebody coverage out"
+ - "infer experiment out"
+ - "inner distance out"
+ - "read distribution out"
+ RSeQC output for single end data:
+ - "genebody coverage MultiQC directory"
+ - "genebody coverage MultiQC directory ZIP"
+ - "infer experiment MultiQC directory"
+ - "infer experiment MultiQC directory ZIP"
+ - "read distribution MultiQC directory"
+ - "read distribution MultiQC directory ZIP"
+ - "genebody coverage out"
+ - "infer experiment out"
+ - "read distribution out"
+ RSEM counts:
+ - "RSEM counts MultiQC directory"
+ - "RSEM counts MultiQC directory ZIP"
+ - "rsem number non-zero count genes table"
+ - "rsem unnormalized counts table"
+ - "sample gene counts table"
+ - "sample isoform counts table"
+ - "sample counts stats directory"
+ is single end full:
+ - "runsheet"
+ - "ISA Archive"
+ - "raw MultiQC directory"
+ - "raw MultiQC directory ZIP"
+ - "raw reads fastq GZ"
+ - "raw reads fastQC HTML"
+ - "raw reads fastQC ZIP"
+ - "trimmed fastQC MultiQC directory"
+ - "trimmed fastQC MultiQC directory ZIP"
+ - "trimmed reads fastq GZ"
+ - "trimmed reads fastQC HTML"
+ - "trimmed reads fastQC ZIP"
+ - "trimming MultiQC directory"
+ - "reads trimming report"
+ - "aligned MultiQC directory"
+ - "aligned MultiQC directory ZIP"
+ - "aligned ToTranscriptome Bam"
+ - "aligned SortedByCoord Bam"
+ - "aligned SortedByCoord ResortedBam"
+ - "aligned SortedByCoord ResortedBamIndex"
+ - "aligned log Final"
+ - "aligned log Progress"
+ - "aligned log Full"
+ - "aligned sjTab"
+ - "genebody coverage MultiQC directory"
+ - "genebody coverage MultiQC directory ZIP"
+ - "infer experiment MultiQC directory"
+ - "infer experiment MultiQC directory ZIP"
+ - "read distribution MultiQC directory"
+ - "read distribution MultiQC directory ZIP"
+ - "genebody coverage out"
+ - "infer experiment out"
+ - "read distribution out"
+ - "RSEM counts MultiQC directory"
+ - "RSEM counts MultiQC directory ZIP"
+ - "star number non-zero count genes table"
+ - "star unnormalized counts table"
+ - "rsem number non-zero count genes table"
+ - "rsem unnormalized counts table"
+ - "sample reads per gene table"
+ - "sample gene counts table"
+ - "sample isoform counts table"
+ - "sample counts stats directory"
+ - "DESeq2 normalized counts table"
+ - "sample table"
+ - "DESeq2 unnormalized counts table"
+ - "DESeq2 contrasts table"
+ - "DESeq2 annotated DGE table"
+ - "DESeq2 annotated DGE extended for viz table"
+ - "DESeq2 viz PCA table"
+ is paired end full:
+ - "runsheet"
+ - "ISA Archive"
+ - "raw MultiQC directory"
+ - "raw MultiQC directory ZIP"
+ - "raw forward reads fastq GZ"
+ - "raw reverse reads fastq GZ"
+ - "raw forward reads fastQC HTML"
+ - "raw reverse reads fastQC HTML"
+ - "raw forward reads fastQC ZIP"
+ - "raw reverse reads fastQC ZIP"
+ - "trimmed fastQC MultiQC directory"
+ - "trimmed fastQC MultiQC directory ZIP"
+ - "trimmed forward reads fastq GZ"
+ - "trimmed reverse reads fastq GZ"
+ - "trimmed forward reads fastQC HTML"
+ - "trimmed reverse reads fastQC HTML"
+ - "trimmed forward reads fastQC ZIP"
+ - "trimmed reverse reads fastQC ZIP"
+ - "trimming MultiQC directory"
+ - "forward reads trimming report"
+ - "reverse reads trimming report"
+ - "aligned MultiQC directory"
+ - "aligned MultiQC directory ZIP"
+ - "aligned ToTranscriptome Bam"
+ - "aligned SortedByCoord Bam"
+ - "aligned SortedByCoord ResortedBam"
+ - "aligned SortedByCoord ResortedBamIndex"
+ - "aligned log Final"
+ - "aligned log Progress"
+ - "aligned log Full"
+ - "aligned sjTab"
+ - "genebody coverage MultiQC directory"
+ - "genebody coverage MultiQC directory ZIP"
+ - "infer experiment MultiQC directory"
+ - "infer experiment MultiQC directory ZIP"
+ - "inner distance MultiQC directory"
+ - "inner distance MultiQC directory ZIP"
+ - "read distribution MultiQC directory"
+ - "read distribution MultiQC directory ZIP"
+ - "genebody coverage out"
+ - "infer experiment out"
+ - "inner distance out"
+ - "read distribution out"
+ - "RSEM counts MultiQC directory"
+ - "RSEM counts MultiQC directory ZIP"
+ - "star number non-zero count genes table"
+ - "star unnormalized counts table"
+ - "rsem number non-zero count genes table"
+ - "rsem unnormalized counts table"
+ - "sample reads per gene table"
+ - "sample gene counts table"
+ - "sample isoform counts table"
+ - "sample counts stats directory"
+ - "DESeq2 normalized counts table"
+ - "sample table"
+ - "DESeq2 unnormalized counts table"
+ - "DESeq2 contrasts table"
+ - "DESeq2 annotated DGE table"
+ - "DESeq2 annotated DGE extended for viz table"
+ - "DESeq2 viz PCA table"
+ DGE Output:
+ - "DESeq2 normalized counts table"
+ - "sample table"
+ - "DESeq2 unnormalized counts table"
+ - "DESeq2 contrasts table"
+ - "DESeq2 annotated DGE table"
+ - "DESeq2 annotated DGE extended for viz table"
+ - "DESeq2 viz PCA table"
+ ERCC DGE Output:
+ - "ERCC normalized DESeq2 normalized counts table"
+ - "ERCC sample table"
+ - "ERCC normalized DESeq2 contrasts table"
+ - "ERCC normalized DESeq2 annotated DGE table"
+ - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+ - "ERCC normalized DESeq2 viz PCA table"
+ # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+ RSEM Output:
+ - "RSEM counts MultiQC directory"
+ - "RSEM counts MultiQC directory ZIP"
+ - "rsem number non-zero count genes table"
+ - "rsem unnormalized counts table"
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py
new file mode 100644
index 00000000..5eaa896a
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py
@@ -0,0 +1,997 @@
+from pathlib import Path
+import re
+from typing import Union
+import yaml
+import logging
+
+from dp_tools.core.entity_model import Dataset
+
+log = logging.getLogger(__name__)
+
+from dp_tools.core.check_model import ValidationProtocol
+
+from .checks import *
+
+CONFIG = {
+ "Metadata-check_metadata_attributes_exist": {
+ "expected_attrs": ["paired_end", "has_ERCC", "organism"]
+ },
+ "Raw Reads-check_for_outliers": {
+ "mqc_module": "FastQC",
+ "mqc_plot": "general_stats",
+ "mqc_keys": [
+ "percent_gc",
+ "avg_sequence_length",
+ "total_sequences",
+ "percent_duplicates",
+ ],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "Trim Reads-check_for_outliers": {
+ "mqc_module": "FastQC",
+ "mqc_plot": "general_stats",
+ "mqc_keys": [
+ "percent_gc",
+ "avg_sequence_length",
+ "total_sequences",
+ "percent_duplicates",
+ ],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "Raw Reads By Sample-check_fastqgz_file_contents": {
+ "count_lines_to_check": 200000000
+ },
+ "Trim Reads By Sample-check_fastqgz_file_contents": {
+ "count_lines_to_check": 200000000
+ },
+ "STAR Alignments By Sample-check_thresholds-Mapped": {
+ "mqc_key": "STAR",
+ "stat_string": "uniquely_mapped_percent + multimapped_percent",
+ "thresholds": [
+ {"code": "YELLOW", "type": "lower", "value": 70},
+ {"code": "RED", "type": "lower", "value": 50},
+ ],
+ },
+ "STAR Alignments By Sample-check_thresholds-MultiMapped": {
+ "mqc_key": "STAR",
+ "stat_string": "multimapped_toomany_percent + multimapped_percent",
+ "thresholds": [
+ {"code": "YELLOW", "type": "lower", "value": 30},
+ {"code": "RED", "type": "lower", "value": 15},
+ ],
+ },
+ "STAR Alignments-check_for_outliers": {
+ "mqc_module": "STAR",
+ "mqc_plot": "general_stats",
+ "mqc_keys": [
+ "uniquely_mapped_percent",
+ "avg_mapped_read_length",
+ "mismatch_rate",
+ "deletion_rate",
+ "deletion_length",
+ "insertion_rate",
+ "insertion_length",
+ "multimapped_percent",
+ "multimapped_toomany_percent",
+ "unmapped_mismatches_percent",
+ "unmapped_tooshort_percent",
+ "unmapped_other_percent",
+ ],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "RSeQC-check_for_outliers-geneBody_coverage": {
+ "mqc_module": "RSeQC",
+ "mqc_plot": "Gene Body Coverage",
+ "mqc_keys": ["_ALL"],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "RSeQC-check_for_outliers-infer_experiment": {
+ "mqc_module": "RSeQC",
+ "mqc_plot": "Infer experiment",
+ "mqc_keys": ["_ALL"],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "RSeQC-check_for_outliers-inner_distance": {
+ "mqc_module": "RSeQC",
+ "mqc_plot": "Inner Distance",
+ "mqc_keys": ["_ALL"],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "RSeQC-check_for_outliers-read_distribution": {
+ "mqc_module": "RSeQC",
+ "mqc_plot": "Read Distribution",
+ "mqc_keys": ["_ALL"],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+ "RSeQC-check_strandedness_assessable_from_infer_experiment": {
+ "stranded_assessment_range": {"max": 100, "min": 75},
+ "unstranded_assessment_range": {"min": 40, "max": 60},
+ "valid_dominant_strandedness_assessments": [
+ "Sense (% Tags)",
+ "Antisense (% Tags)",
+ ],
+ },
+ "RSEM Counts-check_for_outliers": {
+ "mqc_module": "Rsem",
+ "mqc_plot": "general_stats",
+ "mqc_keys": [
+ "Unalignable",
+ "Alignable",
+ "Filtered",
+ "Total",
+ "alignable_percent",
+ "Unique",
+ "Multi",
+ "Uncertain",
+ ],
+ "thresholds": [
+ {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"},
+ {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"},
+ ],
+ },
+}
+
+# Manual kept in sync for now
+COMPONENTS_LIST = [
+ "Metadata", # for raw reads V&V
+ "Raw Reads", # for raw reads V&V
+ "Raw Reads By Sample", # for raw reads V&V
+ "Trim Reads", # for trim reads V&V
+ "Trimmed Reads By Sample", # for trim reads V&V
+ "STAR Alignments", # for star alignment V&V
+ "STAR Alignments By Sample", # for star alignment V&V
+ "RSeQC By Sample", # for RSeQC V&V
+ "RSeQC", # for RSeQC V&V
+ "RSEM Counts", # for after RSEM V&V
+ "Unnormalized Gene Counts", # for after RSEM V&V
+ "DGE Metadata", # for post DGE
+ "DGE Metadata ERCC", # for post DGE
+ "DGE Output", # for post DGE
+ "DGE Output ERCC", # for post DGE
+]
+
+
+def validate(
+ dataset: Dataset,
+ config_path: Path = None,
+ run_args: dict = None,
+ report_args: dict = None,
+ protocol_args: dict = None,
+ defer_run: bool = False,
+) -> Union[ValidationProtocol, ValidationProtocol.Report]:
+
+ if config_path is not None:
+ with open(config_path, "r") as f:
+ config = yaml.safe_load(f)
+ else:
+ config = CONFIG
+
+ if run_args is None:
+ run_args = dict()
+
+ if report_args is None:
+ report_args = dict()
+
+ if protocol_args is None:
+ protocol_args = dict()
+
+ # Modify protocol_args to convert run_components to skip_components based on COMPONENTS_LIST
+ if (
+ "run_components" in protocol_args
+ and protocol_args.get("run_components") is not None
+ ):
+ protocol_args["skip_components"] = [
+ c for c in COMPONENTS_LIST if c not in protocol_args["run_components"]
+ ]
+ # Check if any run components are not in COMPONENTS_LIST
+ if set(protocol_args["run_components"]) - set(COMPONENTS_LIST):
+ raise ValueError(
+ f"run_components contains components not in COMPONENTS_LIST. Unique to run_components: {set(protocol_args['run_components']) - set(COMPONENTS_LIST)}. All Components: {COMPONENTS_LIST}"
+ )
+ del protocol_args["run_components"]
+
+ # init validation protocol
+ vp = ValidationProtocol(**protocol_args)
+ # fmt: on
+ with vp.component_start(
+ name=dataset.name,
+ description="Validate processing from trim reads through differential gene expression output",
+ ):
+
+ with vp.component_start(
+ name="Metadata", description="Metadata file validation"
+ ):
+ with vp.payload(payloads=[{"dataset": dataset}]):
+ vp.add(
+ check_metadata_attributes_exist,
+ config=config["Metadata-check_metadata_attributes_exist"],
+ )
+
+ with vp.component_start(
+ name="Raw Reads", description="Raw Reads Outliers Detection"
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["raw reads fastQC ZIP"],
+ }
+ ]
+ if not dataset.metadata["paired_end"]
+ else [
+ {
+ "dataset": dataset,
+ "data_asset_keys": [
+ "raw forward reads fastQC ZIP",
+ ],
+ },
+ {
+ "dataset": dataset,
+ "data_asset_keys": [
+ "raw reverse reads fastQC ZIP",
+ ],
+ },
+ ]
+ ):
+ vp.add(
+ check_for_outliers, config=config["Raw Reads-check_for_outliers"]
+ )
+
+ with vp.payload(
+ payloads=[
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "raw MultiQC directory"
+ ].path,
+ "name_reformat_func": lambda: lambda s: re.sub(
+ "_raw|_R1_raw|_R2_raw$", "", s
+ ),
+ },
+ ]
+ ):
+ vp.add(
+ check_sample_in_multiqc_report,
+ description="Check all samples are present in raw reads multiQC report",
+ )
+
+ with vp.component_start(
+ name="Trim Reads", description="Trimmed Reads Outliers Detection"
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["trimmed reads fastQC ZIP"],
+ }
+ ]
+ if not dataset.metadata["paired_end"]
+ else [
+ {
+ "dataset": dataset,
+ "data_asset_keys": [
+ "trimmed forward reads fastQC ZIP",
+ ],
+ },
+ {
+ "dataset": dataset,
+ "data_asset_keys": [
+ "trimmed reverse reads fastQC ZIP",
+ ],
+ },
+ ]
+ ):
+ vp.add(
+ check_for_outliers, config=config["Trim Reads-check_for_outliers"]
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "trimmed fastQC MultiQC directory"
+ ].path,
+ "name_reformat_func": lambda: lambda s: re.sub(
+ "_R1|_R2$", "", s
+ ),
+ },
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "trimming MultiQC directory"
+ ].path,
+ "name_reformat_func": lambda: lambda s: re.sub(
+ "_raw|_R1_raw|_R2_raw$", "", s
+ ),
+ },
+ ]
+ ):
+ vp.add(
+ check_sample_in_multiqc_report,
+ description="Check that all samples are present in the trimmed FastQC and trimming report multiQC reports",
+ )
+ with vp.component_start(
+ name="STAR Alignments",
+ description="Dataset wide checks including outliers detection",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["aligned log Final"],
+ }
+ ]
+ ):
+ vp.add(
+ check_for_outliers,
+ config=config["STAR Alignments-check_for_outliers"],
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "aligned MultiQC directory"
+ ].path,
+ },
+ ]
+ ):
+ vp.add(
+ check_sample_in_multiqc_report,
+ description="Check all samples are present in STAR multiQC report",
+ )
+
+ with vp.component_start(
+ name="RSeQC",
+ description="RSeQC submodule outliers checking and other submodule specific dataset wide checks",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["genebody coverage out"],
+ }
+ ]
+ ):
+ vp.add(
+ check_for_outliers,
+ description="Check for outliers in geneBody Coverage",
+ config=config["RSeQC-check_for_outliers-geneBody_coverage"],
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["infer experiment out"],
+ }
+ ]
+ ):
+ vp.add(
+ check_for_outliers,
+ description="Check for outliers in infer experiment",
+ config=config["RSeQC-check_for_outliers-infer_experiment"],
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["inner distance out"],
+ }
+ ]
+ ):
+ vp.add(
+ check_for_outliers,
+ description="Check for outliers in inner distance",
+ config=config["RSeQC-check_for_outliers-inner_distance"],
+ skip=(not dataset.metadata["paired_end"]),
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["read distribution out"],
+ }
+ ]
+ ):
+ vp.add(
+ check_for_outliers,
+ description="Check for outliers in read distribution",
+ config=config["RSeQC-check_for_outliers-read_distribution"],
+ )
+
+ with vp.payload(payloads=[{"dataset": dataset}]):
+ vp.add(
+ check_strandedness_assessable_from_infer_experiment,
+ config=config[
+ "RSeQC-check_strandedness_assessable_from_infer_experiment"
+ ],
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "genebody coverage MultiQC directory"
+ ].path,
+ },
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "infer experiment MultiQC directory"
+ ].path,
+ "name_reformat_func": lambda: lambda s: re.sub(
+ "_infer_expt$", "", s
+ ),
+ },
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "read distribution MultiQC directory"
+ ].path,
+ "name_reformat_func": lambda: lambda s: re.sub(
+ "_read_dist$", "", s
+ ),
+ },
+ ]
+ ):
+ vp.add(
+ check_sample_in_multiqc_report,
+ description="Check all samples are present in RSeQC multiQC reports",
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "inner distance MultiQC directory"
+ ].path,
+ },
+ ]
+ ):
+ vp.add(
+ check_sample_in_multiqc_report,
+ description="Check all samples are present in RSeQC inner distance multiQC report (paired end only)",
+ skip=(not dataset.metadata["paired_end"]),
+ )
+ with vp.component_start(
+ name="RSEM Counts",
+ description="Dataset wide checks including outliers detection",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "dataset": dataset,
+ "data_asset_keys": ["sample counts stats directory"],
+ }
+ ]
+ ):
+ vp.add(
+ check_for_outliers, config=config["RSEM Counts-check_for_outliers"]
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "samples": list(dataset.samples),
+ "multiqc_report_path": lambda: dataset.data_assets[
+ "RSEM counts MultiQC directory"
+ ].path,
+ },
+ ]
+ ):
+ vp.add(
+ check_sample_in_multiqc_report,
+ description="Check all samples are present in RSEM multiQC report",
+ )
+ with vp.component_start(
+ name="Unnormalized Gene Counts",
+ description="Validate normalization related output",
+ ):
+
+ with vp.payload(
+ payloads=[
+ {
+ "unnormalizedCountTable": lambda: dataset.data_assets[
+ "star unnormalized counts table"
+ ].path,
+ "samplewise_tables": lambda: {
+ s.name: s.data_assets["sample reads per gene table"].path
+ for s in dataset.samples.values()
+ },
+ },
+ ]
+ ):
+ vp.add(
+ check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "unnormalizedCountTable": lambda: dataset.data_assets[
+ "rsem unnormalized counts table"
+ ].path,
+ "samplewise_tables": lambda: {
+ s.name: s.data_assets["sample gene counts table"].path
+ for s in dataset.samples.values()
+ },
+ },
+ ]
+ ):
+ vp.add(
+ check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables
+ )
+ vp.add(
+ check_ERCC_subgroup_representation,
+ skip=(not dataset.metadata["has_ERCC"]),
+ )
+
+ with vp.component_start(
+ name="DGE Metadata",
+ description="",
+ ):
+
+ with vp.component_start(
+ name="Sample Table",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ "sampleTable": lambda: dataset.data_assets[
+ "sample table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(
+ check_sample_table_against_runsheet,
+ config={"all_samples_required": True},
+ )
+ vp.add(check_sample_table_for_correct_group_assignments)
+
+ with vp.component_start(
+ name="Contrasts Tables",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ "contrasts_table": lambda: dataset.data_assets[
+ "DESeq2 contrasts table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(check_contrasts_table_headers)
+ vp.add(check_contrasts_table_rows)
+
+ with vp.component_start(
+ name="DGE Metadata ERCC",
+ description="",
+ skip=(not dataset.metadata["has_ERCC"]),
+ ):
+
+ with vp.component_start(
+ name="Sample Table",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ "sampleTable": lambda: dataset.data_assets[
+ "ERCC sample table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(
+ check_sample_table_against_runsheet,
+ config={"all_samples_required": False},
+ )
+ vp.add(check_sample_table_for_correct_group_assignments)
+
+ with vp.component_start(
+ name="Contrasts Tables",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ "contrasts_table": lambda: dataset.data_assets[
+ "ERCC normalized DESeq2 contrasts table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(check_contrasts_table_headers)
+ vp.add(check_contrasts_table_rows)
+
+ with vp.component_start(
+ name="DGE Output",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "rsem_table_path": lambda: dataset.data_assets[
+ "rsem unnormalized counts table"
+ ].path,
+ "deseq2_table_path": lambda: dataset.data_assets[
+ "DESeq2 unnormalized counts table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(
+ check_rsem_counts_and_unnormalized_tables_parity,
+ skip=(
+ "rsem unnormalized counts table" not in dataset.data_assets
+ or "DESeq2 unnormalized counts table" not in dataset.data_assets
+ ),
+ )
+
+ with vp.payload(
+ payloads=[
+ {
+ "organism": lambda: dataset.metadata["organism"],
+ "samples": lambda: set(dataset.samples),
+ "dge_table": lambda: dataset.data_assets[
+ "DESeq2 annotated DGE table"
+ ].path,
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ }
+ ]
+ ):
+ vp.add(check_dge_table_annotation_columns_exist)
+ vp.add(check_dge_table_sample_columns_exist)
+ vp.add(check_dge_table_sample_columns_constraints)
+ vp.add(check_dge_table_group_columns_exist)
+ vp.add(check_dge_table_group_columns_constraints)
+ vp.add(check_dge_table_comparison_statistical_columns_exist)
+ vp.add(check_dge_table_group_statistical_columns_constraints)
+ vp.add(check_dge_table_fixed_statistical_columns_exist)
+ vp.add(check_dge_table_fixed_statistical_columns_constraints)
+ vp.add(check_dge_table_log2fc_within_reason)
+
+ with vp.component_start(
+ name="Viz Tables",
+ description="Extended from the dge tables",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "organism": lambda: dataset.metadata["organism"],
+ "samples": lambda: set(dataset.samples),
+ "dge_table": lambda: dataset.data_assets[
+ "DESeq2 annotated DGE extended for viz table"
+ ].path,
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ }
+ ]
+ ):
+ vp.add(check_dge_table_annotation_columns_exist)
+ vp.add(check_dge_table_sample_columns_exist)
+ vp.add(check_dge_table_sample_columns_constraints)
+ vp.add(check_dge_table_group_columns_exist)
+ vp.add(check_dge_table_group_columns_constraints)
+ vp.add(check_dge_table_comparison_statistical_columns_exist)
+ vp.add(check_dge_table_group_statistical_columns_constraints)
+ vp.add(check_dge_table_fixed_statistical_columns_exist)
+ vp.add(check_dge_table_fixed_statistical_columns_constraints)
+ vp.add(check_dge_table_log2fc_within_reason)
+ vp.add(check_viz_table_columns_exist)
+ vp.add(check_viz_table_columns_constraints)
+
+ with vp.payload(
+ payloads=[
+ {
+ "samples": lambda: set(dataset.samples),
+ "pca_table": lambda: dataset.data_assets[
+ "DESeq2 viz PCA table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(check_viz_pca_table_index_and_columns_exist)
+
+ with vp.component_start(
+ name="DGE Output ERCC",
+ description="",
+ skip=(not dataset.metadata["has_ERCC"]),
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "organism": lambda: dataset.metadata["organism"],
+ "samples": lambda: set(
+ pd.read_csv(
+ dataset.data_assets["ERCC sample table"].path,
+ index_col=0,
+ ).index
+ ),
+ "dge_table": lambda: dataset.data_assets[
+ "ERCC normalized DESeq2 annotated DGE table"
+ ].path,
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ }
+ ]
+ ):
+ vp.add(check_dge_table_annotation_columns_exist)
+ vp.add(check_dge_table_sample_columns_exist)
+ vp.add(check_dge_table_sample_columns_constraints)
+ vp.add(check_dge_table_group_columns_exist)
+ vp.add(check_dge_table_group_columns_constraints)
+ vp.add(check_dge_table_comparison_statistical_columns_exist)
+ vp.add(check_dge_table_group_statistical_columns_constraints)
+ vp.add(check_dge_table_fixed_statistical_columns_exist)
+ vp.add(check_dge_table_fixed_statistical_columns_constraints)
+ vp.add(check_dge_table_log2fc_within_reason)
+
+ with vp.component_start(
+ name="Viz Tables",
+ description="Extended from the dge tables",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "organism": lambda: dataset.metadata["organism"],
+ "samples": lambda: set(
+ pd.read_csv(
+ dataset.data_assets["ERCC sample table"].path,
+ index_col=0,
+ ).index
+ ),
+ "dge_table": lambda: dataset.data_assets[
+ "ERCC normalized DESeq2 annotated DGE extended for viz table"
+ ].path,
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ }
+ ]
+ ):
+ vp.add(check_dge_table_annotation_columns_exist)
+ vp.add(check_dge_table_sample_columns_exist)
+ vp.add(check_dge_table_sample_columns_constraints)
+ vp.add(check_dge_table_group_columns_exist)
+ vp.add(check_dge_table_group_columns_constraints)
+ vp.add(check_dge_table_comparison_statistical_columns_exist)
+ vp.add(check_dge_table_group_statistical_columns_constraints)
+ vp.add(check_dge_table_fixed_statistical_columns_exist)
+ vp.add(check_dge_table_fixed_statistical_columns_constraints)
+ vp.add(check_dge_table_log2fc_within_reason)
+ vp.add(check_viz_table_columns_exist)
+ vp.add(check_viz_table_columns_constraints)
+
+ with vp.payload(
+ payloads=[
+ {
+ "samples": lambda: set(
+ pd.read_csv(
+ dataset.data_assets["ERCC sample table"].path,
+ index_col=0,
+ ).index
+ ),
+ "pca_table": lambda: dataset.data_assets[
+ "ERCC normalized DESeq2 viz PCA table"
+ ].path,
+ }
+ ]
+ ):
+ vp.add(check_viz_pca_table_index_and_columns_exist)
+
+ for sample in dataset.samples.values():
+ with vp.component_start(
+ name=sample.name, description="Samples level checks"
+ ):
+ with vp.component_start(
+ name="Raw Reads By Sample", description="Raw reads"
+ ):
+ with vp.payload(
+ payloads=(
+ [
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "raw forward reads fastq GZ"
+ ].path
+ },
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "raw reverse reads fastq GZ"
+ ].path
+ },
+ ]
+ if dataset.metadata["paired_end"]
+ else [
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "raw reads fastq GZ"
+ ].path
+ },
+ ]
+ )
+ ):
+ vp.add(
+ check_fastqgz_file_contents,
+ config=config[
+ "Raw Reads By Sample-check_fastqgz_file_contents"
+ ],
+ )
+ vp.add(
+ check_gzip_file_integrity,
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "sample": sample,
+ "reads_key_1": "raw forward reads fastQC ZIP",
+ "reads_key_2": "raw reverse reads fastQC ZIP",
+ },
+ ],
+ ):
+ vp.add(
+ check_forward_and_reverse_reads_counts_match,
+ skip=(not dataset.metadata["paired_end"]),
+ )
+ with vp.component_start(
+ name="Trimmed Reads By Sample", description="Trimmed reads"
+ ):
+ with vp.payload(
+ payloads=(
+ [
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "trimmed forward reads fastq GZ"
+ ].path
+ },
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "trimmed reverse reads fastq GZ"
+ ].path
+ },
+ ]
+ if dataset.metadata["paired_end"]
+ else [
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "trimmed reads fastq GZ"
+ ].path
+ }
+ ]
+ )
+ ):
+ vp.add(check_file_exists, description="Check reads files exist")
+ vp.add(
+ check_fastqgz_file_contents,
+ config=config[
+ "Trim Reads By Sample-check_fastqgz_file_contents"
+ ],
+ )
+
+ with vp.payload(
+ payloads=[
+ {
+ "sample": sample,
+ "reads_key_1": "trimmed forward reads fastQC ZIP",
+ "reads_key_2": "trimmed reverse reads fastQC ZIP",
+ },
+ ],
+ ):
+ vp.add(
+ check_forward_and_reverse_reads_counts_match,
+ skip=(not dataset.metadata["paired_end"]),
+ )
+
+ with vp.component_start(
+ name="STAR Alignments By Sample",
+ description="STAR Alignment outputs",
+ ):
+
+ with vp.payload(
+ payloads=[
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "aligned ToTranscriptome Bam"
+ ].path,
+ },
+ {
+ "file": lambda sample=sample: sample.data_assets[
+ "aligned SortedByCoord Bam"
+ ].path,
+ },
+ ]
+ ):
+ vp.add(
+ check_bam_file_integrity,
+ config={
+ "samtools_bin": "samtools"
+ }, # assumes accessible on path already
+ )
+
+ with vp.payload(
+ payloads=[
+ {
+ "multiqc_inputs": lambda sample=sample: [
+ sample.data_assets["aligned log Final"].path
+ ],
+ },
+ ]
+ ):
+ vp.add(
+ check_thresholds,
+ config=config[
+ "STAR Alignments By Sample-check_thresholds-Mapped"
+ ],
+ description="Check that mapping rates are reasonable, specifically most reads map to the target genome",
+ )
+ vp.add(
+ check_thresholds,
+ config=config[
+ "STAR Alignments By Sample-check_thresholds-MultiMapped"
+ ],
+ description="Check that mapping rates are reasonable, specifically that a considerable amount of reads multimap to the target genome",
+ )
+
+ with vp.component_start(
+ name="RSeQC By Sample",
+ description="RNASeq QA outputs",
+ ):
+ with vp.component_start(
+ name="geneBody_coverage",
+ description="Assess integrity of transcripts and library prep signatures",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "input_dir": lambda sample=sample: sample.data_assets[
+ "genebody coverage out"
+ ].path
+ },
+ ]
+ ):
+ vp.add(check_genebody_coverage_output)
+ with vp.component_start(
+ name="inner_distance",
+ description="Reports on distance between mate reads based on gene annotations",
+ skip=(not dataset.metadata["paired_end"]),
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "input_dir": lambda sample=sample: sample.data_assets[
+ "inner distance out"
+ ].path
+ },
+ ]
+ ):
+ vp.add(check_inner_distance_output)
+ # return protocol object without running or generating a report
+ if defer_run:
+ return vp
+
+ vp.run(**run_args)
+
+ # return report
+ return vp.report(**report_args, combine_with_flags=dataset.loaded_assets_dicts)
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py
new file mode 100644
index 00000000..f12de761
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py
@@ -0,0 +1,33 @@
+import pandas as pd
+
+class runsheet: # Bad casing since we will use the class definition itself for all static methods
+
+ @staticmethod
+ def check_single_value(column: pd.Series, error_msg: str, errors: list[str]) -> None:
+ if len(column.unique()) != 1:
+ errors.append(error_msg)
+
+ @staticmethod
+ def check_read2_path_populated_if_paired_end(df: pd.DataFrame, errors: list[str]) -> None:
+ if (("read2_path" in df.columns and df['paired_end'][0] == True) or
+ ("read2_path" not in df.columns and df['paired_end'][0] == False)):
+ return
+ else:
+ errors.append("Expected 'read2_path' to be populated only if paired_end is True")
+
+ @staticmethod
+ def validate(df_runsheet: pd.DataFrame) -> bool:
+ errors = []
+
+ # Check for single value in specified columns
+ runsheet.check_single_value(df_runsheet['has_ERCC'], "Dataset level columns do NOT contain one unique value for 'has_ERCC'", errors)
+ runsheet.check_single_value(df_runsheet['organism'], "Dataset level columns do NOT contain one unique value for 'organism'", errors)
+ runsheet.check_single_value(df_runsheet['paired_end'], "Dataset level columns do NOT contain one unique value for 'paired_end'", errors)
+
+ # Check for 'read2_path' population if paired_end is True
+ runsheet.check_read2_path_populated_if_paired_end(df_runsheet, errors)
+
+ if errors:
+ raise ValueError("\n".join(errors))
+ else:
+ return True
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config
deleted file mode 100644
index faca1b05..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config
+++ /dev/null
@@ -1,60 +0,0 @@
-// Config that specifies packaged conda yml files for each process
-
-process {
- withName: 'TO_PRED|TO_BED' {
- conda = "${projectDir}/envs/ucsc_gtf_Pred_BED.yml"
- }
-
- withName: 'INFER_EXPERIMENT|SORT_INDEX_BAM|GENEBODY_COVERAGE|INNER_DISTANCE|READ_DISTRIBUTION' {
- conda = "${projectDir}/envs/samtools_rseqc.yml"
- }
-
- withName: 'DGE_BY_DESEQ2|QUANTIFY_GENES' {
- conda = "${projectDir}/envs/RNAseq_Rtools.yml"
- }
-
- withName: 'FASTQC' {
- conda = "${projectDir}/envs/fastqc.yml"
- }
-
- withName: 'MULTIQC' {
- conda = "${projectDir}/envs/multiqc.yml"
- }
-
- withName: 'TRIMGALORE' {
- conda = "${projectDir}/envs/trim_galore.yml"
- }
-
- withName: 'DOWNLOAD_GENOME_ANNOTATIONS|GENERATE_METASHEET' {
- conda = "${projectDir}/envs/download_tools.yml"
- }
-
- withName: 'RNASEQ_RUNSHEET_FROM_GLDS' {
- conda = "${projectDir}/envs/dp_tools.yml"
- }
-
- withName: 'BUILD_STAR|ALIGN_STAR' {
- conda = "${projectDir}/envs/star.yml"
- }
-
- withName: 'BUILD_RSEM|COUNT_ALIGNED' {
- conda = "${projectDir}/envs/rsem.yml"
- }
-
- withName: 'SUBSAMPLE_GENOME' {
- conda = "${projectDir}/envs/samtools.yml"
- }
-
- withName: 'POST_PROCESSING|SOFTWARE_VERSIONS' {
- conda = "${projectDir}/envs/genelab_utils.yml"
- }
-
- withLabel: 'VV' {
- conda = "${projectDir}/envs/dp_tools.yml"
- }
-
- withName: 'GET_MAX_READ_LENGTH|ASSESS_STRANDEDNESS' {
- conda = "${projectDir}/envs/python.yml"
- }
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config
index 0166b683..34cc3370 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config
@@ -45,12 +45,12 @@ process {
container = "quay.io/biocontainers/rsem:1.3.1--pl526r341h4f16992_0"
}
- withName: 'RNASEQ_RUNSHEET_FROM_GLDS|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|SOFTWARE_VERSIONS' {
- container = "quay.io/j_81/dp_tools:1.1.8"
+ withName: 'RUNSHEET_FROM_GLDS|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|SOFTWARE_VERSIONS' {
+ container = "quay.io/j_81/dp_tools:1.3.3"
}
withLabel: 'VV' {
- container = "quay.io/j_81/dp_tools:1.1.8"
+ container = "quay.io/j_81/dp_tools:1.3.3"
}
withName: 'GET_MAX_READ_LENGTH|ASSESS_STRANDEDNESS' {
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config
deleted file mode 100644
index 00976bf2..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config
+++ /dev/null
@@ -1,15 +0,0 @@
-// Should be loaded by every RNASeq process.
-params {
- /*
- Parameters that MUST be supplied
- */
- gldsAccession = 'GLDS-194' // GeneLab Data Accession Number, e.g. GLDS-104
- use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail
-
- /*
- DEBUG parameters, should NOT be overwritten for production processing runs
- */
- genomeSubsample = 19 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome)
- truncateTo = 300 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file.
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config
deleted file mode 100644
index a4a26252..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config
+++ /dev/null
@@ -1,11 +0,0 @@
-// Should be loaded by every RNASeq process.
-params {
- /*
- Parameters that MUST be supplied
- */
- gldsAccession = 'GLDS-207' // GeneLab Data Accession Number, e.g. GLDS-104
- use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail
-
- truncateTo = 100 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file.
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config
deleted file mode 100644
index 0085e1f9..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config
+++ /dev/null
@@ -1,12 +0,0 @@
-// Should be loaded by every RNASeq process.
-params {
- /*
- Parameters that MUST be supplied
- */
- gldsAccession = 'GLDS-251' // GeneLab Data Accession Number, e.g. GLDS-104
- use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail
-
- genomeSubsample = 5 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome)
- truncateTo = 300 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file.
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config
deleted file mode 100644
index 049b4527..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config
+++ /dev/null
@@ -1,15 +0,0 @@
-// Should be loaded by every RNASeq process.
-params {
- /*
- Parameters that MUST be supplied
- */
- gldsAccession = 'GLDS-48' // GeneLab Data Accession Number, e.g. GLDS-104
- use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail
-
- /*
- DEBUG parameters, should NOT be overwritten for production processing runs
- */
- genomeSubsample = 19 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome)
- truncateTo = 600 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file.
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config
deleted file mode 100644
index a667c13e..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config
+++ /dev/null
@@ -1,15 +0,0 @@
-// Should be loaded by every RNASeq process.
-params {
- /*
- Parameters that MUST be supplied
- */
- gldsAccession = 'GLDS-91' // GeneLab Data Accession Number, e.g. GLDS-104
- use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail
-
- /*
- DEBUG parameters, should NOT be overwritten for production processing runs
- */
- genomeSubsample = 21 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome)
- truncateTo = 600 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file.
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config
deleted file mode 100644
index 5e72e005..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config
+++ /dev/null
@@ -1,20 +0,0 @@
-// Should be loaded by every RNASeq process.
-params {
- /*
- Parameters that MUST be supplied
- */
- gldsAccession = 'CustomAnalysis' // GeneLab Data Accession Number, e.g. GLDS-104
- use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail
-
- /*
- Parameters that CAN be overwritten
- */
- runsheetPath = "${projectDir}/test_assets/CustomAnalysis_bulkRNASeq_v1_runsheet.csv"
-
- /*
- DEBUG parameters, should NOT be overwritten for production processing runs
- */
- genomeSubsample = 19 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome)
- truncateTo = 600 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file.
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml
deleted file mode 100644
index 5b4107da..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: AST
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - python=3.8
- - pandas=1.2
- - isatools
- - peppy
- - pip
- - pip:
- - git+https://github.com/J-81/Analysis_Staging.git@0.4.0-beta.7
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml
deleted file mode 100644
index dcdf990a..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml
+++ /dev/null
@@ -1,378 +0,0 @@
-name: RNAseq_Rtools
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - _libgcc_mutex=0.1=conda_forge
- - _openmp_mutex=4.5=1_gnu
- - _r-mutex=1.0.1=anacondar_1
- - binutils_impl_linux-64=2.36.1=h193b22a_2
- - binutils_linux-64=2.36=hf3e587d_6
- - bioconductor-affy=1.72.0=r41h5c21468_1
- - bioconductor-affyio=1.64.0=r41h5c21468_1
- - bioconductor-annotate=1.72.0=r41hdfd78af_0
- - bioconductor-annotationdbi=1.56.1=r41hdfd78af_0
- - bioconductor-annotationhub=3.2.0=r41hdfd78af_0
- - bioconductor-biobase=2.54.0=r41h5c21468_1
- - bioconductor-biocfilecache=2.2.0=r41hdfd78af_0
- - bioconductor-biocgenerics=0.40.0=r41hdfd78af_0
- - bioconductor-biocparallel=1.28.3=r41h619a076_0
- - bioconductor-biocversion=3.14.0=r41hdfd78af_0
- - bioconductor-biocviews=1.62.0=r41hdfd78af_0
- - bioconductor-biostrings=2.62.0=r41h5c21468_1
- - bioconductor-complexheatmap=2.10.0=r41hdfd78af_0
- - bioconductor-delayedarray=0.20.0=r41h5c21468_1
- - bioconductor-deseq2=1.34.0=r41h619a076_1
- - bioconductor-enhancedvolcano=1.12.0=r41hdfd78af_0
- - bioconductor-genefilter=1.76.0=r41ha086028_1
- - bioconductor-geneplotter=1.72.0=r41hdfd78af_0
- - bioconductor-genomeinfodb=1.30.0=r41hdfd78af_0
- - bioconductor-genomeinfodbdata=1.2.7=r41hdfd78af_1
- - bioconductor-genomicranges=1.46.1=r41h5c21468_0
- - bioconductor-graph=1.72.0=r41h5c21468_1
- - bioconductor-impute=1.68.0=r41h77f299f_1
- - bioconductor-interactivedisplaybase=1.32.0=r41hdfd78af_0
- - bioconductor-iranges=2.28.0=r41h5c21468_1
- - bioconductor-keggrest=1.34.0=r41hdfd78af_0
- - bioconductor-limma=3.50.1=r41h5c21468_0
- - bioconductor-massspecwavelet=1.60.0=r41h5c21468_1
- - bioconductor-matrixgenerics=1.6.0=r41hdfd78af_0
- - bioconductor-mscoreutils=1.6.1=r41h619a076_0
- - bioconductor-msfeatures=1.2.0=r41hdfd78af_0
- - bioconductor-msnbase=2.20.4=r41h619a076_0
- - bioconductor-mzid=1.32.0=r41hdfd78af_0
- - bioconductor-mzr=2.28.0=r41h619a076_1
- - bioconductor-org.at.tair.db=3.14.0=r41hdfd78af_0
- - bioconductor-org.ce.eg.db=3.14.0=r41hdfd78af_0
- - bioconductor-org.dm.eg.db=3.14.0=r41hdfd78af_0
- - bioconductor-org.eck12.eg.db=3.14.0=r41hdfd78af_0
- - bioconductor-org.hs.eg.db=3.14.0=r41hdfd78af_0
- - bioconductor-org.mm.eg.db=3.14.0=r41hdfd78af_0
- - bioconductor-org.sc.sgd.db=3.14.0=r41hdfd78af_0
- - bioconductor-panther.db=1.0.11=r41hdfd78af_1
- - bioconductor-pcamethods=1.86.0=r41h619a076_1
- - bioconductor-preprocesscore=1.56.0=r41h5c21468_1
- - bioconductor-protgenerics=1.26.0=r41hdfd78af_0
- - bioconductor-rbgl=1.70.0=r41h619a076_1
- - bioconductor-rhdf5lib=1.16.0=r41h5c21468_1
- - bioconductor-risa=1.36.0=r41h619a076_1
- - bioconductor-s4vectors=0.32.3=r41h5c21468_0
- - bioconductor-stringdb=2.6.0=r41hdfd78af_0
- - bioconductor-summarizedexperiment=1.24.0=r41hdfd78af_0
- - bioconductor-tximport=1.22.0=r41hdfd78af_0
- - bioconductor-vsn=3.62.0=r41h5c21468_1
- - bioconductor-xcms=3.16.1=r41h619a076_0
- - bioconductor-xvector=0.34.0=r41h5c21468_1
- - bioconductor-zlibbioc=1.40.0=r41h5c21468_1
- - bwidget=1.9.14=ha770c72_1
- - bzip2=1.0.8=h7f98852_4
- - c-ares=1.18.1=h7f98852_0
- - ca-certificates=2022.5.18.1=ha878542_0
- - cairo=1.16.0=ha00ac49_1009
- - curl=7.81.0=h494985f_0
- - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
- - font-ttf-inconsolata=3.000=h77eed37_0
- - font-ttf-source-code-pro=2.038=h77eed37_0
- - font-ttf-ubuntu=0.83=hab24e00_0
- - fontconfig=2.13.96=ha180cfb_0
- - fonts-conda-ecosystem=1=0
- - fonts-conda-forge=1=0
- - freetype=2.10.4=h0708190_1
- - fribidi=1.0.10=h36c2ea0_0
- - gcc_impl_linux-64=9.4.0=h03d3576_13
- - gcc_linux-64=9.4.0=h391b98a_6
- - gettext=0.19.8.1=h73d1719_1008
- - gfortran_impl_linux-64=9.4.0=h0003116_13
- - gfortran_linux-64=9.4.0=hf0ab688_6
- - gmp=6.2.1=h58526e2_0
- - graphite2=1.3.13=h58526e2_1001
- - gsl=2.7=he838d99_0
- - gxx_impl_linux-64=9.4.0=h03d3576_13
- - gxx_linux-64=9.4.0=h0316aca_6
- - harfbuzz=3.4.0=hb4a5f5f_0
- - hdf4=4.2.15=h10796ff_3
- - hdf5=1.12.1=nompi_h7f166f4_103
- - icu=69.1=h9c3ff4c_0
- - jbig=2.1=h7f98852_2003
- - jpeg=9e=h7f98852_0
- - kernel-headers_linux-64=2.6.32=he073ed8_15
- - keyutils=1.6.1=h166bdaf_0
- - krb5=1.19.2=h08a2579_4
- - ld_impl_linux-64=2.36.1=hea4e1c9_2
- - lerc=3.0=h9c3ff4c_0
- - libblas=3.9.0=13_linux64_openblas
- - libcblas=3.9.0=13_linux64_openblas
- - libcurl=7.81.0=h494985f_0
- - libdeflate=1.10=h7f98852_0
- - libedit=3.1.20191231=he28a2e2_2
- - libev=4.33=h516909a_1
- - libffi=3.4.2=h7f98852_5
- - libgcc-devel_linux-64=9.4.0=hd854feb_13
- - libgcc-ng=11.2.0=h1d223b6_13
- - libgfortran-ng=11.2.0=h69a702a_13
- - libgfortran5=11.2.0=h5c6108e_13
- - libglib=2.70.2=h174f98d_4
- - libgomp=11.2.0=h1d223b6_13
- - libiconv=1.16=h516909a_0
- - liblapack=3.9.0=13_linux64_openblas
- - libnetcdf=4.8.1=nompi_hb3fd0d9_101
- - libnghttp2=1.47.0=he49606f_0
- - libopenblas=0.3.18=pthreads_h8fe5266_0
- - libpng=1.6.37=h21135ba_2
- - libsanitizer=9.4.0=h79bfe98_13
- - libssh2=1.10.0=ha35d2d1_2
- - libstdcxx-devel_linux-64=9.4.0=hd854feb_13
- - libstdcxx-ng=11.2.0=he4da1e4_13
- - libtiff=4.3.0=h542a066_3
- - libuuid=2.32.1=h7f98852_1000
- - libwebp-base=1.2.2=h7f98852_1
- - libxcb=1.13=h7f98852_1004
- - libxml2=2.9.12=h885dcf4_1
- - libzip=1.8.0=h1c5bbd1_1
- - libzlib=1.2.11=h36c2ea0_1013
- - lz4-c=1.9.3=h9c3ff4c_1
- - make=4.3=hd18ef5c_1
- - ncurses=6.2=h58526e2_4
- - openssl=3.0.3=h166bdaf_0
- - pandoc=2.17.1.1=ha770c72_0
- - pango=1.50.3=h9967ed3_0
- - pcre=8.45=h9c3ff4c_0
- - pcre2=10.37=h032f7d1_0
- - pixman=0.40.0=h36c2ea0_0
- - proj=8.2.1=h277dcde_0
- - pthread-stubs=0.4=h36c2ea0_1001
- - r-ash=1.0_15=r41h859d828_1006
- - r-askpass=1.1=r41hcfec24a_2
- - r-assertthat=0.2.1=r41hc72bb7e_2
- - r-backports=1.4.1=r41hcfec24a_0
- - r-base=4.1.2=h2553ce4_1
- - r-base64enc=0.1_3=r41hcfec24a_1004
- - r-beeswarm=0.4.0=r41hcfec24a_1
- - r-bh=1.78.0_0=r41hc72bb7e_0
- - r-biocmanager=1.30.16=r41hc72bb7e_0
- - r-bit=4.0.4=r41hcfec24a_0
- - r-bit64=4.0.5=r41hcfec24a_0
- - r-bitops=1.0_7=r41hcfec24a_0
- - r-blob=1.2.2=r41hc72bb7e_0
- - r-brew=1.0_7=r41hc72bb7e_0
- - r-brio=1.1.3=r41hcfec24a_0
- - r-broom=0.7.12=r41hc72bb7e_0
- - r-bslib=0.3.1=r41hc72bb7e_0
- - r-cachem=1.0.6=r41hcfec24a_0
- - r-cairo=1.5_14=r41hcfec24a_0
- - r-callr=3.7.0=r41hc72bb7e_0
- - r-catools=1.18.2=r41h03ef668_0
- - r-cellranger=1.1.0=r41hc72bb7e_1004
- - r-chron=2.3_56=r41hcfec24a_0
- - r-circlize=0.4.14=r41hc72bb7e_0
- - r-cli=3.3.0=r41h7525677_0
- - r-clipr=0.8.0=r41hc72bb7e_0
- - r-clue=0.3_60=r41hcfec24a_0
- - r-cluster=2.1.2=r41h859d828_0
- - r-codetools=0.2_18=r41hc72bb7e_0
- - r-collections=0.3.5=r41hcfec24a_0
- - r-colorspace=2.0_3=r41h06615bd_0
- - r-commonmark=1.7=r41hcfec24a_1002
- - r-cpp11=0.4.2=r41hc72bb7e_0
- - r-crayon=1.5.0=r41hc72bb7e_0
- - r-crosstalk=1.2.0=r41hc72bb7e_0
- - r-curl=4.3.2=r41hcfec24a_0
- - r-cyclocomp=1.1.0=r41hc72bb7e_1004
- - r-data.table=1.14.2=r41hcfec24a_0
- - r-dbi=1.1.2=r41hc72bb7e_0
- - r-dbplyr=2.1.1=r41hc72bb7e_0
- - r-deoptimr=1.0_10=r41hc72bb7e_0
- - r-desc=1.4.0=r41hc72bb7e_0
- - r-diffobj=0.3.5=r41hcfec24a_0
- - r-digest=0.6.29=r41h03ef668_0
- - r-doparallel=1.0.17=r41hc72bb7e_0
- - r-dplyr=1.0.7=r41h03ef668_0
- - r-dt=0.21=r41hc72bb7e_0
- - r-dtplyr=1.2.1=r41hc72bb7e_0
- - r-ellipsis=0.3.2=r41hcfec24a_0
- - r-evaluate=0.15=r41hc72bb7e_0
- - r-extrafont=0.17=r41ha770c72_1002
- - r-extrafontdb=1.0=r41hc72bb7e_1003
- - r-fansi=1.0.2=r41hcfec24a_0
- - r-farver=2.1.0=r41h03ef668_0
- - r-fastmap=1.1.0=r41h03ef668_0
- - r-filelock=1.0.2=r41hcfec24a_1002
- - r-fontawesome=0.2.2=r41hc72bb7e_0
- - r-forcats=0.5.1=r41hc72bb7e_0
- - r-foreach=1.5.2=r41hc72bb7e_0
- - r-formatr=1.11=r41hc72bb7e_0
- - r-fs=1.5.2=r41h03ef668_0
- - r-futile.logger=1.4.3=r41hc72bb7e_1003
- - r-futile.options=1.0.1=r41hc72bb7e_1002
- - r-gargle=1.2.0=r41hc72bb7e_0
- - r-generics=0.1.2=r41hc72bb7e_0
- - r-getopt=1.20.3=r41ha770c72_2
- - r-getoptlong=1.0.5=r41hc72bb7e_0
- - r-ggalt=0.4.0=r41ha770c72_2
- - r-ggbeeswarm=0.6.0=r41ha770c72_1003
- - r-ggdendro=0.1.23=r41hc72bb7e_0
- - r-ggfortify=0.4.14=r41hc72bb7e_0
- - r-ggplot2=3.3.5=r41hc72bb7e_0
- - r-ggrastr=1.0.1=r41hc72bb7e_0
- - r-ggrepel=0.9.1=r41h03ef668_0
- - r-globaloptions=0.1.2=r41ha770c72_0
- - r-glue=1.6.2=r41h06615bd_0
- - r-googledrive=2.0.0=r41hc72bb7e_0
- - r-googlesheets4=1.0.0=r41h785f33e_0
- - r-gplots=3.1.1=r41hc72bb7e_0
- - r-gridextra=2.3=r41hc72bb7e_1003
- - r-gsubfn=0.7=r41hc72bb7e_1002
- - r-gtable=0.3.0=r41hc72bb7e_3
- - r-gtools=3.9.2=r41hcfec24a_0
- - r-hash=3.0.1=r41hc72bb7e_2
- - r-haven=2.4.3=r41h2713e49_0
- - r-here=1.0.1=r41hc72bb7e_0
- - r-hexbin=1.28.2=r41h8da6f51_0
- - r-highr=0.9=r41hc72bb7e_0
- - r-hms=1.1.1=r41hc72bb7e_0
- - r-htmltools=0.5.2=r41h03ef668_0
- - r-htmlwidgets=1.5.4=r41hc72bb7e_0
- - r-httpuv=1.6.5=r41h03ef668_0
- - r-httr=1.4.2=r41hc72bb7e_0
- - r-ids=1.0.1=r41hc72bb7e_1
- - r-igraph=1.2.11=r41he0372cf_0
- - r-isoband=0.2.5=r41h03ef668_0
- - r-iterators=1.0.14=r41hc72bb7e_0
- - r-jquerylib=0.1.4=r41hc72bb7e_0
- - r-jsonlite=1.8.0=r41h06615bd_0
- - r-kernsmooth=2.23_20=r41h742201e_0
- - r-knitr=1.37=r41hc72bb7e_1
- - r-labeling=0.4.2=r41hc72bb7e_1
- - r-lambda.r=1.2.4=r41hc72bb7e_1
- - r-languageserver=0.3.12=r41h06615bd_0
- - r-later=1.2.0=r41h03ef668_0
- - r-lattice=0.20_45=r41hcfec24a_0
- - r-lazyeval=0.2.2=r41hcfec24a_2
- - r-lifecycle=1.0.1=r41hc72bb7e_0
- - r-lintr=3.0.0=r41hc72bb7e_0
- - r-locfit=1.5_9.5=r41h06615bd_0
- - r-lubridate=1.8.0=r41h03ef668_0
- - r-magrittr=2.0.2=r41hcfec24a_0
- - r-maldiquant=1.21=r41h7f98852_0
- - r-maps=3.4.0=r41hcfec24a_0
- - r-mass=7.3_55=r41hcfec24a_0
- - r-matrix=1.4_0=r41he454529_0
- - r-matrixstats=0.61.0=r41hcfec24a_0
- - r-memoise=2.0.1=r41hc72bb7e_0
- - r-mgcv=1.8_39=r41h0154571_0
- - r-mime=0.12=r41hcfec24a_0
- - r-modelr=0.1.8=r41hc72bb7e_0
- - r-munsell=0.5.0=r41hc72bb7e_1004
- - r-ncdf4=1.19=r41h186726c_0
- - r-nlme=3.1_155=r41h859d828_0
- - r-openssl=2.0.0=r41h1f3e0c5_0
- - r-optparse=1.7.1=r41hc72bb7e_0
- - r-pillar=1.7.0=r41hc72bb7e_0
- - r-pkgconfig=2.0.3=r41hc72bb7e_1
- - r-pkgload=1.2.4=r41h03ef668_0
- - r-plogr=0.2.0=r41hc72bb7e_1003
- - r-plotly=4.10.0=r41hc72bb7e_0
- - r-plotrix=3.8_2=r41hc72bb7e_0
- - r-plyr=1.8.6=r41h03ef668_1
- - r-png=0.1_7=r41hcfec24a_1004
- - r-praise=1.0.0=r41hc72bb7e_1005
- - r-prettyunits=1.1.1=r41hc72bb7e_1
- - r-processx=3.5.2=r41hcfec24a_0
- - r-progress=1.2.2=r41hc72bb7e_2
- - r-proj4=1.0_11=r41h0ae476a_0
- - r-promises=1.2.0.1=r41h03ef668_0
- - r-proto=1.0.0=r41ha770c72_2003
- - r-ps=1.6.0=r41hcfec24a_0
- - r-purrr=0.3.4=r41hcfec24a_1
- - r-r.cache=0.15.0=r41hc72bb7e_0
- - r-r.methodss3=1.8.2=r41hc72bb7e_0
- - r-r.oo=1.25.0=r41hc72bb7e_0
- - r-r.utils=2.11.0=r41hc72bb7e_0
- - r-r6=2.5.1=r41hc72bb7e_0
- - r-ragg=1.2.2=r41hc1f6985_0
- - r-rann=2.6.1=r41h03ef668_2
- - r-rappdirs=0.3.3=r41hcfec24a_0
- - r-rcolorbrewer=1.1_2=r41h785f33e_1003
- - r-rcpp=1.0.8=r41h03ef668_0
- - r-rcpparmadillo=0.10.8.1.0=r41h306847c_0
- - r-rcurl=1.98_1.6=r41hcfec24a_0
- - r-readr=2.1.2=r41h03ef668_0
- - r-readxl=1.3.1=r41h2713e49_4
- - r-rematch=1.0.1=r41hc72bb7e_1004
- - r-rematch2=2.1.2=r41hc72bb7e_1
- - r-remotes=2.4.2=r41hc72bb7e_0
- - r-repr=1.1.4=r41h785f33e_0
- - r-reprex=2.0.1=r41hc72bb7e_0
- - r-reshape2=1.4.4=r41h03ef668_1
- - r-rex=1.2.1=r41hc72bb7e_0
- - r-rjson=0.2.21=r41h03ef668_0
- - r-rlang=0.4.12=r41hcfec24a_0
- - r-rmarkdown=2.12=r41hc72bb7e_0
- - r-robustbase=0.93_9=r41h52d45c5_0
- - r-roxygen2=7.2.0=r41h7525677_0
- - r-rprojroot=2.0.2=r41hc72bb7e_0
- - r-rsqlite=2.2.8=r41h03ef668_0
- - r-rstudioapi=0.13=r41hc72bb7e_0
- - r-rttf2pt1=1.3.10=r41hcfec24a_0
- - r-runit=0.4.32=r41hc72bb7e_1002
- - r-rvest=1.0.2=r41hc72bb7e_0
- - r-sass=0.4.0=r41h03ef668_0
- - r-scales=1.1.1=r41hc72bb7e_0
- - r-selectr=0.4_2=r41hc72bb7e_1
- - r-shape=1.4.6=r41ha770c72_0
- - r-shiny=1.7.1=r41h785f33e_0
- - r-snow=0.4_4=r41hc72bb7e_0
- - r-sourcetools=0.1.7=r41h03ef668_1002
- - r-sqldf=0.4_11=r41hc72bb7e_2
- - r-stringdist=0.9.8=r41hcfec24a_1
- - r-stringi=1.7.6=r41h337692f_1
- - r-stringr=1.4.0=r41hc72bb7e_2
- - r-styler=1.7.0=r41hc72bb7e_0
- - r-survival=3.3_1=r41h06615bd_0
- - r-sys=3.4=r41hcfec24a_0
- - r-systemfonts=1.0.4=r41hef9c87a_0
- - r-testthat=3.1.2=r41h03ef668_0
- - r-textshaping=0.3.6=r41hcb6d10c_0
- - r-tibble=3.1.6=r41hcfec24a_0
- - r-tidyr=1.2.0=r41h03ef668_0
- - r-tidyselect=1.1.1=r41hc72bb7e_0
- - r-tidyverse=1.3.1=r41hc72bb7e_0
- - r-tinytex=0.37=r41hc72bb7e_0
- - r-tzdb=0.2.0=r41h03ef668_0
- - r-utf8=1.2.2=r41hcfec24a_0
- - r-uuid=1.0_3=r41hcfec24a_0
- - r-vctrs=0.3.8=r41hcfec24a_1
- - r-vipor=0.4.5=r41hc72bb7e_1003
- - r-viridislite=0.4.0=r41hc72bb7e_0
- - r-vroom=1.5.7=r41h03ef668_0
- - r-waldo=0.3.1=r41hc72bb7e_0
- - r-waveslim=1.8.2=r41h859d828_2
- - r-withr=2.5.0=r41hc72bb7e_0
- - r-xfun=0.30=r41h7525677_0
- - r-xml=3.99_0.9=r41h06615bd_0
- - r-xml2=1.3.3=r41h03ef668_0
- - r-xmlparsedata=1.0.5=r41hc72bb7e_0
- - r-xtable=1.8_4=r41hc72bb7e_3
- - r-yaml=2.3.5=r41h06615bd_0
- - readline=8.1=h46c0cb4_0
- - sed=4.8=he412f7d_0
- - sqlite=3.37.0=h9cd32fc_0
- - sysroot_linux-64=2.12=he073ed8_15
- - tk=8.6.12=h27826a3_0
- - tktable=2.10=hb7b940f_3
- - xorg-kbproto=1.0.7=h7f98852_1002
- - xorg-libice=1.0.10=h7f98852_0
- - xorg-libsm=1.2.3=hd9c2040_1000
- - xorg-libx11=1.7.2=h7f98852_0
- - xorg-libxau=1.0.9=h7f98852_0
- - xorg-libxdmcp=1.1.3=h7f98852_0
- - xorg-libxext=1.3.4=h7f98852_1
- - xorg-libxrender=0.9.10=h7f98852_1003
- - xorg-libxt=1.2.1=h7f98852_2
- - xorg-renderproto=0.11.1=h7f98852_1002
- - xorg-xextproto=7.3.0=h7f98852_1002
- - xorg-xproto=7.0.31=h7f98852_1007
- - xz=5.2.5=h516909a_1
- - zlib=1.2.11=h36c2ea0_1013
- - zstd=1.5.2=ha95c52a_0
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml
deleted file mode 100644
index bc39c7c0..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: VV
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - python=3.8
- - pandas=1.2
- - samtools
- - isatools
- - pip
- - pip:
- - git+https://github.com/J-81/JDO_V-V.git@0.6.0-beta.3
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml
deleted file mode 100644
index be88ce26..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: download_tools
-channels:
- - anaconda
- - conda-forge
- - bioconda
- - defaults
-dependencies:
- - tqdm=4.59
- - requests=2.25
- - pandas=1.2
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml
deleted file mode 100644
index 03494e70..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: dp_tools
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - python=3.10 # required for in-house codebase compatibility
- - pandas
- - samtools
- - isatools
- - multiqc
- - schema
- - pytest
- - pip
- - pip:
- - git+https://github.com/J-81/dp_tools.git@1.0.7rc2
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml
deleted file mode 100644
index 2adf7e89..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-# source: https://raw.githubusercontent.com/J-81/gl_dockerfiles/6bb3de66396b98e1533119e203a26ed3a8abcdc8/assets/conda.yaml
-channels:
- - conda-forge
- - bioconda
- - r
- - defaults
-dependencies:
- - python==3.10
- - jupyter
- - seaborn
- - plotly
- - matplotlib
- - pandas
- - scikit-learn
- - statsmodels
- - papermill
- - r-base==4.1.2
- - bioconductor-deseq2=1.34.0
- - r-tidyverse==1.3.1
- - r-plotly
- - r-knitr
- - r-irkernel
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml
deleted file mode 100644
index 9362200d..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
- - bioconda
- - defaults
-dependencies:
- - fastqc
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml
deleted file mode 100644
index b7bfe567..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: genelab_utils
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - python=3.8
- - pandas
- - tabulate
- - openpyxl
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml
deleted file mode 100644
index 26c52ade..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - isatools
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml
deleted file mode 100644
index 3d965dbe..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-name: main
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - nextflow=20.07
- - python=3.8
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml
deleted file mode 100644
index 944c1bbf..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - multiqc
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml
deleted file mode 100644
index e46f23db..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: python
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - python=3.8
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml
deleted file mode 100644
index 6add00db..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - R
- - r-biocmanager
- - r-rnetcdf
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml
deleted file mode 100644
index 4caada28..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml
+++ /dev/null
@@ -1,322 +0,0 @@
-name: rnaseq_v1.0_modify
-channels:
- - conda-forge
- - bioconda
- - defaults
-dependencies:
- - _libgcc_mutex=0.1=conda_forge
- - _openmp_mutex=4.5=1_gnu
- - _r-mutex=1.0.1=anacondar_1
- - alsa-lib=1.2.3=h516909a_0
- - bcftools=1.10.2=h4f4756c_3
- - binutils_impl_linux-64=2.35.1=h193b22a_1
- - binutils_linux-64=2.35=hc3fd857_29
- - bioconductor-biobase=2.50.0=r40h037d062_0
- - bioconductor-biocgenerics=0.36.0=r40_0
- - bioconductor-biocparallel=1.24.0=r40h5f743cb_0
- - bioconductor-biostrings=2.58.0=r40h037d062_0
- - bioconductor-delayedarray=0.16.0=r40h037d062_0
- - bioconductor-ebseq=1.30.0=r40_0
- - bioconductor-genomeinfodb=1.26.0=r40_0
- - bioconductor-genomeinfodbdata=1.2.4=r40_0
- - bioconductor-genomicalignments=1.26.0=r40h037d062_0
- - bioconductor-genomicranges=1.42.0=r40h037d062_0
- - bioconductor-iranges=2.24.0=r40h037d062_0
- - bioconductor-matrixgenerics=1.2.0=r40_0
- - bioconductor-noiseq=2.34.0=r40_0
- - bioconductor-rhtslib=1.22.0=r40h037d062_0
- - bioconductor-rsamtools=2.6.0=r40h5f743cb_0
- - bioconductor-rtracklayer=1.50.0=r40h9bb0e53_1
- - bioconductor-s4vectors=0.28.0=r40h037d062_0
- - bioconductor-summarizedexperiment=1.20.0=r40_0
- - bioconductor-xvector=0.30.0=r40h037d062_0
- - bioconductor-zlibbioc=1.36.0=r40h037d062_0
- - brotlipy=0.7.0=py38h8df0ef7_1001
- - bwidget=1.9.14=ha770c72_0
- - bx-python=0.8.9=py38hb90e610_2
- - bzip2=1.0.8=h7f98852_4
- - c-ares=1.17.1=h36c2ea0_0
- - ca-certificates=2021.1.19=h06a4308_0
- - cairo=1.16.0=h9f066cc_1006
- - certifi=2020.12.5=py38h578d9bd_1
- - cffi=1.14.4=py38ha65f79e_1
- - chardet=4.0.0=py38h578d9bd_1
- - click=7.1.2=pyh9f0ad1d_0
- - coloredlogs=15.0=py38h578d9bd_0
- - colormath=3.0.0=py_2
- - cryptography=3.3.1=py38h2b97feb_1
- - curl=7.71.1=he644dc0_8
- - cutadapt=3.2=py38h0213d0e_0
- - cycler=0.10.0=py_2
- - decorator=4.4.2=py_0
- - dnaio=0.5.0=py38h0213d0e_0
- - fastqc=0.11.9=0
- - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
- - font-ttf-inconsolata=2.001=hab24e00_0
- - font-ttf-source-code-pro=2.030=hab24e00_0
- - font-ttf-ubuntu=0.83=hab24e00_0
- - fontconfig=2.13.1=h7e3eb15_1002
- - fonts-conda-ecosystem=1=0
- - fonts-conda-forge=1=0
- - freetype=2.10.4=h7ca028e_0
- - fribidi=1.0.10=h36c2ea0_0
- - future=0.18.2=py38h578d9bd_3
- - gcc_impl_linux-64=9.3.0=h28f5a38_17
- - gcc_linux-64=9.3.0=h7247604_29
- - gettext=0.19.8.1=h0b5b191_1005
- - gfortran_impl_linux-64=9.3.0=h2bb4189_17
- - gfortran_linux-64=9.3.0=ha1c937c_29
- - giflib=5.2.1=h36c2ea0_2
- - graphite2=1.3.13=h58526e2_1001
- - gsl=2.6=he838d99_1
- - gxx_impl_linux-64=9.3.0=h53cdd4c_17
- - gxx_linux-64=9.3.0=h0d07fa4_29
- - harfbuzz=2.7.2=ha5b49bf_1
- - htslib=1.10.2=hd3b49d5_1
- - humanfriendly=9.1=py38h578d9bd_0
- - icu=67.1=he1b5a44_0
- - idna=2.10=pyh9f0ad1d_0
- - importlib-metadata=3.4.0=py38h578d9bd_0
- - isa-l=2.30.0=h36c2ea0_0
- - jinja2=2.11.2=pyh9f0ad1d_0
- - jpeg=9d=h36c2ea0_0
- - kernel-headers_linux-64=2.6.32=h77966d4_13
- - kiwisolver=1.3.1=py38h1fd1430_1
- - krb5=1.17.2=h926e7f8_0
- - lcms2=2.11=hcbb858e_1
- - ld_impl_linux-64=2.35.1=hea4e1c9_1
- - libblas=3.8.0=17_openblas
- - libcblas=3.8.0=17_openblas
- - libcurl=7.71.1=hcdd3856_8
- - libdeflate=1.6=h516909a_0
- - libedit=3.1.20191231=he28a2e2_2
- - libev=4.33=h516909a_1
- - libffi=3.3=h58526e2_2
- - libgcc-devel_linux-64=9.3.0=hfd08b2a_17
- - libgcc-ng=9.3.0=h5dbcf3e_17
- - libgfortran-ng=9.3.0=he4bcb1c_17
- - libgfortran5=9.3.0=he4bcb1c_17
- - libglib=2.66.4=h164308a_1
- - libgomp=9.3.0=h5dbcf3e_17
- - libiconv=1.16=h516909a_0
- - liblapack=3.8.0=17_openblas
- - libnghttp2=1.41.0=h8cfc5f6_2
- - libopenblas=0.3.10=pthreads_h4812303_5
- - libpng=1.6.37=h21135ba_2
- - libssh2=1.9.0=hab1572f_5
- - libstdcxx-devel_linux-64=9.3.0=h4084dd6_17
- - libstdcxx-ng=9.3.0=h2ae2ef3_17
- - libtiff=4.2.0=hdc55705_0
- - libuuid=2.32.1=h7f98852_1000
- - libwebp-base=1.1.0=h36c2ea0_3
- - libxcb=1.13=h7f98852_1003
- - libxml2=2.9.10=h68273f3_2
- - lz4-c=1.9.3=h9c3ff4c_0
- - lzo=2.10=h516909a_1000
- - lzstring=1.0.4=py_1001
- - make=4.3=hd18ef5c_1
- - markdown=3.3.3=pyh9f0ad1d_0
- - markupsafe=1.1.1=py38h497a2fe_3
- - matplotlib-base=3.3.3=py38h5c7f4ab_0
- - multiqc=1.9=py_1
- - mysql-connector-c=6.1.11=h6eb9d5d_1007
- - ncurses=6.2=h58526e2_4
- - networkx=2.5=py_0
- - numpy=1.19.5=py38h18fd61f_1
- - olefile=0.46=pyh9f0ad1d_1
- - openjdk=11.0.8=hacce0ff_0
- - openssl=1.1.1i=h7f98852_0
- - pandas=1.2.0=py38h51da96c_0
- - pango=1.42.4=h69149e4_5
- - pcre=8.44=he1b5a44_0
- - pcre2=10.36=h032f7d1_0
- - perl=5.26.2=h36c2ea0_1008
- - perl-app-cpanminus=1.7044=pl526_1
- - perl-carp=1.38=pl526_3
- - perl-constant=1.33=pl526_1
- - perl-cpan-meta=2.150010=pl526_0
- - perl-cpan-meta-requirements=2.140=pl526_0
- - perl-cpan-meta-yaml=0.018=pl526_0
- - perl-data-dumper=2.173=pl526_0
- - perl-encode=2.88=pl526_1
- - perl-exporter=5.72=pl526_1
- - perl-extutils-cbuilder=0.280230=pl526_1
- - perl-extutils-makemaker=7.36=pl526_1
- - perl-extutils-manifest=1.72=pl526_0
- - perl-extutils-parsexs=3.35=pl526_0
- - perl-file-path=2.16=pl526_0
- - perl-file-temp=0.2304=pl526_2
- - perl-getopt-long=2.50=pl526_1
- - perl-ipc-cmd=1.02=pl526_0
- - perl-json-pp=4.04=pl526_0
- - perl-locale-maketext-simple=0.21=pl526_2
- - perl-module-build=0.4224=pl526_3
- - perl-module-corelist=5.20190524=pl526_0
- - perl-module-load=0.32=pl526_1
- - perl-module-load-conditional=0.68=pl526_2
- - perl-module-metadata=1.000036=pl526_0
- - perl-params-check=0.38=pl526_1
- - perl-parent=0.236=pl526_1
- - perl-perl-ostype=1.010=pl526_1
- - perl-scalar-list-utils=1.52=pl526h516909a_0
- - perl-text-abbrev=1.02=pl526_0
- - perl-text-parsewords=3.30=pl526_0
- - perl-version=0.9924=pl526_0
- - pigz=2.3.4=hed695b0_1
- - pillow=8.1.0=py38h357d4e7_1
- - pip=20.3.3=pyhd8ed1ab_0
- - pixman=0.40.0=h36c2ea0_0
- - pthread-stubs=0.4=h36c2ea0_1001
- - pybigwig=0.3.17=py38h55f8d50_2
- - pybktree=1.1=pyh9f0ad1d_0
- - pycparser=2.20=pyh9f0ad1d_2
- - pyopenssl=20.0.1=pyhd8ed1ab_0
- - pyparsing=2.4.7=pyh9f0ad1d_0
- - pysam=0.16.0.1=py38hbdc2ae9_1
- - pysocks=1.7.1=py38h578d9bd_3
- - python=3.8.6=hffdb5ce_4_cpython
- - python-dateutil=2.8.1=py_0
- - python-lzo=1.12=py38h86e1cee_1003
- - python_abi=3.8=1_cp38
- - pytz=2020.5=pyhd8ed1ab_0
- - pyyaml=5.3.1=py38h497a2fe_2
- - qualimap=2.2.2d=1
- - r-assertthat=0.2.1=r40h6115d3f_2
- - r-backports=1.2.1=r40hcfec24a_0
- - r-base=4.0.3=ha43b4e8_3
- - r-bh=1.75.0_0=r40hc72bb7e_0
- - r-bibtex=0.4.2.3=r40hcdcec82_0
- - r-bitops=1.0_6=r40hcdcec82_1004
- - r-blockmodeling=1.0.0=r40h580db52_1
- - r-brio=1.1.0=r40h9e2df91_1
- - r-callr=3.5.1=r40h142f84f_0
- - r-catools=1.18.1=r40h03ef668_0
- - r-cli=2.2.0=r40hc72bb7e_0
- - r-codetools=0.2_18=r40hc72bb7e_0
- - r-crayon=1.3.4=r40h6115d3f_1003
- - r-desc=1.2.0=r40h6115d3f_1003
- - r-diffobj=0.3.3=r40hcfec24a_0
- - r-digest=0.6.27=r40h1b71b39_0
- - r-doparallel=1.0.16=r40h142f84f_0
- - r-dorng=1.8.2=r40h6115d3f_1
- - r-ellipsis=0.3.1=r40hcdcec82_0
- - r-evaluate=0.14=r40h6115d3f_2
- - r-fansi=0.4.1=r40hcdcec82_1
- - r-foreach=1.5.1=r40h142f84f_0
- - r-formatr=1.7=r40h6115d3f_2
- - r-futile.logger=1.4.3=r40h6115d3f_1003
- - r-futile.options=1.0.1=r40h6115d3f_1002
- - r-getopt=1.20.3=r40_2
- - r-glue=1.4.2=r40hcdcec82_0
- - r-gplots=3.1.1=r40hc72bb7e_0
- - r-gtools=3.8.2=r40hcdcec82_1
- - r-iterators=1.0.13=r40h142f84f_0
- - r-jsonlite=1.7.2=r40hcfec24a_0
- - r-kernsmooth=2.23_18=r40h742201e_0
- - r-lambda.r=1.2.4=r40h6115d3f_1
- - r-lattice=0.20_41=r40hcfec24a_2
- - r-lifecycle=0.2.0=r40h6115d3f_1
- - r-magrittr=2.0.1=r40h9e2df91_1
- - r-matrix=1.3_2=r40he454529_0
- - r-matrixstats=0.57.0=r40hcfec24a_0
- - r-optparse=1.6.6=r40h6115d3f_1
- - r-pillar=1.4.7=r40hc72bb7e_0
- - r-pkgbuild=1.2.0=r40hc72bb7e_0
- - r-pkgconfig=2.0.3=r40h6115d3f_1
- - r-pkgload=1.1.0=r40h0357c0b_0
- - r-pkgmaker=0.32.2=r40h142f84f_0
- - r-praise=1.0.0=r40h6115d3f_1004
- - r-prettyunits=1.1.1=r40h6115d3f_1
- - r-processx=3.4.5=r40hcfec24a_0
- - r-ps=1.5.0=r40hcfec24a_0
- - r-r6=2.5.0=r40hc72bb7e_0
- - r-rcurl=1.98_1.2=r40hcdcec82_1
- - r-registry=0.5_1=r40h6115d3f_2
- - r-rematch2=2.1.2=r40h6115d3f_1
- - r-rlang=0.4.10=r40hcfec24a_0
- - r-rngtools=1.5=r40h6115d3f_1
- - r-rprojroot=2.0.2=r40hc72bb7e_0
- - r-rstudioapi=0.13=r40hc72bb7e_0
- - r-snow=0.4_3=r40h6115d3f_1002
- - r-stringi=1.5.3=r40h604b29c_0
- - r-stringr=1.4.0=r40h6115d3f_2
- - r-testthat=3.0.1=r40h03ef668_0
- - r-tibble=3.0.4=r40h0eb13af_0
- - r-utf8=1.1.4=r40hcdcec82_1003
- - r-vctrs=0.3.6=r40hcfec24a_0
- - r-waldo=0.2.3=r40hc72bb7e_0
- - r-withr=2.3.0=r40h6115d3f_0
- - r-xml=3.99_0.5=r40hcfec24a_0
- - r-xtable=1.8_4=r40h6115d3f_3
- - r-zeallot=0.1.0=r40h6115d3f_1002
- - readline=8.0=he28a2e2_2
- - regex=2020.11.13=py38h497a2fe_1
- - requests=2.25.1=pyhd3deb0d_0
- - rsem=1.3.3=pl526hfbaaabd_1
- - rseqc=4.0.0=py38h0213d0e_0
- - samtools=1.10=h2e538c0_3
- - scipy=1.6.0=py38hb2138dd_0
- - sed=4.8=he412f7d_0
- - seqtk=1.3=hed695b0_2
- - setuptools=49.6.0=py38h578d9bd_3
- - simplejson=3.17.2=py38h497a2fe_2
- - six=1.15.0=pyh9f0ad1d_0
- - spectra=0.0.11=py_1
- - sqlite=3.34.0=h74cdb3f_0
- - star=2.7.7a=0
- - sysroot_linux-64=2.12=h77966d4_13
- - tk=8.6.10=h21135ba_1
- - tktable=2.10=hb7b940f_3
- - tornado=6.1=py38h497a2fe_1
- - trim-galore=0.6.6=0
- - ucsc-bigwigsummary=377=h446ed27_1
- - umi_tools=1.1.1=py38h0213d0e_1
- - urllib3=1.26.2=pyhd8ed1ab_0
- - wheel=0.36.2=pyhd3deb0d_0
- - xopen=1.0.1=py38h578d9bd_1
- - xorg-fixesproto=5.0=h14c3975_1002
- - xorg-inputproto=2.3.2=h7f98852_1002
- - xorg-kbproto=1.0.7=h7f98852_1002
- - xorg-libice=1.0.10=h516909a_0
- - xorg-libsm=1.2.3=h84519dc_1000
- - xorg-libx11=1.6.12=h516909a_0
- - xorg-libxau=1.0.9=h7f98852_0
- - xorg-libxdmcp=1.1.3=h7f98852_0
- - xorg-libxext=1.3.4=h516909a_0
- - xorg-libxfixes=5.0.3=h516909a_1004
- - xorg-libxi=1.7.10=h516909a_0
- - xorg-libxrender=0.9.10=h516909a_1002
- - xorg-libxt=1.2.0=h516909a_0
- - xorg-libxtst=1.2.3=h516909a_1002
- - xorg-recordproto=1.14.2=h516909a_1002
- - xorg-renderproto=0.11.1=h14c3975_1002
- - xorg-xextproto=7.3.0=h7f98852_1002
- - xorg-xproto=7.0.31=h7f98852_1007
- - xz=5.2.5=h516909a_1
- - yaml=0.2.5=h516909a_0
- - zipp=3.4.0=py_0
- - zlib=1.2.11=h516909a_1010
- - zstd=1.4.8=ha95c52a_1
- - pip:
- - appdirs==1.4.4
- - attrs==20.3.0
- - beautifulsoup4==4.9.3
- - biopython==1.78
- - cached-property==1.5.2
- - deepdiff==5.2.2
- - et-xmlfile==1.0.1
- - fs==2.4.12
- - isatools==0.11.0
- - iso8601==0.1.13
- - jdcal==1.4.1
- - jsonschema==3.2.0
- - lxml==4.6.2
- - mzml2isa==1.0.3
- - openpyxl==2.6.4
- - ordered-set==4.0.2
- - progressbar2==3.53.1
- - pronto==0.12.2
- - pyrsistent==0.17.3
- - python-utils==2.5.5
- - soupsieve==2.1
-prefix: /home/joribello/anaconda3/envs/rnaseq_v1.0_modify
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml
deleted file mode 100644
index cd5c013a..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels:
- - bioconda
- - conda-forge
- - defaults
-dependencies:
- - rsem
- - R
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml
deleted file mode 100644
index 0509dd42..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
- - bioconda
- - defaults
-dependencies:
- - samtools
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml
deleted file mode 100644
index 3c4afe5e..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: samtools_rseqc
-channels:
- - conda-forge
- - bioconda
- - defaults
-
-dependencies:
- - python
- - rseqc
- - samtools >= 1.13
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml
deleted file mode 100644
index 7f0da546..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
- - bioconda
- - defaults
-dependencies:
- - star=2.7.8a # pinned due to a bug in 2.7.9a in outputting transcriptTome.bam, should update once the bug is fixed in a newer release
- - python=3.8
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml
deleted file mode 100644
index dd50a690..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
- - bioconda
- - defaults
-dependencies:
- - trim-galore
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml
deleted file mode 100644
index 6ae38a48..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: ucsc_gtf_Pred_BED
-channels:
- - conda-forge
- - bioconda
- - defaults
-
-dependencies:
- - python
- - ucsc-gtftogenepred
- - ucsc-genepredtobed
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf
index 1862ef81..477b1929 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf
@@ -22,7 +22,7 @@ include { BUILD_STAR;
CONCAT_ERCC;
QUANTIFY_STAR_GENES;
QUANTIFY_RSEM_GENES } from './modules/genome.nf'
-include { DGE_BY_DESEQ2 } from './modules/dge.nf'
+include { DGE_BY_DESEQ2 } from './modules/DGE_BY_DESEQ2'
include { VV_RAW_READS;
VV_TRIMMED_READS;
VV_STAR_ALIGNMENTS;
@@ -240,6 +240,7 @@ workflow {
RAW_FASTQC.out.fastqc | map { it -> [ it[1], it[2] ] } | flatten | collect,
RAW_MULTIQC.out.zipped_report,
RAW_MULTIQC.out.unzipped_report,
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
)
VV_TRIMMED_READS( ch_meta,
STAGING.out.runsheet,
@@ -250,13 +251,15 @@ workflow {
TRIMGALORE.out.reports | collect,
TRIM_MULTIQC.out.zipped_report,
TRIM_MULTIQC.out.unzipped_report,
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
)
VV_STAR_ALIGNMENTS( STAGING.out.runsheet,
ALIGN_STAR.out.publishables | collect,
QUANTIFY_STAR_GENES.out.publishables | collect,
ALIGN_MULTIQC.out.zipped_report,
ALIGN_MULTIQC.out.unzipped_report,
- STRANDEDNESS.out.bam_bed | collect
+ STRANDEDNESS.out.bam_bed | collect,
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
)
VV_RSEQC( ch_meta,
STAGING.out.runsheet,
@@ -265,12 +268,14 @@ workflow {
STRANDEDNESS.out.infer_experiment_multiqc,
STRANDEDNESS.out.inner_distance_multiqc,
STRANDEDNESS.out.read_distribution_multiqc,
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
)
VV_RSEM_COUNTS( STAGING.out.runsheet,
COUNT_ALIGNED.out.only_counts | collect,
QUANTIFY_RSEM_GENES.out.publishables,
COUNT_MULTIQC.out.zipped_report,
COUNT_MULTIQC.out.unzipped_report,
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
)
VV_DESEQ2_ANALYSIS( ch_meta,
STAGING.out.runsheet,
@@ -281,6 +286,7 @@ workflow {
DGE_BY_DESEQ2.out.dge,
DGE_BY_DESEQ2.out.norm_counts_ercc | ifEmpty( { file("NO_FILES.placeholder") }),
DGE_BY_DESEQ2.out.dge_ercc | ifEmpty( { file("NO_FILES.placeholder") }),
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
)
// Software Version Capturing
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf
similarity index 90%
rename from RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf
rename to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf
index 00b09b05..25e5a59a 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf
@@ -28,6 +28,9 @@ process DGE_BY_DESEQ2 {
path("dge_output_ercc/visualization_output_table_ERCCnorm.csv"),
path("dge_output_ercc/visualization_PCA_table_ERCCnorm.csv"), optional: true, emit: dge_ercc
+ path("dge_output/summary.txt"), emit: summary
+ path("dge_output_ercc/ERCCnorm_summary.txt"), optional: true, emit: summary_ercc
+
path("versions.txt"), emit: version
script:
@@ -45,8 +48,7 @@ process DGE_BY_DESEQ2 {
--dge_output_prefix "dge_output/" \\
--annotation_file_path ${annotation_file} \\
--extended_table_output_prefix "dge_output/"\\
- --extended_table_output_suffix ".csv" \\
- --verbose
+ --extended_table_output_suffix ".csv"
if ${ meta.has_ercc ? 'true' : 'false'}
then
@@ -60,8 +62,8 @@ process DGE_BY_DESEQ2 {
--dge_output_prefix "dge_output_ercc/ERCCnorm_" \\
--annotation_file_path ${annotation_file} \\
--extended_table_output_prefix "dge_output_ercc/"\\
- --extended_table_output_suffix "_ERCCnorm.csv" \\
- --verbose
+ --extended_table_output_suffix "_ERCCnorm.csv"
fi
+ # bump
"""
}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf
new file mode 100644
index 00000000..e5d003f7
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf
@@ -0,0 +1,28 @@
+process RUNSHEET_FROM_GLDS {
+ // Downloads isa Archive and creates runsheet using GeneLab API
+ tag "${ gldsAccession }"
+ publishDir "${ params.outputDir }/${ gldsAccession }/Metadata",
+ pattern: "*.{zip,csv}",
+ mode: params.publish_dir_mode
+
+ input:
+ // TEMP: RESTORE ONCE OSD SUPPORT ADDED val(osdAccession)
+ val(gldsAccession)
+ path(dp_tools_plugin)
+
+ output:
+ path("${ gldsAccession }_*_v?_runsheet.csv"), emit: runsheet
+ path("*.zip"), emit: isaArchive
+
+ script:
+ def injects = params.biomart_attribute ? "--inject biomart_attribute='${ params.biomart_attribute }'" : ''
+ """
+
+ dpt-get-isa-archive --accession ${ gldsAccession }
+ ls ${dp_tools_plugin}
+
+ dpt-isa-to-runsheet --accession ${ gldsAccession } \
+ --plugin-dir ${dp_tools_plugin} \
+ --isa-archive *.zip ${ injects }
+ """
+}
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf
index 0d84a93d..9e60c283 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf
@@ -65,14 +65,9 @@ process TRIMGALORE {
path("versions.txt"), emit: version
script:
- /*
- * comments -> --ilumina # if adapters are not illumina, replace with adapters
- * --paired # only for PE studies, # if SE use only single read file
- */
"""
trim_galore --gzip \
--cores $task.cpus \
- --illumina \
--phred33 \
${ meta.paired_end ? '--paired' : '' } \
$reads \
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf
index ae929f44..17842f69 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf
@@ -19,11 +19,12 @@ process VV_RAW_READS {
input:
val(meta)
- path("VV_INPUT/Metadata/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC
+ path("VV_INPUT/Metadata/*") // runsheet
path("VV_INPUT/00-RawData/Fastq/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC
path("VV_INPUT/00-RawData/FastQC_Reports/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC
path("VV_INPUT/00-RawData/FastQC_Reports/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC
path("VV_INPUT/00-RawData/FastQC_Reports/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC
+ path(dp_tools__NF_RCP)
output:
val(meta)
@@ -38,21 +39,17 @@ process VV_RAW_READS {
"""
# move from VV_INPUT to task directory
# This allows detection as output files for publishing
- mv VV_INPUT/* .
+ mv VV_INPUT/* . || true
# Run V&V unless user requests to skip V&V
- if ${ !params.skipVV} ; then
- VV_data_assets.py --root-path . \\
- --accession ${ params.gldsAccession } \\
- --runsheet-path Metadata/*_runsheet.csv \\
- --data-asset-sets \\
- ${ meta.paired_end ? "'demuliplexed paired end raw data'" : "'demuliplexed single end raw data'"} \\
- ${ meta.paired_end ? "'qc reports for paired end raw data'" : "'qc reports for single end raw data'"} \\
+ if ${ !params.skipVV } ; then
+ dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\
+ --data-asset-key-sets \\
+ ${ meta.paired_end ? "'demuliplexed paired end raw data,qc reports for paired end raw data'" : "'demuliplexed single end raw data,qc reports for single end raw data'"} \\
--run-components \\
- 'Metadata' \\
- 'Raw Reads' \\
- 'Raw Reads By Sample' \\
- --max-flag-code ${ params.max_flag_code }
+ 'Metadata,Raw Reads,Raw Reads By Sample' \\
+ --max-flag-code ${ params.max_flag_code } \\
+ --output VV_log.tsv
fi
"""
}
@@ -79,6 +76,7 @@ process VV_TRIMMED_READS {
path("VV_INPUT/01-TG_Preproc/Trimming_Reports/*") // trimming reports
path("VV_INPUT/01-TG_Preproc/Trimming_Reports/*") // trimming reports multiqc zipped report
path("VV_INPUT/01-TG_Preproc/Trimming_Reports/*") // trimming reports multiqc unzipped report
+ path(dp_tools__NF_RCP)
output:
path("01-TG_Preproc/Fastq"), emit: VVed_trimmed_reads
@@ -92,20 +90,18 @@ process VV_TRIMMED_READS {
"""
# move from VV_INPUT to task directory
# This allows detection as output files for publishing
- mv VV_INPUT/* .
+ mv VV_INPUT/* . || true
+
# Run V&V unless user requests to skip V&V
- if ${ !params.skipVV} ; then
- VV_data_assets.py --root-path . \\
- --accession ${ params.gldsAccession } \\
- --runsheet-path Metadata/*_runsheet.csv \\
- --data-asset-sets \\
- ${ meta.paired_end ? "'paired end trimmed reads'" : "'single end trimmed reads'"} \\
- ${ meta.paired_end ? "'qc reports for paired end trimmed reads data'" : "'qc reports for single end trimmed reads data'"} \\
+ if ${ !params.skipVV } ; then
+ dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\
+ --data-asset-key-sets \\
+ ${ meta.paired_end ? "'paired end trimmed reads,qc reports for paired end trimmed reads data'" : "'single end trimmed reads,qc reports for single end trimmed reads data'"} \\
--run-components \\
- 'Trim Reads' \\
- 'Trimmed Reads By Sample' \\
- --max-flag-code ${ params.max_flag_code }
+ 'Trim Reads,Trimmed Reads By Sample' \\
+ --max-flag-code ${ params.max_flag_code } \\
+ --output VV_log.tsv
fi
"""
}
@@ -129,6 +125,7 @@ process VV_STAR_ALIGNMENTS {
path("VV_INPUT/02-STAR_Alignment/*") // zipped multiqc report
path("VV_INPUT/02-STAR_Alignment/*") // unzipped multiqc report
path("VV_INPUT/02-STAR_Alignment/*") // reindexed, sorted bam/bed files
+ path(dp_tools__NF_RCP)
output:
path("02-STAR_Alignment")
@@ -138,20 +135,18 @@ process VV_STAR_ALIGNMENTS {
"""
# move from VV_INPUT to task directory
# This allows detection as output files for publishing
- mv VV_INPUT/* .
+ mv VV_INPUT/* . || true
sort_into_subdirectories_by_sample.py 02-STAR_Alignment 02-STAR_Alignment '_*'
# Run V&V unless user requests to skip V&V
- if ${ !params.skipVV} ; then
- VV_data_assets.py --root-path . \\
- --accession ${ params.gldsAccession } \\
- --runsheet-path Metadata/*_runsheet.csv \\
- --data-asset-sets \\
+ if ${ !params.skipVV } ; then
+ dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\
+ --data-asset-key-sets \\
'STAR alignments' \\
--run-components \\
- 'STAR Alignments' \\
- 'STAR Alignments By Sample' \\
- --max-flag-code ${ params.max_flag_code }
+ 'STAR Alignments,STAR Alignments By Sample' \\
+ --max-flag-code ${ params.max_flag_code } \\
+ --output VV_log.tsv
fi
"""
@@ -176,7 +171,7 @@ process VV_RSEQC {
path("VV_INPUT/RSeQC_Analyses/03_infer_experiment/*") // genebody multiqc
path("VV_INPUT/RSeQC_Analyses/04_inner_distance/*") // genebody multiqc
path("VV_INPUT/RSeQC_Analyses/05_read_distribution/*") // genebody multiqc
-
+ path(dp_tools__NF_RCP)
output:
path("RSeQC_Analyses")
@@ -186,7 +181,7 @@ process VV_RSEQC {
"""
# move from VV_INPUT to task directory
# This allows detection as output files for publishing
- mv VV_INPUT/* .
+ mv VV_INPUT/* . || true
sort_into_subdirectories_by_sample.py RSeQC_Analyses RSeQC_Analyses/02_geneBody_coverage '.geneBodyCoverage.txt'
sort_into_subdirectories_by_sample.py RSeQC_Analyses RSeQC_Analyses/02_geneBody_coverage '.geneBodyCoverage.curves.pdf'
sort_into_subdirectories_by_sample.py RSeQC_Analyses RSeQC_Analyses/02_geneBody_coverage '.geneBodyCoverage.r'
@@ -199,18 +194,15 @@ process VV_RSEQC {
# These are not in sub directories: sort_into_subdirectories_by_sample.py RSeQC_Analyses/05_read_distribution RSeQC_Analyses/05_read_distribution '_read_dist.out'
mv RSeQC_Analyses/*_read_dist.out RSeQC_Analyses/05_read_distribution
-
# Run V&V unless user requests to skip V&V
- if ${ !params.skipVV} ; then
- VV_data_assets.py --root-path . \\
- --accession ${ params.gldsAccession } \\
- --runsheet-path Metadata/*_runsheet.csv \\
- --data-asset-sets \\
+ if ${ !params.skipVV } ; then
+ dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\
+ --data-asset-key-sets \\
${ meta.paired_end ? "'RSeQC output for paired end data'" : "'RSeQC output for single end data'"} \\
--run-components \\
- 'RSeQC' \\
- 'RSeQC By Sample' \\
- --max-flag-code ${ params.max_flag_code }
+ 'RSeQC,RSeQC By Sample' \\
+ --max-flag-code ${ params.max_flag_code } \\
+ --output VV_log.tsv
fi
# Remove all placeholder files and empty directories to prevent publishing
@@ -239,7 +231,7 @@ process VV_RSEM_COUNTS {
path("VV_INPUT/03-RSEM_Counts/*") // RSEM dataset output
path("VV_INPUT/03-RSEM_Counts/*") // zipped multiqc report
path("VV_INPUT/03-RSEM_Counts/*") // unzipped multiqc report
-
+ path(dp_tools__NF_RCP)
output:
path("03-RSEM_Counts")
@@ -249,18 +241,17 @@ process VV_RSEM_COUNTS {
"""
# move from VV_INPUT to task directory
# This allows detection as output files for publishing
- mv VV_INPUT/* .
+ mv VV_INPUT/* . || true
# Run V&V unless user requests to skip V&V
- if ${ !params.skipVV} ; then
- VV_data_assets.py --root-path . \\
- --accession ${ params.gldsAccession } \\
- --runsheet-path Metadata/*_runsheet.csv \\
- --data-asset-sets \\
+ if ${ !params.skipVV } ; then
+ dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\
+ --data-asset-key-sets \\
'RSEM counts' \\
--run-components \\
'RSEM Counts' \\
- --max-flag-code ${ params.max_flag_code }
+ --max-flag-code ${ params.max_flag_code } \\
+ --output VV_log.tsv
fi
"""
}
@@ -285,9 +276,9 @@ process VV_DESEQ2_ANALYSIS {
path("VV_INPUT/03-RSEM_Counts/*") // unzipped multiqc report
path("VV_INPUT/04-DESeq2_NormCounts/*") // norm counts files
path("VV_INPUT/05-DESeq2_DGE/*") // dge files
- path("VV_INPUT/04-DESeq2_NormCounts/*") // ERCC norm counts files
+ path("VV_INPUT/ 04-DESeq2_NormCounts/*") // ERCC norm counts files
path("VV_INPUT/05-DESeq2_DGE/ERCC_NormDGE/*") // ERCC dge files
-
+ path(dp_tools__NF_RCP)
output:
path("04-DESeq2_NormCounts")
@@ -298,25 +289,18 @@ process VV_DESEQ2_ANALYSIS {
"""
# move from VV_INPUT to task directory
# This allows detection as output files for publishing
- mv VV_INPUT/* .
+ mv VV_INPUT/* . || true
# Run V&V unless user requests to skip V&V
- if ${ !params.skipVV} ; then
- VV_data_assets.py --root-path . \\
- --accession ${ params.gldsAccession } \\
- --runsheet-path Metadata/*_runsheet.csv \\
- --data-asset-sets \\
- 'RSEM Output' \\
- 'DGE Output' \\
- ${ meta.has_ercc ? "'ERCC DGE Output'" : ''} \\
+ if ${ !params.skipVV } ; then
+ dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\
+ --data-asset-key-sets \\
+ 'RSEM Output,DGE Output${ meta.has_ercc ? ",ERCC DGE Output" : ''}' \\
--run-components \\
- 'DGE Metadata' \\
- ${ meta.has_ercc ? "'DGE Metadata ERCC'" : '' } \\
- 'DGE Output' \\
- ${ meta.has_ercc ? "'DGE Output ERCC'" : '' } \\
- --max-flag-code ${ params.max_flag_code }
+ 'DGE Metadata${ meta.has_ercc ? ",DGE Metadata ERCC" : '' },DGE Output${ meta.has_ercc ? ",DGE Output ERCC" : '' }' \\
+ --max-flag-code ${ params.max_flag_code } \\
+ --output VV_log.tsv
fi
-
# Remove all placeholder files and empty directories to prevent publishing
find . -type f,l -name *.placeholder -delete
find . -empty -type d -delete
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config
index bcf31521..364a00f2 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config
@@ -15,6 +15,7 @@ profiles {
charliecloud.enabled = false
// Address issue: https://github.com/nextflow-io/nextflow/issues/1210
process {
+ containerOptions = "--no-home"
errorStrategy = {
task.exitStatus == 255 ? 'retry' : 'terminate'
}
@@ -38,34 +39,9 @@ profiles {
includeConfig 'config/software/by_docker_image.config'
}
- test_nonGLDS {
- includeConfig 'config/executor/default_CI_test.config'
- includeConfig 'config/tests/test_nonGLDS.config'
- }
-
test {
includeConfig 'config/executor/default_CI_test.config'
}
-
- test91 {
- includeConfig 'config/executor/default_CI_test.config'
- includeConfig 'config/tests/test_glds91.config'
- }
-
- test194 {
- includeConfig 'config/executor/default_CI_test.config'
- includeConfig 'config/tests/test_glds194.config'
- }
-
- test207 {
- includeConfig 'config/executor/default_CI_test.config'
- includeConfig 'config/tests/test_glds207.config'
- }
-
- test251 {
- includeConfig 'config/executor/default_CI_test.config'
- includeConfig 'config/tests/test_glds251.config'
- }
}
manifest {
@@ -74,24 +50,24 @@ manifest {
mainScript = 'main.nf'
defaultBranch = 'main'
nextflowVersion = '>=22.10.1'
- version = '1.0.3'
+ version = '1.0.4'
}
// Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config
def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
timeline {
enabled = true
- file = "${params.gldsAccession}/Resource_Usage/execution_timeline_${trace_timestamp}.html"
+ file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/execution_timeline_${trace_timestamp}.html"
}
report {
enabled = true
- file = "${params.gldsAccession}/Resource_Usage/execution_report_${trace_timestamp}.html"
+ file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/execution_report_${trace_timestamp}.html"
}
trace {
enabled = true
- file = "${params.gldsAccession}/Resource_Usage/execution_trace_${trace_timestamp}.txt"
+ file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/execution_trace_${trace_timestamp}.txt"
}
dag {
enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram
- file = "${params.gldsAccession}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg"
+ file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg"
}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf
index 5fc69522..4593f022 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf
@@ -15,10 +15,10 @@ def mutate_to_single_end(it) {
}
// Import process from separate module file
-include { RNASEQ_RUNSHEET_FROM_GLDS as GENERATE_RUNSHEET;
- GENERATE_METASHEET;
+include { GENERATE_METASHEET;
STAGE_RAW_READS;
get_runsheet_paths } from'./modules/genelab.nf'
+include { RUNSHEET_FROM_GLDS } from './modules/RUNSHEET_FROM_GLDS.nf'
/**************************************************
* ACTUAL WORKFLOW ********************************
@@ -31,9 +31,12 @@ workflow staging{
sample_limit = params.limitSamplesTo ? params.limitSamplesTo : -1 // -1 in take means no limit
if (!params.runsheetPath) {
- ch_glds_accession | GENERATE_RUNSHEET
- GENERATE_RUNSHEET.out.runsheet | set{ ch_runsheet }
- GENERATE_METASHEET( GENERATE_RUNSHEET.out.isazip, GENERATE_RUNSHEET.out.runsheet )
+ RUNSHEET_FROM_GLDS(
+ ch_glds_accession,
+ "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
+ )
+ RUNSHEET_FROM_GLDS.out.runsheet | set{ ch_runsheet }
+ GENERATE_METASHEET( RUNSHEET_FROM_GLDS.out.isaArchive, RUNSHEET_FROM_GLDS.out.runsheet )
} else {
ch_runsheet = channel.fromPath(params.runsheetPath)
}
@@ -86,7 +89,7 @@ workflow staging{
emit:
raw_reads = stageLocal ? STAGE_RAW_READS.out : null
- isa = params.runsheetPath ? null : GENERATE_RUNSHEET.out.isazip
+ isa = params.runsheetPath ? null : RUNSHEET_FROM_GLDS.out.isaArchive
runsheet = ch_runsheet
metasheet = params.runsheetPath ? null : GENERATE_METASHEET.out.metasheet
}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml
new file mode 100644
index 00000000..f2722c8e
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml
@@ -0,0 +1,3 @@
+DGE_BY_DESEQ2:
+ - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/**
+ - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/**
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test
index 7c6438b7..17dd250a 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test
@@ -2,6 +2,7 @@ nextflow_pipeline {
name "Test Workflow main.nf"
script "main.nf"
+ tag "core"
test("GLDS-48:Mouse,SingleEnd,NonERCC: Should run without failures") {
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap
index b3f56516..eb4ad0e6 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap
@@ -61,7 +61,7 @@
"Mmus_C57-6J_LVR_GC_C_Rep5_M40_infer_expt.out:md5,cf4b9b80507493cd8232113620b4a765",
"Mmus_C57-6J_LVR_GC_I_Rep2_M32_read_dist.out:md5,fe7bfe2aa0a2774fa2754ee0485c8b6e"
],
- "timestamp": "2023-01-25T23:47:22+0000"
+ "timestamp": "2023-05-09T20:33:40+0000"
},
"GLDS-48:Mouse,SingleEnd,NonERCC: Should run without failures": {
"content": [
@@ -126,6 +126,6 @@
"Mmus_C57-6J_LVR_GC_C_Rep5_M40_infer_expt.out:md5,cf4b9b80507493cd8232113620b4a765",
"Mmus_C57-6J_LVR_GC_I_Rep2_M32_read_dist.out:md5,fe7bfe2aa0a2774fa2754ee0485c8b6e"
],
- "timestamp": "2023-01-25T23:47:23+0000"
+ "timestamp": "2023-05-09T20:33:40+0000"
}
}
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test
new file mode 100644
index 00000000..443c7188
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test
@@ -0,0 +1,71 @@
+nextflow_process {
+
+ name "Test Process DGE_BY_DESEQ2"
+ script "modules/DGE_BY_DESEQ2/main.nf"
+ process "DGE_BY_DESEQ2"
+
+ test("GLDS-194") {
+ tag 'DGE_BY_DESEQ2'
+
+ when {
+ params {
+ // define parameters here. Example:
+ use_dummy_gene_counts = true
+ }
+ process {
+ """
+ // define inputs of the process here. Example:
+ input[0] = file("test-datasets/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv")
+ input[1] = file("test-datasets/testdata/GLDS-194/03-RSEM_Counts/*.genes.results")
+ input[2] = [ primary_keytype:'ENSEMBL', has_ercc:true ]
+ input[3] = file("https://figshare.com/ndownloader/files/36597114")
+ input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
+ """
+ }
+ }
+
+ then {
+ assert process.success
+ assert snapshot(
+ process.out.summary,
+ process.out.norm_counts,
+ process.out.summary_ercc,
+ process.out.norm_counts_ercc,
+ process.out.version
+ ).match()
+ }
+
+ }
+
+ test("GLDS-321:55_.ISSUE") {
+ tag 'DGE_BY_DESEQ2'
+
+ when {
+ params {
+ // define parameters here. Example:
+ use_dummy_gene_counts = true
+ }
+ process {
+ """
+ // define inputs of the process here. Example:
+ input[0] = file("test-datasets/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv")
+ input[1] = file("test-datasets/testdata/GLDS-321/03-RSEM_Counts/*.genes.results")
+ input[2] = [ primary_keytype:'TAIR', has_ercc:false ]
+ input[3] = file("https://figshare.com/ndownloader/files/36597132")
+ input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
+ """
+ }
+ }
+
+ then {
+ assert process.success
+ assert snapshot(
+ process.out.summary,
+ process.out.norm_counts,
+ process.out.version,
+ ).match()
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap
new file mode 100644
index 00000000..8223ca2a
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap
@@ -0,0 +1,42 @@
+{
+ "GLDS-321:55_.ISSUE": {
+ "content": [
+ [
+ "summary.txt:md5,2ae67caf20a32f00b87e0f340a4c505b"
+ ],
+ [
+ [
+ "Normalized_Counts.csv:md5,c148732be1d0b1bb61278bfef612f07b",
+ "RSEM_Unnormalized_Counts.csv:md5,fd101e235076c3ae66c513bc96017b33"
+ ]
+ ],
+ [
+ "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311"
+ ]
+ ],
+ "timestamp": "2023-07-11T22:30:32+0000"
+ },
+ "GLDS-194": {
+ "content": [
+ [
+ "summary.txt:md5,6c202fd3c11a747e40a49d1369e8875f"
+ ],
+ [
+ [
+ "Normalized_Counts.csv:md5,b4ba348d5446f8ba546a46b966087c1b",
+ "RSEM_Unnormalized_Counts.csv:md5,931c6070b5e19909929c5a217713500b"
+ ]
+ ],
+ [
+ "ERCCnorm_summary.txt:md5,1f77ed6cd1a8435038859b0361f6b047"
+ ],
+ [
+ "ERCC_Normalized_Counts.csv:md5,b1d9d5a546a23b6709a9f8c60548b6a7"
+ ],
+ [
+ "versions.txt:md5,1865b6b8900d83ec7881f58fe301da11"
+ ]
+ ],
+ "timestamp": "2023-07-11T22:30:32+0000"
+ }
+}
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test
deleted file mode 100644
index d43f32c4..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test
+++ /dev/null
@@ -1,106 +0,0 @@
-nextflow_process {
-
- name "Test Process DGE_BY_DESEQ2"
- script "modules/dge.nf"
- process "DGE_BY_DESEQ2"
-
- test("Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION") {
-
- when {
- params {
- // define parameters here. Example:
- use_dummy_gene_counts = true
- }
- process {
- """
- // define inputs of the process here. Example:
- input[0] = file("test-datasets-extended/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv")
- input[1] = file("test-datasets-extended/testdata/GLDS-194/03-RSEM_Counts/*.genes.results")
- input[2] = [ primary_keytype:'ENSEMBL' ]
- input[3] = file("https://figshare.com/ndownloader/files/36597114")
- input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
- """
- }
- }
-
- then {
- assert process.success
- assert snapshot(
- process.out.dge,
- process.out.norm_counts,
- process.out.dge_ercc,
- process.out.norm_counts_ercc,
- process.out.version,
- ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated!
- ).match()
- }
-
- }
-
- test("ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION") {
-
- when {
- params {
- // define parameters here. Example:
- use_dummy_gene_counts = true
- }
- process {
- """
- // define inputs of the process here. Example:
- input[0] = file("test-datasets-extended/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv")
- input[1] = file("test-datasets-extended/testdata/GLDS-321/03-RSEM_Counts/*.genes.results")
- input[2] = [ primary_keytype:'TAIR' ]
- input[3] = file("https://figshare.com/ndownloader/files/36597132")
- input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
- """
- }
- }
-
- then {
- assert process.success
- assert snapshot(
- process.out.dge,
- process.out.norm_counts,
- // NON_ERCC process.out.dge_ercc,
- // NON_ERCC process.out.norm_counts_ercc,
- process.out.version,
- ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated!
- ).match()
- }
-
- }
-
- test("ISSUE_55_ON_Mock:Should run without failures") {
-
- when {
- params {
- // define parameters here. Example:
- use_dummy_gene_counts = true
- }
- process {
- """
- // define inputs of the process here. Example:
- input[0] = file("test-datasets-extended/testdata/mocks/overlapping_samplenames/MOCK_bulkRNASeq_v1_runsheet.csv")
- input[1] = file("test-datasets-extended/testdata/mocks/overlapping_samplenames/*.genes.results")
- input[2] = [ primary_keytype:'ENSEMBL' ]
- input[3] = file("https://figshare.com/ndownloader/files/36597114")
- input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
- """
- }
- }
-
- then {
- assert process.success
- assert snapshot(
- process.out.dge,
- process.out.norm_counts,
- // NON_ERCC process.out.dge_ercc,
- // NON_ERCC process.out.norm_counts_ercc,
- process.out.version,
- ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated!
- ).match()
- }
-
- }
-
-}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap
deleted file mode 100644
index 6d0ff5be..00000000
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap
+++ /dev/null
@@ -1,89 +0,0 @@
-{
- "Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION": {
- "content": [
- [
- [
- "contrasts.csv:md5,66d74b686885ffd4eccdf55823c0e3ce",
- "SampleTable.csv:md5,bfd18bbc7d34e41c23f0c9107f5d75c9",
- "differential_expression.csv:md5,00cf45e546529c81c0a43ae1b8495a59",
- "visualization_output_table.csv:md5,58578caedc33e6a0230ba80abe61f0d5",
- "visualization_PCA_table.csv:md5,5c461d35b12d5946c2105f705a03c6d3"
- ]
- ],
- [
- [
- "Normalized_Counts.csv:md5,b4ba348d5446f8ba546a46b966087c1b",
- "RSEM_Unnormalized_Counts.csv:md5,931c6070b5e19909929c5a217713500b"
- ]
- ],
- [
-
- ],
- [
-
- ],
- [
- "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a"
- ],
- {
- "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true,
- "dp_tools_version": "1.1.8"
- }
- ],
- "timestamp": "2023-01-25T20:29:00+0000"
- },
- "ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION": {
- "content": [
- [
- [
- "contrasts.csv:md5,af3bef64a768dd6220b6a143d2fbb1bc",
- "SampleTable.csv:md5,0b64b62678b9903bda2a431129cf52af",
- "differential_expression.csv:md5,e33ffaa350a90f7dd0f4607292db68de",
- "visualization_output_table.csv:md5,89c4b8722bf2a8fe25c6fcfa915e5c56",
- "visualization_PCA_table.csv:md5,c19f946356e520bd9bf68606d639f21c"
- ]
- ],
- [
- [
- "Normalized_Counts.csv:md5,c148732be1d0b1bb61278bfef612f07b",
- "RSEM_Unnormalized_Counts.csv:md5,fd101e235076c3ae66c513bc96017b33"
- ]
- ],
- [
- "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a"
- ],
- {
- "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true,
- "dp_tools_version": "1.1.8"
- }
- ],
- "timestamp": "2023-01-25T20:29:00+0000"
- },
- "ISSUE_55_ON_Mock:Should run without failures": {
- "content": [
- [
- [
- "contrasts.csv:md5,b99c88e9c92f1d1588727df47523c4ad",
- "SampleTable.csv:md5,229c988f09fbfeca182da7011f6f93b4",
- "differential_expression.csv:md5,f58b6f602598a0c25379afd0c5e87a71",
- "visualization_output_table.csv:md5,d056472d2ac135cad9ee4d9f33bde387",
- "visualization_PCA_table.csv:md5,1293b99878d7a7eb0e02dc6a38e33d39"
- ]
- ],
- [
- [
- "Normalized_Counts.csv:md5,393160aee08165165ccd2b8579a45161",
- "RSEM_Unnormalized_Counts.csv:md5,6759e0e7ec07960691d3913b3877c129"
- ]
- ],
- [
- "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a"
- ],
- {
- "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true,
- "dp_tools_version": "1.1.8"
- }
- ],
- "timestamp": "2023-01-25T20:29:00+0000"
- }
-}
\ No newline at end of file
diff --git a/RNAseq/Workflow_Documentation/README.md b/RNAseq/Workflow_Documentation/README.md
index ead4736c..8c7dc6ef 100644
--- a/RNAseq/Workflow_Documentation/README.md
+++ b/RNAseq/Workflow_Documentation/README.md
@@ -8,7 +8,7 @@ GeneLab has wrapped each step of the pipeline into a workflow with validation an
|Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version|
|:---------------|:---------------------------------------------------------|:---------------|
-|*[GL-DPPD-7101-F.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md)|[NF_RCP-F_1.0.3](NF_RCP-F)|22.10.1|
+|*[GL-DPPD-7101-F.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md)|[NF_RCP-F_1.0.4](NF_RCP-F)|22.10.1|
*Current GeneLab Pipeline/Workflow Implementation