produce per-region ASV tables and fasta

nf-core · Feb 7, 2024 · 715ae05 · 715ae05
1 parent 45fefd4
commit 715ae05
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 0 deletions.
diff --git a/conf/modules.config b/conf/modules.config
@@ -271,6 +271,14 @@ process {
         ]
     }
 
+    withName: DADA2_SPLITREGIONS {
+        publishDir = [
+            path: { "${params.outdir}/dada2/per_region" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: BARRNAP {
         ext.kingdom = "bac,arc,mito,euk"
         ext.args = "--quiet --reject 0.1"

diff --git a/modules/local/dada2_splitregions.nf b/modules/local/dada2_splitregions.nf
@@ -0,0 +1,67 @@
+process DADA2_SPLITREGIONS {
+    label 'process_low'
+
+    // TODO: DADA2 not neccessary, R base sufficient!
+    conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' :
+        'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }"
+
+    input:
+    tuple val(meta), val(mapping)
+    path(table)
+
+    output:
+    tuple val(meta), path( "DADA2_table_*.tsv" ), emit: dada2asv
+    tuple val(meta), path( "ASV_table_*.tsv" )  , emit: asv
+    tuple val(meta), path( "ASV_seqs_*.fasta" ) , emit: fasta
+    path "versions.yml"      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    //make groovy map to R list
+    def mapping_r_list = mapping
+        .toString()
+        .replaceAll("':","=")
+        .replaceAll(", '",",")
+        .replaceAll("\\['","list(")
+        .replaceAll("\\[","list(")
+        .replaceAll("\\]",")")
+        .replaceAll("false","'false'")
+    def suffix = "region" + meta.region + "_" + meta.fw_primer + "_" + meta.rv_primer
+    """
+    #!/usr/bin/env Rscript
+
+    nested_list <- $mapping_r_list
+    mapping <- as.data.frame(do.call(rbind, nested_list))
+
+    df <- read.csv("$table", header=TRUE, sep="\\t")
+
+    # extract samples of this region
+    keep <- intersect( colnames(df), c("ASV_ID", unlist(mapping\$id), "sequence" ) )
+    df <- subset( df, select = keep )
+
+    # modify sample names from .id to .sample
+    for (row in 1:nrow(mapping)) {
+        colnames(df) <- gsub( mapping[row,]\$id, mapping[row,]\$sample, colnames(df) )
+    }
+
+    # filter rows with only 0, occurring because many samples were removed
+    df <- df[as.logical(rowSums(df[,2:(ncol(df)-1)] != 0)), ]
+
+    # Write file with ASV abdundance and sequences to file
+    write.table(df, file = "DADA2_table_${suffix}.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '')
+
+    # Write fasta file with ASV sequences to file
+    write.table(data.frame(s = sprintf(">%s\n%s", df\$ASV_ID, df\$sequence)), 'ASV_seqs_${suffix}.fasta', col.names = FALSE, row.names = FALSE, quote = FALSE, na = '')
+
+    # Write ASV file with ASV abundances to file
+    df\$sequence <- NULL
+    write.table(df, file = "ASV_table_${suffix}.tsv", sep="\\t", row.names = FALSE, quote = FALSE, na = '')
+
+
+    writeLines(c("\\"${task.process}\\":", paste0("    R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0("    dada2: ", packageVersion("dada2")) ), "versions.yml")
+    """
+}
diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf
@@ -178,6 +178,7 @@ include { DADA2_DENOISING               } from '../modules/local/dada2_denoising
 include { DADA2_RMCHIMERA               } from '../modules/local/dada2_rmchimera'
 include { DADA2_STATS                   } from '../modules/local/dada2_stats'
 include { DADA2_MERGE                   } from '../modules/local/dada2_merge'
+include { DADA2_SPLITREGIONS            } from '../modules/local/dada2_splitregions'
 include { BARRNAP                       } from '../modules/local/barrnap'
 include { BARRNAPSUMMARY                } from '../modules/local/barrnapsummary'
 include { FILTER_SSU                    } from '../modules/local/filter_ssu'
@@ -417,6 +418,20 @@ workflow AMPLISEQ {
         ch_stats = DADA2_MERGE.out.dada2stats
     }
 
+    //separate sequences and abundances when several regions
+    if ( params.input_multiregion ) {
+        DADA2_SPLITREGIONS (
+            //DADA2_DENOISING per run & region -> per run
+            ch_reads
+                .map {
+                    info, reads ->
+                        def meta = info.subMap( info.keySet() - 'id' - 'sample' - 'run' ) //All of 'id', 'sample', 'run' must be removed to merge by region; downstream unused: 'single_end'
+                        [ meta, info ] }
+                .groupTuple(by: 0 ).dump(tag:'DADA2_SPLITREGIONS:meta'),
+            DADA2_MERGE.out.dada2asv.first() )
+        ch_versions = ch_versions.mix(DADA2_SPLITREGIONS.out.versions)
+    }
+
     //
     // MODULE : ASV post-clustering with VSEARCH
     //