nf-core
diff --git a/‎CHANGELOG.md
Lines changed: 22 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 22 additions & 0 deletions
diff --git a/‎assets/multiqc_config.yml
Lines changed: 2 additions & 2 deletions b/‎assets/multiqc_config.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎conf/modules.config
Lines changed: 6 additions & 5 deletions b/‎conf/modules.config
Lines changed: 6 additions & 5 deletions
diff --git a/‎docs/output.md
Lines changed: 21 additions & 18 deletions b/‎docs/output.md
Lines changed: 21 additions & 18 deletions
diff --git a/‎modules.json
Lines changed: 1 addition & 1 deletion b/‎modules.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎modules/local/cat.nf
Lines changed: 23 additions & 20 deletions b/‎modules/local/cat.nf
Lines changed: 23 additions & 20 deletions
diff --git a/‎modules/local/quast_bins.nf
Lines changed: 5 additions & 4 deletions b/‎modules/local/quast_bins.nf
Lines changed: 5 additions & 4 deletions
@@ -3,6 +3,28 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 2.5.1 - [2023-11-17]
+
+### `Added`
+
+### `Changed`
+
+### `Fixed`
+
+- [#489](https://github.com/nf-core/mag/pull/489) Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (reported by @tillenglert and @maxibor, fix by @maxibor)
+- [#533](https://github.com/nf-core/mag/pull/533) Fix glob pattern for publishing MetaBAT2 bins in results (reported by @patriciatran, fix by @jfy133)
+- [#535](https://github.com/nf-core/mag/pull/535) Fix input validation pattern to again allow direct FASTQ input (reported by @lennijusten, @emnilsson, fix by @jfy133, @d4straub, @mahesh-panchal, @nvnieuwk)
+
+### `Dependencies`
+
+| Tool | Previous version | New version |
+| ---- | ---------------- | ----------- |
+| CAT  | 4.6              | 5.2.3       |
+
+### `Deprecated`
+
+- [#536](https://github.com/nf-core/mag/pull/536) Remove custom function with native Nextflow for checking file extension (reported by @d4straub, fix by @jfy133)
+
 ## 2.5.0 - [2023-10-10]
 
 ### `Added`
 
@@ -1,8 +1,8 @@
 report_comment: >
 
-  This report has been generated by the <a href="https://github.com/nf-core/mag/releases/tag/2.5.0" target="_blank">nf-core/mag</a>
+  This report has been generated by the <a href="https://github.com/nf-core/mag/releases/tag/2.5.1" target="_blank">nf-core/mag</a>
   analysis pipeline. For information about how to interpret these results, please see the
-  <a href="https://nf-co.re/mag/2.5.0/docs/output" target="_blank">documentation</a>.
+  <a href="https://nf-co.re/mag/2.5.1/docs/output" target="_blank">documentation</a>.
 
 report_section_order:
   "nf-core-mag-methods-description":
 
@@ -377,8 +377,8 @@ process {
     }
 
     withName: 'CHECKM_LINEAGEWF' {
-        tag = { "${meta.assembler}-${meta.binner}-${meta.id}" }
-        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_wf" }
+        tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
+        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" }
         publishDir = [
             path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
             mode: params.publish_dir_mode,
@@ -387,7 +387,7 @@ process {
     }
 
     withName: 'CHECKM_QA' {
-        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_qa" }
+        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" }
         ext.args = "-o 2 --tab_table"
         publishDir = [
             path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
@@ -458,6 +458,7 @@ process {
 
     withName: GTDBTK_CLASSIFYWF {
         ext.args   = "--extension fa"
+        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
         publishDir = [
             path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" },
             mode: params.publish_dir_mode,
@@ -569,9 +570,9 @@ process {
     withName: METABAT2_METABAT2 {
         publishDir = [
             [
-                path: { "${params.outdir}/GenomeBinning/MetaBAT2/" },
+                path: { "${params.outdir}/GenomeBinning/MetaBAT2/bins/" },
                 mode: params.publish_dir_mode,
-                pattern: 'bins/*.fa.gz'
+                pattern: '*[!lowDepth|tooShort|unbinned].fa.gz'
             ],
             [
                 path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" },
 
@@ -333,6 +333,8 @@ These depth files are used for downstream binning steps.
 
 All the files and contigs in these folders will be assessed by QUAST and BUSCO.
 
+All other files that were discarded by the tool, or from the low-quality unbinned contigs, can be found here.
+
 <details markdown="1">
 <summary>Output files</summary>
 
@@ -476,6 +478,7 @@ For each bin or refined bin the median sequencing depth is computed based on the
   - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format
   - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor)
 - `GenomeBinning/QC/`
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition.
   - `quast_summary.tsv`: QUAST output for all bins summarized
 
 </details>
@@ -531,9 +534,9 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place
 <summary>Output files</summary>
 
 - `GenomeBinning/QC/CheckM/`
-  - `[assembler]-[binner]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
-  - `[assembler]-[binner]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
-  - `[assembler]-[binner]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
   - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`).
 
 </details>
@@ -581,14 +584,14 @@ If `--gunc_save_db` is specified, the output directory will also contain the req
 <summary>Output files</summary>
 
 - `Taxonomy/CAT/[assembler]/[binner]/`
-  - `[assembler]-[binner]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
-  - `[assembler]-[binner]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
 - `Taxonomy/CAT/[assembler]/[binner]/raw/`
-  - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
-  - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
-  - `[assembler]-[binner]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
-  - `[assembler]-[binner]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
-  - `[assembler]-[binner]-[sample/group].log`: Log files
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files
 
 </details>
 
@@ -609,14 +612,14 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally
 <summary>Output files</summary>
 
 - `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/`
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html).
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].*.log`: Log files.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
-- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
+  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)).
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
+- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
 
 </details>
 
 
@@ -118,7 +118,7 @@
                     },
                     "gtdbtk/classifywf": {
                         "branch": "master",
-                        "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df",
+                        "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
                         "installed_by": ["modules"]
                     },
                     "gunc/downloaddb": {
 
@@ -1,39 +1,42 @@
 process CAT {
-    tag "${meta.assembler}-${meta.binner}-${meta.id}-${db_name}"
+    tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}"
 
-    conda "bioconda::cat=4.6 bioconda::diamond=2.0.6"
+    conda "bioconda::cat=5.2.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' :
-        'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }"
+        'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' :
+        'biocontainers/cat:5.2.3--hdfd78af_1' }"
 
     input:
     tuple val(meta), path("bins/*")
     tuple val(db_name), path("database/*"), path("taxonomy/*")
 
     output:
-    path("*.names.txt.gz")                 , emit: tax_classification
-    path("raw/*.ORF2LCA.txt.gz")           , emit: orf2lca
-    path("raw/*.predicted_proteins.faa.gz"), emit: faa
-    path("raw/*.predicted_proteins.gff.gz"), emit: gff
-    path("raw/*.log")                      , emit: log
-    path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids
-    path "versions.yml"                    , emit: versions
+    path("*.ORF2LCA.names.txt.gz")            , emit: orf2lca_classification
+    path("*.bin2classification.names.txt.gz") , emit: tax_classification_names
+    path("raw/*.ORF2LCA.txt.gz")              , emit: orf2lca
+    path("raw/*.predicted_proteins.faa.gz")   , emit: faa
+    path("raw/*.predicted_proteins.gff.gz")   , emit: gff
+    path("raw/*.log")                         , emit: log
+    path("raw/*.bin2classification.txt.gz")   , emit: tax_classification_taxids
+    path "versions.yml"                       , emit: versions
 
     script:
     def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : ""
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
     """
-    CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing
-    CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
-    CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
+    CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing
+    CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
+    CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
 
     mkdir raw
     mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
-    gzip "raw/${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" \
-        "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.faa" \
-        "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.gff" \
-        "raw/${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" \
-        "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" \
-        "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt"
+    gzip "raw/${prefix}.ORF2LCA.txt" \
+        "raw/${prefix}.concatenated.predicted_proteins.faa" \
+        "raw/${prefix}.concatenated.predicted_proteins.gff" \
+        "raw/${prefix}.bin2classification.txt" \
+        "${prefix}.ORF2LCA.names.txt" \
+        "${prefix}.bin2classification.names.txt"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
 
@@ -1,5 +1,5 @@
 process QUAST_BINS {
-    tag "${meta.assembler}-${meta.binner}-${meta.id}"
+    tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
 
     conda "bioconda::quast=5.0.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -15,15 +15,16 @@ process QUAST_BINS {
     path "versions.yml"             , emit: versions
 
     script:
+    def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
     """
     BINS=\$(echo \"$bins\" | sed 's/[][]//g')
     IFS=', ' read -r -a bins <<< \"\$BINS\"
     for bin in \"\${bins[@]}\"; do
         metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}"
-        if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then
-            cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
+        if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then
+            cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv"
         else
-            tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
+            tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv"
         fi
     done