Skip to content

Commit e728900

Browse files
authored
Merge pull request #541 from nf-core/dev
Patch release 2.5.1
2 parents ba72349 + 4c3349d commit e728900

File tree

16 files changed

+211
-121
lines changed

16 files changed

+211
-121
lines changed

CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,28 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6+
## 2.5.1 - [2023-11-17]
7+
8+
### `Added`
9+
10+
### `Changed`
11+
12+
### `Fixed`
13+
14+
- [#489](https://github.com/nf-core/mag/pull/489) Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (reported by @tillenglert and @maxibor, fix by @maxibor)
15+
- [#533](https://github.com/nf-core/mag/pull/533) Fix glob pattern for publishing MetaBAT2 bins in results (reported by @patriciatran, fix by @jfy133)
16+
- [#535](https://github.com/nf-core/mag/pull/535) Fix input validation pattern to again allow direct FASTQ input (reported by @lennijusten, @emnilsson, fix by @jfy133, @d4straub, @mahesh-panchal, @nvnieuwk)
17+
18+
### `Dependencies`
19+
20+
| Tool | Previous version | New version |
21+
| ---- | ---------------- | ----------- |
22+
| CAT | 4.6 | 5.2.3 |
23+
24+
### `Deprecated`
25+
26+
- [#536](https://github.com/nf-core/mag/pull/536) Remove custom function with native Nextflow for checking file extension (reported by @d4straub, fix by @jfy133)
27+
628
## 2.5.0 - [2023-10-10]
729

830
### `Added`

assets/multiqc_config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
report_comment: >
22
3-
This report has been generated by the <a href="https://github.com/nf-core/mag/releases/tag/2.5.0" target="_blank">nf-core/mag</a>
3+
This report has been generated by the <a href="https://github.com/nf-core/mag/releases/tag/2.5.1" target="_blank">nf-core/mag</a>
44
analysis pipeline. For information about how to interpret these results, please see the
5-
<a href="https://nf-co.re/mag/2.5.0/docs/output" target="_blank">documentation</a>.
5+
<a href="https://nf-co.re/mag/2.5.1/docs/output" target="_blank">documentation</a>.
66
77
report_section_order:
88
"nf-core-mag-methods-description":

conf/modules.config

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -377,8 +377,8 @@ process {
377377
}
378378

379379
withName: 'CHECKM_LINEAGEWF' {
380-
tag = { "${meta.assembler}-${meta.binner}-${meta.id}" }
381-
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_wf" }
380+
tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
381+
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" }
382382
publishDir = [
383383
path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
384384
mode: params.publish_dir_mode,
@@ -387,7 +387,7 @@ process {
387387
}
388388

389389
withName: 'CHECKM_QA' {
390-
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_qa" }
390+
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" }
391391
ext.args = "-o 2 --tab_table"
392392
publishDir = [
393393
path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
@@ -458,6 +458,7 @@ process {
458458

459459
withName: GTDBTK_CLASSIFYWF {
460460
ext.args = "--extension fa"
461+
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
461462
publishDir = [
462463
path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" },
463464
mode: params.publish_dir_mode,
@@ -569,9 +570,9 @@ process {
569570
withName: METABAT2_METABAT2 {
570571
publishDir = [
571572
[
572-
path: { "${params.outdir}/GenomeBinning/MetaBAT2/" },
573+
path: { "${params.outdir}/GenomeBinning/MetaBAT2/bins/" },
573574
mode: params.publish_dir_mode,
574-
pattern: 'bins/*.fa.gz'
575+
pattern: '*[!lowDepth|tooShort|unbinned].fa.gz'
575576
],
576577
[
577578
path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" },

docs/output.md

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,8 @@ These depth files are used for downstream binning steps.
333333

334334
All the files and contigs in these folders will be assessed by QUAST and BUSCO.
335335

336+
All other files that were discarded by the tool, or from the low-quality unbinned contigs, can be found here.
337+
336338
<details markdown="1">
337339
<summary>Output files</summary>
338340

@@ -476,6 +478,7 @@ For each bin or refined bin the median sequencing depth is computed based on the
476478
- `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format
477479
- `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor)
478480
- `GenomeBinning/QC/`
481+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition.
479482
- `quast_summary.tsv`: QUAST output for all bins summarized
480483

481484
</details>
@@ -531,9 +534,9 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place
531534
<summary>Output files</summary>
532535

533536
- `GenomeBinning/QC/CheckM/`
534-
- `[assembler]-[binner]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
535-
- `[assembler]-[binner]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
536-
- `[assembler]-[binner]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
537+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
538+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
539+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
537540
- `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`).
538541

539542
</details>
@@ -581,14 +584,14 @@ If `--gunc_save_db` is specified, the output directory will also contain the req
581584
<summary>Output files</summary>
582585

583586
- `Taxonomy/CAT/[assembler]/[binner]/`
584-
- `[assembler]-[binner]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
585-
- `[assembler]-[binner]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
587+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
588+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
586589
- `Taxonomy/CAT/[assembler]/[binner]/raw/`
587-
- `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
588-
- `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
589-
- `[assembler]-[binner]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
590-
- `[assembler]-[binner]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
591-
- `[assembler]-[binner]-[sample/group].log`: Log files
590+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
591+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
592+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
593+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
594+
- `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files
592595

593596
</details>
594597

@@ -609,14 +612,14 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally
609612
<summary>Output files</summary>
610613

611614
- `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/`
612-
- `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html).
613-
- `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
614-
- `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
615-
- `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
616-
- `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
617-
- `gtdbtk.[assembler]-[binner]-[sample/group].*.log`: Log files.
618-
- `gtdbtk.[assembler]-[binner]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
619-
- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
615+
- `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)).
616+
- `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
617+
- `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
618+
- `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
619+
- `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
620+
- `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files.
621+
- `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
622+
- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
620623

621624
</details>
622625

modules.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@
118118
},
119119
"gtdbtk/classifywf": {
120120
"branch": "master",
121-
"git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df",
121+
"git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
122122
"installed_by": ["modules"]
123123
},
124124
"gunc/downloaddb": {

modules/local/cat.nf

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,42 @@
11
process CAT {
2-
tag "${meta.assembler}-${meta.binner}-${meta.id}-${db_name}"
2+
tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}"
33

4-
conda "bioconda::cat=4.6 bioconda::diamond=2.0.6"
4+
conda "bioconda::cat=5.2.3"
55
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
6-
'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' :
7-
'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }"
6+
'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' :
7+
'biocontainers/cat:5.2.3--hdfd78af_1' }"
88

99
input:
1010
tuple val(meta), path("bins/*")
1111
tuple val(db_name), path("database/*"), path("taxonomy/*")
1212

1313
output:
14-
path("*.names.txt.gz") , emit: tax_classification
15-
path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
16-
path("raw/*.predicted_proteins.faa.gz"), emit: faa
17-
path("raw/*.predicted_proteins.gff.gz"), emit: gff
18-
path("raw/*.log") , emit: log
19-
path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids
20-
path "versions.yml" , emit: versions
14+
path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification
15+
path("*.bin2classification.names.txt.gz") , emit: tax_classification_names
16+
path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
17+
path("raw/*.predicted_proteins.faa.gz") , emit: faa
18+
path("raw/*.predicted_proteins.gff.gz") , emit: gff
19+
path("raw/*.log") , emit: log
20+
path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids
21+
path "versions.yml" , emit: versions
2122

2223
script:
2324
def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : ""
25+
def args = task.ext.args ?: ''
26+
def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
2427
"""
25-
CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing
26-
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
27-
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
28+
CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing
29+
CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
30+
CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
2831
2932
mkdir raw
3033
mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
31-
gzip "raw/${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" \
32-
"raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.faa" \
33-
"raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.gff" \
34-
"raw/${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" \
35-
"${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" \
36-
"${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt"
34+
gzip "raw/${prefix}.ORF2LCA.txt" \
35+
"raw/${prefix}.concatenated.predicted_proteins.faa" \
36+
"raw/${prefix}.concatenated.predicted_proteins.gff" \
37+
"raw/${prefix}.bin2classification.txt" \
38+
"${prefix}.ORF2LCA.names.txt" \
39+
"${prefix}.bin2classification.names.txt"
3740
3841
cat <<-END_VERSIONS > versions.yml
3942
"${task.process}":

modules/local/quast_bins.nf

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
process QUAST_BINS {
2-
tag "${meta.assembler}-${meta.binner}-${meta.id}"
2+
tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
33

44
conda "bioconda::quast=5.0.2"
55
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -15,15 +15,16 @@ process QUAST_BINS {
1515
path "versions.yml" , emit: versions
1616

1717
script:
18+
def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
1819
"""
1920
BINS=\$(echo \"$bins\" | sed 's/[][]//g')
2021
IFS=', ' read -r -a bins <<< \"\$BINS\"
2122
for bin in \"\${bins[@]}\"; do
2223
metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}"
23-
if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then
24-
cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
24+
if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then
25+
cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv"
2526
else
26-
tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
27+
tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv"
2728
fi
2829
done
2930

0 commit comments

Comments
 (0)