From 74668137aa7379ba5881358b6612b22bee5fa5de Mon Sep 17 00:00:00 2001 From: d4straub Date: Wed, 20 Mar 2024 15:05:10 +0100 Subject: [PATCH 1/2] fix vsearch cluster filtering --- CHANGELOG.md | 7 ++++--- bin/filt_clusters.py | 17 ++++++++++------- modules/local/filter_clusters.nf | 3 ++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bdab84a5c..fc09e5a13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,11 +15,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` - [#697](https://github.com/nf-core/ampliseq/pull/697),[#699](https://github.com/nf-core/ampliseq/pull/699),[#713](https://github.com/nf-core/ampliseq/pull/713) - Template update for nf-core/tools version 2.13.1 +- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases +- [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters ### `Dependencies` ### `Removed` +- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0 + ## nf-core/ampliseq version 2.8.0 - 2024-01-16 ### `Added` @@ -33,7 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - [#677](https://github.com/nf-core/ampliseq/pull/677) - Added cut_its information to SDBI export -- [#711](https://github.com/nf-core/ampliseq/pull/711) - Changed code in taxref_reformat_gtdb.sh so it can take both bacteria and Archaea. Check issue [#708](https://github.com/nf-core/ampliseq/issues/708) for more info. ### `Fixed` @@ -48,8 +51,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Removed` -- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0 - ## nf-core/ampliseq version 2.7.1 - 2023-11-14 ### `Added` diff --git a/bin/filt_clusters.py b/bin/filt_clusters.py index 53681f033..c54310e3f 100755 --- a/bin/filt_clusters.py +++ b/bin/filt_clusters.py @@ -3,6 +3,7 @@ import argparse import gzip import pandas as pd +import sys usage = """This program filters ASVs that aren't centroids after post-clustering.""" @@ -30,23 +31,25 @@ "-c", "--cluster-fastas", dest="cluster_fastas", - type=str, + type=argparse.FileType('r'), + default=sys.stdin, help="Space separated list of fasta files of the clusters. First read of the cluster should be the centroid of that cluster.", required=True, ) -args = parser.parse_args() +count = parser.parse_args().count +prefix = parser.parse_args().prefix # This dictionary will store the centroid ASVs as keys, and the values will be the ASVs clustered to that centroid cluster_dict = {} # Loop though list of cluster fasta files to populate cluster_dict and to create centroid fasta file -cluster_fastas = args.cluster_fastas.split(" ") +cluster_fastas = parser.parse_args().cluster_fastas.read().rstrip().split(" ") for cluster_fasta in cluster_fastas: read_num = 0 # Loop through each line of current fasta file and open output fasta file in append mode - with gzip.open(cluster_fasta, "rt") as in_fasta, open(args.prefix + "_filtered.fna", "a") as out_fasta: + with gzip.open(cluster_fasta, "rt") as in_fasta, open(prefix + "_filtered.fna", "a") as out_fasta: for line in in_fasta: line = line.rstrip("\n") @@ -75,7 +78,7 @@ sam_asv_counts = {} # This count_df will have ASVs as the index, and samples as the header -count_df = pd.read_table(args.count, delimiter="\t", index_col=0, header=0) +count_df = pd.read_table(count, delimiter="\t", index_col=0, header=0) # Get the number of ASVs per sample before clustering for sample in count_df.columns: @@ -103,5 +106,5 @@ stats_df["ASVs_after_clustering"] = list(sam_asv_counts.values()) # Output filtered count tsv and stats tsv -count_df.to_csv(args.prefix + "_filtered.table.tsv", sep="\t") -stats_df.to_csv(args.prefix + "_filtered.stats.tsv", sep="\t", index=False) +count_df.to_csv(prefix + "_filtered.table.tsv", sep="\t") +stats_df.to_csv(prefix + "_filtered.stats.tsv", sep="\t", index=False) diff --git a/modules/local/filter_clusters.nf b/modules/local/filter_clusters.nf index 18b02fcb3..5c0dd20b6 100644 --- a/modules/local/filter_clusters.nf +++ b/modules/local/filter_clusters.nf @@ -24,7 +24,8 @@ process FILTER_CLUSTERS { def prefix = task.ext.prefix ?: "'$meta.id'" def clusters = "'$clusters'" """ - filt_clusters.py -t ${asv} -p ${prefix} -c ${clusters} + ulimit -s unlimited + echo ${clusters} | filt_clusters.py -t ${asv} -p ${prefix} -c - cat <<-END_VERSIONS > versions.yml "${task.process}": From 583deca30c82ae665781585b6457df9feb1d971a Mon Sep 17 00:00:00 2001 From: Daniel Straub <42973691+d4straub@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:57:05 +0100 Subject: [PATCH 2/2] Update CHANGELOG.md Co-authored-by: Daniel Lundin --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc09e5a13..cf1bda40a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` - [#697](https://github.com/nf-core/ampliseq/pull/697),[#699](https://github.com/nf-core/ampliseq/pull/699),[#713](https://github.com/nf-core/ampliseq/pull/713) - Template update for nf-core/tools version 2.13.1 -- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases +- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases. (This did not affect `sbdi-gtdb` databases, only `gtdb`.) - [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters ### `Dependencies`