diff --git a/CHANGELOG.md b/CHANGELOG.md index bdab84a5..cf1bda40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,11 +15,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` - [#697](https://github.com/nf-core/ampliseq/pull/697),[#699](https://github.com/nf-core/ampliseq/pull/699),[#713](https://github.com/nf-core/ampliseq/pull/713) - Template update for nf-core/tools version 2.13.1 +- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases. (This did not affect `sbdi-gtdb` databases, only `gtdb`.) +- [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters ### `Dependencies` ### `Removed` +- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0 + ## nf-core/ampliseq version 2.8.0 - 2024-01-16 ### `Added` @@ -33,7 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - [#677](https://github.com/nf-core/ampliseq/pull/677) - Added cut_its information to SDBI export -- [#711](https://github.com/nf-core/ampliseq/pull/711) - Changed code in taxref_reformat_gtdb.sh so it can take both bacteria and Archaea. Check issue [#708](https://github.com/nf-core/ampliseq/issues/708) for more info. ### `Fixed` @@ -48,8 +51,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Removed` -- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0 - ## nf-core/ampliseq version 2.7.1 - 2023-11-14 ### `Added` diff --git a/bin/filt_clusters.py b/bin/filt_clusters.py index 53681f03..c54310e3 100755 --- a/bin/filt_clusters.py +++ b/bin/filt_clusters.py @@ -3,6 +3,7 @@ import argparse import gzip import pandas as pd +import sys usage = """This program filters ASVs that aren't centroids after post-clustering.""" @@ -30,23 +31,25 @@ "-c", "--cluster-fastas", dest="cluster_fastas", - type=str, + type=argparse.FileType('r'), + default=sys.stdin, help="Space separated list of fasta files of the clusters. First read of the cluster should be the centroid of that cluster.", required=True, ) -args = parser.parse_args() +count = parser.parse_args().count +prefix = parser.parse_args().prefix # This dictionary will store the centroid ASVs as keys, and the values will be the ASVs clustered to that centroid cluster_dict = {} # Loop though list of cluster fasta files to populate cluster_dict and to create centroid fasta file -cluster_fastas = args.cluster_fastas.split(" ") +cluster_fastas = parser.parse_args().cluster_fastas.read().rstrip().split(" ") for cluster_fasta in cluster_fastas: read_num = 0 # Loop through each line of current fasta file and open output fasta file in append mode - with gzip.open(cluster_fasta, "rt") as in_fasta, open(args.prefix + "_filtered.fna", "a") as out_fasta: + with gzip.open(cluster_fasta, "rt") as in_fasta, open(prefix + "_filtered.fna", "a") as out_fasta: for line in in_fasta: line = line.rstrip("\n") @@ -75,7 +78,7 @@ sam_asv_counts = {} # This count_df will have ASVs as the index, and samples as the header -count_df = pd.read_table(args.count, delimiter="\t", index_col=0, header=0) +count_df = pd.read_table(count, delimiter="\t", index_col=0, header=0) # Get the number of ASVs per sample before clustering for sample in count_df.columns: @@ -103,5 +106,5 @@ stats_df["ASVs_after_clustering"] = list(sam_asv_counts.values()) # Output filtered count tsv and stats tsv -count_df.to_csv(args.prefix + "_filtered.table.tsv", sep="\t") -stats_df.to_csv(args.prefix + "_filtered.stats.tsv", sep="\t", index=False) +count_df.to_csv(prefix + "_filtered.table.tsv", sep="\t") +stats_df.to_csv(prefix + "_filtered.stats.tsv", sep="\t", index=False) diff --git a/modules/local/filter_clusters.nf b/modules/local/filter_clusters.nf index 18b02fcb..5c0dd20b 100644 --- a/modules/local/filter_clusters.nf +++ b/modules/local/filter_clusters.nf @@ -24,7 +24,8 @@ process FILTER_CLUSTERS { def prefix = task.ext.prefix ?: "'$meta.id'" def clusters = "'$clusters'" """ - filt_clusters.py -t ${asv} -p ${prefix} -c ${clusters} + ulimit -s unlimited + echo ${clusters} | filt_clusters.py -t ${asv} -p ${prefix} -c - cat <<-END_VERSIONS > versions.yml "${task.process}":