Merge pull request #715 from d4straub/fix-vsearch-cluster

fix vsearch cluster filtering
nf-core · Mar 25, 2024 · f4ac5f6 · f4ac5f6
2 parents d05e42e + 583deca
commit f4ac5f6
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,11 +15,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - [#697](https://github.com/nf-core/ampliseq/pull/697),[#699](https://github.com/nf-core/ampliseq/pull/699),[#713](https://github.com/nf-core/ampliseq/pull/713) - Template update for nf-core/tools version 2.13.1
+- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases. (This did not affect `sbdi-gtdb` databases, only `gtdb`.)
+- [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters
 
 ### `Dependencies`
 
 ### `Removed`
 
+- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0
+
 ## nf-core/ampliseq version 2.8.0 - 2024-01-16
 
 ### `Added`
@@ -33,7 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Changed`
 
 - [#677](https://github.com/nf-core/ampliseq/pull/677) - Added cut_its information to SDBI export
-- [#711](https://github.com/nf-core/ampliseq/pull/711) - Changed code in taxref_reformat_gtdb.sh so it can take both bacteria and Archaea. Check issue [#708](https://github.com/nf-core/ampliseq/issues/708) for more info.
 
 ### `Fixed`
 
@@ -48,8 +51,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Removed`
 
-- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0
-
 ## nf-core/ampliseq version 2.7.1 - 2023-11-14
 
 ### `Added`

diff --git a/bin/filt_clusters.py b/bin/filt_clusters.py
@@ -3,6 +3,7 @@
 import argparse
 import gzip
 import pandas as pd
+import sys
 
 usage = """This program filters ASVs that aren't centroids after post-clustering."""
 
@@ -30,23 +31,25 @@
     "-c",
     "--cluster-fastas",
     dest="cluster_fastas",
-    type=str,
+    type=argparse.FileType('r'),
+    default=sys.stdin,
     help="Space separated list of fasta files of the clusters. First read of the cluster should be the centroid of that cluster.",
     required=True,
 )
 
-args = parser.parse_args()
+count = parser.parse_args().count
+prefix = parser.parse_args().prefix
 
 # This dictionary will store the centroid ASVs as keys, and the values will be the ASVs clustered to that centroid
 cluster_dict = {}
 
 # Loop though list of cluster fasta files to populate cluster_dict and to create centroid fasta file
-cluster_fastas = args.cluster_fastas.split(" ")
+cluster_fastas = parser.parse_args().cluster_fastas.read().rstrip().split(" ")
 for cluster_fasta in cluster_fastas:
     read_num = 0
 
     # Loop through each line of current fasta file and open output fasta file in append mode
-    with gzip.open(cluster_fasta, "rt") as in_fasta, open(args.prefix + "_filtered.fna", "a") as out_fasta:
+    with gzip.open(cluster_fasta, "rt") as in_fasta, open(prefix + "_filtered.fna", "a") as out_fasta:
         for line in in_fasta:
             line = line.rstrip("\n")
 
@@ -75,7 +78,7 @@
 sam_asv_counts = {}
 
 # This count_df will have ASVs as the index, and samples as the header
-count_df = pd.read_table(args.count, delimiter="\t", index_col=0, header=0)
+count_df = pd.read_table(count, delimiter="\t", index_col=0, header=0)
 
 # Get the number of ASVs per sample before clustering
 for sample in count_df.columns:
@@ -103,5 +106,5 @@
 stats_df["ASVs_after_clustering"] = list(sam_asv_counts.values())
 
 # Output filtered count tsv and stats tsv
-count_df.to_csv(args.prefix + "_filtered.table.tsv", sep="\t")
-stats_df.to_csv(args.prefix + "_filtered.stats.tsv", sep="\t", index=False)
+count_df.to_csv(prefix + "_filtered.table.tsv", sep="\t")
+stats_df.to_csv(prefix + "_filtered.stats.tsv", sep="\t", index=False)
diff --git a/modules/local/filter_clusters.nf b/modules/local/filter_clusters.nf
@@ -24,7 +24,8 @@ process FILTER_CLUSTERS {
     def prefix   = task.ext.prefix ?: "'$meta.id'"
     def clusters = "'$clusters'"
     """
-    filt_clusters.py -t ${asv} -p ${prefix} -c ${clusters}
+    ulimit -s unlimited
+    echo ${clusters} | filt_clusters.py -t ${asv} -p ${prefix} -c -
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":