From 74668137aa7379ba5881358b6612b22bee5fa5de Mon Sep 17 00:00:00 2001
From: d4straub <daniel.straub@uni-tuebingen.de>
Date: Wed, 20 Mar 2024 15:05:10 +0100
Subject: [PATCH 1/2] fix vsearch cluster filtering

---
 CHANGELOG.md                     |  7 ++++---
 bin/filt_clusters.py             | 17 ++++++++++-------
 modules/local/filter_clusters.nf |  3 ++-
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bdab84a5c..fc09e5a13 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,11 +15,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - [#697](https://github.com/nf-core/ampliseq/pull/697),[#699](https://github.com/nf-core/ampliseq/pull/699),[#713](https://github.com/nf-core/ampliseq/pull/713) - Template update for nf-core/tools version 2.13.1
+- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases
+- [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters
 
 ### `Dependencies`
 
 ### `Removed`
 
+- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0
+
 ## nf-core/ampliseq version 2.8.0 - 2024-01-16
 
 ### `Added`
@@ -33,7 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Changed`
 
 - [#677](https://github.com/nf-core/ampliseq/pull/677) - Added cut_its information to SDBI export
-- [#711](https://github.com/nf-core/ampliseq/pull/711) - Changed code in taxref_reformat_gtdb.sh so it can take both bacteria and Archaea. Check issue [#708](https://github.com/nf-core/ampliseq/issues/708) for more info.
 
 ### `Fixed`
 
@@ -48,8 +51,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Removed`
 
-- [#710](https://github.com/nf-core/ampliseq/pull/710) - Removed Phyloref from DADA2 reference option because it's part of PR2 5.0.0
-
 ## nf-core/ampliseq version 2.7.1 - 2023-11-14
 
 ### `Added`
diff --git a/bin/filt_clusters.py b/bin/filt_clusters.py
index 53681f033..c54310e3f 100755
--- a/bin/filt_clusters.py
+++ b/bin/filt_clusters.py
@@ -3,6 +3,7 @@
 import argparse
 import gzip
 import pandas as pd
+import sys
 
 usage = """This program filters ASVs that aren't centroids after post-clustering."""
 
@@ -30,23 +31,25 @@
     "-c",
     "--cluster-fastas",
     dest="cluster_fastas",
-    type=str,
+    type=argparse.FileType('r'),
+    default=sys.stdin,
     help="Space separated list of fasta files of the clusters. First read of the cluster should be the centroid of that cluster.",
     required=True,
 )
 
-args = parser.parse_args()
+count = parser.parse_args().count
+prefix = parser.parse_args().prefix
 
 # This dictionary will store the centroid ASVs as keys, and the values will be the ASVs clustered to that centroid
 cluster_dict = {}
 
 # Loop though list of cluster fasta files to populate cluster_dict and to create centroid fasta file
-cluster_fastas = args.cluster_fastas.split(" ")
+cluster_fastas = parser.parse_args().cluster_fastas.read().rstrip().split(" ")
 for cluster_fasta in cluster_fastas:
     read_num = 0
 
     # Loop through each line of current fasta file and open output fasta file in append mode
-    with gzip.open(cluster_fasta, "rt") as in_fasta, open(args.prefix + "_filtered.fna", "a") as out_fasta:
+    with gzip.open(cluster_fasta, "rt") as in_fasta, open(prefix + "_filtered.fna", "a") as out_fasta:
         for line in in_fasta:
             line = line.rstrip("\n")
 
@@ -75,7 +78,7 @@
 sam_asv_counts = {}
 
 # This count_df will have ASVs as the index, and samples as the header
-count_df = pd.read_table(args.count, delimiter="\t", index_col=0, header=0)
+count_df = pd.read_table(count, delimiter="\t", index_col=0, header=0)
 
 # Get the number of ASVs per sample before clustering
 for sample in count_df.columns:
@@ -103,5 +106,5 @@
 stats_df["ASVs_after_clustering"] = list(sam_asv_counts.values())
 
 # Output filtered count tsv and stats tsv
-count_df.to_csv(args.prefix + "_filtered.table.tsv", sep="\t")
-stats_df.to_csv(args.prefix + "_filtered.stats.tsv", sep="\t", index=False)
+count_df.to_csv(prefix + "_filtered.table.tsv", sep="\t")
+stats_df.to_csv(prefix + "_filtered.stats.tsv", sep="\t", index=False)
diff --git a/modules/local/filter_clusters.nf b/modules/local/filter_clusters.nf
index 18b02fcb3..5c0dd20b6 100644
--- a/modules/local/filter_clusters.nf
+++ b/modules/local/filter_clusters.nf
@@ -24,7 +24,8 @@ process FILTER_CLUSTERS {
     def prefix   = task.ext.prefix ?: "'$meta.id'"
     def clusters = "'$clusters'"
     """
-    filt_clusters.py -t ${asv} -p ${prefix} -c ${clusters}
+    ulimit -s unlimited
+    echo ${clusters} | filt_clusters.py -t ${asv} -p ${prefix} -c -
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 583deca30c82ae665781585b6457df9feb1d971a Mon Sep 17 00:00:00 2001
From: Daniel Straub <42973691+d4straub@users.noreply.github.com>
Date: Wed, 20 Mar 2024 16:57:05 +0100
Subject: [PATCH 2/2] Update CHANGELOG.md

Co-authored-by: Daniel Lundin <erik.rikard.daniel@gmail.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc09e5a13..cf1bda40a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - [#697](https://github.com/nf-core/ampliseq/pull/697),[#699](https://github.com/nf-core/ampliseq/pull/699),[#713](https://github.com/nf-core/ampliseq/pull/713) - Template update for nf-core/tools version 2.13.1
-- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases
+- [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases. (This did not affect `sbdi-gtdb` databases, only `gtdb`.)
 - [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters
 
 ### `Dependencies`