Skip to content

Commit f3879e9

Browse files
committed
do fastq average quality calculation in SQL instead of a UDF
1 parent dd16999 commit f3879e9

File tree

1 file changed

+7
-19
lines changed

1 file changed

+7
-19
lines changed

countess/plugins/fastq.py

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,6 @@
1111
logger = logging.getLogger(__name__)
1212

1313

14-
# UDF for calculating average quality so it can be filtered.
15-
# XXX it would be great to do this as a native DuckDB function
16-
# as calling a Python UDF is quite slow.
17-
def _fastq_avg_quality(quality: str) -> float:
18-
q_bytes = quality.encode("ascii")
19-
return sum(q_bytes) / len(q_bytes) - 33
20-
21-
2214
class LoadFastqPlugin(DuckdbLoadFilePlugin):
2315
"""Load counts from one or more FASTQ files, by first building a dask dataframe of raw sequences
2416
with count=1 and then grouping by sequence and summing counts. It supports counting
@@ -46,16 +38,10 @@ def load_file(
4638
rel = rel.limit(row_limit)
4739

4840
if self.min_avg_quality > 0:
49-
try:
50-
cursor.create_function(
51-
"fastq_avg_quality",
52-
_fastq_avg_quality,
53-
exception_handling=duckdb.PythonExceptionHandling.RETURN_NULL,
54-
side_effects=False,
55-
)
56-
except duckdb.CatalogException as exc:
57-
assert "fastq_avg_quality" in str(exc)
58-
rel = rel.filter("fastq_avg_quality(quality_scores) >= %f" % self.min_avg_quality.value)
41+
rel = rel.filter(
42+
"list_aggregate(list_transform(string_split(quality_scores, ''), x -> ord(x)), 'avg') - 33 >= %f"
43+
% self.min_avg_quality.value
44+
)
5945

6046
if self.group:
6147
rel = rel.aggregate("sequence, count(*) as count")
@@ -69,7 +55,9 @@ def combine(
6955
self, ddbc: duckdb.DuckDBPyConnection, tables: Iterable[duckdb.DuckDBPyRelation]
7056
) -> Optional[duckdb.DuckDBPyRelation]:
7157
combined_view = super().combine(ddbc, tables)
72-
if self.filename_column or self.header_column or combined_view is None:
58+
if combined_view is None:
59+
return None
60+
elif self.filename_column or self.header_column:
7361
return combined_view
7462
else:
7563
return combined_view.aggregate("sequence, sum(count) as count")

0 commit comments

Comments
 (0)