1111logger = logging .getLogger (__name__ )
1212
1313
14- # UDF for calculating average quality so it can be filtered.
15- # XXX it would be great to do this as a native DuckDB function
16- # as calling a Python UDF is quite slow.
17- def _fastq_avg_quality (quality : str ) -> float :
18- q_bytes = quality .encode ("ascii" )
19- return sum (q_bytes ) / len (q_bytes ) - 33
20-
21-
2214class LoadFastqPlugin (DuckdbLoadFilePlugin ):
2315 """Load counts from one or more FASTQ files, by first building a dask dataframe of raw sequences
2416 with count=1 and then grouping by sequence and summing counts. It supports counting
@@ -46,16 +38,10 @@ def load_file(
4638 rel = rel .limit (row_limit )
4739
4840 if self .min_avg_quality > 0 :
49- try :
50- cursor .create_function (
51- "fastq_avg_quality" ,
52- _fastq_avg_quality ,
53- exception_handling = duckdb .PythonExceptionHandling .RETURN_NULL ,
54- side_effects = False ,
55- )
56- except duckdb .CatalogException as exc :
57- assert "fastq_avg_quality" in str (exc )
58- rel = rel .filter ("fastq_avg_quality(quality_scores) >= %f" % self .min_avg_quality .value )
41+ rel = rel .filter (
42+ "list_aggregate(list_transform(string_split(quality_scores, ''), x -> ord(x)), 'avg') - 33 >= %f"
43+ % self .min_avg_quality .value
44+ )
5945
6046 if self .group :
6147 rel = rel .aggregate ("sequence, count(*) as count" )
@@ -69,7 +55,9 @@ def combine(
6955 self , ddbc : duckdb .DuckDBPyConnection , tables : Iterable [duckdb .DuckDBPyRelation ]
7056 ) -> Optional [duckdb .DuckDBPyRelation ]:
7157 combined_view = super ().combine (ddbc , tables )
72- if self .filename_column or self .header_column or combined_view is None :
58+ if combined_view is None :
59+ return None
60+ elif self .filename_column or self .header_column :
7361 return combined_view
7462 else :
7563 return combined_view .aggregate ("sequence, sum(count) as count" )
0 commit comments