diff --git a/CHANGELOG.md b/CHANGELOG.md index e2466879..6a5cd236 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#717](https://github.com/nf-core/ampliseq/pull/717) - Fix edge case for sorting file names by using radix method - [#718](https://github.com/nf-core/ampliseq/pull/718) - Require a minimum sequence length of 50bp for taxonomic classifcation after using ITSx - [#721](https://github.com/nf-core/ampliseq/pull/721) - Fix error `unknown recognition error type: groovyjarjarantlr4.v4.runtime.LexerNoViableAltException` caused by a missing `\` in nf-core module `pigz/uncompress` (which had no consequences but was confusing) +- [#722](https://github.com/nf-core/ampliseq/pull/722) - When barrnap detects several genes select the lowest e-value ### `Dependencies` diff --git a/bin/summarize_barrnap.py b/bin/summarize_barrnap.py index c7690bbe..8a87f90b 100755 --- a/bin/summarize_barrnap.py +++ b/bin/summarize_barrnap.py @@ -3,7 +3,8 @@ # Takes a list of files with barrnap predictions (rrna.arc.gff, rrna.bac.gff, etc) # for ASV sequences, extracts evalues for each prediction and summarize the results # in a new file "summary.gff". Assumes that the same program/barrnap version is -# used for all predictions. +# used for all predictions. If there is more than one gene for a given domain, +# retains the lowest e-value (case of full rRNA operon sequences). # import pandas as pd import sys @@ -27,7 +28,8 @@ method[asv] = rowparts[1] if asv not in evalues: evalues[asv] = dict() - evalues[asv][org] = rowparts[5] + if (org not in evalues[asv]) or (float(evalues[asv][org]) > float(rowparts[5])): + evalues[asv][org] = rowparts[5] fh.close() # Write results