From 0bc07984e205cb941d8358644f048dcbd300319c Mon Sep 17 00:00:00 2001 From: Tomo Oga Date: Tue, 5 Nov 2024 11:01:17 -0500 Subject: [PATCH 1/2] add normalization to stopwords in stopwords file as loaded --- gilda/ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gilda/ner.py b/gilda/ner.py index 49dae9e..cf7e0d3 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -70,7 +70,7 @@ def _load_stoplist() -> Set[str]: """Load NER stoplist from file.""" stoplist_path = STOPLIST_PATH with open(stoplist_path, 'r') as file: - stoplist = {line.strip() for line in file} + stoplist = {normalize(line.strip()) for line in file} return stoplist From 692fb2f794eea2029eac2b21ac3840f0c29e72ec Mon Sep 17 00:00:00 2001 From: Tomo Oga Date: Tue, 5 Nov 2024 11:22:28 -0500 Subject: [PATCH 2/2] add raw word comparison to stoplist, this approach over normalizing stoplist text --- gilda/ner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gilda/ner.py b/gilda/ner.py index cf7e0d3..153369f 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -70,7 +70,7 @@ def _load_stoplist() -> Set[str]: """Load NER stoplist from file.""" stoplist_path = STOPLIST_PATH with open(stoplist_path, 'r') as file: - stoplist = {normalize(line.strip()) for line in file} + stoplist = {line.strip() for line in file} return stoplist @@ -144,6 +144,8 @@ def annotate( continue if word in stop_words: continue + if raw_words[idx] in stop_words: + continue spans = grounder.prefix_index.get(word, set()) if not spans: continue