Skip to content

Commit

Permalink
Fix the role of raw vs processed words
Browse files Browse the repository at this point in the history
  • Loading branch information
bgyori committed Jul 24, 2024
1 parent 72fe19c commit cfc0dcd
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 8 deletions.
7 changes: 2 additions & 5 deletions gilda/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,19 +140,16 @@ def annotate(
for span in sorted(applicable_spans, reverse=True):
# We have to reconstruct a text span while adding spaces
# where needed
txt_span = ''
raw_span = ''
for w, rw, c in zip(words[idx:idx+span],
raw_words[idx:idx+span],
for rw, c in zip(raw_words[idx:idx+span],
raw_word_coords[idx:idx+span]):
# Figure out if we need a space before this word, then
# append the word.
spaces = ' ' * (c[0] - len(raw_span) -
raw_word_coords[idx][0])
txt_span += spaces + w
raw_span += spaces + rw
context = text if context_text is None else context_text
matches = grounder.ground(txt_span,
matches = grounder.ground(raw_span,
context=context,
organisms=organisms,
namespaces=namespaces)
Expand Down
6 changes: 3 additions & 3 deletions gilda/tests/test_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def test_context_test():
context_text = "Calcium is released from the ER."
results = gilda.annotate(text, context_text=context_text)
assert len(results) == 1
assert results[1].matches[0].term.get_curie() == "GO:0005783"
assert results[1].text == "ER"
assert (results[1].start, results[0].end) == (14, 16)
assert results[0].matches[0].term.get_curie() == "GO:0005783"
assert results[0].text == "ER"
assert (results[0].start, results[0].end) == (14, 16)


def test_punctuation_comma_in_entity():
Expand Down

0 comments on commit cfc0dcd

Please sign in to comment.