Skip to content

Commit

Permalink
Merge pull request #147 from gyorilab/use-coords
Browse files Browse the repository at this point in the history
Use coordinates in custom sentence split function
  • Loading branch information
bgyori authored Jul 25, 2024
2 parents 1ee3e85 + 4bcb4ea commit 1a29a1a
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 17 deletions.
9 changes: 5 additions & 4 deletions gilda/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,12 @@ def annotate(
----------
text : str
The text to be annotated.
sent_split_fun : Callable, optional
sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional
A function that splits the text into sentences. The default is
:func:`nltk.tokenize.sent_tokenize`. The function should take a string
as input and return an iterable of strings corresponding to the sentences
in the input text.
:func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function
should take a string as input and return an iterable of coordinate pairs
corresponding to the start and end coordinates for each sentence in the
input text.
organisms : list[str], optional
A list of organism names to pass to the grounder. If not provided,
human is used.
Expand Down
26 changes: 13 additions & 13 deletions gilda/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,13 @@ def annotate(
The text to be annotated.
grounder : gilda.grounder.Grounder, optional
The Gilda grounder to use for grounding.
sent_split_fun : Callable, optional
sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional
A function that splits the text into sentences. The default is
:func:`nltk.tokenize.sent_tokenize`. The function should take a string
as input and return an iterable of strings corresponding to the sentences
in the input text.
organisms : List[str], optional
:func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function
should take a string as input and return an iterable of coordinate pairs
corresponding to the start and end coordinates for each sentence in the
input text.
organisms : list[str], optional
A list of organism names to pass to the grounder. If not provided,
human is used.
namespaces : List[str], optional
Expand All @@ -103,18 +104,18 @@ def annotate(
"""
if grounder is None:
grounder = get_grounder()
sent_tokenizer = PunktSentenceTokenizer()
if sent_split_fun is None:
sent_split_fun = sent_tokenizer.tokenize
sent_tokenizer = PunktSentenceTokenizer()
sent_split_fun = sent_tokenizer.span_tokenize
# Get sentences
sentences = sent_split_fun(text)
sentence_coords = list(sent_tokenizer.span_tokenize(text))
sentence_coords = sent_split_fun(text)
text_coord = 0
annotations = []
word_tokenizer = TreebankWordTokenizer()
# FIXME: a custom sentence split function can be inconsistent
# with the coordinates being used here which come from NLTK
for sentence, sentence_coord in zip(sentences, sentence_coords):
for sent_start, sent_end in sentence_coords:
sentence = text[sent_start:sent_end]
# FIXME: one rare corner case is named entities with single quotes
# in them which get tokenized in a weird way
raw_word_coords = \
Expand Down Expand Up @@ -154,9 +155,8 @@ def annotate(
organisms=organisms,
namespaces=namespaces)
if matches:
start_coord = sentence_coord[0] + raw_word_coords[idx][0]
end_coord = sentence_coord[0] + \
raw_word_coords[idx+span-1][1]
start_coord = sent_start + raw_word_coords[idx][0]
end_coord = sent_start + raw_word_coords[idx+span-1][1]
annotations.append(Annotation(
raw_span, matches, start_coord, end_coord
))
Expand Down

0 comments on commit 1a29a1a

Please sign in to comment.