From 50cb274131964e9c61a0a5be89f7c714ddf32fe2 Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 24 Jul 2024 12:05:35 -0700 Subject: [PATCH 1/4] Assume splitter function returns coordinates instead of sentences --- gilda/ner.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gilda/ner.py b/gilda/ner.py index 6feb574..6aa1ec8 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -79,12 +79,12 @@ def annotate( The text to be annotated. grounder : gilda.grounder.Grounder, optional The Gilda grounder to use for grounding. - sent_split_fun : Callable, optional + sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional A function that splits the text into sentences. The default is :func:`nltk.tokenize.sent_tokenize`. The function should take a string - as input and return an iterable of strings corresponding to the sentences - in the input text. - organisms : List[str], optional + as input and return an iterable of coordinate pairs corresponding to the + start and end coordinates for each sentence in the input text. + organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, human is used. namespaces : List[str], optional @@ -103,18 +103,18 @@ def annotate( """ if grounder is None: grounder = get_grounder() - sent_tokenizer = PunktSentenceTokenizer() if sent_split_fun is None: - sent_split_fun = sent_tokenizer.tokenize + sent_tokenizer = PunktSentenceTokenizer() + sent_split_fun = sent_tokenizer.span_tokenize # Get sentences - sentences = sent_split_fun(text) - sentence_coords = list(sent_tokenizer.span_tokenize(text)) + sentence_coords = sent_split_fun(text) text_coord = 0 annotations = [] word_tokenizer = TreebankWordTokenizer() # FIXME: a custom sentence split function can be inconsistent # with the coordinates being used here which come from NLTK - for sentence, sentence_coord in zip(sentences, sentence_coords): + for sentence_coord in sentence_coords: + sentence = text[sentence_coord[0]:sentence_coord[1]] # FIXME: one rare corner case is named entities with single quotes # in them which get tokenized in a weird way raw_word_coords = \ From 9caa293298c749cd7f2d5ccfcd707823e561261a Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 24 Jul 2024 12:07:06 -0700 Subject: [PATCH 2/4] Use specific variables for start and end coordinates for clarity --- gilda/ner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gilda/ner.py b/gilda/ner.py index 6aa1ec8..4aeb101 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -113,8 +113,8 @@ def annotate( word_tokenizer = TreebankWordTokenizer() # FIXME: a custom sentence split function can be inconsistent # with the coordinates being used here which come from NLTK - for sentence_coord in sentence_coords: - sentence = text[sentence_coord[0]:sentence_coord[1]] + for sent_start, sent_end in sentence_coords: + sentence = text[sent_start:sent_end] # FIXME: one rare corner case is named entities with single quotes # in them which get tokenized in a weird way raw_word_coords = \ @@ -154,9 +154,8 @@ def annotate( organisms=organisms, namespaces=namespaces) if matches: - start_coord = sentence_coord[0] + raw_word_coords[idx][0] - end_coord = sentence_coord[0] + \ - raw_word_coords[idx+span-1][1] + start_coord = sent_start + raw_word_coords[idx][0] + end_coord = sent_end + raw_word_coords[idx+span-1][1] annotations.append(Annotation( raw_span, matches, start_coord, end_coord )) From 56c0e61ad7ccc7853d69d45bc03db041b15269ab Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 24 Jul 2024 14:56:14 -0700 Subject: [PATCH 3/4] Fix typo --- gilda/ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gilda/ner.py b/gilda/ner.py index 4aeb101..d6cc126 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -155,7 +155,7 @@ def annotate( namespaces=namespaces) if matches: start_coord = sent_start + raw_word_coords[idx][0] - end_coord = sent_end + raw_word_coords[idx+span-1][1] + end_coord = sent_start + raw_word_coords[idx+span-1][1] annotations.append(Annotation( raw_span, matches, start_coord, end_coord )) From 4bcb4ea2f8e440aa5eccde0b95063814bc0a061f Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 24 Jul 2024 15:22:23 -0700 Subject: [PATCH 4/4] Update docstring --- gilda/api.py | 9 +++++---- gilda/ner.py | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gilda/api.py b/gilda/api.py index 1c988e8..afd971f 100644 --- a/gilda/api.py +++ b/gilda/api.py @@ -120,11 +120,12 @@ def annotate( ---------- text : str The text to be annotated. - sent_split_fun : Callable, optional + sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional A function that splits the text into sentences. The default is - :func:`nltk.tokenize.sent_tokenize`. The function should take a string - as input and return an iterable of strings corresponding to the sentences - in the input text. + :func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function + should take a string as input and return an iterable of coordinate pairs + corresponding to the start and end coordinates for each sentence in the + input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, human is used. diff --git a/gilda/ner.py b/gilda/ner.py index d6cc126..39a5009 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -81,9 +81,10 @@ def annotate( The Gilda grounder to use for grounding. sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional A function that splits the text into sentences. The default is - :func:`nltk.tokenize.sent_tokenize`. The function should take a string - as input and return an iterable of coordinate pairs corresponding to the - start and end coordinates for each sentence in the input text. + :func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function + should take a string as input and return an iterable of coordinate pairs + corresponding to the start and end coordinates for each sentence in the + input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, human is used.