From 50cb274131964e9c61a0a5be89f7c714ddf32fe2 Mon Sep 17 00:00:00 2001
From: kkaris <karis.klas@gmail.com>
Date: Wed, 24 Jul 2024 12:05:35 -0700
Subject: [PATCH 1/4] Assume splitter function returns coordinates instead of
 sentences

---
 gilda/ner.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index 6feb574..6aa1ec8 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -79,12 +79,12 @@ def annotate(
         The text to be annotated.
     grounder : gilda.grounder.Grounder, optional
         The Gilda grounder to use for grounding.
-    sent_split_fun : Callable, optional
+    sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional
         A function that splits the text into sentences. The default is
         :func:`nltk.tokenize.sent_tokenize`. The function should take a string
-        as input and return an iterable of strings corresponding to the sentences
-        in the input text.
-    organisms : List[str], optional
+        as input and return an iterable of coordinate pairs corresponding to the
+        start and end coordinates for each sentence in the input text.
+    organisms : list[str], optional
         A list of organism names to pass to the grounder. If not provided,
         human is used.
     namespaces : List[str], optional
@@ -103,18 +103,18 @@ def annotate(
     """
     if grounder is None:
         grounder = get_grounder()
-    sent_tokenizer = PunktSentenceTokenizer()
     if sent_split_fun is None:
-        sent_split_fun = sent_tokenizer.tokenize
+        sent_tokenizer = PunktSentenceTokenizer()
+        sent_split_fun = sent_tokenizer.span_tokenize
     # Get sentences
-    sentences = sent_split_fun(text)
-    sentence_coords = list(sent_tokenizer.span_tokenize(text))
+    sentence_coords = sent_split_fun(text)
     text_coord = 0
     annotations = []
     word_tokenizer = TreebankWordTokenizer()
     # FIXME: a custom sentence split function can be inconsistent
     # with the coordinates being used here which come from NLTK
-    for sentence, sentence_coord in zip(sentences, sentence_coords):
+    for sentence_coord in sentence_coords:
+        sentence = text[sentence_coord[0]:sentence_coord[1]]
         # FIXME: one rare corner case is named entities with single quotes
         # in them which get tokenized in a weird way
         raw_word_coords = \

From 9caa293298c749cd7f2d5ccfcd707823e561261a Mon Sep 17 00:00:00 2001
From: kkaris <karis.klas@gmail.com>
Date: Wed, 24 Jul 2024 12:07:06 -0700
Subject: [PATCH 2/4] Use specific variables for start and end coordinates for
 clarity

---
 gilda/ner.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index 6aa1ec8..4aeb101 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -113,8 +113,8 @@ def annotate(
     word_tokenizer = TreebankWordTokenizer()
     # FIXME: a custom sentence split function can be inconsistent
     # with the coordinates being used here which come from NLTK
-    for sentence_coord in sentence_coords:
-        sentence = text[sentence_coord[0]:sentence_coord[1]]
+    for sent_start, sent_end in sentence_coords:
+        sentence = text[sent_start:sent_end]
         # FIXME: one rare corner case is named entities with single quotes
         # in them which get tokenized in a weird way
         raw_word_coords = \
@@ -154,9 +154,8 @@ def annotate(
                                           organisms=organisms,
                                           namespaces=namespaces)
                 if matches:
-                    start_coord = sentence_coord[0] + raw_word_coords[idx][0]
-                    end_coord = sentence_coord[0] + \
-                        raw_word_coords[idx+span-1][1]
+                    start_coord = sent_start + raw_word_coords[idx][0]
+                    end_coord = sent_end + raw_word_coords[idx+span-1][1]
                     annotations.append(Annotation(
                         raw_span, matches, start_coord, end_coord
                     ))

From 56c0e61ad7ccc7853d69d45bc03db041b15269ab Mon Sep 17 00:00:00 2001
From: kkaris <karis.klas@gmail.com>
Date: Wed, 24 Jul 2024 14:56:14 -0700
Subject: [PATCH 3/4] Fix typo

---
 gilda/ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index 4aeb101..d6cc126 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -155,7 +155,7 @@ def annotate(
                                           namespaces=namespaces)
                 if matches:
                     start_coord = sent_start + raw_word_coords[idx][0]
-                    end_coord = sent_end + raw_word_coords[idx+span-1][1]
+                    end_coord = sent_start + raw_word_coords[idx+span-1][1]
                     annotations.append(Annotation(
                         raw_span, matches, start_coord, end_coord
                     ))

From 4bcb4ea2f8e440aa5eccde0b95063814bc0a061f Mon Sep 17 00:00:00 2001
From: kkaris <karis.klas@gmail.com>
Date: Wed, 24 Jul 2024 15:22:23 -0700
Subject: [PATCH 4/4] Update docstring

---
 gilda/api.py | 9 +++++----
 gilda/ner.py | 7 ++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/gilda/api.py b/gilda/api.py
index 1c988e8..afd971f 100644
--- a/gilda/api.py
+++ b/gilda/api.py
@@ -120,11 +120,12 @@ def annotate(
     ----------
     text : str
         The text to be annotated.
-    sent_split_fun : Callable, optional
+    sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional
         A function that splits the text into sentences. The default is
-        :func:`nltk.tokenize.sent_tokenize`. The function should take a string
-        as input and return an iterable of strings corresponding to the sentences
-        in the input text.
+        :func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function
+        should take a string as input and return an iterable of coordinate pairs
+        corresponding to the start and end coordinates for each sentence in the
+        input text.
     organisms : list[str], optional
         A list of organism names to pass to the grounder. If not provided,
         human is used.
diff --git a/gilda/ner.py b/gilda/ner.py
index d6cc126..39a5009 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -81,9 +81,10 @@ def annotate(
         The Gilda grounder to use for grounding.
     sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional
         A function that splits the text into sentences. The default is
-        :func:`nltk.tokenize.sent_tokenize`. The function should take a string
-        as input and return an iterable of coordinate pairs corresponding to the
-        start and end coordinates for each sentence in the input text.
+        :func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function
+        should take a string as input and return an iterable of coordinate pairs
+        corresponding to the start and end coordinates for each sentence in the
+        input text.
     organisms : list[str], optional
         A list of organism names to pass to the grounder. If not provided,
         human is used.