From ce29d90c720ca8f382ab29f76bf5b3553c447b40 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Fri, 30 Jun 2023 15:55:23 +0200
Subject: [PATCH 1/6] Add NER tutorial and update documentation configuration

---
 doc/conf.py           |  4 ++--
 doc/modules/index.rst |  7 ++++++
 doc/requirements.txt  |  2 +-
 gilda/ner.py          | 50 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index ff0cd72..77d2929 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -68,7 +68,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -171,6 +171,6 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    'https://docs.python.org/': None,
+    'python': ('https://docs.python.org/', None),
     'pyobo': ('https://pyobo.readthedocs.io/en/latest/', None),
 }
diff --git a/doc/modules/index.rst b/doc/modules/index.rst
index 825d3d7..4b405d1 100644
--- a/doc/modules/index.rst
+++ b/doc/modules/index.rst
@@ -36,6 +36,13 @@ Process
     :members:
     :show-inheritance:
 
+Named Entity Recognition
+------------------------
+.. automodule:: gilda.ner
+    :members:
+    :show-inheritance:
+
+
 Pandas Utilities
 ----------------
 .. automodule:: gilda.pandas_utils
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 6e30d5e..b079ec9 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -1,4 +1,4 @@
-sphinx
+sphinx<7.0
 sphinx_autodoc_typehints
 sphinx_rtd_theme
 mock
diff --git a/gilda/ner.py b/gilda/ner.py
index 737dc14..ddeb804 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -1,9 +1,59 @@
+"""
+Gilda implements a simple dictionary-based named entity
+recognition (NER) algorithm. It can be used as follows:
+
+>>> from gilda.ner import annotate
+>>> text = "MEK phosphorylates ERK"
+>>> results = annotate(text)
+
+The results are a list of 4-tuples containing:
+- the text string matched
+- a :class:`gilda.ScoredMatch` instance containing the _best_ match
+- the position in the text string where the entity starts
+- the position in the text string where the entity ends
+
+In this example, the two concepts are grounded to FamPlex entries.
+
+>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3]
+('MEK', 'fplx:MEK', 0, 3)
+>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3]
+('ERK', 'fplx:ERK', 19, 22)
+
+If you directly look in the second part of the 4-tuple, you get a full
+description of the match itself:
+
+>>> results[0][1]
+ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\
+0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\
+False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')]))
+
+BRAT
+----
+Gilda implements a way to output annotation in a format appropriate for the
+`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_
+
+>>> from gilda.ner import get_brat
+>>> from pathlib import Path
+>>> brat_string = get_brat(results)
+>>> Path("results.ann").write_text(brat_string)
+>>> Path("results.txt").write_text(text)
+
+For brat to work, you need to store the text in a file with
+the extension `.txt` and the annotations in a file with the
+same name but extension `.ann`.
+"""
+
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize
 
 from gilda import ScoredMatch, get_grounder
 from gilda.process import normalize
 
+__all__ = [
+    "annotate",
+    "get_brat",
+]
+
 stop_words = set(stopwords.words('english'))
 
 

From 0d59c65af2d0fd435ad03a23c72935a2269fae0b Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Fri, 30 Jun 2023 16:00:49 +0200
Subject: [PATCH 2/6] Make build work

---
 README.md    | 6 +++---
 gilda/ner.py | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d87d4ff..bdaba4c 100644
--- a/README.md
+++ b/README.md
@@ -30,10 +30,10 @@ also be regenerated locally by running `python -m gilda.generate_terms`.
 ## Documentation and notebooks
 Documentation for Gilda is available [here](https://gilda.readthedocs.io).
 We also provide several interactive Jupyter notebooks to help use and customize Gilda:
-- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda.
-- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom
+- [Gilda Introduction](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda.
+- [Custom Grounders](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom
 grounding resources.
-- [This notebook](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training
+- [Model Training](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training
 new disambiguation models.
 
 ## Usage
diff --git a/gilda/ner.py b/gilda/ner.py
index ddeb804..171941b 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -7,8 +7,9 @@
 >>> results = annotate(text)
 
 The results are a list of 4-tuples containing:
+
 - the text string matched
-- a :class:`gilda.ScoredMatch` instance containing the _best_ match
+- a :class:`gilda.grounder.ScoredMatch` instance containing the _best_ match
 - the position in the text string where the entity starts
 - the position in the text string where the entity ends
 
@@ -30,7 +31,7 @@
 BRAT
 ----
 Gilda implements a way to output annotation in a format appropriate for the
-`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_
+`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_.
 
 >>> from gilda.ner import get_brat
 >>> from pathlib import Path
@@ -39,8 +40,8 @@
 >>> Path("results.txt").write_text(text)
 
 For brat to work, you need to store the text in a file with
-the extension `.txt` and the annotations in a file with the
-same name but extension `.ann`.
+the extension ``.txt`` and the annotations in a file with the
+same name but extension ``.ann``.
 """
 
 from nltk.corpus import stopwords

From 28358070672c12525c1e4707bc9f3dd29e8516a0 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Fri, 30 Jun 2023 16:27:47 +0200
Subject: [PATCH 3/6] Add top-level annotation function

---
 README.md         |  8 ++++++++
 gilda/__init__.py |  3 ++-
 gilda/api.py      | 51 ++++++++++++++++++++++++++++++++++++++++++++++-
 gilda/ner.py      | 15 ++++++++++----
 4 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bdaba4c..ba934b1 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,14 @@ import gilda
 scored_matches = gilda.ground('ER', context='Calcium is released from the ER.')
 ```
 
+Gilda can implements a simple dictionary-based named entity recognition (NER)
+algorithm that can be used as follows:
+
+```python
+import gilda
+results = gilda.annotate('Calcium is released from the ER.')
+```
+
 ### Use as a web service
 The REST service accepts POST requests with a JSON header on the /ground
 endpoint. There is a public REST service running at http://grounding.indra.bio
diff --git a/gilda/__init__.py b/gilda/__init__.py
index 2783935..3d9e85f 100644
--- a/gilda/__init__.py
+++ b/gilda/__init__.py
@@ -2,13 +2,14 @@
 
 import logging
 
-from .api import get_grounder, get_models, get_names, ground, make_grounder
+from .api import get_grounder, get_models, get_names, ground, make_grounder, annotate
 from .grounder import Grounder, ScoredMatch
 from .pandas_utils import ground_df, ground_df_map
 from .term import Term
 
 __all__ = [
     'ground',
+    'annotate',
     'get_models',
     'get_names',
     'get_grounder',
diff --git a/gilda/api.py b/gilda/api.py
index 1e2f798..142143a 100644
--- a/gilda/api.py
+++ b/gilda/api.py
@@ -1,4 +1,11 @@
-__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder']
+__all__ = [
+    'ground',
+    'get_models',
+    'get_names',
+    'get_grounder',
+    'make_grounder',
+    'annotate',
+]
 
 from typing import List, Mapping, Union, Optional
 
@@ -96,6 +103,48 @@ def ground(text, context=None, organisms=None, namespaces=None):
     return grounder.ground(text=text, context=context, organisms=organisms, namespaces=namespaces)
 
 
+def annotate(
+    text: str,
+    sent_split_fun=None,
+    organisms=None,
+    namespaces=None,
+):
+    """Annotate a given text with Gilda (i.e., do named entity recognition).
+
+    Parameters
+    ----------
+    text : str
+        The text to be annotated.
+    sent_split_fun : Callable, optional
+        A function that splits the text into sentences. The default is
+        :func:`nltk.tokenize.sent_tokenize`. The function should take a string
+        as input and return an iterable of strings corresponding to the sentences
+        in the input text.
+    organisms : list[str], optional
+        A list of organism names to pass to the grounder. If not provided,
+        human is used.
+    namespaces : list[str], optional
+        A list of namespaces to pass to the grounder to restrict the matches
+        to. By default, no restriction is applied.
+
+    Returns
+    -------
+    list[tuple[str, ScoredMatch, int, int]]
+        A list of tuples of start and end character offsets of the text
+        corresponding to the entity, the entity text, and the ScoredMatch
+        object corresponding to the entity.
+    """
+    from .ner import annotate as _annotate
+
+    return _annotate(
+        text,
+        grounder=grounder,
+        sent_split_fun=sent_split_fun,
+        organisms=organisms,
+        namespaces=namespaces
+    )
+
+
 def get_models():
     """Return a list of entity texts for which disambiguation models exist.
 
diff --git a/gilda/ner.py b/gilda/ner.py
index 171941b..c2c83c6 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -44,6 +44,8 @@
 same name but extension ``.ann``.
 """
 
+from typing import List, Tuple
+
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize
 
@@ -53,18 +55,21 @@
 __all__ = [
     "annotate",
     "get_brat",
+    "Annotation",
 ]
 
 stop_words = set(stopwords.words('english'))
 
+Annotation = Tuple[str, ScoredMatch, int, int]
+
 
 def annotate(
     text, *,
     grounder=None,
-    sent_split_fun=sent_tokenize,
+    sent_split_fun=None,
     organisms=None,
     namespaces=None,
-):
+) -> List[Annotation]:
     """Annotate a given text with Gilda.
 
     Parameters
@@ -75,8 +80,8 @@ def annotate(
         The Gilda grounder to use for grounding.
     sent_split_fun : Callable, optional
         A function that splits the text into sentences. The default is
-        nltk.tokenize.sent_tokenize. The function should take a string as
-        input and return an iterable of strings corresponding to the sentences
+        :func:`nltk.tokenize.sent_tokenize`. The function should take a string
+        as input and return an iterable of strings corresponding to the sentences
         in the input text.
     organisms : list[str], optional
         A list of organism names to pass to the grounder. If not provided,
@@ -94,6 +99,8 @@ def annotate(
     """
     if grounder is None:
         grounder = get_grounder()
+    if sent_split_fun is None:
+        sent_split_fun = sent_tokenize
     # Get sentences
     sentences = sent_split_fun(text)
     text_coord = 0

From a746315927f611a824307233924b36fad4e37fcd Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Fri, 30 Jun 2023 16:30:10 +0200
Subject: [PATCH 4/6] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ba934b1..1b194fe 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,8 @@ https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb
 ### Use as a Python package
 For using Gilda as a Python package, the documentation at
 http://gilda.readthedocs.org provides detailed descriptions of each module of
-Gilda and their usage. A basic usage example is as follows
+Gilda and their usage. A basic usage example for named entity normalization (NEN),
+or _grounding_ is as follows:
 
 ```python
 import gilda

From dcda7f6aa244c0ef54000f2898596155a0ea3361 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Fri, 30 Jun 2023 16:45:27 +0200
Subject: [PATCH 5/6] Add additional test

Found that something else needs to get exposed
---
 gilda/api.py            | 4 ++++
 gilda/tests/test_ner.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/gilda/api.py b/gilda/api.py
index 142143a..5906d02 100644
--- a/gilda/api.py
+++ b/gilda/api.py
@@ -36,6 +36,10 @@ def get_names(self, db, id, status=None, source=None):
                                              status=status,
                                              source=source)
 
+    @property
+    def prefix_index(self):
+        return self.get_grounder().prefix_index
+
 
 grounder = GrounderInstance()
 
diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py
index 9de2f19..5d6b508 100644
--- a/gilda/tests/test_ner.py
+++ b/gilda/tests/test_ner.py
@@ -8,7 +8,7 @@ def test_annotate():
     full_text = \
         "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein."
 
-    annotations = annotate(full_text)
+    annotations = gilda.annotate(full_text)
     assert isinstance(annotations, list)
 
     # Check that we get 7 annotations
@@ -42,7 +42,7 @@ def test_get_brat():
     full_text = \
         "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein."
 
-    brat_str = get_brat(annotate(full_text))
+    brat_str = get_brat(gilda.annotate(full_text))
 
     assert isinstance(brat_str, str)
     match_str = dedent("""

From 816b1dfac2114cc16884686e6d3520809cf2f8a3 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Fri, 30 Jun 2023 11:02:48 -0400
Subject: [PATCH 6/6] Fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1b194fe..0e5fd18 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ import gilda
 scored_matches = gilda.ground('ER', context='Calcium is released from the ER.')
 ```
 
-Gilda can implements a simple dictionary-based named entity recognition (NER)
+Gilda also implements a simple dictionary-based named entity recognition (NER)
 algorithm that can be used as follows:
 
 ```python