Merge pull request #118 from cthoyt/ner-tutorial

Add NER tutorial and update documentation configuration
gyorilab · Jun 30, 2023 · 4408ee3 · 4408ee3
2 parents 76daa5b + 816b1df
commit 4408ee3
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -30,10 +30,10 @@ also be regenerated locally by running `python -m gilda.generate_terms`.
 ## Documentation and notebooks
 Documentation for Gilda is available [here](https://gilda.readthedocs.io).
 We also provide several interactive Jupyter notebooks to help use and customize Gilda:
-- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda.
-- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom
+- [Gilda Introduction](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda.
+- [Custom Grounders](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom
 grounding resources.
-- [This notebook](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training
+- [Model Training](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training
 new disambiguation models.
 
 ## Usage
@@ -45,13 +45,22 @@ https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb
 ### Use as a Python package
 For using Gilda as a Python package, the documentation at
 http://gilda.readthedocs.org provides detailed descriptions of each module of
-Gilda and their usage. A basic usage example is as follows
+Gilda and their usage. A basic usage example for named entity normalization (NEN),
+or _grounding_ is as follows:
 
 ```python
 import gilda
 scored_matches = gilda.ground('ER', context='Calcium is released from the ER.')
 ```
 
+Gilda also implements a simple dictionary-based named entity recognition (NER)
+algorithm that can be used as follows:
+
+```python
+import gilda
+results = gilda.annotate('Calcium is released from the ER.')
+```
+
 ### Use as a web service
 The REST service accepts POST requests with a JSON header on the /ground
 endpoint. There is a public REST service running at http://grounding.indra.bio

diff --git a/doc/conf.py b/doc/conf.py
@@ -68,7 +68,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -171,6 +171,6 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    'https://docs.python.org/': None,
+    'python': ('https://docs.python.org/', None),
     'pyobo': ('https://pyobo.readthedocs.io/en/latest/', None),
 }
diff --git a/doc/modules/index.rst b/doc/modules/index.rst
@@ -36,6 +36,13 @@ Process
     :members:
     :show-inheritance:
 
+Named Entity Recognition
+------------------------
+.. automodule:: gilda.ner
+    :members:
+    :show-inheritance:
+
+
 Pandas Utilities
 ----------------
 .. automodule:: gilda.pandas_utils

diff --git a/doc/requirements.txt b/doc/requirements.txt
@@ -1,4 +1,4 @@
-sphinx
+sphinx<7.0
 sphinx_autodoc_typehints
 sphinx_rtd_theme
 mock

diff --git a/gilda/__init__.py b/gilda/__init__.py
@@ -2,13 +2,14 @@
 
 import logging
 
-from .api import get_grounder, get_models, get_names, ground, make_grounder
+from .api import get_grounder, get_models, get_names, ground, make_grounder, annotate
 from .grounder import Grounder, ScoredMatch
 from .pandas_utils import ground_df, ground_df_map
 from .term import Term
 
 __all__ = [
     'ground',
+    'annotate',
     'get_models',
     'get_names',
     'get_grounder',

diff --git a/gilda/api.py b/gilda/api.py
@@ -1,4 +1,11 @@
-__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder']
+__all__ = [
+    'ground',
+    'get_models',
+    'get_names',
+    'get_grounder',
+    'make_grounder',
+    'annotate',
+]
 
 from typing import List, Mapping, Union, Optional
 
@@ -29,6 +36,10 @@ def get_names(self, db, id, status=None, source=None):
                                              status=status,
                                              source=source)
 
+    @property
+    def prefix_index(self):
+        return self.get_grounder().prefix_index
+
 
 grounder = GrounderInstance()
 
@@ -96,6 +107,48 @@ def ground(text, context=None, organisms=None, namespaces=None):
     return grounder.ground(text=text, context=context, organisms=organisms, namespaces=namespaces)
 
 
+def annotate(
+    text: str,
+    sent_split_fun=None,
+    organisms=None,
+    namespaces=None,
+):
+    """Annotate a given text with Gilda (i.e., do named entity recognition).
+
+    Parameters
+    ----------
+    text : str
+        The text to be annotated.
+    sent_split_fun : Callable, optional
+        A function that splits the text into sentences. The default is
+        :func:`nltk.tokenize.sent_tokenize`. The function should take a string
+        as input and return an iterable of strings corresponding to the sentences
+        in the input text.
+    organisms : list[str], optional
+        A list of organism names to pass to the grounder. If not provided,
+        human is used.
+    namespaces : list[str], optional
+        A list of namespaces to pass to the grounder to restrict the matches
+        to. By default, no restriction is applied.
+
+    Returns
+    -------
+    list[tuple[str, ScoredMatch, int, int]]
+        A list of tuples of start and end character offsets of the text
+        corresponding to the entity, the entity text, and the ScoredMatch
+        object corresponding to the entity.
+    """
+    from .ner import annotate as _annotate
+
+    return _annotate(
+        text,
+        grounder=grounder,
+        sent_split_fun=sent_split_fun,
+        organisms=organisms,
+        namespaces=namespaces
+    )
+
+
 def get_models():
     """Return a list of entity texts for which disambiguation models exist.
 

diff --git a/gilda/ner.py b/gilda/ner.py
@@ -1,19 +1,75 @@
+"""
+Gilda implements a simple dictionary-based named entity
+recognition (NER) algorithm. It can be used as follows:
+
+>>> from gilda.ner import annotate
+>>> text = "MEK phosphorylates ERK"
+>>> results = annotate(text)
+
+The results are a list of 4-tuples containing:
+
+- the text string matched
+- a :class:`gilda.grounder.ScoredMatch` instance containing the _best_ match
+- the position in the text string where the entity starts
+- the position in the text string where the entity ends
+
+In this example, the two concepts are grounded to FamPlex entries.
+
+>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3]
+('MEK', 'fplx:MEK', 0, 3)
+>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3]
+('ERK', 'fplx:ERK', 19, 22)
+
+If you directly look in the second part of the 4-tuple, you get a full
+description of the match itself:
+
+>>> results[0][1]
+ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\
+0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\
+False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')]))
+
+BRAT
+----
+Gilda implements a way to output annotation in a format appropriate for the
+`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_.
+
+>>> from gilda.ner import get_brat
+>>> from pathlib import Path
+>>> brat_string = get_brat(results)
+>>> Path("results.ann").write_text(brat_string)
+>>> Path("results.txt").write_text(text)
+
+For brat to work, you need to store the text in a file with
+the extension ``.txt`` and the annotations in a file with the
+same name but extension ``.ann``.
+"""
+
+from typing import List, Tuple
+
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize
 
 from gilda import ScoredMatch, get_grounder
 from gilda.process import normalize
 
+__all__ = [
+    "annotate",
+    "get_brat",
+    "Annotation",
+]
+
 stop_words = set(stopwords.words('english'))
 
+Annotation = Tuple[str, ScoredMatch, int, int]
+
 
 def annotate(
     text, *,
     grounder=None,
-    sent_split_fun=sent_tokenize,
+    sent_split_fun=None,
     organisms=None,
     namespaces=None,
-):
+) -> List[Annotation]:
     """Annotate a given text with Gilda.
 
     Parameters
@@ -24,8 +80,8 @@ def annotate(
         The Gilda grounder to use for grounding.
     sent_split_fun : Callable, optional
         A function that splits the text into sentences. The default is
-        nltk.tokenize.sent_tokenize. The function should take a string as
-        input and return an iterable of strings corresponding to the sentences
+        :func:`nltk.tokenize.sent_tokenize`. The function should take a string
+        as input and return an iterable of strings corresponding to the sentences
         in the input text.
     organisms : list[str], optional
         A list of organism names to pass to the grounder. If not provided,
@@ -43,6 +99,8 @@ def annotate(
     """
     if grounder is None:
         grounder = get_grounder()
+    if sent_split_fun is None:
+        sent_split_fun = sent_tokenize
     # Get sentences
     sentences = sent_split_fun(text)
     text_coord = 0

diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py
@@ -8,7 +8,7 @@ def test_annotate():
     full_text = \
         "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein."
 
-    annotations = annotate(full_text)
+    annotations = gilda.annotate(full_text)
     assert isinstance(annotations, list)
 
     # Check that we get 7 annotations
@@ -42,7 +42,7 @@ def test_get_brat():
     full_text = \
         "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein."
 
-    brat_str = get_brat(annotate(full_text))
+    brat_str = get_brat(gilda.annotate(full_text))
 
     assert isinstance(brat_str, str)
     match_str = dedent("""