diff --git a/README.md b/README.md index d87d4ff..0e5fd18 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,10 @@ also be regenerated locally by running `python -m gilda.generate_terms`. ## Documentation and notebooks Documentation for Gilda is available [here](https://gilda.readthedocs.io). We also provide several interactive Jupyter notebooks to help use and customize Gilda: -- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda. -- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom +- [Gilda Introduction](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda. +- [Custom Grounders](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom grounding resources. -- [This notebook](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training +- [Model Training](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training new disambiguation models. ## Usage @@ -45,13 +45,22 @@ https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb ### Use as a Python package For using Gilda as a Python package, the documentation at http://gilda.readthedocs.org provides detailed descriptions of each module of -Gilda and their usage. A basic usage example is as follows +Gilda and their usage. A basic usage example for named entity normalization (NEN), +or _grounding_ is as follows: ```python import gilda scored_matches = gilda.ground('ER', context='Calcium is released from the ER.') ``` +Gilda also implements a simple dictionary-based named entity recognition (NER) +algorithm that can be used as follows: + +```python +import gilda +results = gilda.annotate('Calcium is released from the ER.') +``` + ### Use as a web service The REST service accepts POST requests with a JSON header on the /ground endpoint. There is a public REST service running at http://grounding.indra.bio diff --git a/doc/conf.py b/doc/conf.py index ff0cd72..77d2929 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -68,7 +68,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -171,6 +171,6 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'https://docs.python.org/': None, + 'python': ('https://docs.python.org/', None), 'pyobo': ('https://pyobo.readthedocs.io/en/latest/', None), } diff --git a/doc/modules/index.rst b/doc/modules/index.rst index 825d3d7..4b405d1 100644 --- a/doc/modules/index.rst +++ b/doc/modules/index.rst @@ -36,6 +36,13 @@ Process :members: :show-inheritance: +Named Entity Recognition +------------------------ +.. automodule:: gilda.ner + :members: + :show-inheritance: + + Pandas Utilities ---------------- .. automodule:: gilda.pandas_utils diff --git a/doc/requirements.txt b/doc/requirements.txt index 6e30d5e..b079ec9 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -sphinx +sphinx<7.0 sphinx_autodoc_typehints sphinx_rtd_theme mock diff --git a/gilda/__init__.py b/gilda/__init__.py index 2783935..3d9e85f 100644 --- a/gilda/__init__.py +++ b/gilda/__init__.py @@ -2,13 +2,14 @@ import logging -from .api import get_grounder, get_models, get_names, ground, make_grounder +from .api import get_grounder, get_models, get_names, ground, make_grounder, annotate from .grounder import Grounder, ScoredMatch from .pandas_utils import ground_df, ground_df_map from .term import Term __all__ = [ 'ground', + 'annotate', 'get_models', 'get_names', 'get_grounder', diff --git a/gilda/api.py b/gilda/api.py index 1e2f798..5906d02 100644 --- a/gilda/api.py +++ b/gilda/api.py @@ -1,4 +1,11 @@ -__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder'] +__all__ = [ + 'ground', + 'get_models', + 'get_names', + 'get_grounder', + 'make_grounder', + 'annotate', +] from typing import List, Mapping, Union, Optional @@ -29,6 +36,10 @@ def get_names(self, db, id, status=None, source=None): status=status, source=source) + @property + def prefix_index(self): + return self.get_grounder().prefix_index + grounder = GrounderInstance() @@ -96,6 +107,48 @@ def ground(text, context=None, organisms=None, namespaces=None): return grounder.ground(text=text, context=context, organisms=organisms, namespaces=namespaces) +def annotate( + text: str, + sent_split_fun=None, + organisms=None, + namespaces=None, +): + """Annotate a given text with Gilda (i.e., do named entity recognition). + + Parameters + ---------- + text : str + The text to be annotated. + sent_split_fun : Callable, optional + A function that splits the text into sentences. The default is + :func:`nltk.tokenize.sent_tokenize`. The function should take a string + as input and return an iterable of strings corresponding to the sentences + in the input text. + organisms : list[str], optional + A list of organism names to pass to the grounder. If not provided, + human is used. + namespaces : list[str], optional + A list of namespaces to pass to the grounder to restrict the matches + to. By default, no restriction is applied. + + Returns + ------- + list[tuple[str, ScoredMatch, int, int]] + A list of tuples of start and end character offsets of the text + corresponding to the entity, the entity text, and the ScoredMatch + object corresponding to the entity. + """ + from .ner import annotate as _annotate + + return _annotate( + text, + grounder=grounder, + sent_split_fun=sent_split_fun, + organisms=organisms, + namespaces=namespaces + ) + + def get_models(): """Return a list of entity texts for which disambiguation models exist. diff --git a/gilda/ner.py b/gilda/ner.py index 737dc14..c2c83c6 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -1,19 +1,75 @@ +""" +Gilda implements a simple dictionary-based named entity +recognition (NER) algorithm. It can be used as follows: + +>>> from gilda.ner import annotate +>>> text = "MEK phosphorylates ERK" +>>> results = annotate(text) + +The results are a list of 4-tuples containing: + +- the text string matched +- a :class:`gilda.grounder.ScoredMatch` instance containing the _best_ match +- the position in the text string where the entity starts +- the position in the text string where the entity ends + +In this example, the two concepts are grounded to FamPlex entries. + +>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3] +('MEK', 'fplx:MEK', 0, 3) +>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3] +('ERK', 'fplx:ERK', 19, 22) + +If you directly look in the second part of the 4-tuple, you get a full +description of the match itself: + +>>> results[0][1] +ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\ +0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\ +False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')])) + +BRAT +---- +Gilda implements a way to output annotation in a format appropriate for the +`BRAT Rapid Annotation Tool (BRAT) `_. + +>>> from gilda.ner import get_brat +>>> from pathlib import Path +>>> brat_string = get_brat(results) +>>> Path("results.ann").write_text(brat_string) +>>> Path("results.txt").write_text(text) + +For brat to work, you need to store the text in a file with +the extension ``.txt`` and the annotations in a file with the +same name but extension ``.ann``. +""" + +from typing import List, Tuple + from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize from gilda import ScoredMatch, get_grounder from gilda.process import normalize +__all__ = [ + "annotate", + "get_brat", + "Annotation", +] + stop_words = set(stopwords.words('english')) +Annotation = Tuple[str, ScoredMatch, int, int] + def annotate( text, *, grounder=None, - sent_split_fun=sent_tokenize, + sent_split_fun=None, organisms=None, namespaces=None, -): +) -> List[Annotation]: """Annotate a given text with Gilda. Parameters @@ -24,8 +80,8 @@ def annotate( The Gilda grounder to use for grounding. sent_split_fun : Callable, optional A function that splits the text into sentences. The default is - nltk.tokenize.sent_tokenize. The function should take a string as - input and return an iterable of strings corresponding to the sentences + :func:`nltk.tokenize.sent_tokenize`. The function should take a string + as input and return an iterable of strings corresponding to the sentences in the input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, @@ -43,6 +99,8 @@ def annotate( """ if grounder is None: grounder = get_grounder() + if sent_split_fun is None: + sent_split_fun = sent_tokenize # Get sentences sentences = sent_split_fun(text) text_coord = 0 diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py index 9de2f19..5d6b508 100644 --- a/gilda/tests/test_ner.py +++ b/gilda/tests/test_ner.py @@ -8,7 +8,7 @@ def test_annotate(): full_text = \ "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein." - annotations = annotate(full_text) + annotations = gilda.annotate(full_text) assert isinstance(annotations, list) # Check that we get 7 annotations @@ -42,7 +42,7 @@ def test_get_brat(): full_text = \ "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein." - brat_str = get_brat(annotate(full_text)) + brat_str = get_brat(gilda.annotate(full_text)) assert isinstance(brat_str, str) match_str = dedent("""