From ce29d90c720ca8f382ab29f76bf5b3553c447b40 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 30 Jun 2023 15:55:23 +0200 Subject: [PATCH 1/6] Add NER tutorial and update documentation configuration --- doc/conf.py | 4 ++-- doc/modules/index.rst | 7 ++++++ doc/requirements.txt | 2 +- gilda/ner.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index ff0cd72..77d2929 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -68,7 +68,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -171,6 +171,6 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'https://docs.python.org/': None, + 'python': ('https://docs.python.org/', None), 'pyobo': ('https://pyobo.readthedocs.io/en/latest/', None), } diff --git a/doc/modules/index.rst b/doc/modules/index.rst index 825d3d7..4b405d1 100644 --- a/doc/modules/index.rst +++ b/doc/modules/index.rst @@ -36,6 +36,13 @@ Process :members: :show-inheritance: +Named Entity Recognition +------------------------ +.. automodule:: gilda.ner + :members: + :show-inheritance: + + Pandas Utilities ---------------- .. automodule:: gilda.pandas_utils diff --git a/doc/requirements.txt b/doc/requirements.txt index 6e30d5e..b079ec9 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -sphinx +sphinx<7.0 sphinx_autodoc_typehints sphinx_rtd_theme mock diff --git a/gilda/ner.py b/gilda/ner.py index 737dc14..ddeb804 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -1,9 +1,59 @@ +""" +Gilda implements a simple dictionary-based named entity +recognition (NER) algorithm. It can be used as follows: + +>>> from gilda.ner import annotate +>>> text = "MEK phosphorylates ERK" +>>> results = annotate(text) + +The results are a list of 4-tuples containing: +- the text string matched +- a :class:`gilda.ScoredMatch` instance containing the _best_ match +- the position in the text string where the entity starts +- the position in the text string where the entity ends + +In this example, the two concepts are grounded to FamPlex entries. + +>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3] +('MEK', 'fplx:MEK', 0, 3) +>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3] +('ERK', 'fplx:ERK', 19, 22) + +If you directly look in the second part of the 4-tuple, you get a full +description of the match itself: + +>>> results[0][1] +ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\ +0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\ +False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')])) + +BRAT +---- +Gilda implements a way to output annotation in a format appropriate for the +`BRAT Rapid Annotation Tool (BRAT) `_ + +>>> from gilda.ner import get_brat +>>> from pathlib import Path +>>> brat_string = get_brat(results) +>>> Path("results.ann").write_text(brat_string) +>>> Path("results.txt").write_text(text) + +For brat to work, you need to store the text in a file with +the extension `.txt` and the annotations in a file with the +same name but extension `.ann`. +""" + from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize from gilda import ScoredMatch, get_grounder from gilda.process import normalize +__all__ = [ + "annotate", + "get_brat", +] + stop_words = set(stopwords.words('english')) From 0d59c65af2d0fd435ad03a23c72935a2269fae0b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 30 Jun 2023 16:00:49 +0200 Subject: [PATCH 2/6] Make build work --- README.md | 6 +++--- gilda/ner.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d87d4ff..bdaba4c 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,10 @@ also be regenerated locally by running `python -m gilda.generate_terms`. ## Documentation and notebooks Documentation for Gilda is available [here](https://gilda.readthedocs.io). We also provide several interactive Jupyter notebooks to help use and customize Gilda: -- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda. -- [This notebook](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom +- [Gilda Introduction](https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb) provides an interactive tutorial for using Gilda. +- [Custom Grounders](https://github.com/indralab/gilda/blob/master/notebooks/custom_grounders.ipynb) shows several examples of how Gilda can be instantiated with custom grounding resources. -- [This notebook](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training +- [Model Training](https://github.com/indralab/gilda/blob/master/models/model_training.ipynb) provides interactive sample code for training new disambiguation models. ## Usage diff --git a/gilda/ner.py b/gilda/ner.py index ddeb804..171941b 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -7,8 +7,9 @@ >>> results = annotate(text) The results are a list of 4-tuples containing: + - the text string matched -- a :class:`gilda.ScoredMatch` instance containing the _best_ match +- a :class:`gilda.grounder.ScoredMatch` instance containing the _best_ match - the position in the text string where the entity starts - the position in the text string where the entity ends @@ -30,7 +31,7 @@ BRAT ---- Gilda implements a way to output annotation in a format appropriate for the -`BRAT Rapid Annotation Tool (BRAT) `_ +`BRAT Rapid Annotation Tool (BRAT) `_. >>> from gilda.ner import get_brat >>> from pathlib import Path @@ -39,8 +40,8 @@ >>> Path("results.txt").write_text(text) For brat to work, you need to store the text in a file with -the extension `.txt` and the annotations in a file with the -same name but extension `.ann`. +the extension ``.txt`` and the annotations in a file with the +same name but extension ``.ann``. """ from nltk.corpus import stopwords From 28358070672c12525c1e4707bc9f3dd29e8516a0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 30 Jun 2023 16:27:47 +0200 Subject: [PATCH 3/6] Add top-level annotation function --- README.md | 8 ++++++++ gilda/__init__.py | 3 ++- gilda/api.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++- gilda/ner.py | 15 ++++++++++---- 4 files changed, 71 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index bdaba4c..ba934b1 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,14 @@ import gilda scored_matches = gilda.ground('ER', context='Calcium is released from the ER.') ``` +Gilda can implements a simple dictionary-based named entity recognition (NER) +algorithm that can be used as follows: + +```python +import gilda +results = gilda.annotate('Calcium is released from the ER.') +``` + ### Use as a web service The REST service accepts POST requests with a JSON header on the /ground endpoint. There is a public REST service running at http://grounding.indra.bio diff --git a/gilda/__init__.py b/gilda/__init__.py index 2783935..3d9e85f 100644 --- a/gilda/__init__.py +++ b/gilda/__init__.py @@ -2,13 +2,14 @@ import logging -from .api import get_grounder, get_models, get_names, ground, make_grounder +from .api import get_grounder, get_models, get_names, ground, make_grounder, annotate from .grounder import Grounder, ScoredMatch from .pandas_utils import ground_df, ground_df_map from .term import Term __all__ = [ 'ground', + 'annotate', 'get_models', 'get_names', 'get_grounder', diff --git a/gilda/api.py b/gilda/api.py index 1e2f798..142143a 100644 --- a/gilda/api.py +++ b/gilda/api.py @@ -1,4 +1,11 @@ -__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder'] +__all__ = [ + 'ground', + 'get_models', + 'get_names', + 'get_grounder', + 'make_grounder', + 'annotate', +] from typing import List, Mapping, Union, Optional @@ -96,6 +103,48 @@ def ground(text, context=None, organisms=None, namespaces=None): return grounder.ground(text=text, context=context, organisms=organisms, namespaces=namespaces) +def annotate( + text: str, + sent_split_fun=None, + organisms=None, + namespaces=None, +): + """Annotate a given text with Gilda (i.e., do named entity recognition). + + Parameters + ---------- + text : str + The text to be annotated. + sent_split_fun : Callable, optional + A function that splits the text into sentences. The default is + :func:`nltk.tokenize.sent_tokenize`. The function should take a string + as input and return an iterable of strings corresponding to the sentences + in the input text. + organisms : list[str], optional + A list of organism names to pass to the grounder. If not provided, + human is used. + namespaces : list[str], optional + A list of namespaces to pass to the grounder to restrict the matches + to. By default, no restriction is applied. + + Returns + ------- + list[tuple[str, ScoredMatch, int, int]] + A list of tuples of start and end character offsets of the text + corresponding to the entity, the entity text, and the ScoredMatch + object corresponding to the entity. + """ + from .ner import annotate as _annotate + + return _annotate( + text, + grounder=grounder, + sent_split_fun=sent_split_fun, + organisms=organisms, + namespaces=namespaces + ) + + def get_models(): """Return a list of entity texts for which disambiguation models exist. diff --git a/gilda/ner.py b/gilda/ner.py index 171941b..c2c83c6 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -44,6 +44,8 @@ same name but extension ``.ann``. """ +from typing import List, Tuple + from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize @@ -53,18 +55,21 @@ __all__ = [ "annotate", "get_brat", + "Annotation", ] stop_words = set(stopwords.words('english')) +Annotation = Tuple[str, ScoredMatch, int, int] + def annotate( text, *, grounder=None, - sent_split_fun=sent_tokenize, + sent_split_fun=None, organisms=None, namespaces=None, -): +) -> List[Annotation]: """Annotate a given text with Gilda. Parameters @@ -75,8 +80,8 @@ def annotate( The Gilda grounder to use for grounding. sent_split_fun : Callable, optional A function that splits the text into sentences. The default is - nltk.tokenize.sent_tokenize. The function should take a string as - input and return an iterable of strings corresponding to the sentences + :func:`nltk.tokenize.sent_tokenize`. The function should take a string + as input and return an iterable of strings corresponding to the sentences in the input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, @@ -94,6 +99,8 @@ def annotate( """ if grounder is None: grounder = get_grounder() + if sent_split_fun is None: + sent_split_fun = sent_tokenize # Get sentences sentences = sent_split_fun(text) text_coord = 0 From a746315927f611a824307233924b36fad4e37fcd Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 30 Jun 2023 16:30:10 +0200 Subject: [PATCH 4/6] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ba934b1..1b194fe 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,8 @@ https://github.com/indralab/gilda/blob/master/notebooks/gilda_introduction.ipynb ### Use as a Python package For using Gilda as a Python package, the documentation at http://gilda.readthedocs.org provides detailed descriptions of each module of -Gilda and their usage. A basic usage example is as follows +Gilda and their usage. A basic usage example for named entity normalization (NEN), +or _grounding_ is as follows: ```python import gilda From dcda7f6aa244c0ef54000f2898596155a0ea3361 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 30 Jun 2023 16:45:27 +0200 Subject: [PATCH 5/6] Add additional test Found that something else needs to get exposed --- gilda/api.py | 4 ++++ gilda/tests/test_ner.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gilda/api.py b/gilda/api.py index 142143a..5906d02 100644 --- a/gilda/api.py +++ b/gilda/api.py @@ -36,6 +36,10 @@ def get_names(self, db, id, status=None, source=None): status=status, source=source) + @property + def prefix_index(self): + return self.get_grounder().prefix_index + grounder = GrounderInstance() diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py index 9de2f19..5d6b508 100644 --- a/gilda/tests/test_ner.py +++ b/gilda/tests/test_ner.py @@ -8,7 +8,7 @@ def test_annotate(): full_text = \ "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein." - annotations = annotate(full_text) + annotations = gilda.annotate(full_text) assert isinstance(annotations, list) # Check that we get 7 annotations @@ -42,7 +42,7 @@ def test_get_brat(): full_text = \ "The protein BRAF is a kinase.\nBRAF is a gene.\nBRAF is a protein." - brat_str = get_brat(annotate(full_text)) + brat_str = get_brat(gilda.annotate(full_text)) assert isinstance(brat_str, str) match_str = dedent(""" From 816b1dfac2114cc16884686e6d3520809cf2f8a3 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Fri, 30 Jun 2023 11:02:48 -0400 Subject: [PATCH 6/6] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1b194fe..0e5fd18 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ import gilda scored_matches = gilda.ground('ER', context='Calcium is released from the ER.') ``` -Gilda can implements a simple dictionary-based named entity recognition (NER) +Gilda also implements a simple dictionary-based named entity recognition (NER) algorithm that can be used as follows: ```python