diff --git a/.readthedocs.yaml b/.readthedocs.yaml index c409646..8d044c5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -17,7 +17,7 @@ build: # Build documentation in the "docs/" directory with Sphinx sphinx: - configuration: docs/conf.py + configuration: docs/source/conf.py # Optionally build your docs in additional formats such as PDF and ePub # formats: diff --git a/README-UI.md b/README-UI.md new file mode 100644 index 0000000..a096884 --- /dev/null +++ b/README-UI.md @@ -0,0 +1,90 @@ +# ontology-mapper-ui +The following information pertains to the text2term UI, which is written [here](https://github.com/ccb-hms/ontology-mapper-ui) and runs online [here](https://text2term.hms.harvard.edu/). It supports fewer features than the base package does, but provides a user interface for non-programmers. + +### Running Locally via Node + Python + +##### Requirements + +- Node >= 16.0.0 +- npm >= 8.0.0 +- Python >= 3.9.0 +- pip >= 21.0.0 +- text2term >= 1.1.0 + +**\*** These are the versions I have that work; while I know Python 3.9 or higher is necessary, the others may not strictly require the listed versions. + +**\*\*** If you are running this locally on Google Chrome, you will likely run into issues with CORS (Cross-Origin Requests) that I have been unable to completely resolve. I would recommend using a different browser, using the Docker method, or finding some way to disable CORS on Chrome while running this. + +#### Instructions + +##### Initial Setup + +When first cloned, run the command: + + +``` +npm install +``` + +to install all necessary packages for the React frontend. + +Next, go into the `flask-api` folder (perhaps by running `cd flask-api`) and run + +``` +pip install -r requirements-flask.txt +``` + +to install necessary packages for the Flask api. + +##### Running + +To run, make sure you are in the root of the repository and run, in two separate command line instances, the command + +``` +npm start +``` + +to start the front-end, which can be seen at `localhost:3000`, and the command + +``` +npm run flask-api +``` + +to start the back-end, which can be interacted with at `localhost:5000`. + +### Running Locally via Docker + +#### Requirements + +- Docker + +#### Instructions + +##### Initial Setup + +Before running, make sure you have the latest version of the repository built by running the command + +``` +docker-compose build +``` + +Docker should build two images: + +- `ontology-mapper-api`: the Flask backend API +- `ontology-mapper-client`: the React frontend + +##### Running + +To run the website, run the command: + +``` +docker-compose up +``` + +Docker should build two containers corresponding to the two images. + +In a browser, navigate to `localhost:8602` to see the front-end. + +### Acknowledgements + +Initial setup of React and Flask and Dockerization aided by an [article series](https://blog.miguelgrinberg.com/post/how-to-dockerize-a-react-flask-project) by Miguel Grinberg. \ No newline at end of file diff --git a/README.md b/README.md index 08a52ee..f936def 100644 --- a/README.md +++ b/README.md @@ -65,22 +65,22 @@ text2term.map_terms(source_terms, save_mappings=False, separator=',', use_cache=False, - term_type='classes', + term_type=OntologyTermType.CLASS, incl_unmapped=False) ``` NOTE: As of 3.0.0, the former three functions (`map_file`, `map_terms`, `map_tagged_terms`) have been condensed into one function. Users can now change the name of any function in old code to `map_terms` and it reads the input context to maintain the functionality of each one. ### Arguments -For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3)dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). +For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3) a dictionary where the keys are the terms to be mapped, and values can be a list of tags or a list of TaggedTerm objects (see below). Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. The exception is the Ignore tag, which causes the term to not be mapped at all, but still be outputted in the results if the incl_unmapped argument is True (see below). All other arguments are the same, and have the same functionality: `target_ontology` : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - As of version 2.3.0, passing a recognized acronym to `target_ontology` will generate the download link automatically. This is done using the `bioregistry` python package. + Path or URL or acronym of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. When the target ontology has been previously cached, provide the ontology name that was used to cache it. + As of version 2.3.0, it is possible to specify ontology acronyms as the `target_ontology` (eg "EFO" or "CL"), which is achieved using [bioregistry](https://bioregistry.io) to retrieve URLs for those acronyms. `base_iris` : tuple Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: @@ -116,16 +116,16 @@ All other arguments are the same, and have the same functionality: Save the generated mappings to a file (specified by `output_file`) `seperator` : str - Character that seperates the source term values if a file input is given. Ignored if the input is not a file path. + Character that separates the source term values if a file input is given. Ignored if the input is not a file path. `use_cache` : bool Use the cache for the ontology. More details are below. -`term_type` : str - Determines whether the ontology should be parsed for its classes (ThingClass), properties (PropertyClass), or both. Possible values are ['classes', 'properties', 'both']. If it does not match one of these values, the program will throw a ValueError. +`term_type` : term.OntologyTermType + Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any']. `incl_unmapped` : bool - Include all unmapped terms in the output. If something has been tagged Ignore (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output. + Include all unmapped terms in the output. If something has been tagged 'Ignore' (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output data frame. All default values, if they exist, can be seen above. diff --git a/docs/source/_static/ccb_logo.jpg b/docs/source/_static/ccb_logo.jpg new file mode 100644 index 0000000..422182b Binary files /dev/null and b/docs/source/_static/ccb_logo.jpg differ diff --git a/docs/conf.py b/docs/source/conf.py similarity index 95% rename from docs/conf.py rename to docs/source/conf.py index ded1330..d745823 100644 --- a/docs/conf.py +++ b/docs/source/conf.py @@ -23,5 +23,6 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'alabaster' +html_theme = 'pyramid' html_static_path = ['_static'] +html_logo = "ccb_logo.jpg" diff --git a/docs/index.rst b/docs/source/index.rst similarity index 61% rename from docs/index.rst rename to docs/source/index.rst index 6456e30..46ed444 100644 --- a/docs/index.rst +++ b/docs/source/index.rst @@ -3,18 +3,26 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to text2term's documentation! +.. .. image:: ccb_logo.jpg +.. :alt: CCB's Logo +.. :scale: 50 % +.. :align: left + +Text2term ===================================== .. toctree:: :maxdepth: 2 - :caption: Contents: - +.. include:: ../../README.md + :parser: myst_parser.sphinx_ +.. include:: ../../README-UI.md + :parser: myst_parser.sphinx_ Indices and tables ================== * :ref:`genindex` -* :ref:`modindex` * :ref:`search` + + diff --git a/requirements.txt b/requirements.txt index 0617121..98714ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy~=1.24.2 gensim~=4.3.0 scipy~=1.10.1 scikit-learn~=1.2.1 -setuptools~=67.6.0 +setuptools~=68.2.2 requests~=2.31.0 tqdm~=4.66.1 sparse_dot_topn~=0.3.4 diff --git a/test/simple-test.py b/test/simple-test.py deleted file mode 100644 index be7ddaa..0000000 --- a/test/simple-test.py +++ /dev/null @@ -1,21 +0,0 @@ -import text2term -import bioregistry - -def main(): - efo = "http://www.ebi.ac.uk/efo/efo.owl#" - pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" - ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" - if not text2term.cache_exists("EFO"): - cached_onto = text2term.cache_ontology("EFO") - # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") - print("Cache exists:", cached_onto.cache_exists()) - # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") - # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") - # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") - df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":None}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) - # taggedterms = text2term.preprocess_tagged_terms("test/simple_preprocess.txt") - # df = text2term.map_terms(taggedterms, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) - print(df.to_string()) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/test/simple_tests.py b/test/simple_tests.py new file mode 100644 index 0000000..7e65169 --- /dev/null +++ b/test/simple_tests.py @@ -0,0 +1,223 @@ +import os +import unittest +import pandas as pd +import text2term +from term import OntologyTermType +from mapper import Mapper +from text2term import OntologyTermCollector + +pd.set_option('display.max_columns', None) + + +class Text2TermTestSuite(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(Text2TermTestSuite, cls).setUpClass() + print("Setting up test suite global variables...") + cls.EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" + cls.SOURCE_TERM_ID_COLUMN = "Source Term ID" + cls.MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" + cls.MAPPING_SCORE_COLUMN = "Mapping Score" + cls.TAGS_COLUMN = "Tags" + + def test_caching_ontology_from_url(self): + # Test caching an ontology loaded from a URL + print("Test caching an ontology loaded from a URL...") + efo_cache = text2term.cache_ontology(ontology_url=self.EFO_URL, ontology_acronym="EFO") + print(f"Cache exists: {efo_cache.cache_exists()}\n") + assert efo_cache.cache_exists() is True + + print("Test using the returned ontology cache object to map a list of terms...") + mappings_efo_cache = efo_cache.map_terms(["asthma", "disease location", "food allergy"], + term_type=OntologyTermType.ANY) + assert mappings_efo_cache.size > 0 + + def test_caching_ontology_from_acronym(self): + # Test caching an ontology by resolving its acronym using bioregistry + print("Test caching an ontology by resolving its acronym using bioregistry...") + clo_cache = text2term.cache_ontology(ontology_url="CLO", ontology_acronym="CLO") + print(f"Cache exists: {clo_cache.cache_exists()}\n") + assert clo_cache.cache_exists() is True + + def test_caching_ontology_set(self): + nr_ontologies_in_registry = 8 + # Test caching the set of ontologies specified in resources/ontologies.csv + caches = text2term.cache_ontology_set(os.path.join("..", "text2term", "resources", "ontologies.csv")) + assert len(caches) == nr_ontologies_in_registry + + def test_mapping_to_cached_ontology(self): + # Test mapping a list of terms to EFO loaded from cache + print("Test mapping a list of terms to EFO loaded from cache...") + mappings_efo_cache = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology="EFO", + use_cache=True, term_type=OntologyTermType.ANY) + print(f"{mappings_efo_cache}\n") + assert mappings_efo_cache.size > 0 + + # Test mapping a list of terms to EFO loaded from a URL + print("Test mapping a list of terms to EFO loaded from a URL...") + mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], + target_ontology=self.EFO_URL, term_type=OntologyTermType.ANY) + print(f"{mappings_efo_url}\n") + assert mappings_efo_url.size > 0 + + # Test that mapping to cached ontology is the same as to ontology loaded from its URL + print("Test that mapping to cached ontology is the same as to ontology loaded from its URL...") + mappings_match = self.check_df_equals(self.drop_source_term_ids(mappings_efo_cache), + self.drop_source_term_ids(mappings_efo_url)) + print(f"...{mappings_match}") + assert mappings_match is True + + def test_mapping_to_cached_efo_using_syntactic_mapper(self): + # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric + print("Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric...") + df = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", use_cache=True, + mapper=text2term.Mapper.JARO_WINKLER, term_type=OntologyTermType.ANY) + print(f"{df}\n") + assert df.size > 0 + + def test_mapping_to_efo_using_ontology_acronym(self): + # Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry + print( + "Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") + df2 = text2term.map_terms(["contains", "asthma"], "EFO", term_type=OntologyTermType.CLASS) + print(f"{df2}\n") + assert df2.size > 0 + + def test_mapping_tagged_terms(self): + # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output + print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") + df3 = text2term.map_terms( + {"asthma": "disease", "allergy": ["ignore", "response"], "protein level": ["measurement"], + "isdjfnsdfwd": None}, target_ontology="EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + print(f"{df3}\n") + assert df3.size > 0 + assert df3[self.TAGS_COLUMN].str.contains("disease").any() + assert df3[self.TAGS_COLUMN].str.contains("measurement").any() + + def test_preprocessing_from_file(self): + # Test processing tagged terms where the tags are provided in a file + print("Test processing tagged terms where the tags are provided in a file...") + tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") + df4 = text2term.map_terms(tagged_terms, target_ontology="EFO", use_cache=True, incl_unmapped=True) + print(f"{df4}\n") + assert df4.size > 0 + assert df4[self.TAGS_COLUMN].str.contains("disease").any() + assert df4[self.TAGS_COLUMN].str.contains("important").any() + + def test_mapping_to_properties(self): + # Test mapping a list of properties to EFO loaded from a URL and restrict search to properties + print("Test mapping a list of properties to EFO loaded from a URL and restrict search to properties...") + df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=self.EFO_URL, + term_type=OntologyTermType.PROPERTY) + print(f"{df5}\n") + assert df5.size > 0 + + # Test mapping a list of properties to EFO loaded from cache and restrict search to properties + print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") + if not text2term.cache_exists("EFO"): + text2term.cache_ontology(ontology_url=self.EFO_URL, ontology_acronym="EFO") + df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, + term_type=OntologyTermType.PROPERTY) + print(f"{df6}\n") + assert df6.size > 0 + + # Test that mapping to properties in cached ontology is the same as to ontology loaded from its URL + properties_df_match = self.check_df_equals(self.drop_source_term_ids(df5), self.drop_source_term_ids(df6)) + print(f"Properties match: {properties_df_match}") + assert properties_df_match is True + + def test_mapping_zooma_ontologies(self): + # Test mapping a list of terms to multiple ontologies using the Zooma mapper + print("Test mapping a list of terms to multiple ontologies using the Zooma mapper...") + df_zooma = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) + print(f"{df_zooma}\n") + assert df_zooma.size > 0 + assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + def test_mapping_bioportal_ontologies(self): + # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper + print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") + df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) + print(f"{df_bioportal}\n") + assert df_bioportal.size > 0 + assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + def test_term_collector(self): + expected_nr_efo_terms = 50867 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms() + assert len(terms) == expected_nr_efo_terms + + def test_term_collector_classes_only(self): + expected_nr_efo_classes = 50643 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(term_type=OntologyTermType.CLASS) + assert len(terms) == expected_nr_efo_classes + + def test_term_collector_properties_only(self): + expected_nr_efo_properties = 224 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_efo_properties + + def test_term_collector_iri_limit(self): + efo_base_iri = "http://www.ebi.ac.uk/efo/" + expected_nr_terms_with_efo_iri = 17383 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.ANY) + assert len(terms) == expected_nr_terms_with_efo_iri + + def test_term_collector_iri_limit_properties_only(self): + efo_base_iri = "http://www.ebi.ac.uk/efo/" + expected_nr_properties_with_efo_iri = 29 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_properties_with_efo_iri + + def test_mapping_with_min_score_filter(self): + min_score = 0.6 + search_terms = ["asthma attack", "location"] + + print("Test mapping to cached EFO using Zooma mapper and min_score filter...") + df_zooma = text2term.map_terms(search_terms, target_ontology="EFO,NCIT", mapper=Mapper.ZOOMA, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_zooma[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + print("Test mapping to cached EFO using TFIDF similarity metric and min_score filter...") + df_tfidf = text2term.map_terms(search_terms, target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_tfidf[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + print("Test mapping to cached EFO using Levenshtein similarity metric and min_score filter...") + df_leven = text2term.map_terms(search_terms, target_ontology="EFO", use_cache=True, mapper=Mapper.LEVENSHTEIN, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_leven[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + def test_include_unmapped_terms(self): + df = text2term.map_terms(["asthma", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + incl_unmapped=True, min_score=0.8) + assert df[self.TAGS_COLUMN].str.contains("unmapped").any() + + def test_include_unmapped_terms_when_no_mappings_are_returned(self): + df = text2term.map_terms(["mojito", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + incl_unmapped=True, min_score=0.8) + assert df[self.TAGS_COLUMN].str.contains("unmapped").any() + + def drop_source_term_ids(self, df): + # Unless specified, source term IDs are randomly generated UUIDs. We have to drop the ID column to be able to + # get a meaningful diff between two dataframes. Otherwise, the dataframes would always differ because of the IDs + return df.drop(self.SOURCE_TERM_ID_COLUMN, axis=1) + + def check_df_equals(self, df, expected_df): + # Use pandas::assert_frame_equal function to determine if two data frames are equal + pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) + return True + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test-pypi.py b/test/test-pypi.py deleted file mode 100644 index 54e2390..0000000 --- a/test/test-pypi.py +++ /dev/null @@ -1,39 +0,0 @@ -from contextlib import contextmanager -import sys, os -import text2term - -def main(): - try: - with suppress_stdout(): - # Simple set up and testing - text2term.map_terms(["fever", "headache"], "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") - text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") - text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) - text2term.map_terms(["fever", "headache"], "EFO", base_iris=("http://www.ebi.ac.uk/efo",), mapper=text2term.mapper.Mapper.LEVENSHTEIN, max_mappings=4, use_cache=True) - - # Properties and classes tests - text2term.map_terms(["fever", "headache"], "EFO", term_type="classes", use_cache=True) - text2term.map_terms(["contains", "location"], "EFO", term_type="properties", use_cache=True) - text2term.map_terms(["fever", "contains"], "EFO", term_type="both", use_cache=True) - - # Clear cache and set down - text2term.clear_cache("EFO") - except: - print("ERROR") - -# From https://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python -@contextmanager -def suppress_stdout(): - with open(os.devnull, "w") as devnull: - old_stdout = sys.stdout - old_stderr = sys.stderr - sys.stdout = devnull - sys.stderr = devnull - try: - yield - finally: - sys.stdout = old_stdout - sys.stderr = old_stderr - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/test/test_pypi.py b/test/test_pypi.py new file mode 100644 index 0000000..6d04fe2 --- /dev/null +++ b/test/test_pypi.py @@ -0,0 +1,46 @@ +import os +import sys +import text2term +from text2term.term import OntologyTermType +from contextlib import contextmanager + + +def main(): + try: + with suppress_stdout(): + # Simple set up and testing + text2term.map_terms(["fever", "headache"], + "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") + text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") + text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) + text2term.map_terms(["fever", "headache"], "EFO", base_iris=("http://www.ebi.ac.uk/efo",), + mapper=text2term.mapper.Mapper.LEVENSHTEIN, max_mappings=4, use_cache=True) + + # Properties and classes tests + text2term.map_terms(["fever", "headache"], "EFO", term_type=OntologyTermType.CLASS, use_cache=True) + text2term.map_terms(["contains", "location"], "EFO", term_type=OntologyTermType.PROPERTY, use_cache=True) + text2term.map_terms(["fever", "contains"], "EFO", term_type=OntologyTermType.ANY, use_cache=True) + + # Clear cache and set down + text2term.clear_cache("EFO") + except: + print("ERROR") + + +# From https://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python +@contextmanager +def suppress_stdout(): + with open(os.devnull, "w") as devnull: + old_stdout = sys.stdout + old_stderr = sys.stderr + sys.stdout = devnull + sys.stderr = devnull + try: + yield + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + + +if __name__ == '__main__': + main() diff --git a/text2term/__init__.py b/text2term/__init__.py index 33b75b5..ad9f676 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -6,4 +6,5 @@ from .mapper import Mapper from .preprocess import preprocess_terms from .preprocess import preprocess_tagged_terms -from .tagged_terms import TaggedTerm \ No newline at end of file +from .tagged_term import TaggedTerm +from .term_collector import OntologyTermCollector diff --git a/text2term/__main__.py b/text2term/__main__.py index 39fa830..df9863b 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -4,19 +4,20 @@ from t2t import map_terms, cache_ontology from onto_cache import cache_exists from mapper import Mapper +from term import OntologyTermType if __name__ == "__main__": parser = argparse.ArgumentParser(description='A tool for mapping free-text descriptions of (biomedical) ' 'entities to controlled terms in an ontology') parser.add_argument("-s", "--source", required=True, type=str, - help="Input file containing 'source' terms to map to ontology terms (list of terms or CSV file)") + help="Input file containing 'source' terms to map to ontology terms: list of terms or CSV file") parser.add_argument("-t", "--target", required=True, type=str, help="Path or URL of 'target' ontology to map source terms to. When the chosen mapper is " "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " "'all' to search all ontologies") parser.add_argument("-o", "--output", required=False, type=str, default="", help="Path to desired output file for the mappings (default=current working directory)") - parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf", + parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), @@ -38,7 +39,7 @@ help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", help="Store the target ontology into local cache under acronym") - parser.add_argument("-type", "--term_type", required=False, type=str, default="classes", + parser.add_argument("-type", "--term_type", required=False, type=str, default=OntologyTermType.CLASS, help="Define whether to return ontology classes, properties, or both") arguments = parser.parse_args() diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index dedcb3e..2e08bf0 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -30,12 +30,9 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional BioPortal Annotator-specific parameters to include in the request """ - self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) - start = time.time() mappings = [] for term, term_id in zip(source_terms, source_terms_ids): mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) - self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): diff --git a/text2term/config.py b/text2term/config.py index a2ded2f..189c03b 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "3.0.2" +VERSION = "4.0.0" diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py index 7af3e40..614f912 100644 --- a/text2term/onto_cache.py +++ b/text2term/onto_cache.py @@ -1,14 +1,19 @@ -import text2term -from .mapper import Mapper import os -from shutil import rmtree import sys -import pandas as pd +import text2term import owlready2 +import pandas as pd +from .term import OntologyTermType +from .mapper import Mapper +from shutil import rmtree + +CACHE_FOLDER = "cache" """ CACHING FUNCTIONS -- Public """ + + # Caches many ontologies from a csv def cache_ontology_set(ontology_registry_path): registry = pd.read_csv(ontology_registry_path) @@ -16,62 +21,47 @@ def cache_ontology_set(ontology_registry_path): for index, row in registry.iterrows(): try: cache = text2term.cache_ontology(row.url, row.acronym) - cache_set.update({row.acronym : cache}) + cache_set.update({row.acronym: cache}) except Exception as err: err_message = "Could not cache ontology " + row.acronym + " due to error: " + str(err) sys.stderr.write(err_message) owlready2.default_world.ontologies.clear() return cache_set + # Will check if an acronym exists in the cache def cache_exists(ontology_acronym=''): - return os.path.exists("cache/" + ontology_acronym) + return os.path.exists(os.path.join(CACHE_FOLDER, ontology_acronym)) + # Clears the cache def clear_cache(ontology_acronym=''): - cache_dir = "cache/" + cache_dir = CACHE_FOLDER if ontology_acronym != '': - cache_dir = os.path.join(cache_dir, ontology_acronym) + cache_dir = os.path.join(CACHE_FOLDER, ontology_acronym) # Is equivalent to: rm -r cache_dir try: rmtree(cache_dir) sys.stderr.write("Cache has been cleared successfully\n") except OSError as error: sys.stderr.write("Cache cannot be removed:") - sys.stderr.write(error) + sys.stderr.write(str(error)) + -## Class that is returned to run +# Class that is returned to run class OntologyCache: def __init__(self, ontology_acronym): self.acronym = ontology_acronym - self.ontology = "cache/" + ontology_acronym + "/" + self.ontology = os.path.join(CACHE_FOLDER, ontology_acronym) def map_terms(self, source_terms, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - term_type='classes'): - return text2term.map_terms(source_terms, self.acronym, base_iris=base_iris, \ - excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ - mapper=mapper, output_file=output_file, save_graphs=save_graphs, \ - save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ - term_type=term_type) - - def map_tagged_terms(self, tagged_terms_dict, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - term_type='classes'): - return text2term.map_tagged_terms(tagged_terms_dict, self.acronym, base_iris=base_iris, \ - excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ - mapper=mapper, output_file=output_file, save_graphs=save_graphs, \ - save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ - term_type=term_type) - - def map_file(self, input_file, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=',', term_type='classes'): - return text2term.map_file(source_terms, self.acronym, base_iris=base_iris, csv_columns=csv_columns, \ - excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ - mapper=mapper, output_file=output_file, save_graphs=save_graphs, separator=separator, \ - save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ - term_type=term_type) + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + term_type=OntologyTermType.CLASS): + return text2term.map_terms(source_terms, self.acronym, base_iris=base_iris, + excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, + mapper=mapper, output_file=output_file, save_graphs=save_graphs, + save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, + term_type=term_type) def clear_cache(self): clear_cache(self.acronym) @@ -79,5 +69,5 @@ def clear_cache(self): def cache_exists(self): return cache_exists(self.acronym) - def acroynm(self): + def acronym(self): return self.acronym diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 222f6c2..9cbd9ac 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -1,6 +1,4 @@ import logging -import re -import sys import pandas as pd import bioregistry import shortuuid @@ -21,8 +19,8 @@ 'later', 'trimester'} QUANTITY_WORDS = {'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'frequently', 'per', 'hour', 'day', 'week', 'month', - 'year', 'years', 'total', 'quantity', 'amount', 'level', 'levels', 'volume', 'count', 'counts', 'percentage', - 'abundance', 'proportion', 'content', 'average', 'prevalence', 'mean', 'ratio'} + 'year', 'years', 'total', 'quantity', 'amount', 'level', 'levels', 'volume', 'count', 'counts', + 'percentage', 'abundance', 'proportion', 'content', 'average', 'prevalence', 'mean', 'ratio'} def normalize_list(token_list): diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 44e4f0f..2e97883 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -1,21 +1,20 @@ import re -import os -from enum import Enum -from .tagged_terms import TaggedTerm +from .tagged_term import TaggedTerm + ## Tags should be stored with their terms in the same line, delineated by ";:;" ## ex: Age when diagnosed with (.*) ;:; age,diagnosis ## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} -def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ - blocklist_char='', rem_duplicates=False, separator=";:;"): - # Seperate tags from the terms, put in TaggedTerm and add to list +def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", + blocklist_char='', rem_duplicates=False, separator=";:;"): + # Separate tags from the terms, put in TaggedTerm and add to list raw_terms = _get_values(file_path) terms = [] for raw_term in raw_terms: - seperated = raw_term.split(separator) + separated = raw_term.split(separator) try: - tags = seperated[1].split(",") - term = TaggedTerm(original_term=seperated[0], tags=tags) + tags = separated[1].split(",") + term = TaggedTerm(original_term=separated[0], tags=tags) except IndexError: term = TaggedTerm(original_term=raw_term) terms.append(term) @@ -25,10 +24,10 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ if template_path != "": raw_templates = _get_values(template_path) for raw_template in raw_templates: - seperated = raw_template.split(separator) + separated = raw_template.split(separator) try: - tags = seperated[1].split(",") - regex_term = re.compile(seperated[0]) + tags = separated[1].split(",") + regex_term = re.compile(separated[0]) templates[regex_term] = tags except IndexError: regex_term = re.compile(raw_template) @@ -45,12 +44,12 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ for term in terms: if _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=True): continue - for template, tem_tags in templates.items(): + for template, term_tags in templates.items(): match = template.fullmatch(term.get_original_term()) if match: combined_matches = ' '.join(map(str, match.groups())) if combined_matches: - _update_tagged_term(processed_terms, term, combined_matches, tem_tags) + _update_tagged_term(processed_terms, term, combined_matches, term_tags) break if rem_duplicates: @@ -58,10 +57,10 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ return processed_terms -def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ - blocklist_char='', rem_duplicates=False): + +def preprocess_terms(terms, template_path, output_file="", blocklist_path="", blocklist_char='', rem_duplicates=False): if isinstance(terms, str): - terms = _get_values(file_path) + terms = _get_values(terms) # if 'terms' is a string, we assume it is a filepath # Form the templates as regular expressions template_strings = [] if template_path != "": @@ -96,6 +95,7 @@ def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ fp.write('\n'.join(processed_terms.values())) return processed_terms + ## Note: Because Python Dictionaries and Lists are passed by reference (sort of), updating the ## dictionary/list here will update the dictionary in the caller def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=False): @@ -110,20 +110,24 @@ def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=Fal return True return False -def _update_tagged_term(processed_terms, term, new_term, tags=[]): + +def _update_tagged_term(processed_terms, term, new_term, tags=()): term.update_term(new_term) term.add_tags(tags) processed_terms.append(term) + def _get_values(path): return open(path).read().splitlines() + def _make_regex_list(strings): regexes = [] for string in strings: regexes.append(re.compile(string)) return regexes + def _remove_duplicates(terms): if type(terms) is dict: temp = {val : key for key, val in terms.items()} diff --git a/text2term/resources/ontologies.csv b/text2term/resources/ontologies.csv index 77edfb6..910acbd 100644 --- a/text2term/resources/ontologies.csv +++ b/text2term/resources/ontologies.csv @@ -1,11 +1,9 @@ acronym,version,url -CLO,2.1.178,http://purl.obolibrary.org/obo/clo.owl -CL,9/15/22,http://purl.obolibrary.org/obo/cl/releases/2022-09-15/cl.owl -EFO,3.46.0,https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl -GO,9/19/22,http://purl.obolibrary.org/obo/go/releases/2022-09-19/go.owl -HPO,6/11/22,http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl -MONDO,8/1/22,http://purl.obolibrary.org/obo/mondo/releases/2022-08-01/mondo.owl -NCIT,22.07d,http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl -PRO,67,http://purl.obolibrary.org/obo/pr/67.0/pr.owl -UBERON,8/19/22,http://purl.obolibrary.org/obo/uberon/releases/2022-08-19/uberon.owl -MP,8/4/22,http://purl.obolibrary.org/obo/mp/releases/2022-08-04/mp.owl \ No newline at end of file +CL,2023-09-21,https://github.com/obophenotype/cell-ontology/releases/download/v2023-09-21/cl.owl +EFO,3.57.0,https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl +FOODON,0.6.0,https://github.com/FoodOntology/foodon/raw/v0.6.0/foodon.owl +GO,2023-07-27,http://purl.obolibrary.org/obo/go/releases/2023-07-27/go.owl +HPO,2023-09-01,https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-09-01/hp.owl +MONDO,2023-09-12,https://github.com/monarch-initiative/mondo/releases/download/v2023-08-02/mondo.owl +NCIT,2022-08-19,https://github.com/NCI-Thesaurus/thesaurus-obo-edition/releases/download/v2022-08-19/ncit.owl +UBERON,2023-09-05,https://github.com/obophenotype/uberon/releases/download/v2023-09-05/uberon.owl \ No newline at end of file diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index a9ab4ff..5316303 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -1,7 +1,6 @@ """Provides SyntacticMapper class""" import logging -import time import nltk import rapidfuzz from tqdm import tqdm @@ -26,14 +25,10 @@ def map(self, source_terms, source_terms_ids, mapper=Mapper.JARO_WINKLER, max_ma :param mapper: Mapping method to be used for matching :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned """ - self.logger.info("Mapping %i source terms...", len(source_terms)) - start = time.time() mappings = [] for term, term_id in tqdm(zip(source_terms, source_terms_ids), total=len(source_terms)): matches = self._map(term, term_id, mapper, max_mappings) mappings.extend(matches) - end = time.time() - self.logger.info('done (mapping time: %.2fs seconds)', end - start) return TermMappingCollection(mappings).mappings_df() def _map(self, source_term, source_term_id, mapper, max_matches=3): diff --git a/text2term/t2t.py b/text2term/t2t.py index 66f1233..bf03965 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -1,14 +1,14 @@ import os -import sys import json import pickle -import time +import logging import datetime -import owlready2 +import time import pandas as pd from text2term import onto_utils -from text2term.mapper import Mapper from text2term import onto_cache +from text2term.mapper import Mapper +from text2term.term import OntologyTermType from text2term.term_collector import OntologyTermCollector from text2term.term_graph_generator import TermGraphGenerator from text2term.bioportal_mapper import BioPortalAnnotatorMapper @@ -16,57 +16,75 @@ from text2term.tfidf_mapper import TFIDFMapper from text2term.zooma_mapper import ZoomaMapper from text2term.config import VERSION -from text2term.tagged_terms import TaggedTerm +from text2term.tagged_term import TaggedTerm from text2term.term_mapping import TermMapping IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] UNMAPPED_TAG = "unmapped" -""" -Maps the terms in the given list to the specified target ontology. - -Parameters ----------- -source_terms : list - List of 'source' terms to map to ontology terms -target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies -base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') -source_terms_ids : tuple - Collection of identifiers for the given source terms -excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` -mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal -max_mappings : int - Maximum number of top-ranked mappings returned per source term -min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) -output_file : str - Path to desired output file for the mappings -save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term -save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - -Returns ----------- -df - Data frame containing the generated ontology mappings -""" +LOGGER = onto_utils.get_logger(__name__, level=logging.INFO) + + def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - separator=',', use_cache=False, term_type='classes', incl_unmapped=False): + min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, + source_terms_ids=(), separator=',', use_cache=False, term_type=OntologyTermType.CLASS, + incl_unmapped=False): + """ + Maps the terms in the given list to the specified target ontology. + + Parameters + ---------- + source_terms : str or list or dict + Path to file containing the terms to map to. Or list of terms to map to an ontology. Or dictionary containing + tagged terms, where the keys are the source terms and the values are tags attached to those terms + target_ontology : str + Filepath or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. + When the target ontology has been previously cached, provide the ontology name as used when it was cached + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + csv_columns : tuple + Name of column containing the terms to map, optionally followed by another column name containing the term IDs, + for example: ('disease', 'disease_identifier') + source_terms_ids : tuple + Collection of identifiers for the given source terms + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) + separator : str + Symbol used to separate columns in the input table (eg ',' or '\t' for csv or tsv, respectively) + use_cache : bool + Use a previously cached ontology + term_type : OntologyTermType + The type(s) of ontology terms to map to, which can be 'class' or 'property' or 'any' + incl_unmapped : bool + Include unmapped terms in the output data frame + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ # Parse the possible source terms options and tags source_terms, source_term_ids, tags = _parse_source_terms(source_terms, source_terms_ids, csv_columns, separator) - # Create Source Term Ids if they are not provided + # Create source term IDs if they are not provided if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: - sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") + LOGGER.warning(f"The number of Source Term IDs provided ({len(source_terms_ids)}) is different than the " + f"number of Source Terms ({len(source_terms)}). New Source Term IDs will be used instead.") source_terms_ids = onto_utils.generate_iris(len(source_terms)) # Create the output file if output_file == '': @@ -78,32 +96,38 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ else: target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) # Run the mapper - mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, incl_unmapped) + LOGGER.info(f"Mapping {len(source_terms)} source terms to {target_ontology}") + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, + incl_unmapped) mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) if save_mappings: - _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, \ - excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) if save_graphs: _save_graphs(target_terms, output_file) return mappings_df + # Caches a single ontology def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): if ontology_acronym == "": ontology_acronym = ontology_url - ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type='both') - cache_dir = "cache/" + ontology_acronym + "/" + ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type=OntologyTermType.ANY) + cache_dir = os.path.join("cache", ontology_acronym) + LOGGER.info(f"Caching ontology {ontology_url} to: {cache_dir}") if not os.path.exists(cache_dir): os.makedirs(cache_dir) - _serialize_ontology(ontology_terms, ontology_acronym, cache_dir) - _save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) + _save_graphs(ontology_terms, output_file=os.path.join(cache_dir, ontology_acronym)) ontology_terms.clear() return onto_cache.OntologyCache(ontology_acronym) + """ PRIVATE/HELPER FUNCTIONS """ + + # Parses the source terms and returns what is to be mapped, the term ids, and the tags def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separator=','): # If source_terms is a string, we assume it is a file location @@ -120,7 +144,7 @@ def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separ source_terms_id_list = [] for tagged_term in source_terms: terms.append(tagged_term.get_term()) - if tagged_term.get_source_term_id() != None: + if tagged_term.get_source_term_id() is None: source_terms_id_list.append(tagged_term.get_source_term_id()) source_terms_ids = source_terms_id_list tags = source_terms @@ -129,11 +153,11 @@ def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separ tags = dict.fromkeys(terms) return terms, source_terms_ids, tags + def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): - start = time.time() - with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: + with open(os.path.join(cache_dir, ontology_acronym + "-term-details.pickle"), 'wb+') as out_file: pickle.dump(ontology_terms, out_file) - end = time.time() + def _load_data(input_file_path, csv_column_names, separator): if len(csv_column_names) >= 1: @@ -148,21 +172,27 @@ def _load_data(input_file_path, csv_column_names, separator): term_ids = onto_utils.generate_iris(len(terms)) return terms, term_ids -def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type='classes'): - term_collector = OntologyTermCollector() + +def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type=OntologyTermType.CLASS): + term_collector = OntologyTermCollector(ontology_iri=ontology) if use_cache: - pickle_file = "cache/" + ontology + "/" + ontology + "-term-details.pickle" + pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") + LOGGER.info(f"Loading cached ontology from: {pickle_file}") onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: - - onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) + onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated, + term_type=term_type) + term_collector.close() + LOGGER.info(f"Filtered ontology terms to those of type: {term_type}") if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms + def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): to_map, tags = _process_tags(source_terms, tags) + start = time.time() if mapper == Mapper.TFIDF: term_mapper = TFIDFMapper(ontology_terms) mappings_df = term_mapper.map(to_map, source_term_ids, max_mappings=max_mappings, min_score=min_score) @@ -177,14 +207,23 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi mappings_df = term_mapper.map(to_map, source_term_ids, mapper, max_mappings=max_mappings) else: raise ValueError("Unsupported mapper: " + mapper) + LOGGER.info("...done (mapping time: %.2fs seconds)", time.time() - start) - # Add tags, process, and filter - df = _filter_mappings(mappings_df, min_score) + # Filter terms by the mapping score specified + if mapper == Mapper.BIOPORTAL: + LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score " + "filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.") + df = mappings_df + else: + df = _filter_mappings(mappings_df, min_score) + # Include in output data frame any input terms that did not meet min_score threshold if incl_unmapped: - df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) - df = _add_tags_to_df(mappings_df, tags) + df = _add_unmapped_terms(df, tags, source_terms, source_term_ids) + # Add tags + df = _add_tags_to_df(df, tags) return df + # Takes in the tags and source terms and processes them accordingly def _process_tags(source_terms, tags): to_map = [] @@ -205,6 +244,7 @@ def _process_tags(source_terms, tags): to_map.append(term) return to_map, tags + def _add_tags_to_df(df, tags): if isinstance(tags, dict): for key, value in tags.items(): @@ -213,12 +253,13 @@ def _add_tags_to_df(df, tags): else: to_store = str(value) df.loc[df['Source Term'] == key, "Tags"] = to_store - else: + else: for term in tags: to_store = ','.join(term.get_tags()) df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store return df + def _filter_mappings(mappings_df, min_score): new_df = pd.DataFrame(columns=mappings_df.columns) for index, row in mappings_df.iterrows(): @@ -226,8 +267,12 @@ def _filter_mappings(mappings_df, min_score): new_df.loc[len(new_df.index)] = row return new_df + def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): - mapped = pd.unique(mappings_df["Source Term"]) + if mappings_df.size == 0: + mapped = () + else: + mapped = pd.unique(mappings_df["Source Term"]) for (term, term_id) in zip(source_terms, source_terms_ids): if term not in mapped: non_mapping = TermMapping(term, term_id, "", "", 0) @@ -235,6 +280,7 @@ def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): mappings_df.loc[len(mappings_df.index)] = non_mapping.to_dict() return mappings_df + def _add_tag(tags, term, to_add, ignore=False): if isinstance(tags, dict): new_tags = tags.get(term, []) @@ -254,14 +300,15 @@ def _add_tag(tags, term, to_add, ignore=False): if tagged_term.get_term() == term and check_ignore: tagged_term.add_tags([to_add]) -def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, \ - excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): + +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): if os.path.dirname(output_file): # create output directories if needed os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "a") as f: - f.write("# Date and time run: %s\n" % datetime.datetime.now()) + f.write("# Timestamp: %s\n" % datetime.datetime.now()) f.write("# Target Ontology: %s\n" % target_ontology) - f.write("# Text2term version: %s\n" % VERSION) + f.write("# text2term version: %s\n" % VERSION) f.write("# Minimum Score: %.2f\n" % min_score) f.write("# Mapper: %s\n" % mapper.value) f.write("# Base IRIs: %s\n" % (base_iris,)) @@ -272,10 +319,12 @@ def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, ba f.write("# Unmapped Terms ") f.write("Excluded\n" if not incl_unmapped else "Included\n") writestring = "# Of " + str(len(source_terms)) + " entries, " + str(len(pd.unique(mappings["Source Term ID"]))) - writestring += " were successfully mapped to " + str(len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" + writestring += " were mapped to " + str( + len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" f.write(writestring) mappings.to_csv(output_file, index=False, mode='a') + def _save_graphs(terms, output_file): term_graphs = TermGraphGenerator(terms).graphs_dicts() with open(output_file + "-term-graphs.json", 'w') as json_file: diff --git a/text2term/tagged_terms.py b/text2term/tagged_term.py similarity index 80% rename from text2term/tagged_terms.py rename to text2term/tagged_term.py index 53d3441..7891f63 100644 --- a/text2term/tagged_terms.py +++ b/text2term/tagged_term.py @@ -1,6 +1,8 @@ +"""Provides TaggedTerm class""" + class TaggedTerm: - def __init__(self, term=None, tags=[], original_term=None, source_term_id=None): + def __init__(self, term=None, tags=(), original_term=None, source_term_id=None): self.term = term self.tags = tags self.original_term = original_term @@ -10,7 +12,7 @@ def __repr__(self): return f" need to unload previously loaded ontologies - try: - ontology.destroy() - except Exception as err: - self.logger.debug("Unable to destroy ontology: ", err) return ontology_terms - def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type='classes'): + def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): filtered_onto_terms = {} for base_iri, term in onto_terms.items(): if type(iris) == str: @@ -61,10 +60,10 @@ def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type='cl filtered_onto_terms.update({base_iri: term}) return filtered_onto_terms - def _get_ontology_signature(self, ontology, term_type='classes'): + def _get_ontology_signature(self, ontology): signature = list(ontology.classes()) signature.extend(list(ontology.properties())) - # ontology.classes() does not include classes in imported ontologies; we need to explicitly add them to our list + # owlready2::ontology.classes() does not include classes in imported ontologies; we need to explicitly add them for imported_ontology in ontology.imported_ontologies: signature.extend(list(imported_ontology.classes())) signature.extend(list(imported_ontology.properties())) @@ -80,55 +79,83 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type iri = ontology_term.iri labels = self._get_labels(ontology_term) synonyms = self._get_synonyms(ontology_term) - parents = self._get_parents(ontology_term) + named_parents, complex_parents = self._get_parents(ontology_term) children = self._get_children(ontology_term, ontology) instances = self._get_instances(ontology_term, ontology) definitions = self._get_definitions(ontology_term) is_deprecated = deprecated[ontology_term] == [True] - if self._filter_term_type(ontology_term, "classes", False): - termtype = 'class' - elif self._filter_term_type(ontology_term, "properties", False): - termtype = 'property' + if self._filter_term_type(ontology_term, OntologyTermType.CLASS, False): + owl_term_type = OntologyTermType.CLASS + elif self._filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): + owl_term_type = OntologyTermType.PROPERTY else: - termtype = None + owl_term_type = "undetermined" + self.logger.warn("Term has undetermined type %s %s", iri, labels) term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, - parents=parents, children=children, instances=instances, - deprecated=is_deprecated, termtype=termtype) + parents=named_parents, children=children, instances=instances, + restrictions=complex_parents, deprecated=is_deprecated, + term_type=owl_term_type) ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) return ontology_terms def _filter_term_type(self, ontology_term, term_type, cached): - if term_type == 'classes': + if term_type == OntologyTermType.CLASS: if cached: - return ontology_term.termtype == 'class' + return ontology_term.term_type == OntologyTermType.CLASS else: - return not isinstance(ontology_term, PropertyClass) - elif term_type == 'properties': + return isinstance(ontology_term, ThingClass) + elif term_type == OntologyTermType.PROPERTY: if cached: - return ontology_term.termtype == 'property' + return ontology_term.term_type == OntologyTermType.PROPERTY else: return isinstance(ontology_term, PropertyClass) - elif term_type == 'both': + elif term_type == OntologyTermType.ANY: return True else: - raise ValueError("Option to include Properties or Classes is not valid") + raise ValueError("Invalid term-type option. Acceptable term types are: 'class' or 'property' or 'any'") def _get_parents(self, ontology_term): parents = dict() # named/atomic superclasses except owl:Thing + restrictions = dict() # restrictions are class expressions such as 'pancreatitis disease_has_location pancreas' try: all_parents = ontology_term.is_a # obtain direct parents of this entity for parent in all_parents: - # exclude OWL restrictions and owl:Thing and Self - if isinstance(parent, ThingClass) and parent is not Thing and parent is not ontology_term: - if len(parent.label) > 0: - parents.update({parent.iri: parent.label[0]}) - else: - parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) + # exclude owl:Thing and Self + if parent is not Thing and parent is not ontology_term: + if isinstance(parent, ThingClass): # get named parents (i.e. classes with IRIs) + self._add_named_parent(parent, parents) + elif isinstance(parent, And): # get conjuncts and add them to the respective structures + for conjunct in parent.Classes: + if isinstance(conjunct, ThingClass): # if conjunct is a named class, add it to parents dict + self._add_named_parent(conjunct, parents) + else: + self._add_complex_parent(conjunct, restrictions) + elif isinstance(parent, Restriction): # get complex parents, i.e. restrictions or class expressions + self._add_complex_parent(parent, restrictions) except (AttributeError, ValueError) as err: self.logger.debug(err) - return parents + return parents, restrictions + + def _add_named_parent(self, parent, parents): + if len(parent.label) > 0: + parents.update({parent.iri: parent.label[0]}) + else: + parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) + + def _add_complex_parent(self, parent, restrictions): + property_iri = parent.property.iri + if isinstance(parent.value, ThingClass): # the filler is a named term (i.e., it has an IRI) + value = parent.value.iri + else: # the filler is another complex class expression + value = parent.value + if property_iri in restrictions.keys(): + current_restrictions = restrictions[property_iri] + current_restrictions.add(value) + restrictions.update({property_iri: current_restrictions}) + else: + restrictions.update({property_iri: str(value)}) def _get_children(self, ontology_term, ontology): children = dict() @@ -175,7 +202,7 @@ def _get_labels(self, ontology_term): self.logger.debug("...collected %i labels and synonyms for %s", len(labels), ontology_term) return labels - def _get_synonyms(self, ontology_term, include_broad_synonyms=False): + def _get_synonyms(self, ontology_term, include_related_synonyms=False, include_broad_synonyms=False): """ Collect the synonyms of the given ontology term :param ontology_term: Ontology term @@ -185,12 +212,13 @@ def _get_synonyms(self, ontology_term, include_broad_synonyms=False): synonyms = set() for synonym in self._get_obo_exact_synonyms(ontology_term): synonyms.add(synonym) - for synonym in self._get_obo_related_synonyms(ontology_term): - synonyms.add(synonym) for nci_synonym in self._get_nci_synonyms(ontology_term): synonyms.add(nci_synonym) for efo_alt_term in self._get_efo_alt_terms(ontology_term): synonyms.add(efo_alt_term) + if include_related_synonyms: + for synonym in self._get_obo_related_synonyms(ontology_term): + synonyms.add(synonym) if include_broad_synonyms: for synonym in self._get_obo_broad_synonyms(ontology_term): synonyms.add(synonym) @@ -339,7 +367,7 @@ def _load_ontology(self, ontology_iri): self.logger.info("Loading ontology %s...", ontology_iri) start = time.time() owl_link = bioregistry.get_owl_download(ontology_iri) - if owl_link != None: + if owl_link is not None: ontology_iri = owl_link ontology = get_ontology(ontology_iri).load() end = time.time() @@ -359,6 +387,14 @@ def _classify_ontology(self, ontology): end = time.time() self.logger.info("...done (reasoning time: %.2fs)", end - start) + def close(self): + # when multiple ontologies are loaded with owlready2, and they reference the same ontology term (IRI), a lookup + # for that IRI returns the term from the first ontology loaded —> need to unload previously loaded ontologies + try: + self.ontology.destroy() + except Exception as err: + self.logger.debug("Unable to destroy ontology: ", err) + def _log_ontology_metrics(self, ontology): self.logger.debug(" Ontology IRI: %s", ontology.base_iri) self.logger.debug(" Class count: %i", len(list(ontology.classes()))) diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py index 231c602..c2a061b 100644 --- a/text2term/term_graph_generator.py +++ b/text2term/term_graph_generator.py @@ -1,6 +1,9 @@ +"""Provides TermGraphGenerator class""" + from text2term import onto_utils from text2term.term_graph import TermGraph, Node, Edge + class TermGraphGenerator: def __init__(self, terms): diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 098c04f..c90c7f9 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -1,7 +1,6 @@ """Provides TFIDFMapper class""" import logging -import time import sparse_dot_topn as ct from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from text2term import onto_utils @@ -27,15 +26,10 @@ def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates """ - self.logger.info("Mapping %i source terms...", len(source_terms)) - self.logger.info("...against %i ontology terms (%i labels/synonyms)", len(self.target_ontology_terms), len(self.target_labels)) - start = time.time() source_terms_norm = onto_utils.normalize_list(source_terms) vectorizer = self._tokenize(source_terms_norm, self.target_labels) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) - end = time.time() - self.logger.info("...done (mapping time: %.2fs seconds)", end-start) return results_df def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 26df493..8f72377 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -2,7 +2,6 @@ import json import logging -import time import requests from text2term import onto_utils from text2term.term_mapping import TermMappingCollection, TermMapping @@ -23,12 +22,9 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional Zooma API-specific parameters to include in the request """ - self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) - start = time.time() mappings = [] for term, term_id in zip(source_terms, source_terms_ids): mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) - self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params):