diff --git a/omim2obo/main.py b/omim2obo/main.py index 45e3f90..aa0f28f 100644 --- a/omim2obo/main.py +++ b/omim2obo/main.py @@ -57,8 +57,7 @@ from omim2obo.namespaces import * from omim2obo.parsers.omim_entry_parser import cleanup_title, get_alt_and_included_titles_and_symbols, get_pubs, \ - get_mapped_ids, \ - recapitalize_acronyms_in_title + get_mapped_ids, recapitalize_acronyms_in_titles from omim2obo.config import ROOT_DIR, GLOBAL_TERMS from omim2obo.parsers.omim_txt_parser import * @@ -204,6 +203,16 @@ def omim2obo(use_cache: bool = False): get_alt_and_included_titles_and_symbols(inc_titles_str) included_is_included = included_titles or included_symbols # redundant. can't be included symbol w/out title + # Recapitalize acronyms in titles + all_abbrevs: List[str] = \ + pref_symbols + alt_symbols + former_alt_symbols + included_symbols + former_included_symbols + # todo: consider DRYing to 1 call by passing all 5 title types to a wrapper function + pref_title = recapitalize_acronyms_in_titles(pref_title, all_abbrevs) + alt_titles = recapitalize_acronyms_in_titles(alt_titles, all_abbrevs) + former_alt_titles = recapitalize_acronyms_in_titles(former_alt_titles, all_abbrevs) + included_titles = recapitalize_acronyms_in_titles(included_titles, all_abbrevs) + former_included_titles = recapitalize_acronyms_in_titles(former_included_titles, all_abbrevs) + # Special cases depending on OMIM term type is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER: # % @@ -227,16 +236,11 @@ def omim2obo(use_cache: bool = False): else: graph.add((omim_uri, RDFS.label, Literal(pref_title))) - # todo: .clean()/.cleanup_label() 2nd param `explicit_abbrev` should be List[str] instead of str. And below, - # should pass all symbols/abbrevs from each of preferred, alt, included each time it is called. If no symbols - # for given term, should pass empty list. See: https://github.com/monarch-initiative/omim/issues/129 - pref_abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0] - # Add synonyms # - exact titles - graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(pref_title, pref_abbrev)))) + graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(pref_title))) for title in alt_titles: - graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(title, pref_abbrev)))) + graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(title))) # - exact abbreviations for abbrevs in [pref_symbols, alt_symbols]: for abbreviation in abbrevs: @@ -244,8 +248,7 @@ def omim2obo(use_cache: bool = False): [(oboInOwl.hasSynonymType, OMO['0003000'])]) # - related, deprecated 'former' titles for title in former_alt_titles: - clean_title = recapitalize_acronyms_in_title(title, pref_abbrev) - add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, clean_title, + add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, title, [(OWL.deprecated, Literal(True))]) # - related, deprecated 'former' abbreviations for abbreviation in former_alt_symbols: @@ -259,7 +262,7 @@ def omim2obo(use_cache: bool = False): graph.add((omim_uri, RDFS['comment'], Literal(included_comment))) # - titles for title in included_titles: - graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(recapitalize_acronyms_in_title(title, pref_abbrev)))) + graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(title))) # - symbols for symbol in included_symbols: add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [ @@ -268,8 +271,7 @@ def omim2obo(use_cache: bool = False): ]) # - deprecated, 'former' for title in former_included_titles: - clean_title = recapitalize_acronyms_in_title(title, pref_abbrev) - add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), clean_title, + add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), title, [(OWL.deprecated, Literal(True))]) for symbol in former_included_symbols: add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [ diff --git a/omim2obo/parsers/omim_entry_parser.py b/omim2obo/parsers/omim_entry_parser.py index 71424fe..fd26e36 100644 --- a/omim2obo/parsers/omim_entry_parser.py +++ b/omim2obo/parsers/omim_entry_parser.py @@ -4,7 +4,7 @@ # import re from collections import defaultdict from copy import copy -from typing import List, Dict, Tuple +from typing import List, Dict, Tuple, Union import pandas as pd from rdflib import Graph, RDF, RDFS, DC, Literal, OWL, SKOS, URIRef @@ -21,7 +21,7 @@ def get_known_capitalizations() -> Dict[str, str]: """Get list of known capitalizations for proper names, acronyms, and the like. - TODO: Contains space-delimited words, e.g. "vitamin d". The way that + todo: Contains space-delimited words, e.g. "vitamin d". The way that cleanup_label is currently implemented, each word in the label gets replaced; i.e. it would try to replace "vitamin" and "d" separately. Hence, this would fail. @@ -29,7 +29,13 @@ def get_known_capitalizations() -> Dict[str, str]: the current 'word replacement' logic, but also, (2), at the end, do a generic string replacement (e.g. my_str.replace(a, b). When implementing (2), we should also split this dictionary into two separate dictionaries, - each for 1 of these 2 different purposes.""" + each for 1 of these 2 different purposes. + + todo: known_capitalizations.tsv can be refactored possibly. It really only needs 1 column, the case to replaace. The + pattern column is not used, and the first column (lowercase) can be computed by using .lower() on the case to + replace. We could also leave as-is since this file is shared elsewhere in the project infrastructure, though I do + not know its source-of-truth location. + """ path = DATA_DIR / 'known_capitalizations.tsv' with open(path, "r") as file: data_io = csv.reader(file, delimiter="\t") @@ -147,8 +153,7 @@ def transform_entry(entry) -> Graph: return graph -# todo: probably best to combine explicit abbrevs outside of this func -def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalization_threshold=0.75) -> List[str]: +def detect_abbreviations(label: str, capitalization_threshold=0.75) -> List[str]: """Detect possible abbreviations / acronyms""" # Compile regexp acronyms_without_periods_compiler = re.compile('[A-Z]{1}[A-Z0-9]{1,}') @@ -165,29 +170,21 @@ def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalizatio is_largely_uppercase = \ fully_capitalized_count / len(words) >= capitalization_threshold - # Detect acronyms without periods + # Detect cases if is_largely_uppercase: acronyms_without_periods = [] # can't infer because everything was uppercase else: - acronyms_without_periods = acronyms_without_periods_compiler.findall(label) - # Detect more - title_cased_abbrevs = title_cased_abbrev_compiler.findall(label) - acronyms_with_periods = acronyms_with_periods_compiler.findall(label) - # Combine list of things to re-format - replacements = [] - candidates: List[List[str]] = [ - acronyms_with_periods, acronyms_without_periods, title_cased_abbrevs, [explicit_abbrev]] - for item_list in candidates: - for item in item_list: - if item: - replacements.append(item) - - return replacements + acronyms_without_periods: List[str] = acronyms_without_periods_compiler.findall(label) + title_cased_abbrevs: List[str] = title_cased_abbrev_compiler.findall(label) + acronyms_with_periods: List[str] = acronyms_with_periods_compiler.findall(label) + + return acronyms_with_periods + acronyms_without_periods + title_cased_abbrevs # todo: rename? It's doing more than cleaning; it's mutating def cleanup_title( title: str, + replacement_case_method: str = 'lower', # 'upper', 'title', 'lower', 'capitalize' (=sentence case) conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'], little_preps: List[str] = ['at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'], articles: List[str] = ['a', 'an', 'the'], @@ -197,9 +194,10 @@ def cleanup_title( :param title: A preferred, alternative, or included title. - 1. Removes the abbreviation suffixes - 2. Converts roman numerals to arabic - 3. Makes the text Title Case, except for supplied conjunctions/prepositions/articles + 1. Converts roman numerals to arabic + 2. Makes the text adhere to the case of `replacement_case_method`, except for supplied + conjunctions, prepositions, and articles, which will always be lowercased. NOTE: The default for this is 'lower', + meaning that this operation by default does nothing. Assumptions: 1. All acronyms are capitalized @@ -233,9 +231,6 @@ def cleanup_title( e.g.: Balint syndrome, Barre-Lieou syndrome, Wallerian degeneration, etc. How to do this? Simply get/create a list of known eponyms? Is this feasible? """ - # Simple method: Lower/title case everything but acronyms - # label_newcase = getattr(label2, replacement_case_method)() - # Advanced method: iteritavely format words fixedwords = [] i = 0 for wrd in title.split(): @@ -254,8 +249,7 @@ def cleanup_title( suffix = wrd.replace(toRoman(num), '', 1) fixed = ''.join((str(num), suffix)) wrd = fixed - # todo: next few lines don't make sense. why lower 'wrd', and then conditionally lowercase it again? - wrd = wrd.lower() + wrd = getattr(wrd, replacement_case_method)() # replace interior conjunctions, prepositions, and articles with lowercase, always if wrd in (conjunctions + little_preps + articles) and i != 1: wrd = wrd.lower() @@ -267,18 +261,30 @@ def cleanup_title( return label_newcase -# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129 -def recapitalize_acronyms_in_title(title: str, explicit_abbrev=None, capitalization_threshold=0.75) -> str: - """Re-capitalize acronyms / words based on information contained w/in original label""" - # todo: probably best to combine explicit abbrevs outside of this func - possible_abbreviations = _detect_abbreviations( - title, explicit_abbrev, capitalization_threshold=capitalization_threshold) +def recapitalize_acronyms_in_title(title: str, known_abbrevs: List[str] = None, capitalization_threshold=0.75) -> str: + """Re-capitalize acronyms / words based on information contained w/in original label + + todo: If title has been used on cleanup_title() using a replacement_case_method other than the non-default 'lower', + then the .replace() operation will not work. To solve, this (a) capture the replacement_case_method used and + pass that here, or (b) duplicate the .replace() line and call it on alternative casing variations (.title() and + capitalize() (=sentence case)). + """ + abbrevs: List[str] = known_abbrevs + detect_abbreviations(title, capitalization_threshold) title2 = title - for abbrev in possible_abbreviations: - title2 = title2.replace(abbrev.upper(), abbrev) + for abbrev in abbrevs: + title2 = title2.replace(abbrev.lower(), abbrev) return title2 +def recapitalize_acronyms_in_titles( + titles: Union[str, List[str]], known_abbrevs: List[str] = None, capitalization_threshold=0.75 +) -> Union[str, List[str]]: + """Re-capitalize acronyms in a list of titles""" + if isinstance(titles, str): + return recapitalize_acronyms_in_title(titles, known_abbrevs, capitalization_threshold) + return [recapitalize_acronyms_in_title(title, known_abbrevs, capitalization_threshold) for title in titles] + + def remove_included_and_formerly_suffixes(title: str) -> str: """Remove ', INCLUDED' and ', FORMERLY' suffixes from a title""" for suffix in ['FORMERLY', 'INCLUDED']: