Skip to content

Commit

Permalink
Abbreviation recasing: use all abbrevs
Browse files Browse the repository at this point in the history
- Update: Now considering all abbreviations when checking a title and uppercasing them. Previously was only looking at the first preferred symbol.
- Update: Refactored, added comments and todos, simplified code.
- Bug fix: Was not actually uppercasing previously. The .replace() usage was incorrect.
  • Loading branch information
joeflack4 committed Sep 23, 2024
1 parent f5e4794 commit 7eefea5
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 50 deletions.
30 changes: 16 additions & 14 deletions omim2obo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@

from omim2obo.namespaces import *
from omim2obo.parsers.omim_entry_parser import cleanup_title, get_alt_and_included_titles_and_symbols, get_pubs, \
get_mapped_ids, \
recapitalize_acronyms_in_title
get_mapped_ids, recapitalize_acronyms_in_titles
from omim2obo.config import ROOT_DIR, GLOBAL_TERMS
from omim2obo.parsers.omim_txt_parser import *

Expand Down Expand Up @@ -204,6 +203,16 @@ def omim2obo(use_cache: bool = False):
get_alt_and_included_titles_and_symbols(inc_titles_str)
included_is_included = included_titles or included_symbols # redundant. can't be included symbol w/out title

# Recapitalize acronyms in titles
all_abbrevs: List[str] = \
pref_symbols + alt_symbols + former_alt_symbols + included_symbols + former_included_symbols
# todo: consider DRYing to 1 call by passing all 5 title types to a wrapper function
pref_title = recapitalize_acronyms_in_titles(pref_title, all_abbrevs)
alt_titles = recapitalize_acronyms_in_titles(alt_titles, all_abbrevs)
former_alt_titles = recapitalize_acronyms_in_titles(former_alt_titles, all_abbrevs)
included_titles = recapitalize_acronyms_in_titles(included_titles, all_abbrevs)
former_included_titles = recapitalize_acronyms_in_titles(former_included_titles, all_abbrevs)

# Special cases depending on OMIM term type
is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE
if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER: # %
Expand All @@ -227,25 +236,19 @@ def omim2obo(use_cache: bool = False):
else:
graph.add((omim_uri, RDFS.label, Literal(pref_title)))

# todo: .clean()/.cleanup_label() 2nd param `explicit_abbrev` should be List[str] instead of str. And below,
# should pass all symbols/abbrevs from each of preferred, alt, included each time it is called. If no symbols
# for given term, should pass empty list. See: https://github.com/monarch-initiative/omim/issues/129
pref_abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0]

# Add synonyms
# - exact titles
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(pref_title, pref_abbrev))))
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(pref_title)))
for title in alt_titles:
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(title, pref_abbrev))))
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(title)))
# - exact abbreviations
for abbrevs in [pref_symbols, alt_symbols]:
for abbreviation in abbrevs:
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasExactSynonym, abbreviation,
[(oboInOwl.hasSynonymType, OMO['0003000'])])
# - related, deprecated 'former' titles
for title in former_alt_titles:
clean_title = recapitalize_acronyms_in_title(title, pref_abbrev)
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, clean_title,
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, title,
[(OWL.deprecated, Literal(True))])
# - related, deprecated 'former' abbreviations
for abbreviation in former_alt_symbols:
Expand All @@ -259,7 +262,7 @@ def omim2obo(use_cache: bool = False):
graph.add((omim_uri, RDFS['comment'], Literal(included_comment)))
# - titles
for title in included_titles:
graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(recapitalize_acronyms_in_title(title, pref_abbrev))))
graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(title)))
# - symbols
for symbol in included_symbols:
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [
Expand All @@ -268,8 +271,7 @@ def omim2obo(use_cache: bool = False):
])
# - deprecated, 'former'
for title in former_included_titles:
clean_title = recapitalize_acronyms_in_title(title, pref_abbrev)
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), clean_title,
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), title,
[(OWL.deprecated, Literal(True))])
for symbol in former_included_symbols:
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [
Expand Down
78 changes: 42 additions & 36 deletions omim2obo/parsers/omim_entry_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# import re
from collections import defaultdict
from copy import copy
from typing import List, Dict, Tuple
from typing import List, Dict, Tuple, Union

import pandas as pd
from rdflib import Graph, RDF, RDFS, DC, Literal, OWL, SKOS, URIRef
Expand All @@ -21,15 +21,21 @@

def get_known_capitalizations() -> Dict[str, str]:
"""Get list of known capitalizations for proper names, acronyms, and the like.
TODO: Contains space-delimited words, e.g. "vitamin d". The way that
todo: Contains space-delimited words, e.g. "vitamin d". The way that
cleanup_label is currently implemented, each word in the label gets
replaced; i.e. it would try to replace "vitamin" and "d" separately. Hence,
this would fail.
Therefore, we should probably do this in 2 different operations: (1) use
the current 'word replacement' logic, but also, (2), at the end, do a
generic string replacement (e.g. my_str.replace(a, b). When implementing
(2), we should also split this dictionary into two separate dictionaries,
each for 1 of these 2 different purposes."""
each for 1 of these 2 different purposes.
todo: known_capitalizations.tsv can be refactored possibly. It really only needs 1 column, the case to replaace. The
pattern column is not used, and the first column (lowercase) can be computed by using .lower() on the case to
replace. We could also leave as-is since this file is shared elsewhere in the project infrastructure, though I do
not know its source-of-truth location.
"""
path = DATA_DIR / 'known_capitalizations.tsv'
with open(path, "r") as file:
data_io = csv.reader(file, delimiter="\t")
Expand Down Expand Up @@ -147,8 +153,7 @@ def transform_entry(entry) -> Graph:
return graph


# todo: probably best to combine explicit abbrevs outside of this func
def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalization_threshold=0.75) -> List[str]:
def detect_abbreviations(label: str, capitalization_threshold=0.75) -> List[str]:
"""Detect possible abbreviations / acronyms"""
# Compile regexp
acronyms_without_periods_compiler = re.compile('[A-Z]{1}[A-Z0-9]{1,}')
Expand All @@ -165,29 +170,21 @@ def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalizatio
is_largely_uppercase = \
fully_capitalized_count / len(words) >= capitalization_threshold

# Detect acronyms without periods
# Detect cases
if is_largely_uppercase:
acronyms_without_periods = [] # can't infer because everything was uppercase
else:
acronyms_without_periods = acronyms_without_periods_compiler.findall(label)
# Detect more
title_cased_abbrevs = title_cased_abbrev_compiler.findall(label)
acronyms_with_periods = acronyms_with_periods_compiler.findall(label)
# Combine list of things to re-format
replacements = []
candidates: List[List[str]] = [
acronyms_with_periods, acronyms_without_periods, title_cased_abbrevs, [explicit_abbrev]]
for item_list in candidates:
for item in item_list:
if item:
replacements.append(item)

return replacements
acronyms_without_periods: List[str] = acronyms_without_periods_compiler.findall(label)
title_cased_abbrevs: List[str] = title_cased_abbrev_compiler.findall(label)
acronyms_with_periods: List[str] = acronyms_with_periods_compiler.findall(label)

return acronyms_with_periods + acronyms_without_periods + title_cased_abbrevs


# todo: rename? It's doing more than cleaning; it's mutating
def cleanup_title(
title: str,
replacement_case_method: str = 'lower', # 'upper', 'title', 'lower', 'capitalize' (=sentence case)
conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'],
little_preps: List[str] = ['at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
articles: List[str] = ['a', 'an', 'the'],
Expand All @@ -197,9 +194,10 @@ def cleanup_title(
:param title: A preferred, alternative, or included title.
1. Removes the abbreviation suffixes
2. Converts roman numerals to arabic
3. Makes the text Title Case, except for supplied conjunctions/prepositions/articles
1. Converts roman numerals to arabic
2. Makes the text adhere to the case of `replacement_case_method`, except for supplied
conjunctions, prepositions, and articles, which will always be lowercased. NOTE: The default for this is 'lower',
meaning that this operation by default does nothing.
Assumptions:
1. All acronyms are capitalized
Expand Down Expand Up @@ -233,9 +231,6 @@ def cleanup_title(
e.g.: Balint syndrome, Barre-Lieou syndrome, Wallerian degeneration, etc.
How to do this? Simply get/create a list of known eponyms? Is this feasible?
"""
# Simple method: Lower/title case everything but acronyms
# label_newcase = getattr(label2, replacement_case_method)()
# Advanced method: iteritavely format words
fixedwords = []
i = 0
for wrd in title.split():
Expand All @@ -254,8 +249,7 @@ def cleanup_title(
suffix = wrd.replace(toRoman(num), '', 1)
fixed = ''.join((str(num), suffix))
wrd = fixed
# todo: next few lines don't make sense. why lower 'wrd', and then conditionally lowercase it again?
wrd = wrd.lower()
wrd = getattr(wrd, replacement_case_method)()
# replace interior conjunctions, prepositions, and articles with lowercase, always
if wrd in (conjunctions + little_preps + articles) and i != 1:
wrd = wrd.lower()
Expand All @@ -267,18 +261,30 @@ def cleanup_title(
return label_newcase


# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129
def recapitalize_acronyms_in_title(title: str, explicit_abbrev=None, capitalization_threshold=0.75) -> str:
"""Re-capitalize acronyms / words based on information contained w/in original label"""
# todo: probably best to combine explicit abbrevs outside of this func
possible_abbreviations = _detect_abbreviations(
title, explicit_abbrev, capitalization_threshold=capitalization_threshold)
def recapitalize_acronyms_in_title(title: str, known_abbrevs: List[str] = None, capitalization_threshold=0.75) -> str:
"""Re-capitalize acronyms / words based on information contained w/in original label
todo: If title has been used on cleanup_title() using a replacement_case_method other than the non-default 'lower',
then the .replace() operation will not work. To solve, this (a) capture the replacement_case_method used and
pass that here, or (b) duplicate the .replace() line and call it on alternative casing variations (.title() and
capitalize() (=sentence case)).
"""
abbrevs: List[str] = known_abbrevs + detect_abbreviations(title, capitalization_threshold)
title2 = title
for abbrev in possible_abbreviations:
title2 = title2.replace(abbrev.upper(), abbrev)
for abbrev in abbrevs:
title2 = title2.replace(abbrev.lower(), abbrev)
return title2


def recapitalize_acronyms_in_titles(
titles: Union[str, List[str]], known_abbrevs: List[str] = None, capitalization_threshold=0.75
) -> Union[str, List[str]]:
"""Re-capitalize acronyms in a list of titles"""
if isinstance(titles, str):
return recapitalize_acronyms_in_title(titles, known_abbrevs, capitalization_threshold)
return [recapitalize_acronyms_in_title(title, known_abbrevs, capitalization_threshold) for title in titles]


def remove_included_and_formerly_suffixes(title: str) -> str:
"""Remove ', INCLUDED' and ', FORMERLY' suffixes from a title"""
for suffix in ['FORMERLY', 'INCLUDED']:
Expand Down

0 comments on commit 7eefea5

Please sign in to comment.