Skip to content

Commit fb367bd

Browse files
authored
feat(intl/eu): 75x speedup text data formatting
2 parents d3d38e5 + b7ada44 commit fb367bd

File tree

4 files changed

+209
-111
lines changed

4 files changed

+209
-111
lines changed

onsides_intl/onsides_eu/pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ version = "0.1.0"
44
description = "ONSIDES European Union"
55
readme = "README.md"
66
requires-python = ">=3.11"
7-
dependencies = []
7+
dependencies = [
8+
"pyahocorasick>=2.1.0",
9+
]
810

911
[build-system]
1012
requires = ["hatchling"]
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import ahocorasick
2+
from pydantic import BaseModel
3+
4+
5+
class MeddraSearchTerm(BaseModel):
6+
term: str
7+
meddra_pt_code: int
8+
9+
10+
class FoundMeddraTerm(BaseModel):
11+
term: str
12+
meddra_pt_code: int
13+
start: int
14+
end: int
15+
16+
17+
def build_meddra_search_tree(
18+
meddra_terms: list[MeddraSearchTerm],
19+
) -> ahocorasick.Automaton:
20+
"""
21+
Builds an Aho-Corasick tree from a list of MedDRA terms.
22+
"""
23+
tree = ahocorasick.Automaton(str, str)
24+
for meddra_obj in meddra_terms:
25+
tree.add_word(meddra_obj.term, meddra_obj.model_dump_json())
26+
tree.make_automaton()
27+
return tree
28+
29+
30+
def find_meddra_terms_in_text(
31+
text: str,
32+
meddra_tree: ahocorasick.Automaton,
33+
) -> list[FoundMeddraTerm]:
34+
"""
35+
Finds all MedDRA terms in a text using an Aho-Corasick tree.
36+
"""
37+
found_terms = list()
38+
for end_index, obj_json in meddra_tree.iter(text):
39+
meddra_obj = MeddraSearchTerm.model_validate_json(obj_json)
40+
start_index = end_index - len(meddra_obj.term) + 1
41+
obj = FoundMeddraTerm(
42+
term=meddra_obj.term,
43+
meddra_pt_code=meddra_obj.meddra_pt_code,
44+
start=start_index,
45+
end=end_index,
46+
)
47+
found_terms.append(obj)
48+
return found_terms
49+
50+
51+
def build_bert_string(
52+
text: str,
53+
match: FoundMeddraTerm,
54+
nwords: int = 125,
55+
prop_before: float = 0.125,
56+
) -> str:
57+
term_nwords = len(match.term.split())
58+
n_words_before = prop_before * (nwords - 2 * term_nwords)
59+
n_words_after = (1 - prop_before) * (nwords - 2 * term_nwords)
60+
n_words_before = max(int(n_words_before), 1)
61+
n_words_after = max(int(n_words_after), 1)
62+
before_words = text[: match.start].split()[-n_words_before:]
63+
after_words = text[match.end :].split()[:n_words_after]
64+
words_list = [match.term] + before_words + ["EVENT"] + after_words
65+
result = " ".join(words_list)
66+
return result
Lines changed: 116 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,114 +1,120 @@
1-
import numpy as np
2-
import pandas as pd
3-
import requests
4-
from tqdm import tqdm
5-
from glob import glob
6-
import ast, re, json, orjson
7-
from time import sleep
1+
"""Find MedDRA term exact matches in drug label free text. Setup data for the
2+
OnSIDES model.
3+
4+
There are a couple things that I did here which should be clarified.
5+
6+
First, I don't do any RxNorm mapping here. I saw that previous code attempted to
7+
do this, but it was trying to join drug names from EMA to RxNorm SET IDs, which
8+
look like UUIDs. Text joins didn't work, so I just explicitly set those columns
9+
to None below, just to ensure we have all the same columns as the original code.
10+
11+
Second, I'm not sure about the "AR" section. That's what the previous code did,
12+
so I just did it here too.
13+
14+
Third, I used the same MedDRA terms as previous code (I think). Not sure why
15+
we're only using 5 character or longer terms. My code looks for either PT or LLT
16+
and maps to PT terms.
17+
"""
18+
819
import argparse
9-
import warnings
10-
warnings.filterwarnings('ignore')
11-
import os
20+
import logging
21+
import pathlib
22+
23+
import polars as pl
24+
import tqdm.auto as tqdm
25+
26+
from onsides_eu.stringsearch import (
27+
MeddraSearchTerm,
28+
build_bert_string,
29+
build_meddra_search_tree,
30+
find_meddra_terms_in_text,
31+
)
32+
33+
logger = logging.getLogger(__name__)
34+
35+
36+
def format_text(
37+
data_folder: pathlib.Path,
38+
external_data_folder: pathlib.Path,
39+
) -> None:
40+
drug_to_ade_text = (
41+
pl.read_csv(data_folder / "ade_text_table.csv")
42+
.with_columns(pl.col("ade_text").str.to_lowercase())
43+
.select("drug", "ade_text")
44+
.to_dicts()
45+
)
46+
meddra_df = (
47+
pl.read_csv(external_data_folder / "umls_meddra_en.csv")
48+
.filter(
49+
pl.col("TTY").is_in({"PT", "LLT"}),
50+
)
51+
.with_columns(
52+
pl.col("STR").str.to_lowercase().alias("term"),
53+
)
54+
.rename({"SDUI": "meddra_pt_code"})
55+
)
56+
meddra_pt_code_to_term = (
57+
meddra_df.filter(pl.col("TTY").eq("PT"))
58+
.select("STR", "meddra_pt_code")
59+
.to_pandas()
60+
.set_index("meddra_pt_code")["STR"]
61+
.to_dict()
62+
)
63+
meddra_terms = (
64+
meddra_df.filter(pl.col("term").str.len_chars().ge(5))
65+
.select("term", "meddra_pt_code")
66+
.unique()
67+
.to_dicts()
68+
)
69+
meddra_terms = [MeddraSearchTerm.model_validate(t) for t in meddra_terms]
70+
logger.info(
71+
f"Found {len(drug_to_ade_text)} drugs. "
72+
f"Searching for exact matches of {len(meddra_terms)} MedDRA terms."
73+
)
74+
meddra_tree = build_meddra_search_tree(meddra_terms)
75+
76+
exact_terms = list()
77+
for drug_term in tqdm.tqdm(drug_to_ade_text):
78+
ade_text = drug_term["ade_text"]
79+
matches = find_meddra_terms_in_text(ade_text, meddra_tree)
80+
for match in matches:
81+
bert_string = build_bert_string(ade_text, match)
82+
row = {
83+
"label_id": drug_term["drug"],
84+
"found_term": match.term,
85+
"location": match.start,
86+
"string": bert_string,
87+
"section": "AR",
88+
"set_id": drug_term["drug"],
89+
"drug": None,
90+
"spl_version": None,
91+
"pt_meddra_id": match.meddra_pt_code,
92+
"pt_meddra_term": meddra_pt_code_to_term.get(match.meddra_pt_code),
93+
}
94+
exact_terms.append(row)
95+
96+
logger.info(f"Found {len(exact_terms)} exact matches.")
97+
pl.DataFrame(exact_terms).write_csv(data_folder / "bert_input_v2.csv")
98+
1299

13100
def main():
14-
parser = argparse.ArgumentParser(description='let the code know where the data is held')
15-
parser.add_argument('--data_folder', required=True, help='Path to the data folder.')
16-
parser.add_argument('--external_data', required=True, help='Path to the where the external data is housed.')
17-
parser.add_argument('--map_folder', required=True, help='Path to the where the external data used for OnSIDES model is housed.')
101+
logging.basicConfig(level=logging.INFO)
102+
parser = argparse.ArgumentParser()
103+
parser.add_argument(
104+
"--data_folder",
105+
type=pathlib.Path,
106+
required=True,
107+
help="Path to the data folder.",
108+
)
109+
parser.add_argument(
110+
"--external_data",
111+
type=pathlib.Path,
112+
required=True,
113+
help="Path to the external data folder.",
114+
)
18115
args = parser.parse_args()
19-
data_folder = args.data_folder
20-
external_data_folder = args.external_data
21-
map_folder = args.map_folder
22-
23-
#read in table for drug-ade free-text data
24-
ade_text_table_df = pd.read_csv(data_folder+'ade_text_table.csv')
25-
26-
##Standard Vocabulary Mapping - here, we will use the UMLS MedDRA tables.
27-
meddra_df = pd.read_csv(external_data_folder+'umls_meddra_en.csv')
28-
meddra_df['STR'] = meddra_df.STR.apply(lambda x: x.lower())
29-
meddra_df['len'] = meddra_df.STR.apply(lambda x: len(x))
30-
meddra_dict = dict(zip(meddra_df.STR, meddra_df.SDUI))
31-
meddra_df = meddra_df[(meddra_df.TTY == 'PT')|(meddra_df['len'] > 5)]
32-
33-
exact_terms = []
34-
for i, row in tqdm(ade_text_table_df.iterrows()):
35-
label_id = row['drug']
36-
text = row['ade_txt'].lower()
37-
found_terms = list()
38-
for mdr_term in meddra_dict.keys():
39-
if text.find(mdr_term) == -1:
40-
continue
41-
else:
42-
li = text.split(mdr_term)
43-
start_pos = 0
44-
for i in range(len(li)-1):
45-
# the occurrence of the word is at the end of the previous string
46-
start_pos = sum([len(li[j]) for j in range(i+1)]) + i*len(mdr_term)
47-
if not mdr_term == text[start_pos:(start_pos+len(mdr_term))]:
48-
raise Exception(f" mdr_term: '{mdr_term}', term_in_text: '{text[start_pos:(start_pos+len(mdr_term))]}'")
49-
found_terms.append((mdr_term, meddra_dict[mdr_term], start_pos, len(mdr_term)))
50-
exact_terms.append([label_id, found_terms])
51-
52-
exact_terms_df = pd.DataFrame(exact_terms, columns=['label_id', 'found_terms'])
53-
exact_terms_df = exact_terms_df.explode('found_terms')
54-
exact_terms_df['len'] = exact_terms_df['found_terms'].apply(lambda x: x[3] if str(x) != 'nan' else None)
55-
exact_terms_df = exact_terms_df[exact_terms_df['len'] >= 5]
56-
exact_terms_df['found_term'] = exact_terms_df['found_terms'].apply(lambda x: x[0] if str(x) != 'nan' else None)
57-
exact_terms_df['meddra_id'] = exact_terms_df['found_terms'].apply(lambda x: x[1] if str(x) != 'nan' else None)
58-
exact_terms_df['location'] = exact_terms_df['found_terms'].apply(lambda x: x[2] if str(x) != 'nan' else None)
59-
exact_terms_df = exact_terms_df.drop(['found_terms', 'len'], axis = 1)
60-
61-
building_strings = []
62-
ade_text_table_dict = dict(zip(ade_text_table_df.drug, ade_text_table_df.ade_txt))
63-
for i, row in tqdm(exact_terms_df.iterrows()):
64-
term, label_id, start_pos = row['found_term'], row['label_id'], row['location']
65-
#default settings
66-
nwords, prop_before = 125, 0.125
67-
#pull the full text
68-
ar_text = ade_text_table_dict[label_id]
69-
70-
term_nwords = len(term.split())
71-
size_before = max(int((nwords-2*term_nwords)*prop_before), 1)
72-
size_after = max(int((nwords-2*term_nwords)*(1-prop_before)), 1)
73-
74-
before_text = ar_text[:start_pos]
75-
after_text = ar_text[(start_pos+term_nwords):]
76-
77-
before_parts = before_text.split()[-1*size_before:]
78-
after_parts = after_text.split()[:size_after]
79-
80-
li = [term]
81-
li.extend(before_parts)
82-
li.append('EVENT')
83-
li.extend(after_parts)
84-
example_string = ' '.join(li)
85-
building_strings.append(example_string)
86-
exact_terms_df['string'] = building_strings
87-
88-
#save dataframe
89-
exact_terms_df.to_csv(data_folder+'sentences-rx_method14_nwords125_clinical_bert_application_set_AR.csv', index=False)
90-
91-
#further prep the data for the model
92-
#required columns : section, drug, label_id, set_id, spl_version, pt_meddra_id, pt_meddra_term
93-
exact_terms_df = pd.read_csv(data_folder+'sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0924.csv')
94-
exact_terms_df['section'] = 'AR'
95-
exact_terms_df['set_id'] = exact_terms_df['label_id']
96-
97-
drug_map = pd.read_csv(map_folder+'spl/maps/20230512/rxnorm_mappings.txt', delimiter = '|')
98-
drug_id_dict = dict(zip(drug_map.SETID, drug_map.RXCUI))
99-
drug_ver_dict = dict(zip(drug_map.SETID, drug_map.SPL_VERSION))
100-
exact_terms_df['drug'] = exact_terms_df.set_id.apply(lambda x: drug_id_dict[x] if x in drug_id_dict.keys() else None)
101-
exact_terms_df['spl_version'] = exact_terms_df.set_id.apply(lambda x: drug_ver_dict[x] if x in drug_ver_dict.keys() else None)
102-
103-
llt_pt = pd.read_csv(map_folder+'meddra_llt_pt_map.txt', delimiter = '|')
104-
llt_pt_id_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_code))
105-
llt_pt_term_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_name))
106-
exact_terms_df['pt_meddra_id'] = exact_terms_df.meddra_id.apply(lambda x: llt_pt_id_dict[x] if x in llt_pt_id_dict.keys() else None)
107-
exact_terms_df['pt_meddra_term'] = exact_terms_df.meddra_id.apply(lambda x: llt_pt_term_dict[x] if x in llt_pt_term_dict.keys() else None)
108-
109-
#save dataframe
110-
exact_terms_df.to_csv(data_folder+'sentences-rx_method14_nwords125_clinical_bert_application_set_AR.csv', index=False)
111-
112-
113-
if __name__ == '__main__':
114-
main()
116+
format_text(args.data_folder, args.external_data)
117+
118+
119+
if __name__ == "__main__":
120+
main()

onsides_intl/uv.lock

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)