1
- import numpy as np
2
- import pandas as pd
3
- import requests
4
- from tqdm import tqdm
5
- from glob import glob
6
- import ast , re , json , orjson
7
- from time import sleep
1
+ """Find MedDRA term exact matches in drug label free text. Setup data for the
2
+ OnSIDES model.
3
+
4
+ There are a couple things that I did here which should be clarified.
5
+
6
+ First, I don't do any RxNorm mapping here. I saw that previous code attempted to
7
+ do this, but it was trying to join drug names from EMA to RxNorm SET IDs, which
8
+ look like UUIDs. Text joins didn't work, so I just explicitly set those columns
9
+ to None below, just to ensure we have all the same columns as the original code.
10
+
11
+ Second, I'm not sure about the "AR" section. That's what the previous code did,
12
+ so I just did it here too.
13
+
14
+ Third, I used the same MedDRA terms as previous code (I think). Not sure why
15
+ we're only using 5 character or longer terms. My code looks for either PT or LLT
16
+ and maps to PT terms.
17
+ """
18
+
8
19
import argparse
9
- import warnings
10
- warnings .filterwarnings ('ignore' )
11
- import os
20
+ import logging
21
+ import pathlib
22
+
23
+ import polars as pl
24
+ import tqdm .auto as tqdm
25
+
26
+ from onsides_eu .stringsearch import (
27
+ MeddraSearchTerm ,
28
+ build_bert_string ,
29
+ build_meddra_search_tree ,
30
+ find_meddra_terms_in_text ,
31
+ )
32
+
33
+ logger = logging .getLogger (__name__ )
34
+
35
+
36
+ def format_text (
37
+ data_folder : pathlib .Path ,
38
+ external_data_folder : pathlib .Path ,
39
+ ) -> None :
40
+ drug_to_ade_text = (
41
+ pl .read_csv (data_folder / "ade_text_table.csv" )
42
+ .with_columns (pl .col ("ade_text" ).str .to_lowercase ())
43
+ .select ("drug" , "ade_text" )
44
+ .to_dicts ()
45
+ )
46
+ meddra_df = (
47
+ pl .read_csv (external_data_folder / "umls_meddra_en.csv" )
48
+ .filter (
49
+ pl .col ("TTY" ).is_in ({"PT" , "LLT" }),
50
+ )
51
+ .with_columns (
52
+ pl .col ("STR" ).str .to_lowercase ().alias ("term" ),
53
+ )
54
+ .rename ({"SDUI" : "meddra_pt_code" })
55
+ )
56
+ meddra_pt_code_to_term = (
57
+ meddra_df .filter (pl .col ("TTY" ).eq ("PT" ))
58
+ .select ("STR" , "meddra_pt_code" )
59
+ .to_pandas ()
60
+ .set_index ("meddra_pt_code" )["STR" ]
61
+ .to_dict ()
62
+ )
63
+ meddra_terms = (
64
+ meddra_df .filter (pl .col ("term" ).str .len_chars ().ge (5 ))
65
+ .select ("term" , "meddra_pt_code" )
66
+ .unique ()
67
+ .to_dicts ()
68
+ )
69
+ meddra_terms = [MeddraSearchTerm .model_validate (t ) for t in meddra_terms ]
70
+ logger .info (
71
+ f"Found { len (drug_to_ade_text )} drugs. "
72
+ f"Searching for exact matches of { len (meddra_terms )} MedDRA terms."
73
+ )
74
+ meddra_tree = build_meddra_search_tree (meddra_terms )
75
+
76
+ exact_terms = list ()
77
+ for drug_term in tqdm .tqdm (drug_to_ade_text ):
78
+ ade_text = drug_term ["ade_text" ]
79
+ matches = find_meddra_terms_in_text (ade_text , meddra_tree )
80
+ for match in matches :
81
+ bert_string = build_bert_string (ade_text , match )
82
+ row = {
83
+ "label_id" : drug_term ["drug" ],
84
+ "found_term" : match .term ,
85
+ "location" : match .start ,
86
+ "string" : bert_string ,
87
+ "section" : "AR" ,
88
+ "set_id" : drug_term ["drug" ],
89
+ "drug" : None ,
90
+ "spl_version" : None ,
91
+ "pt_meddra_id" : match .meddra_pt_code ,
92
+ "pt_meddra_term" : meddra_pt_code_to_term .get (match .meddra_pt_code ),
93
+ }
94
+ exact_terms .append (row )
95
+
96
+ logger .info (f"Found { len (exact_terms )} exact matches." )
97
+ pl .DataFrame (exact_terms ).write_csv (data_folder / "bert_input_v2.csv" )
98
+
12
99
13
100
def main ():
14
- parser = argparse .ArgumentParser (description = 'let the code know where the data is held' )
15
- parser .add_argument ('--data_folder' , required = True , help = 'Path to the data folder.' )
16
- parser .add_argument ('--external_data' , required = True , help = 'Path to the where the external data is housed.' )
17
- parser .add_argument ('--map_folder' , required = True , help = 'Path to the where the external data used for OnSIDES model is housed.' )
101
+ logging .basicConfig (level = logging .INFO )
102
+ parser = argparse .ArgumentParser ()
103
+ parser .add_argument (
104
+ "--data_folder" ,
105
+ type = pathlib .Path ,
106
+ required = True ,
107
+ help = "Path to the data folder." ,
108
+ )
109
+ parser .add_argument (
110
+ "--external_data" ,
111
+ type = pathlib .Path ,
112
+ required = True ,
113
+ help = "Path to the external data folder." ,
114
+ )
18
115
args = parser .parse_args ()
19
- data_folder = args .data_folder
20
- external_data_folder = args .external_data
21
- map_folder = args .map_folder
22
-
23
- #read in table for drug-ade free-text data
24
- ade_text_table_df = pd .read_csv (data_folder + 'ade_text_table.csv' )
25
-
26
- ##Standard Vocabulary Mapping - here, we will use the UMLS MedDRA tables.
27
- meddra_df = pd .read_csv (external_data_folder + 'umls_meddra_en.csv' )
28
- meddra_df ['STR' ] = meddra_df .STR .apply (lambda x : x .lower ())
29
- meddra_df ['len' ] = meddra_df .STR .apply (lambda x : len (x ))
30
- meddra_dict = dict (zip (meddra_df .STR , meddra_df .SDUI ))
31
- meddra_df = meddra_df [(meddra_df .TTY == 'PT' )| (meddra_df ['len' ] > 5 )]
32
-
33
- exact_terms = []
34
- for i , row in tqdm (ade_text_table_df .iterrows ()):
35
- label_id = row ['drug' ]
36
- text = row ['ade_txt' ].lower ()
37
- found_terms = list ()
38
- for mdr_term in meddra_dict .keys ():
39
- if text .find (mdr_term ) == - 1 :
40
- continue
41
- else :
42
- li = text .split (mdr_term )
43
- start_pos = 0
44
- for i in range (len (li )- 1 ):
45
- # the occurrence of the word is at the end of the previous string
46
- start_pos = sum ([len (li [j ]) for j in range (i + 1 )]) + i * len (mdr_term )
47
- if not mdr_term == text [start_pos :(start_pos + len (mdr_term ))]:
48
- raise Exception (f" mdr_term: '{ mdr_term } ', term_in_text: '{ text [start_pos :(start_pos + len (mdr_term ))]} '" )
49
- found_terms .append ((mdr_term , meddra_dict [mdr_term ], start_pos , len (mdr_term )))
50
- exact_terms .append ([label_id , found_terms ])
51
-
52
- exact_terms_df = pd .DataFrame (exact_terms , columns = ['label_id' , 'found_terms' ])
53
- exact_terms_df = exact_terms_df .explode ('found_terms' )
54
- exact_terms_df ['len' ] = exact_terms_df ['found_terms' ].apply (lambda x : x [3 ] if str (x ) != 'nan' else None )
55
- exact_terms_df = exact_terms_df [exact_terms_df ['len' ] >= 5 ]
56
- exact_terms_df ['found_term' ] = exact_terms_df ['found_terms' ].apply (lambda x : x [0 ] if str (x ) != 'nan' else None )
57
- exact_terms_df ['meddra_id' ] = exact_terms_df ['found_terms' ].apply (lambda x : x [1 ] if str (x ) != 'nan' else None )
58
- exact_terms_df ['location' ] = exact_terms_df ['found_terms' ].apply (lambda x : x [2 ] if str (x ) != 'nan' else None )
59
- exact_terms_df = exact_terms_df .drop (['found_terms' , 'len' ], axis = 1 )
60
-
61
- building_strings = []
62
- ade_text_table_dict = dict (zip (ade_text_table_df .drug , ade_text_table_df .ade_txt ))
63
- for i , row in tqdm (exact_terms_df .iterrows ()):
64
- term , label_id , start_pos = row ['found_term' ], row ['label_id' ], row ['location' ]
65
- #default settings
66
- nwords , prop_before = 125 , 0.125
67
- #pull the full text
68
- ar_text = ade_text_table_dict [label_id ]
69
-
70
- term_nwords = len (term .split ())
71
- size_before = max (int ((nwords - 2 * term_nwords )* prop_before ), 1 )
72
- size_after = max (int ((nwords - 2 * term_nwords )* (1 - prop_before )), 1 )
73
-
74
- before_text = ar_text [:start_pos ]
75
- after_text = ar_text [(start_pos + term_nwords ):]
76
-
77
- before_parts = before_text .split ()[- 1 * size_before :]
78
- after_parts = after_text .split ()[:size_after ]
79
-
80
- li = [term ]
81
- li .extend (before_parts )
82
- li .append ('EVENT' )
83
- li .extend (after_parts )
84
- example_string = ' ' .join (li )
85
- building_strings .append (example_string )
86
- exact_terms_df ['string' ] = building_strings
87
-
88
- #save dataframe
89
- exact_terms_df .to_csv (data_folder + 'sentences-rx_method14_nwords125_clinical_bert_application_set_AR.csv' , index = False )
90
-
91
- #further prep the data for the model
92
- #required columns : section, drug, label_id, set_id, spl_version, pt_meddra_id, pt_meddra_term
93
- exact_terms_df = pd .read_csv (data_folder + 'sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0924.csv' )
94
- exact_terms_df ['section' ] = 'AR'
95
- exact_terms_df ['set_id' ] = exact_terms_df ['label_id' ]
96
-
97
- drug_map = pd .read_csv (map_folder + 'spl/maps/20230512/rxnorm_mappings.txt' , delimiter = '|' )
98
- drug_id_dict = dict (zip (drug_map .SETID , drug_map .RXCUI ))
99
- drug_ver_dict = dict (zip (drug_map .SETID , drug_map .SPL_VERSION ))
100
- exact_terms_df ['drug' ] = exact_terms_df .set_id .apply (lambda x : drug_id_dict [x ] if x in drug_id_dict .keys () else None )
101
- exact_terms_df ['spl_version' ] = exact_terms_df .set_id .apply (lambda x : drug_ver_dict [x ] if x in drug_ver_dict .keys () else None )
102
-
103
- llt_pt = pd .read_csv (map_folder + 'meddra_llt_pt_map.txt' , delimiter = '|' )
104
- llt_pt_id_dict = dict (zip (llt_pt .llt_concept_code , llt_pt .pt_concept_code ))
105
- llt_pt_term_dict = dict (zip (llt_pt .llt_concept_code , llt_pt .pt_concept_name ))
106
- exact_terms_df ['pt_meddra_id' ] = exact_terms_df .meddra_id .apply (lambda x : llt_pt_id_dict [x ] if x in llt_pt_id_dict .keys () else None )
107
- exact_terms_df ['pt_meddra_term' ] = exact_terms_df .meddra_id .apply (lambda x : llt_pt_term_dict [x ] if x in llt_pt_term_dict .keys () else None )
108
-
109
- #save dataframe
110
- exact_terms_df .to_csv (data_folder + 'sentences-rx_method14_nwords125_clinical_bert_application_set_AR.csv' , index = False )
111
-
112
-
113
- if __name__ == '__main__' :
114
- main ()
116
+ format_text (args .data_folder , args .external_data )
117
+
118
+
119
+ if __name__ == "__main__" :
120
+ main ()
0 commit comments