Skip to content

Commit ee74007

Browse files
committed
preprocess scripts for bc5cdr
1 parent d84a559 commit ee74007

File tree

5 files changed

+214
-126
lines changed

5 files changed

+214
-126
lines changed

preprocess/README.md

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,4 +164,170 @@ python ./query_preprocess.py \
164164
--resolve_composites \
165165
--lowercase true \
166166
--remove_punctuation true
167+
```
168+
169+
## BC5CDR-Disease
170+
You can preprocess BC5CDR-Disease dataset from scratch.
171+
172+
First, parse the raw `BC5CDR` data.
173+
The result will be `mentions (*.concept)` and `contexts (*.txt)`
174+
```
175+
DATA_DIR=../datasets
176+
177+
python ./bc5cdr_preprocess.py \
178+
--input_file ${DATA_DIR}/raw/bc5cdr/CDR_TrainingSet.PubTator.txt \
179+
--output_dir ${DATA_DIR}/bc5cdr-disease/train \
180+
--type disease
181+
182+
python ./bc5cdr_preprocess.py \
183+
--input_file ${DATA_DIR}/raw/bc5cdr/CDR_DevelopmentSet.PubTator.txt \
184+
--output_dir ${DATA_DIR}/bc5cdr-disease/dev \
185+
--type disease
186+
187+
python ./bc5cdr_preprocess.py \
188+
--input_file ${DATA_DIR}/raw/bc5cdr/CDR_TestSet.PubTator.txt \
189+
--output_dir ${DATA_DIR}/bc5cdr-disease/test \
190+
--type disease
191+
```
192+
193+
Second, apply the text preprocess to the train/dev/test dataset and their dictionaries
194+
```
195+
DATA_DIR=../datasets
196+
AB3P_PATH=../Ab3P/identify_abbr
197+
198+
# preprocess trainset and its dictionary
199+
python dictionary_preprocess.py \
200+
--input_dictionary_path ./resources/medic_06Jul2012.txt \
201+
--output_dictionary_path ${DATA_DIR}/bc5cdr-disease/train_dictionary.txt \
202+
--lowercase \
203+
--remove_punctuation
204+
205+
python ./query_preprocess.py \
206+
--input_dir ${DATA_DIR}/bc5cdr-disease/train/ \
207+
--output_dir ${DATA_DIR}/bc5cdr-disease/processed_train/ \
208+
--dictionary_path ${DATA_DIR}/bc5cdr-disease/train_dictionary.txt \
209+
--ab3p_path ${AB3P_PATH} \
210+
--remove_cuiless \
211+
--resolve_composites \
212+
--lowercase true \
213+
--remove_punctuation true
214+
215+
# preprocess devset and its dictionary
216+
python dictionary_preprocess.py \
217+
--input_dictionary_path ${DATA_DIR}/bc5cdr-disease/train_dictionary.txt \
218+
--additional_data_dir ${DATA_DIR}/bc5cdr-disease/processed_train/ \
219+
--output_dictionary_path ${DATA_DIR}/bc5cdr-disease/dev_dictionary.txt \
220+
--lowercase \
221+
--remove_punctuation
222+
223+
python ./query_preprocess.py \
224+
--input_dir ${DATA_DIR}/bc5cdr-disease/dev/ \
225+
--output_dir ${DATA_DIR}/bc5cdr-disease/processed_dev/ \
226+
--dictionary_path ${DATA_DIR}/bc5cdr-disease/dev_dictionary.txt \
227+
--ab3p_path ${AB3P_PATH} \
228+
--remove_cuiless \
229+
--resolve_composites \
230+
--lowercase true \
231+
--remove_punctuation true
232+
233+
# preprocess testset and its dictionary
234+
python dictionary_preprocess.py \
235+
--input_dictionary_path ${DATA_DIR}/bc5cdr-disease/dev_dictionary.txt \
236+
--additional_data_dir ${DATA_DIR}/bc5cdr-disease/processed_dev \
237+
--output_dictionary_path ${DATA_DIR}/bc5cdr-disease/test_dictionary.txt \
238+
--lowercase \
239+
--remove_punctuation
240+
241+
python ./query_preprocess.py \
242+
--input_dir ${DATA_DIR}/bc5cdr-disease/test/ \
243+
--output_dir ${DATA_DIR}/bc5cdr-disease/processed_test/ \
244+
--dictionary_path ${DATA_DIR}/bc5cdr-disease/test_dictionary.txt \
245+
--ab3p_path ${AB3P_PATH} \
246+
--remove_cuiless \
247+
--resolve_composites \
248+
--lowercase true \
249+
--remove_punctuation true
250+
```
251+
252+
## BC5CDR-Chemical
253+
You can preprocess BC5CDR-Chemical dataset from scratch.
254+
255+
First, parse the raw `BC5CDR` data.
256+
The result will be `mentions (*.concept)` and `contexts (*.txt)`
257+
```
258+
DATA_DIR=../datasets
259+
260+
python ./bc5cdr_preprocess.py \
261+
--input_file ${DATA_DIR}/raw/bc5cdr/CDR_TrainingSet.PubTator.txt \
262+
--output_dir ${DATA_DIR}/bc5cdr-chemical/train \
263+
--type chemical
264+
265+
python ./bc5cdr_preprocess.py \
266+
--input_file ${DATA_DIR}/raw/bc5cdr/CDR_DevelopmentSet.PubTator.txt \
267+
--output_dir ${DATA_DIR}/bc5cdr-chemical/dev \
268+
--type chemical
269+
270+
python ./bc5cdr_preprocess.py \
271+
--input_file ${DATA_DIR}/raw/bc5cdr/CDR_TestSet.PubTator.txt \
272+
--output_dir ${DATA_DIR}/bc5cdr-chemical/test \
273+
--type chemical
274+
```
275+
276+
Second, apply the text preprocess to the train/dev/test dataset and their dictionaries
277+
```
278+
DATA_DIR=../datasets
279+
AB3P_PATH=../Ab3P/identify_abbr
280+
281+
# preprocess trainset and its dictionary
282+
python dictionary_preprocess.py \
283+
--input_dictionary_path ./resources/ctd_chemical_04Nov2019.txt \
284+
--output_dictionary_path ${DATA_DIR}/bc5cdr-chemical/train_dictionary.txt \
285+
--lowercase \
286+
--remove_punctuation
287+
288+
python ./query_preprocess.py \
289+
--input_dir ${DATA_DIR}/bc5cdr-chemical/train/ \
290+
--output_dir ${DATA_DIR}/bc5cdr-chemical/processed_train/ \
291+
--dictionary_path ${DATA_DIR}/bc5cdr-chemical/train_dictionary.txt \
292+
--ab3p_path ${AB3P_PATH} \
293+
--remove_cuiless \
294+
--resolve_composites \
295+
--lowercase true \
296+
--remove_punctuation true
297+
298+
# preprocess devset and its dictionary
299+
python dictionary_preprocess.py \
300+
--input_dictionary_path ${DATA_DIR}/bc5cdr-chemical/train_dictionary.txt \
301+
--additional_data_dir ${DATA_DIR}/bc5cdr-chemical/processed_train/ \
302+
--output_dictionary_path ${DATA_DIR}/bc5cdr-chemical/dev_dictionary.txt \
303+
--lowercase \
304+
--remove_punctuation
305+
306+
python ./query_preprocess.py \
307+
--input_dir ${DATA_DIR}/bc5cdr-chemical/dev/ \
308+
--output_dir ${DATA_DIR}/bc5cdr-chemical/processed_dev/ \
309+
--dictionary_path ${DATA_DIR}/bc5cdr-chemical/dev_dictionary.txt \
310+
--ab3p_path ${AB3P_PATH} \
311+
--remove_cuiless \
312+
--resolve_composites \
313+
--lowercase true \
314+
--remove_punctuation true
315+
316+
# preprocess testset and its dictionary
317+
python dictionary_preprocess.py \
318+
--input_dictionary_path ${DATA_DIR}/bc5cdr-chemical/dev_dictionary.txt \
319+
--additional_data_dir ${DATA_DIR}/bc5cdr-chemical/processed_dev \
320+
--output_dictionary_path ${DATA_DIR}/bc5cdr-chemical/test_dictionary.txt \
321+
--lowercase \
322+
--remove_punctuation
323+
324+
python ./query_preprocess.py \
325+
--input_dir ${DATA_DIR}/bc5cdr-chemical/test/ \
326+
--output_dir ${DATA_DIR}/bc5cdr-chemical/processed_test/ \
327+
--dictionary_path ${DATA_DIR}/bc5cdr-chemical/test_dictionary.txt \
328+
--ab3p_path ${AB3P_PATH} \
329+
--remove_cuiless \
330+
--resolve_composites \
331+
--lowercase true \
332+
--remove_punctuation true
167333
```

preprocess/bc5cdr_preprocess.py

Lines changed: 19 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -5,109 +5,13 @@
55
"""
66

77
import os
8-
import pdb
9-
10-
# input_files = [
11-
# './datasets/raw/bc5cdr/CDR_TrainingSet.PubTator.txt',
12-
# './datasets/raw/bc5cdr/CDR_DevelopmentSet.PubTator.txt',
13-
# './datasets/raw/bc5cdr/CDR_TestSet.PubTator.txt',
14-
# ]
15-
16-
# disease_output_dirs = [
17-
# os.path.join('./datasets/bc5cdr-disease', 'train'),
18-
# os.path.join('./datasets/bc5cdr-disease', 'dev'),
19-
# os.path.join('./datasets/bc5cdr-disease', 'test'),
20-
# ]
21-
22-
# chemical_output_dirs = [
23-
# os.path.join('./datasets/bc5cdr-chemical', 'train'),
24-
# os.path.join('./datasets/bc5cdr-chemical', 'dev'),
25-
# os.path.join('./datasets/bc5cdr-chemical', 'test'),
26-
# ]
27-
28-
for input_file, disease_output_dir, chemical_output_dir in zip(input_files, disease_output_dirs, chemical_output_dirs):
29-
if not os.path.exists(disease_output_dir):
30-
os.makedirs(disease_output_dir)
31-
if not os.path.exists(chemical_output_dir):
32-
os.makedirs(chemical_output_dir)
33-
34-
with open(input_file, 'r') as f:
35-
lines = f.readlines()
36-
37-
38-
disease_queries = []
39-
chemical_queries = []
40-
pmids = []
41-
lines = lines + ['\n']
42-
num_docs = 0
43-
num_disease_queries = 0
44-
num_chemical_queries = 0
45-
for line in lines:
46-
line = line.strip()
47-
if '|t|' in line:
48-
title = line.split("|")[2]
49-
elif '|a|' in line:
50-
abstract = line.split("|")[2]
51-
elif '\t' in line:
52-
line = line.split("\t")
53-
if len(line) == 6:
54-
pmid, start, end, mention, _class, cui = line
55-
elif len(line) == 4: # CID
56-
continue
57-
elif len(line) == 7: # Composite mention
58-
pmid, start, end, mention, _class, cui, composite_mentions = line
59-
if composite_mentions.count("|") == cui.count("|"):
60-
mention = composite_mentions
61-
query = pmid + "||"+start +"|" + end + "||" + _class + "||" + mention + "||" + cui
62-
if _class=="Chemical":
63-
chemical_queries.append(query)
64-
elif _class=="Disease":
65-
disease_queries.append(query)
66-
elif len(disease_queries) or len(chemical_queries):
67-
if pmid in pmids:
68-
print(pmid)
69-
disease_queries = []
70-
chemical_queries = []
71-
title = ""
72-
abstract = ""
73-
continue
74-
context = title + "\n\n" + abstract + "\n"
75-
76-
77-
# disease
78-
disease_concept = "\n".join(disease_queries) + "\n"
79-
output_context_file = os.path.join(disease_output_dir, "{}.txt".format(pmid))
80-
output_concept_file = os.path.join(disease_output_dir, "{}.concept".format(pmid))
81-
with open(output_context_file, 'w') as f:
82-
f.write(context)
83-
with open(output_concept_file, 'w') as f:
84-
f.write(disease_concept)
85-
86-
# chemical
87-
chemical_concept = "\n".join(chemical_queries) + "\n"
88-
output_context_file = os.path.join(chemical_output_dir, "{}.txt".format(pmid))
89-
output_concept_file = os.path.join(chemical_output_dir, "{}.concept".format(pmid))
90-
with open(output_context_file, 'w') as f:
91-
f.write(context)
92-
with open(output_concept_file, 'w') as f:
93-
f.write(chemical_concept)
94-
95-
num_docs +=1
96-
num_chemical_queries += len(chemical_queries)
97-
num_disease_queries += len(disease_queries)
98-
pmids.append(pmid)
99-
disease_queries = []
100-
chemical_queries = []
101-
title = ""
102-
abstract = ""
103-
# pdb.set_trace()
104-
105-
print("{} {} {}".format(disease_output_dir, num_docs,num_disease_queries))
106-
print("{} {} {}".format(chemical_output_dir, num_docs,num_chemical_queries))
8+
import argparse
9+
from tqdm import tqdm
10710

10811
def main(args):
10912
input_file = args.input_file
11013
output_dir = args.output_dir
14+
_type = args.type
11115

11216
# create directory if it doesn't exist
11317
if not os.path.exists(output_dir):
@@ -116,13 +20,13 @@ def main(args):
11620
# read lines from raw file
11721
with open(input_file, 'r') as f:
11822
lines = f.readlines()
119-
23+
12024
queries = []
12125
pmids = []
12226
lines = lines + ['\n']
12327
num_docs = 0
12428
num_queries = 0
125-
for line in lines:
29+
for line in tqdm(lines):
12630
line = line.strip()
12731
if '|t|' in line:
12832
title = line.split("|")[2]
@@ -132,25 +36,33 @@ def main(args):
13236
line = line.split("\t")
13337
if len(line) == 6:
13438
pmid, start, end, mention, _class, cui = line
135-
else:
136-
raise NotImplementedError()
39+
elif len(line) == 4: # CID
40+
continue
41+
elif len(line) == 7: # Composite mention
42+
pmid, start, end, mention, _class, cui, composite_mentions = line
43+
if composite_mentions.count("|") == cui.count("|"):
44+
mention = composite_mentions
13745
query = pmid + "||"+start +"|" + end + "||" + _class + "||" + mention + "||" + cui
138-
queries.append(query)
46+
if _class.lower()==_type.lower():
47+
queries.append(query)
13948
elif len(queries):
49+
14050
if pmid in pmids:
14151
print(pmid)
14252
queries = []
14353
title = ""
14454
abstract = ""
14555
continue
14656
context = title + "\n\n" + abstract + "\n"
57+
14758
concept = "\n".join(queries) + "\n"
14859
output_context_file = os.path.join(output_dir, "{}.txt".format(pmid))
14960
output_concept_file = os.path.join(output_dir, "{}.concept".format(pmid))
15061
with open(output_context_file, 'w') as f:
15162
f.write(context)
15263
with open(output_concept_file, 'w') as f:
15364
f.write(concept)
65+
15466
num_docs +=1
15567
num_queries += len(queries)
15668
pmids.append(pmid)
@@ -163,11 +75,12 @@ def main(args):
16375
if __name__ == '__main__':
16476
parser = argparse.ArgumentParser()
16577
parser.add_argument('--input_file', type=str,
166-
default="./raw/ncbi-disease/NCBItrainset_corpus.txt",
78+
default="./raw/bc5cdr/CDR_TrainingSet.PubTator.txt",
16779
help='path of input file')
16880
parser.add_argument('--output_dir', type=str,
169-
default="./ncbi-disease/train",
81+
default="./bc5cdr-disease/train",
17082
help='path of output directionary')
83+
parser.add_argument('--type', type=str, choices=["chemical", "disease"])
17184

17285
args = parser.parse_args()
17386

0 commit comments

Comments
 (0)