5
5
"""
6
6
7
7
import os
8
- import pdb
9
-
10
- # input_files = [
11
- # './datasets/raw/bc5cdr/CDR_TrainingSet.PubTator.txt',
12
- # './datasets/raw/bc5cdr/CDR_DevelopmentSet.PubTator.txt',
13
- # './datasets/raw/bc5cdr/CDR_TestSet.PubTator.txt',
14
- # ]
15
-
16
- # disease_output_dirs = [
17
- # os.path.join('./datasets/bc5cdr-disease', 'train'),
18
- # os.path.join('./datasets/bc5cdr-disease', 'dev'),
19
- # os.path.join('./datasets/bc5cdr-disease', 'test'),
20
- # ]
21
-
22
- # chemical_output_dirs = [
23
- # os.path.join('./datasets/bc5cdr-chemical', 'train'),
24
- # os.path.join('./datasets/bc5cdr-chemical', 'dev'),
25
- # os.path.join('./datasets/bc5cdr-chemical', 'test'),
26
- # ]
27
-
28
- for input_file , disease_output_dir , chemical_output_dir in zip (input_files , disease_output_dirs , chemical_output_dirs ):
29
- if not os .path .exists (disease_output_dir ):
30
- os .makedirs (disease_output_dir )
31
- if not os .path .exists (chemical_output_dir ):
32
- os .makedirs (chemical_output_dir )
33
-
34
- with open (input_file , 'r' ) as f :
35
- lines = f .readlines ()
36
-
37
-
38
- disease_queries = []
39
- chemical_queries = []
40
- pmids = []
41
- lines = lines + ['\n ' ]
42
- num_docs = 0
43
- num_disease_queries = 0
44
- num_chemical_queries = 0
45
- for line in lines :
46
- line = line .strip ()
47
- if '|t|' in line :
48
- title = line .split ("|" )[2 ]
49
- elif '|a|' in line :
50
- abstract = line .split ("|" )[2 ]
51
- elif '\t ' in line :
52
- line = line .split ("\t " )
53
- if len (line ) == 6 :
54
- pmid , start , end , mention , _class , cui = line
55
- elif len (line ) == 4 : # CID
56
- continue
57
- elif len (line ) == 7 : # Composite mention
58
- pmid , start , end , mention , _class , cui , composite_mentions = line
59
- if composite_mentions .count ("|" ) == cui .count ("|" ):
60
- mention = composite_mentions
61
- query = pmid + "||" + start + "|" + end + "||" + _class + "||" + mention + "||" + cui
62
- if _class == "Chemical" :
63
- chemical_queries .append (query )
64
- elif _class == "Disease" :
65
- disease_queries .append (query )
66
- elif len (disease_queries ) or len (chemical_queries ):
67
- if pmid in pmids :
68
- print (pmid )
69
- disease_queries = []
70
- chemical_queries = []
71
- title = ""
72
- abstract = ""
73
- continue
74
- context = title + "\n \n " + abstract + "\n "
75
-
76
-
77
- # disease
78
- disease_concept = "\n " .join (disease_queries ) + "\n "
79
- output_context_file = os .path .join (disease_output_dir , "{}.txt" .format (pmid ))
80
- output_concept_file = os .path .join (disease_output_dir , "{}.concept" .format (pmid ))
81
- with open (output_context_file , 'w' ) as f :
82
- f .write (context )
83
- with open (output_concept_file , 'w' ) as f :
84
- f .write (disease_concept )
85
-
86
- # chemical
87
- chemical_concept = "\n " .join (chemical_queries ) + "\n "
88
- output_context_file = os .path .join (chemical_output_dir , "{}.txt" .format (pmid ))
89
- output_concept_file = os .path .join (chemical_output_dir , "{}.concept" .format (pmid ))
90
- with open (output_context_file , 'w' ) as f :
91
- f .write (context )
92
- with open (output_concept_file , 'w' ) as f :
93
- f .write (chemical_concept )
94
-
95
- num_docs += 1
96
- num_chemical_queries += len (chemical_queries )
97
- num_disease_queries += len (disease_queries )
98
- pmids .append (pmid )
99
- disease_queries = []
100
- chemical_queries = []
101
- title = ""
102
- abstract = ""
103
- # pdb.set_trace()
104
-
105
- print ("{} {} {}" .format (disease_output_dir , num_docs ,num_disease_queries ))
106
- print ("{} {} {}" .format (chemical_output_dir , num_docs ,num_chemical_queries ))
8
+ import argparse
9
+ from tqdm import tqdm
107
10
108
11
def main (args ):
109
12
input_file = args .input_file
110
13
output_dir = args .output_dir
14
+ _type = args .type
111
15
112
16
# create directory if it doesn't exist
113
17
if not os .path .exists (output_dir ):
@@ -116,13 +20,13 @@ def main(args):
116
20
# read lines from raw file
117
21
with open (input_file , 'r' ) as f :
118
22
lines = f .readlines ()
119
-
23
+
120
24
queries = []
121
25
pmids = []
122
26
lines = lines + ['\n ' ]
123
27
num_docs = 0
124
28
num_queries = 0
125
- for line in lines :
29
+ for line in tqdm ( lines ) :
126
30
line = line .strip ()
127
31
if '|t|' in line :
128
32
title = line .split ("|" )[2 ]
@@ -132,25 +36,33 @@ def main(args):
132
36
line = line .split ("\t " )
133
37
if len (line ) == 6 :
134
38
pmid , start , end , mention , _class , cui = line
135
- else :
136
- raise NotImplementedError ()
39
+ elif len (line ) == 4 : # CID
40
+ continue
41
+ elif len (line ) == 7 : # Composite mention
42
+ pmid , start , end , mention , _class , cui , composite_mentions = line
43
+ if composite_mentions .count ("|" ) == cui .count ("|" ):
44
+ mention = composite_mentions
137
45
query = pmid + "||" + start + "|" + end + "||" + _class + "||" + mention + "||" + cui
138
- queries .append (query )
46
+ if _class .lower ()== _type .lower ():
47
+ queries .append (query )
139
48
elif len (queries ):
49
+
140
50
if pmid in pmids :
141
51
print (pmid )
142
52
queries = []
143
53
title = ""
144
54
abstract = ""
145
55
continue
146
56
context = title + "\n \n " + abstract + "\n "
57
+
147
58
concept = "\n " .join (queries ) + "\n "
148
59
output_context_file = os .path .join (output_dir , "{}.txt" .format (pmid ))
149
60
output_concept_file = os .path .join (output_dir , "{}.concept" .format (pmid ))
150
61
with open (output_context_file , 'w' ) as f :
151
62
f .write (context )
152
63
with open (output_concept_file , 'w' ) as f :
153
64
f .write (concept )
65
+
154
66
num_docs += 1
155
67
num_queries += len (queries )
156
68
pmids .append (pmid )
@@ -163,11 +75,12 @@ def main(args):
163
75
if __name__ == '__main__' :
164
76
parser = argparse .ArgumentParser ()
165
77
parser .add_argument ('--input_file' , type = str ,
166
- default = "./raw/ncbi-disease/NCBItrainset_corpus .txt" ,
78
+ default = "./raw/bc5cdr/CDR_TrainingSet.PubTator .txt" ,
167
79
help = 'path of input file' )
168
80
parser .add_argument ('--output_dir' , type = str ,
169
- default = "./ncbi -disease/train" ,
81
+ default = "./bc5cdr -disease/train" ,
170
82
help = 'path of output directionary' )
83
+ parser .add_argument ('--type' , type = str , choices = ["chemical" , "disease" ])
171
84
172
85
args = parser .parse_args ()
173
86
0 commit comments