-
Notifications
You must be signed in to change notification settings - Fork 1
/
main_preprocess.py
72 lines (61 loc) · 2.22 KB
/
main_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- coding: utf-8 -*-
import research_toolbox.tb_io as tb_io
import research_toolbox.tb_filesystem as tb_fs
def read_supertagging_auto_file(filepath):
lines = tb_io.read_textfile(filepath)
num_examples = len(lines) / 2
lst = []
for i in xrange(num_examples):
postags = []
supertags = []
words = []
s = lines[2 * i + 1]
while True:
start_idx = s.find("<L ")
if start_idx == -1:
break
end_idx = s.find(">)") + 1
node = s[start_idx + 3:end_idx - 2]
super_t, _, pos_t, tk, _ = node.split(' ')
postags.append(pos_t)
supertags.append(super_t)
words.append(tk)
# skips potential whitespace
s = s[end_idx + 1:]
example_id, parser_id, num_parses = lines[2 * i].split(' ')
lst.append({
"example_id": example_id.split("=")[1],
"parser_id": parser_id.split("=")[1],
"num_parses": int(num_parses.split("=")[1]),
"words": words,
"postags": postags,
"supertags": supertags
})
return lst
if __name__ == "__main__":
# Path to CCG Bank AUTO folder.
folderpath = "data/ccgbank_1_1/data/AUTO/"
filepath_lst = tb_fs.list_files(folderpath, recursive=True)
examples = []
for fpath in filepath_lst:
examples.extend(read_supertagging_auto_file(fpath))
train_examples = []
dev_examples = []
test_examples = []
idx = len("wsj_")
for e in examples:
section_id = int(e["example_id"][idx:idx + 2])
if section_id >= 2 and section_id <= 21:
train_examples.append(e)
elif section_id == 0:
dev_examples.append(e)
elif section_id == 23:
test_examples.append(e)
else:
continue
print len(train_examples), len(dev_examples), len(test_examples)
# Paths for the output files
tb_fs.create_folder("data/supertagging", abort_if_exists=False)
tb_io.write_jsonlogfile("data/supertagging/train.jsonl", train_examples)
tb_io.write_jsonlogfile("data/supertagging/dev.jsonl", dev_examples)
tb_io.write_jsonlogfile("data/supertagging/test.jsonl", test_examples)