-
Notifications
You must be signed in to change notification settings - Fork 2
/
run.py
160 lines (154 loc) · 5.51 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/python
import pickle
import argparse
import gc
import pprint
import pickle
import grammar
import cyk
import util
if __name__ == '__main__':
help = {
't': 'Generate a grammar from the passed treebank file.',
'p': 'Load a grammar from the passed Python Pickle file.',
'gl': 'Limit the grammar to including the passed number of sentences.',
's': 'Save the generated grammar to the passed file.',
'c': 'Check for consistency and print 4 syntactically ambiguous terminals.',
'v': 'Print verbose output.',
'vv': 'Print very verbose output.',
'm': 'Print the most likely productions for the passed non-terminals.',
'cyk': 'Run the CYK parser on given test corpus or from std input.',
'ps': 'Limit the parsed test sentences to starting at passed line number.',
'pl': 'Limit the parsed test sentences to including the passed number of sentences.',
'a': 'Display ambiguous words.',
'l': 'Lower case words in building grammar and parsing sentences.',
'test': 'Build output test sentences. If passed we don\'t build covering.'
}
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--treebank', help=help['t'])
parser.add_argument('-p', '--pickle', help=help['p'])
parser.add_argument('-gl', '--grammar-limit', type=int, help=help['gl'])
parser.add_argument('-s', '--save', nargs='?', const='grammar', help=help['s'])
parser.add_argument('-c', '--check', action='store_true', help=help['c'])
parser.add_argument('-v', '--verbose', action='store_const', default=0, const=1, help=help['v'])
parser.add_argument('-vv', '--very-verbose', action='store_const', default=0, const=2, help=help['vv'])
parser.add_argument('-m', '--most-likely-productions', nargs='?', const=True, help=help['m'])
parser.add_argument('-cyk', '--cyk', nargs='?', const=True, help=help['cyk'])
parser.add_argument('-ps', '--parser-test-start', type=int, help=help['ps'])
parser.add_argument('-pl', '--parser-test-limit', type=int, help=help['pl'])
parser.add_argument('-a', '--ambiguous', nargs='?', const=True, help=help['a'])
parser.add_argument('-l', '--lower-case', action='store_true', help=help['l'])
parser.add_argument('-test', '--test', action='store_true', help=help['test'])
args = parser.parse_args()
def main():
# extract args
p = args.pickle
verbose = args.very_verbose or args.verbose
check = args.check
ambiguous = args.ambiguous
mlps = args.most_likely_productions
lower_case = args.lower_case
test = args.test
non_terms_for_ml = mlps.split() if mlps and mlps.__class__ != bool else ['VP', 'S', 'NP', 'SBAR', 'PP']
max_word_length = 15
# loading grammar
if p:
if verbose:
util.log_g("Loading grammar from pickle file %s" % (p))
pkl_file = open(p, 'rb')
G = pickle.load(pkl_file)
pkl_file.close()
else:
if verbose:
util.log_g("Loading grammar from treebank %s" % (args.treebank))
f = open(args.treebank, 'r')
G = grammar.Grammar(f, args.grammar_limit, verbose, lower_case)
f.close()
if args.save:
output = open(args.save + '.pkl', 'wb')
pickle.dump(G, output)
output.close()
if verbose: util.log_g("Grammar loaded.")
# running checks and statistics
if check:
util.log_g("Testing probability consistencies.")
util.log_g("Greatest divergence from unity: %0.20f." % max([abs(1 - i) for i in G.check_pcfg_sums()]))
if check or ambiguous:
util.log_g("Ambiguous word tests.")
ambig = G.ambiguous()
ambig_words = zip(*ambig)[0] if ambig else []
if ambiguous and not ambiguous.__class__ == bool:
for word in ambiguous.split():
if word in ambig_words:
util.log_g("'%s' is ambiguous." % (word))
pprint.pprint(ambig[ambig_words.index(word)])
else:
util.log_g("'%s' is not ambiguous." % (word))
else:
util.log_g("4 randomly chosen syntactically ambiguous terminals:")
pprint.pprint(ambig[0:4])
if check or mlps:
util.log_g("Most likely production for non-terminals %s:" % non_terms_for_ml)
mlps = G.most_likely_productions(non_terms_for_ml)
pprint.pprint(mlps)
# running CYK
if args.cyk:
if args.cyk.__class__ == bool:
util.log_p("Enter new line to exit.")
while True:
s = raw_input('Enter a sentence to parse: ')
if len(s):
if verbose:
util.log_p("Start CYK")
parse = cyk.CYK(G, s, verbose, lower_case)
if verbose > 1:
util.log_p("Covering productions:")
pprint.pprint(parse.covering_productions())
util.log_p("Covering productions string: %s" % parse.covering_productions_str())
util.log_p("Viterbi Parse: %s" % parse.viterbi_parse())
else:
break
else:
f = open(args.cyk)
limit = args.parser_test_limit
start = args.parser_test_start
i = 0
if test:
f_vit = open('viterbi_sentences.txt', 'w')
else:
f_cov = open('covering_productions.txt', 'w')
for line in f:
if limit and i >= limit:
break
i += 1
if start and i < start:
continue
if max_word_length and len(line.split()) > max_word_length:
out = "\n"
if test:
f_vit.write(out)
else:
f_cov.write(out)
else:
util.log_p("Sentence %d, parsing sentence: << %s >>" % (i, line.strip()))
parse = cyk.CYK(G, line, verbose)
# write parse results to output file
if test:
out = parse.viterbi_parse()
if out == util.NOT_IN_GRAMMAR_ERROR:
out = "\n"
else:
out += "\n"
f_vit.write(out)
else:
out = parse.covering_productions_str()
f_cov.write(out + "\n")
if verbose:
util.log_p("Wrote line: %s" % out)
gc.collect() # collect cyk object
f.close()
if test:
f_vit.close()
else:
f_cov.close()
main()