forked from RobMcH/CYK-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GrammarConverter.py
87 lines (75 loc) · 3.06 KB
/
GrammarConverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Global dictionary used for storing the rules.
RULE_DICT = {}
def read_grammar(grammar_file):
"""
Reads in the given grammar file and splits it into separate lists for each rule.
:param grammar_file: the grammar file to read in.
:return: the list of rules.
"""
with open(grammar_file) as cfg:
lines = cfg.readlines()
return [x.replace("->", "").split() for x in lines]
def add_rule(rule):
"""
Adds a rule to the dictionary of lists of rules.
:param rule: the rule to add to the dict.
"""
global RULE_DICT
if rule[0] not in RULE_DICT:
RULE_DICT[rule[0]] = []
RULE_DICT[rule[0]].append(rule[1:])
def convert_grammar(grammar):
"""
Converts a context-free grammar in the form of
S -> NP VP
NP -> Det ADV N
and so on into a chomsky normal form of that grammar. After the conversion rules have either
exactly one terminal symbol or exactly two non terminal symbols on its right hand side.
Therefore some new non terminal symbols might be created. These non terminal symbols are
named like the symbol they replaced with an appended index.
:param grammar: the CFG.
:return: the given grammar converted into CNF.
"""
# Remove all the productions of the type A -> X B C or A -> B a.
global RULE_DICT
unit_productions, result = [], []
res_append = result.append
index = 0
for rule in grammar:
new_rules = []
if len(rule) == 2 and rule[1][0] != "'":
# Rule is in form A -> X, so back it up for later and continue with the next rule.
unit_productions.append(rule)
add_rule(rule)
continue
elif len(rule) > 2:
# Rule is in form A -> X B C or A -> X a.
terminals = [(item, i) for i, item in enumerate(rule) if item[0] == "'"]
if terminals:
for item in terminals:
# Create a new non terminal symbol and replace the terminal symbol with it.
# The non terminal symbol derives the replaced terminal symbol.
rule[item[1]] = f"{rule[0]}{str(index)}"
new_rules += [f"{rule[0]}{str(index)}", item[0]]
index += 1
while len(rule) > 3:
new_rules += [f"{rule[0]}{str(index)}", rule[1], rule[2]]
rule = [rule[0]] + [f"{rule[0]}{str(index)}"] + rule[3:]
index += 1
# Adds the modified or unmodified (in case of A -> x i.e.) rules.
add_rule(rule)
res_append(rule)
if new_rules:
res_append(new_rules)
# Handle the unit productions (A -> X)
while unit_productions:
rule = unit_productions.pop()
if rule[1] in RULE_DICT:
for item in RULE_DICT[rule[1]]:
new_rule = [rule[0]] + item
if len(new_rule) > 2 or new_rule[1][0] == "'":
res_append(new_rule)
else:
unit_productions.append(new_rule)
add_rule(new_rule)
return result