-
Notifications
You must be signed in to change notification settings - Fork 15
/
Earley.java
716 lines (652 loc) · 19.4 KB
/
Earley.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/*
* Terminology
* ===========
* Consider the following context-free rule:
*
* X -> A B C | A hello
*
* We say rule 'X' has two __productions__: "A B C" and "A hello".
* Each production is made of __production terms__, which can be either
* __terminals__ (in our case, "hello") or __rules__ (non-terminals, such as "A", "B", and "C")
*
*/
public class Earley
{
/*
* an abstract notion of the elements that can be placed within production
*/
public interface ProductionTerm
{
}
/*
* Represents a terminal element in a production
*/
public static class Terminal implements ProductionTerm
{
public final String value;
public Terminal(String value) {
this.value = value;
}
@Override
public String toString()
{
return value;
}
@Override
public boolean equals(Object other)
{
if (this == other) {
return true;
}
if (other == null) {
return false;
}
if (other instanceof String) {
return value.equals((String)other);
}
else if (other instanceof Terminal) {
return value.equals(((Terminal)other).value);
}
return false;
}
}
/*
* Represents a production of the rule.
*/
public static class Production implements Iterable<ProductionTerm>
{
public final List<ProductionTerm> terms;
public final List<Rule> rules;
public Production(ProductionTerm... terms)
{
this.terms = Arrays.asList(terms);
this.rules = getRules();
}
public Production(Object... terms)
{
this.terms = new ArrayList<ProductionTerm>(terms.length);
for (Object item : terms) {
if (item instanceof String) {
this.terms.add(new Terminal((String)item));
}
else if (item instanceof ProductionTerm) {
this.terms.add((ProductionTerm)item);
}
else {
throw new IllegalArgumentException("Term must be ProductionTerm or String, not " + item);
}
}
this.rules = getRules();
}
public int size() {
return terms.size();
}
public ProductionTerm get(int index) {
return terms.get(index);
}
@Override
public Iterator<ProductionTerm> iterator() {
return terms.iterator();
}
private List<Rule> getRules()
{
ArrayList<Rule> rules = new ArrayList<Rule>();
for (ProductionTerm term : terms) {
if (term instanceof Rule) {
rules.add((Rule)term);
}
}
return rules;
}
@Override
public boolean equals(Object other)
{
if (this == other) {
return true;
}
if (other == null || other.getClass() != getClass()) {
return false;
}
return terms.equals(((Production)other).terms);
}
@Override
public String toString()
{
String s = "";
if (!terms.isEmpty()) {
for (int i = 0; i < terms.size() - 1; i++) {
ProductionTerm t = terms.get(i);
if (t instanceof Rule) {
s += ((Rule)t).name;
}
else {
s += t;
}
s += " ";
}
ProductionTerm t = terms.get(terms.size() - 1);
if (t instanceof Rule) {
s += ((Rule)t).name;
}
else {
s += t;
}
}
return s;
}
}
// Epsilon transition: an empty production
public static final Production Epsilon = new Production();
/*
* A CFG rule. Since CFG rules can be self-referential, more productions may be added
* to them after construction. For example:
*
* Grammar:
* SYM -> a
* OP -> + | -
* EXPR -> SYM | EXPR OP EXPR
*
* In Java:
* Rule SYM = new Rule("SYM", new Production("a"));
* Rule OP = new Rule("OP", new Production("+"), new Production("-"));
* Rule EXPR = new Rule("EXPR", new Production(SYM));
* EXPR.add(new Production(EXPR, OP, EXPR)); // needs to reference EXPR
*
*/
public static class Rule implements ProductionTerm, Iterable<Production>
{
public final String name;
public final ArrayList<Production> productions;
public Rule(String name, Production... productions) {
this.name = name;
this.productions = new ArrayList<Production>(Arrays.asList(productions));
}
public void add(Production... productions) {
this.productions.addAll(Arrays.asList(productions));
}
public int size() {
return productions.size();
}
public Production get(int index) {
return productions.get(index);
}
@Override
public Iterator<Production> iterator() {
return productions.iterator();
}
@Override
public boolean equals(Object other)
{
if (this == other) {
return true;
}
if (other == null || other.getClass() != getClass()) {
return false;
}
Rule other2 = (Rule)other;
return name.equals(other2.name) && productions.equals(other2.productions);
}
@Override
public String toString() {
String s = this.name + " -> ";
if (!productions.isEmpty()) {
for (int i = 0; i < productions.size() - 1; i++) {
s += productions.get(i) + " | ";
}
s += productions.get(productions.size() - 1);
}
return s;
}
}
/*
* Represents a state in the Earley parsing table. A state has a its rule's name,
* the rule's production, dot-location, and starting- and ending-column in the parsing
* table.
*/
public static class TableState
{
public final String name;
public final Production production;
public final int dotIndex;
public final TableColumn startCol;
public TableColumn endCol;
public TableState(String name, Production production, int dotIndex, TableColumn startCol)
{
this.name = name;
this.production = production;
this.dotIndex = dotIndex;
this.startCol = startCol;
endCol = null;
}
public boolean isCompleted() {
return dotIndex >= production.size();
}
public ProductionTerm getNextTerm() {
if (isCompleted()) {
return null;
}
return production.get(dotIndex);
}
@Override
public boolean equals(Object other)
{
if (other == this) {
return true;
}
if (other == null || other.getClass() != getClass()) {
return false;
}
TableState other2 = (TableState)other;
return name.equals(other2.name) && production.equals(other2.production) &&
dotIndex == other2.dotIndex && startCol == other2.startCol;
}
@Override
public String toString()
{
String s = "";
for (int i = 0; i < production.size(); i++) {
if (i == dotIndex) {
s += "\u00B7 ";
}
ProductionTerm t = production.get(i);
if (t instanceof Rule) {
s += ((Rule)t).name;
}
else {
s += t;
}
s += " ";
}
if (dotIndex == production.size()) {
s += "\u00B7";
}
return String.format("%-6s -> %-20s [%d-%d]", name, s, startCol.index, endCol.index);
}
}
/*
* Represents a column in the Earley parsing table
*/
protected static class TableColumn implements Iterable<TableState>
{
public final String token;
public final int index;
public final ArrayList<TableState> states;
public TableColumn(int index, String token) {
this.index = index;
this.token = token;
this.states = new ArrayList<TableState>();
}
/*
* only insert a state if it is not already contained in the list of states. return the
* inserted state, or the pre-existing one.
*/
public TableState insert(TableState state) {
int index = states.indexOf(state);
if (index < 0) {
states.add(state);
state.endCol = this;
return state;
}
else {
return states.get(index);
}
}
public int size() {
return states.size();
}
public TableState get(int index) {
return states.get(index);
}
/*
* since we may modify the list as we traverse it, the built-in list iterator is not
* suitable. this iterator wouldn't mind the list being changed.
*/
private class ModifiableIterator implements Iterator<TableState>
{
private int i = 0;
@Override
public boolean hasNext() {
return i < states.size();
}
@Override
public TableState next() {
TableState st = states.get(i);
i++;
return st;
}
@Override
public void remove() {
}
}
@Override
public Iterator<TableState> iterator() {
return new ModifiableIterator();
}
public void print(PrintStream out, boolean showUncompleted)
{
out.printf("[%d] '%s'\n", index, token);
out.println("=======================================");
for (TableState state : this) {
if (!state.isCompleted() && !showUncompleted) {
continue;
}
out.println(state);
}
out.println();
}
}
/*
* A generic tree node
*/
public static class Node<T> implements Iterable<Node<T>>
{
public final T value;
private List<Node<T>> children;
public Node(T value, List<Node<T>> children) {
this.value = value;
this.children = children;
}
@Override
public Iterator<Node<T>> iterator() {
return children.iterator();
}
public void print(PrintStream out)
{
print(out, 0);
}
private void print(PrintStream out, int level) {
String indentation = "";
for (int i = 0; i < level; i++) {
indentation += " ";
}
out.println(indentation + value);
for (Node<T> child : children) {
child.print(out, level + 1);
}
}
}
/*
* the exception raised by Parser should parsing fail
*/
public static class ParsingFailed extends Exception
{
private static final long serialVersionUID = -3489519608069949690L;
public ParsingFailed(String message) {
super(message);
}
}
/*
* The Earley Parser.
*
* Usage:
*
* Parser p = new Parser(StartRule, "my space-delimited statement");
* for (Node tree : p.getTrees()) {
* tree.print(System.out);
* }
*
*/
public static class Parser
{
protected TableColumn[] columns;
protected TableState finalState = null;
/*
* constructor: takes a start rule and a statement (made of space-separated words).
* it initializes the table and invokes earley's algorithm
*/
public Parser(Rule startRule, String text) throws ParsingFailed
{
String[] tokens = text.split(" ");
columns = new TableColumn[tokens.length + 1];
columns[0] = new TableColumn(0, "");
for (int i = 1; i <= tokens.length; i++) {
columns[i] = new TableColumn(i, tokens[i-1]);
}
finalState = parse(startRule);
if (finalState == null) {
throw new ParsingFailed("your grammar does not accept the given text");
}
}
// this is the name of the special "gamma" rule added by the algorithm
// (this is unicode for 'LATIN SMALL LETTER GAMMA')
private static final String GAMMA_RULE = "\u0263"; // "\u0194"
/*
* the Earley algorithm's core: add gamma rule, fill up table, and check if the gamma rule
* spans from the first column to the last one. return the final gamma state, or null,
* if the parse failed.
*/
protected TableState parse(Rule startRule)
{
columns[0].insert(new TableState(GAMMA_RULE, new Production(startRule), 0, columns[0]));
for (int i = 0; i < columns.length; i++) {
TableColumn col = columns[i];
for (TableState state : col) {
if (state.isCompleted()) {
complete(col, state);
}
else {
ProductionTerm term = state.getNextTerm();
if (term instanceof Rule) {
predict(col, (Rule)term);
}
else if (i + 1 < columns.length) {
scan(columns[i+1], state, ((Terminal)term).value);
}
}
}
handleEpsilons(col);
// DEBUG -- uncomment to print the table during parsing, column after column
//col.print(System.out, false);
}
// find end state (return null if not found)
for (TableState state : columns[columns.length - 1]) {
if (state.name.equals(GAMMA_RULE) && state.isCompleted()) {
return state;
}
}
return null;
}
/*
* Earley scan
*/
private void scan(TableColumn col, TableState state, String token) {
if (token.equals(col.token)) {
col.insert(new TableState(state.name, state.production, state.dotIndex + 1, state.startCol));
}
}
/*
* Earley predict. returns true if the table has been changed, false otherwise
*/
private boolean predict(TableColumn col, Rule rule) {
boolean changed = false;
for (Production prod : rule) {
TableState st = new TableState(rule.name, prod, 0, col);
TableState st2 = col.insert(st);
changed |= (st == st2);
}
return changed;
}
/*
* Earley complete. returns true if the table has been changed, false otherwise
*/
private boolean complete(TableColumn col, TableState state) {
boolean changed = false;
for (TableState st : state.startCol) {
ProductionTerm term = st.getNextTerm();
if (term instanceof Rule && ((Rule)term).name.equals(state.name)) {
TableState st1 = new TableState(st.name, st.production, st.dotIndex + 1, st.startCol);
TableState st2 = col.insert(st1);
changed |= (st1 == st2);
}
}
return changed;
}
/*
* call predict() and complete() for as long as the table keeps changing (may only happen
* if we've got epsilon transitions)
*/
private void handleEpsilons(TableColumn col)
{
boolean changed = true;
while (changed) {
changed = false;
for (TableState state : col) {
ProductionTerm term = state.getNextTerm();
if (term instanceof Rule) {
changed |= predict(col, (Rule)term);
}
if (state.isCompleted()) {
changed |= complete(col, state);
}
}
}
}
/*
* return all parse trees (forest). the forest is simply a list of root nodes, each
* representing a possible parse tree. a node is contains a value and the node's children,
* and supports pretty-printing
*/
public List<Node<TableState>> getTrees() {
return buildTrees(finalState);
}
/*
* this is a bit "magical" -- i wrote the code that extracts a single parse tree,
* and with some help from a colleague (non-student) we managed to make it return all
* parse trees.
*
* how it works: suppose we're trying to match [X -> Y Z W]. we go from finish-to-start,
* e.g., first we'll try to match W in X.endCol. let this matching state be M1. next we'll
* try to match Z in M1.startCol. let this matching state be M2. and finally, we'll try to
* match Y in M2.startCol, which must also start at X.startCol. let this matching state be
* M3.
*
* if we matched M1, M2 and M3, then we've found a parsing for X:
*
* X ->
* Y -> M3
* Z -> M2
* W -> M1
*
*/
private List<Node<TableState>> buildTrees(TableState state) {
return buildTreesHelper(new ArrayList<Node<TableState>>(), state,
state.production.rules.size() - 1, state.endCol);
}
private List<Node<TableState>> buildTreesHelper(List<Node<TableState>> children,
TableState state, int ruleIndex, TableColumn endCol)
{
ArrayList<Node<TableState>> outputs = new ArrayList<Node<TableState>>();
TableColumn startCol = null;
if (ruleIndex < 0) {
// this is the base-case for the recursion (we matched the entire rule)
outputs.add(new Node<TableState>(state, children));
return outputs;
}
else if (ruleIndex == 0) {
// if this is the first rule
startCol = state.startCol;
}
Rule rule = state.production.rules.get(ruleIndex);
for (TableState st : endCol) {
if (st == state) {
// this prevents an endless recursion: since the states are filled in order of
// completion, we know that X cannot depend on state Y that comes after it X
// in chronological order
break;
}
if (!st.isCompleted() || !st.name.equals(rule.name)) {
// this state is out of the question -- either not completed or does not match
// the name
continue;
}
if (startCol != null && st.startCol != startCol) {
// if startCol isn't null, this state must span from startCol to endCol
continue;
}
// okay, so `st` matches -- now we need to create a tree for every possible
// sub-match
for (Node<TableState> subTree : buildTrees(st)) {
// in python: children2 = [subTree] + children
ArrayList<Node<TableState>> children2 = new ArrayList<Node<TableState>>();
children2.add(subTree);
children2.addAll(children);
// now try all options
for (Node<TableState> node : buildTreesHelper(children2, state, ruleIndex - 1, st.startCol)) {
outputs.add(node);
}
}
}
return outputs;
}
}
public static void main(String[] args) throws Exception
{
///////////////////////////////////////////////////////////////////////////////////////
// Simple mathematics expressions
///////////////////////////////////////////////////////////////////////////////////////
Rule SYM = new Rule("SYM", new Production("a"));
Rule OP = new Rule("OP", new Production("+"));
Rule EXPR = new Rule("EXPR", new Production(SYM));
EXPR.add(new Production(EXPR, OP, EXPR));
// note that this yields the catalan numbers sequence (as expected) -- the number of ways
// to place parenthesis in the expression a + a + ... + a
// this serves as a "semi-proof of correctness" :)
System.out.println("catalan numbers:");
for (String text : new String[] {"a", "a + a", "a + a + a", "a + a + a + a", "a + a + a + a + a",
"a + a + a + a + a + a", "a + a + a + a + a + a + a"}) {
Parser p = new Parser(EXPR, text);
System.out.printf("%d, ", p.getTrees().size());
}
System.out.println("\n");
///////////////////////////////////////////////////////////////////////////////////////
// Simple rules for English
///////////////////////////////////////////////////////////////////////////////////////
Rule N = new Rule("N", new Production("time"), new Production("flight"), new Production("banana"),
new Production("flies"), new Production("boy"), new Production("telescope"));
Rule D = new Rule("D", new Production("the"), new Production("a"), new Production("an"));
Rule V = new Rule("V", new Production("book"), new Production("ate"), new Production("sleep"),
new Production("saw"), new Production("thinks"));
Rule P = new Rule("P", new Production("with"), new Production("in"), new Production("on"),
new Production("at"), new Production("through"));
Rule C = new Rule("C", new Production("that"));
Rule PP = new Rule("PP");
Rule NP = new Rule("NP", new Production(D, N), new Production("john"), new Production("bill"),
new Production("houston"));
NP.add(new Production(NP, PP));
PP.add(new Production(P, NP));
Rule VP = new Rule("VP", new Production(V, NP));
VP.add(new Production(VP, PP));
Rule S = new Rule("S", new Production(NP, VP), new Production(VP));
Rule Sbar = new Rule("S'", new Production(C, S));
VP.add(new Production(V, Sbar));
// let's parse some sentences!
for (String text : new String[] {"john ate a banana", "book the flight through houston",
"john saw the boy with the telescope", "john thinks that bill ate a banana"}) {
Parser p = new Parser(S, text);
System.out.printf("Parse trees for '%s'\n", text);
System.out.println("===================================================");
for (Node<TableState> tree : p.getTrees()) {
tree.print(System.out);
System.out.println();
}
}
// let's fail
boolean failed = false;
try {
Parser p = new Parser(S, "john ate");
}
catch (ParsingFailed ex) {
// okay
failed = true;
System.out.println("hurrah, this has failed (we don't allow VP without a complement)");
}
if (!failed) {
System.out.println("oops, this should have failed!");
}
}
}