diff --git a/doc/doc.md b/doc/doc.md index 1d23583a..1351f314 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -466,7 +466,7 @@ empty string as right hand side. ##### CFG Removal of Left Recursion The algorithm removes both direct and indirect left recursion. For direct -recursion it treats every rule where the first symbol on the right hand side is +recursion, it treats every rule where the first symbol on the right hand side is the same as the one on the left hand side except if it is a unary production. For each nonterminal that is the left hand side of a left recursive rule it adds a new nonterminal appended to the rules without left recursion. For @@ -477,8 +477,29 @@ recursive rule a new like `S1 -> a S1` is added. The former left recursive rules are removed. If a nonterminal is associated with only left recursive rules, an error message is printed and no conversion is performed. -For removing indirect left recursion epsilon productions have to be removed -first. The algorithm assumes an order between the nonterminals. For each +For removing indirect left recursion, epsilon productions and loops have to be +removed first. The approach following Moore introduces new nonterminals and +treats every rule according to these rules: +1. If in a rule A -> X beta the A is left-recusive, but X is not or X is +a terminal, a new rule A -> X A:X is added to the grammar. +2. If for a rule A -> B gamma there is a rule B -> X beta where both A and B +are left-recursive, a new rule A:X -> beta A:B is added to the grammar. +3. If for a rule A -> X beta the A is recursive, add A:X -> beta to the new +grammar. Note that the recursive property for X is not specified, thus a rule +already treated by 1 or 2 also applies to rule 3. +4. If in a rule A -> beta the A is not recursive, the rule is copied to the new grammar. + +Nonterminals from the original grammar are kept with new nonterminals added. + +Note that Moore uses an underscore in new nonterminal names but in this +implementation the underscore is a special meaning character that adds +subscripts to tree nodes and gives them special properties, therefore the +underscore is forbidden as letter in a grammar symbol. The colon has no special +meaning for the grammar parser and is not overloaded with any special meaning. + +The previous implementation following Paull's algorithm still exists in the +code base but is not invoked via command line. +The algorithm assumes an order between the nonterminals. For each nonterminal, that is the lhs of a rule whose first rhs symbol is a previous nonterminal, it is replaced by all rhs of rules with those nonterminal as lhs. For example a grammar has the rules `S -> A a, S -> b, A -> S a` where the diff --git a/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/Cfg.java b/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/Cfg.java index 6c159bce..4c8417dc 100644 --- a/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/Cfg.java +++ b/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/Cfg.java @@ -247,7 +247,7 @@ Cfg getCfgWithoutDirectLeftRecursion() { * loops have to be removed first. */ public Cfg getCfgWithoutLeftRecursion() throws ParseException { - return LeftRecursion.removeLeftRecursionPaull(this); + return LeftRecursion.removeLeftRecursionMoore(this); } /** @@ -263,7 +263,10 @@ public boolean hasMixedRhs() { * its set of rules. */ public void addProductionRule(String rule) throws ParseException { - this.productionRules.add(new CfgProductionRule(rule)); + CfgProductionRule newRule = new CfgProductionRule(rule); + if (!this.productionRules.contains(newRule)) { + this.productionRules.add(new CfgProductionRule(rule)); + } } /** diff --git a/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/util/LeftRecursion.java b/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/util/LeftRecursion.java index ae0503dc..b4e2d792 100644 --- a/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/util/LeftRecursion.java +++ b/src/main/java/com/github/samyadaleh/cltoolbox/common/cfg/util/LeftRecursion.java @@ -13,7 +13,7 @@ public class LeftRecursion { - public static final String DELIMITER = "-"; + public static final String DELIMITER = ":"; /** * Returns true if CFG has one rule with direct left recursion, of the form A @@ -153,13 +153,13 @@ public static Cfg removeLeftRecursionPaull(Cfg cfgOld) throws ParseException { * Removes any kind of left recursion including direct and indirect one, * but epsilon productions and loops have to be removed first. * Example: S -> A a, S -> B b, A -> S a, B -> b - * 1. If the LC is terminal or a non-recursive terminal add A -> X A-X - * For S -> B b add S -> B S-B. For B -> b add B -> b B-b. + * 1. If the LC is terminal or a non-recursive terminal add A -> X A:X + * For S -> B b add S -> B S:B. For B -> b add B -> b B:b. * 2. If LC is recursive and has more productions, add a new rule for each: - * For A add S-A -> a S-A. For S add A-S -> a A-S. + * For A add S:A -> a S:A. For S add A:S -> a A:S. * 3. For any rule I guess. - * For S -> A a add S-A -> a. For S -> B b add S-B -> b. - * For A -> S a add A-S -> a. For B -> b add B-b -> ε + * For S -> A a add S:A -> a. For S -> B b add S:B -> b. + * For A -> S a add A:S -> a. For B -> b add B:b -> ε * 4. For non-recursive nonterminals. Copy B -> b to new grammar. */ public static Cfg removeLeftRecursionMoore(Cfg cfgOld) throws ParseException { @@ -170,31 +170,39 @@ public static Cfg removeLeftRecursionMoore(Cfg cfgOld) throws ParseException { Collections.addAll(newNts, cfgOld.getNonterminals()); for (CfgProductionRule rule : cfgOld.getProductionRules()) { String lhs = rule.getLhs(); - String lc = rule.getRhs()[0]; - if (nonterminalIsLhsOfLeftRecursion(cfgOld, lc)) { - // Rule 2 - for (CfgProductionRule rule2 : cfgOld.getProductionRules()) { - if (rule2.getLhs().equals(lc)) { - String lc2 = rule2.getRhs()[0]; - String rhs2Rest = ArrayUtils.getSubSequenceAsString( - rule2.getRhs(), 1, rule2.getRhs().length); - String newNt = lhs + DELIMITER + lc2; - cfg.addProductionRule(newNt + " " + ARROW_RIGHT + " " + rhs2Rest); + if (nonterminalIsLhsOfLeftRecursion(cfgOld, lhs)) { + String lc = rule.getRhs()[0]; + if (nonterminalIsLhsOfLeftRecursion(cfgOld, lc)) { + // Rule 2 + for (CfgProductionRule rule2 : cfgOld.getProductionRules()) { + if (rule2.getLhs().equals(lc)) { + String lc2 = rule2.getRhs()[0]; + String rhs2Rest = ArrayUtils.getSubSequenceAsString( + rule2.getRhs(), 1, rule2.getRhs().length); + String newNt = lhs + DELIMITER + lc2; + if (!newNts.contains(newNt)) { + newNts.add(newNt); + } + cfg.addProductionRule(newNt + " " + ARROW_RIGHT + " " + rhs2Rest); + } + } + } else { + // Rule 1 + String newNt = lhs + DELIMITER + lc; + if (!newNts.contains(newNt)) { + newNts.add(newNt); } + cfg.addProductionRule(lhs + " " + ARROW_RIGHT + " " + lc + " " + newNt); } // Rule 3 - if (nonterminalIsLhsOfLeftRecursion(cfgOld, lhs)) { - String newNt = lhs + DELIMITER + lc; + String newNt = lhs + DELIMITER + lc; + if (!newNts.contains(newNt)) { newNts.add(newNt); - String rhsRest = ArrayUtils.getSubSequenceAsString( - rule.getRhs(), 1, rule.getRhs().length); - cfg.addProductionRule(newNt + " " + ARROW_RIGHT + " " + rhsRest); } + String rhsRest = ArrayUtils.getSubSequenceAsString( + rule.getRhs(), 1, rule.getRhs().length); + cfg.addProductionRule(newNt + " " + ARROW_RIGHT + " " + rhsRest); } else { - // Rule 1 - String newNt = lhs + DELIMITER + lc; - newNts.add(newNt); - cfg.addProductionRule(lhs + " " + ARROW_RIGHT + " " + lc + " " + newNt); // Rule 4 cfg.addProductionRule(rule.toString()); } diff --git a/src/test/java/com/github/samyadaleh/cltoolbox/common/cfg/CfgTest.java b/src/test/java/com/github/samyadaleh/cltoolbox/common/cfg/CfgTest.java index 50883d8b..ae650acd 100644 --- a/src/test/java/com/github/samyadaleh/cltoolbox/common/cfg/CfgTest.java +++ b/src/test/java/com/github/samyadaleh/cltoolbox/common/cfg/CfgTest.java @@ -6,6 +6,7 @@ import com.github.samyadaleh.cltoolbox.chartparsing.converter.cfg.CfgToTopDownRulesConverter; import com.github.samyadaleh.cltoolbox.cli.GrammarToGrammarConverter; import com.github.samyadaleh.cltoolbox.common.GrammarLoader; +import com.github.samyadaleh.cltoolbox.common.cfg.util.LeftRecursion; import com.github.samyadaleh.cltoolbox.common.finiteautomata.NondeterministicFiniteAutomaton; import com.github.samyadaleh.cltoolbox.common.parser.CfgParser; import org.junit.Ignore; @@ -158,10 +159,10 @@ public void testRemoveDirectLeftRecursion() } @Test - public void testRemoveDirectLeftRecursion2() + public void testRemoveDirectLeftRecursion2Paull() throws ParseException, FileNotFoundException { Cfg cfg = GrammarLoader.readCfg("directleftrecursion.cfg"); - Cfg cfgwlr = cfg.getCfgWithoutLeftRecursion(); + Cfg cfgwlr = LeftRecursion.removeLeftRecursionPaull(cfg); assertEquals( "G = \n" + "N = {S, S1}\n" + "T = {a, b, c, d}\n" + "S = S\n" + "P = {S -> c, S -> d, S1 -> ε, S -> d S1, S -> c S1, S1 -> b S1, S1 -> a S1}\n", @@ -169,36 +170,80 @@ public void testRemoveDirectLeftRecursion2() } @Test - public void testRemoveIndirectLeftRecursion() + public void testRemoveDirectLeftRecursion2Moore() + throws ParseException, FileNotFoundException { + Cfg cfg = GrammarLoader.readCfg("directleftrecursion.cfg"); + Cfg cfgwlr = LeftRecursion.removeLeftRecursionMoore(cfg); + assertEquals( + "G = \n" + + "N = {S, S:S, S:c, S:d}\n" + + "T = {a, b, c, d}\n" + + "S = S\n" + + "P = {S:S -> ε, S:S -> a, S:S -> b, S:c -> ε, S:d -> ε, S -> c S:c, S -> d S:d}\n", + cfgwlr.toString()); + } + + @Test + public void testRemoveIndirectLeftRecursionPaull() + throws ParseException, FileNotFoundException { + Cfg cfg = GrammarLoader.readCfg("indirectleftrecursion.cfg"); + Cfg cfgwlr = LeftRecursion.removeLeftRecursionPaull(cfg + .getCfgWithoutEmptyProductions().getCfgWithoutNonGeneratingSymbols() + .getCfgWithoutNonReachableSymbols()); + assertEquals("G = \n" + + "N = {S, A, A1}\n" + + "T = {a, b}\n" + + "S = S\n" + + "P = {S -> A a, S -> b, A -> b a, A1 -> ε, A -> b a A1, A1 -> a a A1}\n" + + "", cfgwlr.toString()); + } + + @Test + public void testRemoveIndirectLeftRecursionMoore() throws ParseException, FileNotFoundException { Cfg cfg = GrammarLoader.readCfg("indirectleftrecursion.cfg"); Cfg cfgwlr = cfg .getCfgWithoutEmptyProductions().getCfgWithoutNonGeneratingSymbols() .getCfgWithoutNonReachableSymbols().getCfgWithoutLeftRecursion(); assertEquals("G = \n" + - "N = {S, A, A1}\n" + - "T = {a, b}\n" + - "S = S\n" + - "P = {S -> A a, S -> b, A -> b a, A1 -> ε, A -> b a A1, A1 -> a a A1}\n" + - "", cfgwlr.toString()); + "N = {S, A, S:S, S:A, A:A, A:b, A:S, S:b}\n" + + "T = {a, b}\n" + + "S = S\n" + + "P = {S:S -> a, S:A -> a, A:A -> a, A:b -> ε, A:S -> a, S -> b S:b, S:b -> ε}\n", + cfgwlr.toString()); } @Test - public void testRemoveLeftRecursionNoTermination() + public void testRemoveLeftRecursionNoTerminationPaull() throws ParseException, FileNotFoundException { Cfg cfg = GrammarLoader.readCfg("leftrecursionnotermination.cfg"); - Cfg cfgwlr = cfg.getCfgWithoutLeftRecursion(); + Cfg cfgwlr = LeftRecursion.removeLeftRecursionPaull(cfg); assertNull(cfgwlr); } @Test - public void testCrazyLeftRecursionRemoval() + public void testCrazyLeftRecursionRemovalPaull() throws ParseException, FileNotFoundException { Cfg cfg = GrammarLoader.readCfg("crazyleftrecursionremoval.cfg"); - Cfg cfgwlr = cfg.getCfgWithoutLeftRecursion(); - assertEquals("G = \n" + "N = {S1, N2, S, N1, S2, N11}\n" - + "T = {t0}\n" + "S = S1\n" - + "P = {N2 -> t0, N1 -> t0, S -> N1 S N1, S -> N1 S, S -> N1 N1, S -> N1, S1 -> S, S1 -> ε, S -> N1 S, S -> N1, S -> N1, S -> t0, S2 -> ε, S -> t0 S2, S -> N1, S -> N1 S2, S -> N1, S -> N1 S2, S -> N1 S, S -> N1 S S2, S2 -> N1 S2, S -> N1 S N1 S2, N1 -> N1 S, N1 -> N1, N1 -> N1 S, N1 -> N1, N1 -> N1, N1 -> t0, N1 -> t0 S2, N1 -> N1, N1 -> N1 S2, N1 -> t0, N1 -> t0 N11, N11 -> S N11, N11 -> N1 N11, N11 -> S N11, N11 -> S N1 N11}\n", + Cfg cfgwlr = LeftRecursion.removeLeftRecursionPaull(cfg); + assertEquals("G = \n" + + "N = {S1, N2, S, N1, S2, N11}\n" + + "T = {t0}\n" + + "S = S1\n" + + "P = {N2 -> t0, N1 -> t0, S -> N1 S N1, S -> N1 S, S -> N1 N1, S -> N1, S1 -> S, S1 -> ε, S -> t0, S2 -> ε, S -> t0 S2, S2 -> N1 S2, S -> N1 S2, S -> N1 N1 S2, S -> N1 S S2, S -> N1 S N1 S2, N1 -> t0, N1 -> t0 S2, N11 -> ε, N11 -> S N1 S2 N11, N11 -> S S2 N11, N11 -> N1 S2 N11, N11 -> S2 N11, N1 -> t0 S2 N11, N1 -> t0, N1 -> t0 N11, N11 -> N1 N11, N11 -> S N11, N11 -> S N1 N11}\n", + cfgwlr.toString()); + } + + @Test + public void testCrazyLeftRecursionRemovalMoore() + throws ParseException, FileNotFoundException { + Cfg cfg = GrammarLoader.readCfg("crazyleftrecursionremoval.cfg"); + Cfg cfgwlr = LeftRecursion.removeLeftRecursionMoore(cfg); + assertEquals("G = \n" + + "N = {S1, N2, S, N1, S:S, S:N2, S:N1, N1:S, N1:N2, N1:N1, N1:t0, S:t0}\n" + + "T = {t0}\n" + + "S = S1\n" + + "P = {S:S -> ε, S:N2 -> ε, S:N1 -> S N1, S:N1 -> S, S:N1 -> N1, S:N1 -> ε, S:S -> N1, N1:S -> ε, N1:N2 -> ε, N1:N1 -> S N1, N1:N1 -> S, N1:N1 -> N1, N1:N1 -> ε, N1:S -> N1, N1:t0 -> ε, N2 -> t0, S -> N2 S:N2, N1 -> t0 N1:t0, S:t0 -> ε, S1 -> S, S1 -> ε}\n", cfgwlr.toString()); } @@ -209,8 +254,11 @@ public void testLeftRecursionNotRemoved() Cfg cfgwlr = GrammarToGrammarConverter.checkAndMayConvertToCfg( cfg, "cfg-topdown", true); assertFalse(cfgwlr.hasLeftRecursion()); - assertEquals("G = \n" + "N = {N2, N21}\n" + "T = {t0}\n" - + "S = N2\n" + "P = {N2 -> t0, N21 -> ε, N21 -> N2 N21, N2 -> t0 N21}\n", + assertEquals("G = \n" + + "N = {N2, N2:t0, N2:N2}\n" + + "T = {t0}\n" + + "S = N2\n" + + "P = {N2 -> t0 N2:t0, N2:t0 -> ε, N2:N2 -> N2}\n", cfgwlr.toString()); } diff --git a/src/test/java/com/github/samyadaleh/cltoolbox/common/lcfrs/util/UselessRulesTest.java b/src/test/java/com/github/samyadaleh/cltoolbox/common/lcfrs/util/UselessRulesTest.java index c2ac53d2..cc013f6c 100644 --- a/src/test/java/com/github/samyadaleh/cltoolbox/common/lcfrs/util/UselessRulesTest.java +++ b/src/test/java/com/github/samyadaleh/cltoolbox/common/lcfrs/util/UselessRulesTest.java @@ -21,7 +21,7 @@ public class UselessRulesTest { Srcg srcg = new Srcg(cfg).getSrcgWithoutUselessRules(); assertEquals("G = \n" + "N = {N2, N3}\n" + "T = {t0, t1}\n" + "V = {X1}\n" + "P = {N2(t0 X1 t1) -> N3(X1), N2(ε) -> ε, " - + "N3(t1) -> ε, N2(ε) -> ε, N3(ε) -> ε}\n" + "S = N2\n", + + "N3(t1) -> ε, N3(ε) -> ε}\n" + "S = N2\n", srcg.toString()); } }