diff --git a/lark/lark.py b/lark/lark.py index f29d444d9..ccdcf33d8 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -66,6 +66,7 @@ class LarkOptions(Serialize): edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]] import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]' source_path: Optional[str] + syntax: Union[str, Callable[[str, str], Tree]] OPTIONS_DOC = """ **=== General Options ===** @@ -136,6 +137,9 @@ class LarkOptions(Serialize): A List of either paths or loader functions to specify from where grammars are imported source_path Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading + syntax + Select the syntax with which the grammar is parsed. Either a function or a registered syntax + (default guess based on extension / uses lark when that fails) **=== End of Options ===** """ if __doc__: @@ -169,6 +173,7 @@ class LarkOptions(Serialize): 'use_bytes': False, 'import_paths': [], 'source_path': None, + 'syntax': None } def __init__(self, options_dict): @@ -328,7 +333,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: # Parse the grammar file and compose the grammars - self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + self.grammar, used_files = load_grammar(grammar, self.source_path, self.options) else: assert isinstance(grammar, Grammar) self.grammar = grammar diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e6604bc82..0eb34c773 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,12 +12,9 @@ from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE -from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import ParsingFrontend -from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY from .utils import classify, dedup_list -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput +from .exceptions import GrammarError, UnexpectedInput from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -28,7 +25,7 @@ EXT = '.lark' -_RE_FLAGS = 'imslux' +RE_FLAGS = 'imslux' _EMPTY = Symbol('__empty__') @@ -71,117 +68,6 @@ ' ' : 'SPACE', } -# Grammar Parser -TERMINALS = { - '_LPAR': r'\(', - '_RPAR': r'\)', - '_LBRA': r'\[', - '_RBRA': r'\]', - '_LBRACE': r'\{', - '_RBRACE': r'\}', - 'OP': '[+*]|[?](?![a-z])', - '_COLON': ':', - '_COMMA': ',', - '_OR': r'\|', - '_DOT': r'\.(?!\.)', - '_DOTDOT': r'\.\.', - 'TILDE': '~', - 'RULE_MODIFIERS': '(!|![?]?|[?]!?)(?=[_a-z])', - 'RULE': '_?[a-z][_a-z0-9]*', - 'TERMINAL': '_?[A-Z][_A-Z0-9]*', - 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, - '_NL': r'(\r?\n)+\s*', - '_NL_OR': r'(\r?\n)+\s*\|', - 'WS': r'[ \t]+', - 'COMMENT': r'\s*//[^\n]*', - 'BACKSLASH': r'\\[ ]*\n', - '_TO': '->', - '_IGNORE': r'%ignore', - '_OVERRIDE': r'%override', - '_DECLARE': r'%declare', - '_EXTEND': r'%extend', - '_IMPORT': r'%import', - 'NUMBER': r'[+-]?\d+', -} - -RULES = { - 'start': ['_list'], - '_list': ['_item', '_list _item'], - '_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'], - - 'rule': ['rule_modifiers RULE template_params priority _COLON expansions _NL'], - 'rule_modifiers': ['RULE_MODIFIERS', - ''], - 'priority': ['_DOT NUMBER', - ''], - 'template_params': ['_LBRACE _template_params _RBRACE', - ''], - '_template_params': ['RULE', - '_template_params _COMMA RULE'], - 'expansions': ['_expansions'], - '_expansions': ['alias', - '_expansions _OR alias', - '_expansions _NL_OR alias'], - - '?alias': ['expansion _TO nonterminal', 'expansion'], - 'expansion': ['_expansion'], - - '_expansion': ['', '_expansion expr'], - - '?expr': ['atom', - 'atom OP', - 'atom TILDE NUMBER', - 'atom TILDE NUMBER _DOTDOT NUMBER', - ], - - '?atom': ['_LPAR expansions _RPAR', - 'maybe', - 'value'], - - 'value': ['terminal', - 'nonterminal', - 'literal', - 'range', - 'template_usage'], - - 'terminal': ['TERMINAL'], - 'nonterminal': ['RULE'], - - '?name': ['RULE', 'TERMINAL'], - '?symbol': ['terminal', 'nonterminal'], - - 'maybe': ['_LBRA expansions _RBRA'], - 'range': ['STRING _DOTDOT STRING'], - - 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], - '_template_args': ['value', - '_template_args _COMMA value'], - - 'term': ['TERMINAL _COLON expansions _NL', - 'TERMINAL _DOT NUMBER _COLON expansions _NL'], - 'override': ['_OVERRIDE rule', - '_OVERRIDE term'], - 'extend': ['_EXTEND rule', - '_EXTEND term'], - 'ignore': ['_IGNORE expansions _NL'], - 'declare': ['_DECLARE _declare_args _NL'], - 'import': ['_IMPORT _import_path _NL', - '_IMPORT _import_path _LPAR name_list _RPAR _NL', - '_IMPORT _import_path _TO name _NL'], - - '_import_path': ['import_lib', 'import_rel'], - 'import_lib': ['_import_args'], - 'import_rel': ['_DOT _import_args'], - '_import_args': ['name', '_import_args _DOT name'], - - 'name_list': ['_name_list'], - '_name_list': ['name', '_name_list _COMMA name'], - - '_declare_args': ['symbol', '_declare_args symbol'], - 'literal': ['REGEXP', 'STRING'], -} - # Value 5 keeps the number of states in the lalr parser somewhat minimal # It isn't optimal, but close to it. See PR #949 @@ -548,7 +434,7 @@ def _literal_to_pattern(literal): flag_start = _rfind(v, '/"')+1 assert flag_start > 0 flags = v[flag_start:] - assert all(f in _RE_FLAGS for f in flags), flags + assert all(f in RE_FLAGS for f in flags), flags if literal.type == 'STRING' and '\n' in v: raise GrammarError('You cannot put newlines in string literals') @@ -873,134 +759,12 @@ def resolve_term_references(term_dict): raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) - -def symbol_from_strcase(s): - assert isinstance(s, str) - return Terminal(s, filter_out=s.startswith('_')) if s.isupper() else NonTerminal(s) - -@inline_args -class PrepareGrammar(Transformer_InPlace): - def terminal(self, name): - return Terminal(str(name), filter_out=name.startswith('_')) - - def nonterminal(self, name): - return NonTerminal(name.value) - - def _find_used_symbols(tree): assert tree.data == 'expansions' return {t.name for x in tree.find_data('expansion') for t in x.scan_values(lambda t: isinstance(t, Symbol))} -def _get_parser(): - try: - return _get_parser.cache - except AttributeError: - terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] - - rules = [(name.lstrip('?'), x, RuleOptions(expand1=name.startswith('?'))) - for name, x in RULES.items()] - rules = [Rule(NonTerminal(r), [symbol_from_strcase(s) for s in x.split()], i, None, o) - for r, xs, o in rules for i, x in enumerate(xs)] - - callback = ParseTreeBuilder(rules, ST).create_callback() - import re - lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) - parser_conf = ParserConf(rules, callback, ['start']) - lexer_conf.lexer_type = 'basic' - parser_conf.parser_type = 'lalr' - _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) - return _get_parser.cache - -GRAMMAR_ERRORS = [ - ('Incorrect type of value', ['a: 1\n']), - ('Unclosed parenthesis', ['a: (\n']), - ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), - ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), - ('Illegal name for rules or terminals', ['Aa:\n']), - ('Alias expects lowercase name', ['a: -> "a"\n']), - ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), - ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), - ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), - ('Terminal names cannot contain dots', ['A.B\n']), - ('Expecting rule or terminal definition', ['"a"\n']), - ('%import expects a name', ['%import "a"\n']), - ('%ignore expects a value', ['%ignore %import\n']), - ] - -def _translate_parser_exception(parse, e): - error = e.match_examples(parse, GRAMMAR_ERRORS, use_accepts=True) - if error: - return error - elif 'STRING' in e.expected: - return "Expecting a value" - -def _parse_grammar(text, name, start='start'): - try: - tree = _get_parser().parse(text + '\n', start) - except UnexpectedCharacters as e: - context = e.get_context(text) - raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % - (e.line, e.column, name, context)) - except UnexpectedToken as e: - context = e.get_context(text) - error = _translate_parser_exception(_get_parser().parse, e) - if error: - raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) - raise - - return PrepareGrammar().transform(tree) - - -def _error_repr(error): - if isinstance(error, UnexpectedToken): - error2 = _translate_parser_exception(_get_parser().parse, error) - if error2: - return error2 - expected = ', '.join(error.accepts or error.expected) - return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) - else: - return str(error) - -def _search_interactive_parser(interactive_parser, predicate): - def expand(node): - path, p = node - for choice in p.choices(): - t = Token(choice, '') - try: - new_p = p.feed_token(t) - except ParseError: # Illegal - pass - else: - yield path + (choice,), new_p - - for path, p in bfs_all_unique([((), interactive_parser)], expand): - if predicate(p): - return path, p - -def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: - errors = [] - def on_error(e): - errors.append((e, _error_repr(e))) - - # recover to a new line - token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices()) - for token_type in token_path: - e.interactive_parser.feed_token(Token(token_type, '')) - e.interactive_parser.feed_token(Token('_NL', '\n')) - return True - - _tree = _get_parser().parse(text + '\n', start, on_error=on_error) - - errors_by_line = classify(errors, lambda e: e[0].line) - errors = [el[0] for el in errors_by_line.values()] # already sorted - - for e in errors: - e[0].interactive_parser = None - return errors - - def _get_mangle(prefix, aliases, base_mangle=None): def mangle(s): if s in aliases: @@ -1061,10 +825,11 @@ class GrammarBuilder: import_paths: List[Union[str, Callable]] used_files: Dict[str, str] - def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, used_files: Optional[Dict[str, str]]=None) -> None: + def __init__(self, *, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, syntax_hint: str=None, used_files: Optional[Dict[str, str]]=None) -> None: self.global_keep_all_tokens = global_keep_all_tokens self.import_paths = import_paths or [] self.used_files = used_files or {} + self.syntax_hint = syntax_hint self._definitions = {} self._ignore_names = [] @@ -1207,7 +972,7 @@ def _unpack_definition(self, tree, mangle): def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optional[Callable[[str], str]]=None) -> None: - tree = _parse_grammar(grammar_text, grammar_name) + tree = parse_grammar(grammar_text, grammar_name, self.syntax_hint) imports = {} for stmt in tree.children: @@ -1292,8 +1057,9 @@ def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], alia if self.used_files.get(joined_path, h) != h: raise RuntimeError("Grammar file was changed during importing") self.used_files[joined_path] = h - - gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) + + gb = GrammarBuilder(global_keep_all_tokens=self.global_keep_all_tokens, import_paths=self.import_paths, + used_files=self.used_files) gb.load_grammar(text, joined_path, mangle) gb._remove_unused(map(mangle, aliases)) for name in gb._definitions: @@ -1355,6 +1121,58 @@ def build(self) -> Grammar: return Grammar(rule_defs, term_defs, self._ignore_names) +_REGISTRY = {} + + +def register_syntax(name: str, parse_func: Callable[[str, str], Tree]): + if name in _REGISTRY: + raise KeyError(f"Syntax {name} already registered") + _REGISTRY[name] = parse_func + + +def unregister_syntax(name: str): + del _REGISTRY[name] + + +def parse_grammar(grammar_text, grammar_name, syntax_hint=None): + """ + Parses the grammar_text. grammar_name is used for error messages. + + syntax_hint is the syntax that is tried first. If it doesn't work (or syntax_hint is None), + we try to infer the syntax from the grammar_name based on the extension. + If that fails we try 'lark', and otherwise raise an exception + """ + exp_to_raise = None + tried = set() + if callable(syntax_hint): + try: + return syntax_hint(grammar_text, grammar_name) + except (GrammarError, UnexpectedInput) as e: + exp_to_raise = e + elif isinstance(syntax_hint, str): + parse_func = _REGISTRY[syntax_hint] + try: + return parse_func(grammar_text, grammar_name) + except (GrammarError, UnexpectedInput) as e: + exp_to_raise = e + tried.add(syntax_hint) + if isinstance(grammar_name, str) and not (grammar_name[0] == "<" and grammar_name[-1] == ">"): + _, ext = os.path.splitext(grammar_name) + ext = ext[1:] + if ext in _REGISTRY and ext not in tried: + try: + return _REGISTRY[ext](grammar_text, grammar_name) + except (GrammarError, UnexpectedInput) as e: + exp_to_raise = exp_to_raise or e + tried.add(ext) + if 'lark' not in tried: + try: + return _REGISTRY['lark'](grammar_text, grammar_name) + except (GrammarError, UnexpectedInput) as e: + exp_to_raise = exp_to_raise or e + raise exp_to_raise + + def verify_used_files(file_hashes): for path, old in file_hashes.items(): text = None @@ -1366,7 +1184,7 @@ def verify_used_files(file_hashes): text = pkgutil.get_data(*path).decode('utf-8') if text is None: # We don't know how to load the path. ignore it. continue - + current = hashlib.md5(text.encode()).hexdigest() if old != current: logger.info("File %r changed, rebuilding Parser" % path) @@ -1375,11 +1193,20 @@ def verify_used_files(file_hashes): def list_grammar_imports(grammar, import_paths=[]): "Returns a list of paths to the lark grammars imported by the given grammar (recursively)" - builder = GrammarBuilder(False, import_paths) + builder = GrammarBuilder(global_keep_all_tokens=False, import_paths=import_paths) builder.load_grammar(grammar, '') return list(builder.used_files.keys()) -def load_grammar(grammar, source, import_paths, global_keep_all_tokens): - builder = GrammarBuilder(global_keep_all_tokens, import_paths) +def load_grammar(grammar, source, options): + builder = GrammarBuilder( + global_keep_all_tokens=options.keep_all_tokens, + import_paths=options.import_paths, + syntax_hint=options.syntax) builder.load_grammar(grammar, source) return builder.build(), builder.used_files + + +# parse_lark_grammar uses some stuff from this module, so we import it last +import lark.parse_lark_grammar + +register_syntax("lark", lark.parse_lark_grammar.parse_lark_grammar) diff --git a/lark/parse_lark_grammar.py b/lark/parse_lark_grammar.py new file mode 100644 index 000000000..40b466be5 --- /dev/null +++ b/lark/parse_lark_grammar.py @@ -0,0 +1,259 @@ +""" +This module contains the basic lark grammar. +""" + +from typing import List, Tuple + +from .utils import bfs_all_unique +from .lexer import Token, TerminalDef, PatternRE + +from .parse_tree_builder import ParseTreeBuilder +from .parser_frontends import ParsingFrontend +from .common import LexerConf, ParserConf +from .grammar import RuleOptions, Rule, Terminal, NonTerminal +from .utils import classify +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput + +from .tree import SlottedTree as ST +from .visitors import v_args, Transformer_InPlace + +from lark.load_grammar import RE_FLAGS + +# Grammar Parser +TERMINALS = { + '_LPAR': r'\(', + '_RPAR': r'\)', + '_LBRA': r'\[', + '_RBRA': r'\]', + '_LBRACE': r'\{', + '_RBRACE': r'\}', + 'OP': '[+*]|[?](?![a-z])', + '_COLON': ':', + '_COMMA': ',', + '_OR': r'\|', + '_DOT': r'\.(?!\.)', + '_DOTDOT': r'\.\.', + 'TILDE': '~', + 'RULE_MODIFIERS': '(!|![?]?|[?]!?)(?=[_a-z])', + 'RULE': '_?[a-z][_a-z0-9]*', + 'TERMINAL': '_?[A-Z][_A-Z0-9]*', + 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % RE_FLAGS, + '_NL': r'(\r?\n)+\s*', + '_NL_OR': r'(\r?\n)+\s*\|', + 'WS': r'[ \t]+', + 'COMMENT': r'\s*//[^\n]*', + 'BACKSLASH': r'\\[ ]*\n', + '_TO': '->', + '_IGNORE': r'%ignore', + '_OVERRIDE': r'%override', + '_DECLARE': r'%declare', + '_EXTEND': r'%extend', + '_IMPORT': r'%import', + 'NUMBER': r'[+-]?\d+', +} + +RULES = { + 'start': ['_list'], + '_list': ['_item', '_list _item'], + '_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'], + + 'rule': ['rule_modifiers RULE template_params priority _COLON expansions _NL'], + 'rule_modifiers': ['RULE_MODIFIERS', + ''], + 'priority': ['_DOT NUMBER', + ''], + 'template_params': ['_LBRACE _template_params _RBRACE', + ''], + '_template_params': ['RULE', + '_template_params _COMMA RULE'], + 'expansions': ['_expansions'], + '_expansions': ['alias', + '_expansions _OR alias', + '_expansions _NL_OR alias'], + + '?alias': ['expansion _TO nonterminal', 'expansion'], + 'expansion': ['_expansion'], + + '_expansion': ['', '_expansion expr'], + + '?expr': ['atom', + 'atom OP', + 'atom TILDE NUMBER', + 'atom TILDE NUMBER _DOTDOT NUMBER', + ], + + '?atom': ['_LPAR expansions _RPAR', + 'maybe', + 'value'], + + 'value': ['terminal', + 'nonterminal', + 'literal', + 'range', + 'template_usage'], + + 'terminal': ['TERMINAL'], + 'nonterminal': ['RULE'], + + '?name': ['RULE', 'TERMINAL'], + '?symbol': ['terminal', 'nonterminal'], + + 'maybe': ['_LBRA expansions _RBRA'], + 'range': ['STRING _DOTDOT STRING'], + + 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], + '_template_args': ['value', + '_template_args _COMMA value'], + + 'term': ['TERMINAL _COLON expansions _NL', + 'TERMINAL _DOT NUMBER _COLON expansions _NL'], + 'override': ['_OVERRIDE rule', + '_OVERRIDE term'], + 'extend': ['_EXTEND rule', + '_EXTEND term'], + 'ignore': ['_IGNORE expansions _NL'], + 'declare': ['_DECLARE _declare_args _NL'], + 'import': ['_IMPORT _import_path _NL', + '_IMPORT _import_path _LPAR name_list _RPAR _NL', + '_IMPORT _import_path _TO name _NL'], + + '_import_path': ['import_lib', 'import_rel'], + 'import_lib': ['_import_args'], + 'import_rel': ['_DOT _import_args'], + '_import_args': ['name', '_import_args _DOT name'], + + 'name_list': ['_name_list'], + '_name_list': ['name', '_name_list _COMMA name'], + + '_declare_args': ['symbol', '_declare_args symbol'], + 'literal': ['REGEXP', 'STRING'], +} + + +def symbol_from_strcase(s): + assert isinstance(s, str) + return Terminal(s, filter_out=s.startswith('_')) if s.isupper() else NonTerminal(s) + + +def _get_parser(): + try: + return _get_parser.cache + except AttributeError: + terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] + + rules = [(name.lstrip('?'), x, RuleOptions(expand1=name.startswith('?'))) + for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), [symbol_from_strcase(s) for s in x.split()], i, None, o) + for r, xs, o in rules for i, x in enumerate(xs)] + + callback = ParseTreeBuilder(rules, ST).create_callback() + import re + lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) + parser_conf = ParserConf(rules, callback, ['start']) + lexer_conf.lexer_type = 'basic' + parser_conf.parser_type = 'lalr' + _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) + return _get_parser.cache + + +GRAMMAR_ERRORS = [ + ('Incorrect type of value', ['a: 1\n']), + ('Unclosed parenthesis', ['a: (\n']), + ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), + ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), + ('Illegal name for rules or terminals', ['Aa:\n']), + ('Alias expects lowercase name', ['a: -> "a"\n']), + ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), + ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), + ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), + ('Terminal names cannot contain dots', ['A.B\n']), + ('Expecting rule or terminal definition', ['"a"\n']), + ('%import expects a name', ['%import "a"\n']), + ('%ignore expects a value', ['%ignore %import\n']), +] + + +def _translate_parser_exception(parse, e): + error = e.match_examples(parse, GRAMMAR_ERRORS, use_accepts=True) + if error: + return error + elif 'STRING' in e.expected: + return "Expecting a value" + + +@v_args(inline=True) +class PrepareGrammar(Transformer_InPlace): + def terminal(self, name): + return Terminal(str(name), filter_out=name.startswith('_')) + + def nonterminal(self, name): + return NonTerminal(name.value) + + +def parse_lark_grammar(text, name, start='start'): + try: + tree = _get_parser().parse(text + '\n', start) + except UnexpectedCharacters as e: + context = e.get_context(text) + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, name, context)) + except UnexpectedToken as e: + context = e.get_context(text) + error = _translate_parser_exception(_get_parser().parse, e) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + raise + + return PrepareGrammar().transform(tree) + + +def _error_repr(error): + if isinstance(error, UnexpectedToken): + error2 = _translate_parser_exception(_get_parser().parse, error) + if error2: + return error2 + expected = ', '.join(error.accepts or error.expected) + return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) + else: + return str(error) + + +def _search_interactive_parser(interactive_parser, predicate): + def expand(node): + path, p = node + for choice in p.choices(): + t = Token(choice, '') + try: + new_p = p.feed_token(t) + except ParseError: # Illegal + pass + else: + yield path + (choice,), new_p + + for path, p in bfs_all_unique([((), interactive_parser)], expand): + if predicate(p): + return path, p + + +def find_grammar_errors(text: str, start: str = 'start') -> List[Tuple[UnexpectedInput, str]]: + errors = [] + + def on_error(e): + errors.append((e, _error_repr(e))) + + # recover to a new line + token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices()) + for token_type in token_path: + e.interactive_parser.feed_token(Token(token_type, '')) + e.interactive_parser.feed_token(Token('_NL', '\n')) + return True + + _tree = _get_parser().parse(text + '\n', start, on_error=on_error) + + errors_by_line = classify(errors, lambda e: e[0].line) + errors = [el[0] for el in errors_by_line.values()] # already sorted + + for e in errors: + e[0].interactive_parser = None + return errors diff --git a/lark/tools/__init__.py b/lark/tools/__init__.py index 3fe66bd63..2144ad532 100644 --- a/lark/tools/__init__.py +++ b/lark/tools/__init__.py @@ -28,6 +28,7 @@ def indent(text, prefix): lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual')) k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') +# TODO: syntax. Might require imports to find the dialect. lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') for f in flags: diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 32f4a7931..adcf14ac3 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -4,8 +4,9 @@ from unittest import TestCase, main from lark import Lark, Token, Tree, ParseError, UnexpectedInput -from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors, list_grammar_imports +from lark.load_grammar import GrammarError, list_grammar_imports from lark.load_grammar import FromPackageLoader +from lark.parse_lark_grammar import GRAMMAR_ERRORS, find_grammar_errors class TestGrammar(TestCase): @@ -276,7 +277,7 @@ def test_list_grammar_imports(self): """ imports = list_grammar_imports(grammar, [os.path.dirname(__file__)]) - self.assertEqual({os.path.split(i)[-1] for i in imports}, {'test_templates_import.lark', 'templates.lark'}) + self.assertEqual({'test_templates_import.lark', 'templates.lark'}, {os.path.split(i)[-1] for i in imports}) imports = list_grammar_imports('%import common.WS', []) assert len(imports) == 1 and imports[0].pkg_name == 'lark' diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index 326831689..a46b37471 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit 326831689826cb1b9a4d21d1ce0d5db9278e9636 +Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44