diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000..cb9dfc1 --- /dev/null +++ b/.bazelrc @@ -0,0 +1 @@ +build --javacopt="--release 8" diff --git a/.travis.yml b/.travis.yml index c128646..4f6fc5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: xenial +dist: bionic addons: apt: @@ -13,7 +13,9 @@ script: - cd examples # build examples as a means of testing - bazel build --jobs 2 //antlr2/Cpp/... //antlr2/Calc/... //antlr2/Python/... //antlr3/Cpp/... //antlr3/Java/... //antlr3/Python2/... //antlr3/Python3/... //antlr4/Cpp/... //antlr4/Go/... //antlr4/Java/... //antlr4/Python2/... //antlr4/Python3/... - - cd .. + - cd antlr4-opt + - bazel build --jobs 2 //... + - cd ../.. - bazel test --jobs 2 --test_output=errors //... - bazel shutdown diff --git a/README.md b/README.md index 459b2a7..f79cb7b 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,31 @@ # ANTLR Rules for Bazel These build rules are used for processing [ANTLR](https://www.antlr.org) -grammars with [Bazel](https://bazel.build/). Currently C/C++, Go, Java and Python targets are supported. +grammars with [Bazel](https://bazel.build/). + * [Support Matrix](#matrix) * [Workspace Setup](#setup) + [Details](docs/setup.md#setup) * [Build Rules](#build-rules) - [Java Example](#java-example) + +## Support Matrix + +| | antlr4 | antlr3 | antlr2 +|---------|:-------------:|:-------------:|:----:| +| C | | Gen | Gen +| C++ | Gen + Runtime | Gen + Runtime | Gen + Runtime +| Go | Gen + Runtime | | +| Java | Gen + Runtime | Gen + Runtime | Gen + Runtime +| ObjC | | Gen | +| Python2 | Gen + Runtime | Gen + Runtime | Gen + Runtime +| Python3 | Gen + Runtime | Gen + Runtime | + +Gen: Code Generation\ +Runtime: Runtime Library bundled + + ## Setup @@ -25,13 +43,13 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") http_archive( name = "rules_antlr", - sha256 = "f7c73e1fe3d3b1be3b65172da756a326d12100f6a8d1ef8327498705c0d52efc", - strip_prefix = "rules_antlr-0.4.0", - urls = ["https://github.com/marcohu/rules_antlr/archive/0.4.0.tar.gz"], + sha256 = "", + strip_prefix = "rules_antlr-0.5.0", + urls = ["https://github.com/marcohu/rules_antlr/archive/0.5.0.tar.gz"], ) load("@rules_antlr//antlr:repositories.bzl", "rules_antlr_dependencies") -rules_antlr_dependencies("4.7.2") +rules_antlr_dependencies("4.8") ``` More detailed instructions can be found in the [Setup](docs/setup.md#setup) document. diff --git a/antlr/impl.bzl b/antlr/impl.bzl index bd69173..326f02f 100644 --- a/antlr/impl.bzl +++ b/antlr/impl.bzl @@ -1,12 +1,12 @@ """The common ANTLR rule implementation.""" -load(":lang.bzl", "C", "CPP", "GO", "PYTHON", "PYTHON2", "PYTHON3") +load(":lang.bzl", "C", "CPP", "GO", "OBJC", "PYTHON", "PYTHON2", "PYTHON3") AntlrInfo = provider( fields = { "sources": "The generated source files.", - "headers": "For C/C++ the generated header files.", - "data": "Additional ANTLR data files", + "headers": "The generated header files (for C/C++/ObjC).", + "data": "Additional ANTLR data files.", }, doc = "A provider containing information about ANTLR code generation.", ) @@ -31,7 +31,7 @@ def antlr(version, ctx, args): data = [] sources = [] headers = [] - cc = ctx.attr.language == CPP or ctx.attr.language == C + cc = ctx.attr.language == CPP or ctx.attr.language == C or ctx.attr.language == OBJC output_type = "dir" if ctx.attr.language and ctx.attr.language != "Java" else "srcjar" if output_type == "srcjar": @@ -71,6 +71,7 @@ def antlr(version, ctx, args): "OUTPUT_DIRECTORY": output_dir, "PACKAGE_NAME": ctx.attr.package, "SRC_JAR": srcjar.path if srcjar else "", + "TARGET": ctx.attr.name, "TARGET_LANGUAGE": ctx.attr.language, "TOOL_CLASSPATH": ",".join([f.path for f in tool_inputs]), }, @@ -89,9 +90,6 @@ def antlr(version, ctx, args): headers = headers, data = [ctx.attr.name + ".antlr"], ), - platform_common.TemplateVariableInfo({ - "INCLUDES": ctx.attr.name + ".inc/" + ctx.attr.package, - }), CcInfo(compilation_context = compilation_context) if cc else _NullInfo(), DefaultInfo(files = depset(outputs)), ] @@ -106,10 +104,12 @@ def extension(language): """ if language == CPP or language == C: return ".cc" - if language == PYTHON or language == PYTHON2 or language == PYTHON3: - return ".py" if language == GO: return ".go" + if language == OBJC: + return ".objc" + if language == PYTHON or language == PYTHON2 or language == PYTHON3: + return ".py" return "" def lib_dir(imports): @@ -122,7 +122,10 @@ def lib_dir(imports): """ lib = {} for resource in imports: - lib[resource.path.replace("/" + resource.basename, "")] = None + if resource.path.endswith(".srcjar"): + lib[resource.path] = None + else: + lib[resource.path.replace("/" + resource.basename, "")] = None count = len(lib) # the lib directory does not allow nested directories diff --git a/antlr/lang.bzl b/antlr/lang.bzl index 2913bdb..d48c260 100644 --- a/antlr/lang.bzl +++ b/antlr/lang.bzl @@ -4,6 +4,7 @@ CSHARP = "CSharp" GO = "Go" JAVA = "Java" JAVASCRIPT = "JavaScript" +OBJC = "ObjC" PYTHON = "Python" # synonym for PYTHON3 PYTHON2 = "Python2" PYTHON3 = "Python3" @@ -16,4 +17,4 @@ def supported(): Returns: the list of supported languages. """ - return [C, CPP, GO, JAVA, PYTHON, PYTHON2, PYTHON3] + return [C, CPP, GO, JAVA, OBJC, PYTHON, PYTHON2, PYTHON3] diff --git a/antlr/repositories.bzl b/antlr/repositories.bzl index 602929c..e2525ab 100644 --- a/antlr/repositories.bzl +++ b/antlr/repositories.bzl @@ -1,12 +1,140 @@ """Loads ANTLR dependencies.""" load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_jar") -load(":lang.bzl", "C", "CPP", "GO", "JAVA", "PYTHON", "PYTHON2", "PYTHON3", supportedLanguages = "supported") +load(":lang.bzl", "C", "CPP", "GO", "JAVA", "OBJC", "PYTHON", "PYTHON2", "PYTHON3", supportedLanguages = "supported") -v4 = [4, "4.7.1", "4.7.2"] +v4 = [4, "4.7.1", "4.7.2", "4.8"] +v4_opt = [4, "4.7.1", "4.7.2", "4.7.3", "4.7.4"] v3 = [3, "3.5.2"] v2 = [2, "2.7.7"] +PACKAGES = { + "antlr": { + "4.8": { + "url": "https://github.com/antlr/antlr4/archive/4.8.tar.gz", + "prefix": "antlr4-4.8", + "sha256": "992d52444b81ed75e52ea62f9f38ecb7652d5ce2a2130af143912b3042a6d77e", + }, + "4.7.2": { + "url": "https://github.com/antlr/antlr4/archive/4.7.2.tar.gz", + "prefix": "antlr4-4.7.2", + "sha256": "46f5e1af5f4bd28ade55cb632f9a069656b31fc8c2408f9aa045f9b5f5caad64", + }, + "4.7.1": { + "url": "https://github.com/antlr/antlr4/archive/4.7.1.tar.gz", + "prefix": "antlr4-4.7.1", + "sha256": "4d0714f441333a63e50031c9e8e4890c78f3d21e053d46416949803e122a6574", + }, + "3.5.2": { + "url": "https://github.com/marcohu/antlr3/archive/master.tar.gz", + "prefix": "antlr3-master", + "sha256": "53cd6c8e41995efa0b7d01c53047ad8a0e2c74e56fe03f6e938d2f0493ee7ace", + }, + "2.7.7": { + "url": "https://www.antlr2.org/download/antlr-2.7.7.tar.gz", + "prefix": "antlr-2.7.7", + "sha256": "853aeb021aef7586bda29e74a6b03006bcb565a755c86b66032d8ec31b67dbb9", + "patches": ["@rules_antlr//third_party:antlr2_strings.patch"], + }, + }, + "antlr4_runtime": { + "4.8": { + "path": "org/antlr/antlr4-runtime/4.8/antlr4-runtime-4.8.jar", + "sha256": "2337df5d81e715b39aeea07aac46ad47e4f1f9e9cd7c899f124f425913efdcf8", + }, + "4.7.2": { + "path": "org/antlr/antlr4-runtime/4.7.2/antlr4-runtime-4.7.2.jar", + "sha256": "4c518b87d4bdff8b44cd8cbc1af816e944b62a3fe5b80b781501cf1f4759bbc4", + }, + "4.7.1": { + "path": "org/antlr/antlr4-runtime/4.7.1/antlr4-runtime-4.7.1.jar", + "sha256": "43516d19beae35909e04d06af6c0c58c17bc94e0070c85e8dc9929ca640dc91d", + }, + "4.7.4-opt": { + "path": "com/tunnelvisionlabs/antlr4-runtime/4.7.4/antlr4-runtime-4.7.4.jar", + "sha256": "c0616e1eb3b7aa6b4de9a304ea458d50cac279f78b0b65bf7a8176701f8402ee", + }, + "4.7.3-opt": { + "path": "com/tunnelvisionlabs/antlr4-runtime/4.7.3/antlr4-runtime-4.7.3.jar", + "sha256": "5f4f0c4031e4b83cb369ef00f4909cdb6f62b11e3d253f83a6184d80c5eb3157", + }, + "4.7.2-opt": { + "path": "com/tunnelvisionlabs/antlr4-runtime/4.7.2/antlr4-runtime-4.7.2.jar", + "sha256": "fdec73953ba059034336a8e0b0ea5204f6897900bf0b0fa35347ce8a8bb88816", + }, + "4.7.1-opt": { + "path": "com/tunnelvisionlabs/antlr4-runtime/4.7.1/antlr4-runtime-4.7.1.jar", + "sha256": "ce4f77ff9dc014feb9a8e700de5c77101d203acb6a1e8fa3446905c391ac72b9", + }, + }, + "antlr4_tool": { + "4.8": { + "path": "org/antlr/antlr4/4.8/antlr4-4.8.jar", + "sha256": "6e4477689371f237d4d8aa40642badbb209d4628ccdd81234d90f829a743bac8", + }, + "4.7.2": { + "path": "org/antlr/antlr4/4.7.2/antlr4-4.7.2.jar", + "sha256": "a3811fad1e4cb6dde62c189c204cf931c5fa40e06e43839ead4a9f2e188f2fe5", + }, + "4.7.1": { + "path": "org/antlr/antlr4/4.7.1/antlr4-4.7.1.jar", + "sha256": "a2cdc2f2f8eb893728832568dc54d080eb5a1495edb3b66e51b97122a60a0d87", + }, + "4.7.4-opt": { + "path": "com/tunnelvisionlabs/antlr4/4.7.4/antlr4-4.7.4.jar", + "sha256": "f84d71d130f17b13f0934af7575626890a4dab0c588a95b80572a66f7deacca4", + }, + "4.7.3-opt": { + "path": "com/tunnelvisionlabs/antlr4/4.7.3/antlr4-4.7.3.jar", + "sha256": "06cd5f3a9488b32cb1022360df054bbe7aebe8e817c0aa58c8feec05879e0c63", + }, + "4.7.2-opt": { + "path": "com/tunnelvisionlabs/antlr4/4.7.2/antlr4-4.7.2.jar", + "sha256": "fcc2a0365de371d8676ab9b45c49aa2e784036a77b76383892887c89c5725ca3", + }, + "4.7.1-opt": { + "path": "com/tunnelvisionlabs/antlr4/4.7.1/antlr4-4.7.1.jar", + "sha256": "de9a7b94b48ea7c8100663cbb1a54465c37671841c0aefdf4c53a72212555ae8", + }, + }, + "antlr3_runtime": { + "3.5.2": { + "path": "org/antlr/antlr-runtime/3.5.2/antlr-runtime-3.5.2.jar", + "sha256": "ce3fc8ecb10f39e9a3cddcbb2ce350d272d9cd3d0b1e18e6fe73c3b9389c8734", + }, + }, + "antlr3_tool": { + "3.5.2": { + # the official release generates problematic C++ code, we therefore use a + # custom build forked from https://github.com/ibre5041/antlr3.git + "path": "https://github.com/marcohu/antlr3/raw/master/antlr-3.5.3.jar", + "sha256": "897d0b914adf2e63899ada179c5f4aeb606d59fdfbb6ccaff5bc87aec300e2ce", + }, + }, + "antlr2": { + "2.7.7": { + "path": "antlr/antlr/2.7.7/antlr-2.7.7.jar", + "sha256": "88fbda4b912596b9f56e8e12e580cc954bacfb51776ecfddd3e18fc1cf56dc4c", + }, + }, + "stringtemplate4": { + "4.3": { + "path": "org/antlr/ST4/4.3/ST4-4.3.jar", + "sha256": "28547dba48cfceb77b6efbfe069aebe9ed3324ae60dbd52093d13a1d636ed069", + }, + "4.0.8": { + "path": "org/antlr/ST4/4.0.8/ST4-4.0.8.jar", + "sha256": "58caabc40c9f74b0b5993fd868e0f64a50c0759094e6a251aaafad98edfc7a3b", + }, + }, + "javax_json": { + "1.0.4": { + "path": "org/glassfish/javax.json/1.0.4/javax.json-1.0.4.jar", + "sha256": "0e1dec40a1ede965941251eda968aeee052cc4f50378bc316cc48e8159bdbeb4", + }, + }, +} + def rules_antlr_dependencies(*versionsAndLanguages): """Loads the dependencies for the specified ANTLR releases. @@ -51,7 +179,9 @@ def rules_antlr_dependencies(*versionsAndLanguages): languages = [JAVA] for version in sorted(versions, key = _toString): - if version == 4 or version == "4.7.2": + if version == 4 or version == "4.8": + _antlr48_dependencies(languages) + elif version == "4.7.2": _antlr472_dependencies(languages) elif version == "4.7.1": _antlr471_dependencies(languages) @@ -74,99 +204,97 @@ def rules_antlr_optimized_dependencies(version): Args: version: the ANTLR release version to make available. """ - if version == 4 or version == "4.7.2": + if version == 4 or version == "4.7.4": + _antlr474_optimized_dependencies() + elif version == "4.7.3": + _antlr473_optimized_dependencies() + elif version == "4.7.2": _antlr472_optimized_dependencies() elif version == "4.7.1": _antlr471_optimized_dependencies() elif type(version) == "int" or str(version).isdigit(): fail('Integer version \'{}\' no longer valid. Use semantic version "{}" instead.'.format(version, ".".join(str(version).elems())), attr = "version") else: - fail('Unsupported ANTLR version provided: "{0}". Currently supported are: {1}'.format(version, v4), attr = "version") + fail('Unsupported ANTLR version provided: "{0}". Currently supported are: {1}'.format(version, v4_opt), attr = "version") + +def _antlr48_dependencies(languages): + _antlr4_dependencies( + "4.8", + languages, + { + "antlr4_runtime": "4.8", + "antlr4_tool": "4.8", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.3", + "javax_json": "1.0.4", + }, + ) def _antlr472_dependencies(languages): _antlr4_dependencies( + "4.7.2", languages, { - "url": "https://github.com/antlr/antlr4/archive/4.7.2.tar.gz", - "prefix": "antlr4-4.7.2", - "sha256": "46f5e1af5f4bd28ade55cb632f9a069656b31fc8c2408f9aa045f9b5f5caad64", + "antlr4_runtime": "4.7.2", + "antlr4_tool": "4.7.2", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.0.8", + "javax_json": "1.0.4", }, - _merge( - { - "antlr4_runtime": { - "name": "antlr4_runtime", - "path": "org/antlr/antlr4-runtime/4.7.2/antlr4-runtime-4.7.2.jar", - "sha256": "4c518b87d4bdff8b44cd8cbc1af816e944b62a3fe5b80b781501cf1f4759bbc4", - }, - "antlr4_tool": { - "name": "antlr4_tool", - "path": "org/antlr/antlr4/4.7.2/antlr4-4.7.2.jar", - "sha256": "a3811fad1e4cb6dde62c189c204cf931c5fa40e06e43839ead4a9f2e188f2fe5", - }, - }, - _antlr4_transitive_dependencies(), - ), ) def _antlr471_dependencies(languages): _antlr4_dependencies( + "4.7.1", languages, { - "url": "https://github.com/antlr/antlr4/archive/4.7.2.tar.gz", - "prefix": "antlr4-4.7.2", - "sha256": "46f5e1af5f4bd28ade55cb632f9a069656b31fc8c2408f9aa045f9b5f5caad64", + "antlr4_runtime": "4.7.1", + "antlr4_tool": "4.7.1", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.0.8", + "javax_json": "1.0.4", }, - _merge( - { - "antlr4_runtime": { - "name": "antlr4_runtime", - "path": "org/antlr/antlr4-runtime/4.7.1/antlr4-runtime-4.7.1.jar", - "sha256": "43516d19beae35909e04d06af6c0c58c17bc94e0070c85e8dc9929ca640dc91d", - }, - "antlr4_tool": { - "name": "antlr4_tool", - "path": "org/antlr/antlr4/4.7.1/antlr4-4.7.1.jar", - "sha256": "a2cdc2f2f8eb893728832568dc54d080eb5a1495edb3b66e51b97122a60a0d87", - }, - }, - _antlr4_transitive_dependencies(), - ), ) +def _antlr474_optimized_dependencies(): + _dependencies({ + "antlr4_runtime": "4.7.4-opt", + "antlr4_tool": "4.7.4-opt", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.0.8", + "javax_json": "1.0.4", + }) + +def _antlr473_optimized_dependencies(): + _dependencies({ + "antlr4_runtime": "4.7.3-opt", + "antlr4_tool": "4.7.3-opt", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.0.8", + "javax_json": "1.0.4", + }) + def _antlr472_optimized_dependencies(): - _download( - name = "antlr4_runtime", - path = "com/tunnelvisionlabs/antlr4-runtime/4.7.2/antlr4-runtime-4.7.2.jar", - sha256 = "fdec73953ba059034336a8e0b0ea5204f6897900bf0b0fa35347ce8a8bb88816", - ) - _download( - name = "antlr4_tool", - path = "com/tunnelvisionlabs/antlr4/4.7.2/antlr4-4.7.2.jar", - sha256 = "fcc2a0365de371d8676ab9b45c49aa2e784036a77b76383892887c89c5725ca3", - ) - _antlr4_transitive_dependencies(False) + _dependencies({ + "antlr4_runtime": "4.7.2-opt", + "antlr4_tool": "4.7.2-opt", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.0.8", + "javax_json": "1.0.4", + }) def _antlr471_optimized_dependencies(): - _download( - name = "antlr4_runtime", - path = "com/tunnelvisionlabs/antlr4-runtime/4.7.1/antlr4-runtime-4.7.1.jar", - sha256 = "ce4f77ff9dc014feb9a8e700de5c77101d203acb6a1e8fa3446905c391ac72b9", - ) - _download( - name = "antlr4_tool", - path = "com/tunnelvisionlabs/antlr4/4.7.1/antlr4-4.7.1.jar", - sha256 = "de9a7b94b48ea7c8100663cbb1a54465c37671841c0aefdf4c53a72212555ae8", - ) - _antlr4_transitive_dependencies() - -def _antlr4_dependencies(languages, archive, dependencies): - for name in dependencies: - _download( - name = name, - path = dependencies[name]["path"], - sha256 = dependencies[name]["sha256"], - ) - + _dependencies({ + "antlr4_runtime": "4.7.1-opt", + "antlr4_tool": "4.7.1-opt", + "antlr3_runtime": "3.5.2", + "stringtemplate4": "4.0.8", + "javax_json": "1.0.4", + }) + +def _antlr4_dependencies(version, languages, dependencies): + _dependencies(dependencies) + archive = PACKAGES["antlr"][version] build_script, workspace = _antlr4_build_script(languages) if build_script: @@ -263,56 +391,20 @@ py_repositories() def _load_rules_python_defs(script): return "" if script.find('load("@rules_python//python:defs.bzl"') > -1 else 'load("@rules_python//python:defs.bzl", "py_library")' -def _antlr4_transitive_dependencies(): - return { - "antlr3_runtime": { - "path": "org/antlr/antlr-runtime/3.5.2/antlr-runtime-3.5.2.jar", - "sha256": "ce3fc8ecb10f39e9a3cddcbb2ce350d272d9cd3d0b1e18e6fe73c3b9389c8734", - }, - "stringtemplate4": { - "path": "org/antlr/ST4/4.0.8/ST4-4.0.8.jar", - "sha256": "58caabc40c9f74b0b5993fd868e0f64a50c0759094e6a251aaafad98edfc7a3b", - }, - "javax_json": { - "path": "org/glassfish/javax.json/1.0.4/javax.json-1.0.4.jar", - "sha256": "0e1dec40a1ede965941251eda968aeee052cc4f50378bc316cc48e8159bdbeb4", - }, - } - def _antlr352_dependencies(languages): _antlr3_dependencies( + "3.5.2", languages, { - "url": "https://github.com/marcohu/antlr3/archive/master.tar.gz", - "prefix": "antlr3-master", - "sha256": "53cd6c8e41995efa0b7d01c53047ad8a0e2c74e56fe03f6e938d2f0493ee7ace", - }, - { - "antlr3_runtime": { - "path": "org/antlr/antlr-runtime/3.5.2/antlr-runtime-3.5.2.jar", - "sha256": "ce3fc8ecb10f39e9a3cddcbb2ce350d272d9cd3d0b1e18e6fe73c3b9389c8734", - }, - # the official release generates problematic C++ code, we therefore use a - # custom build forked from https://github.com/ibre5041/antlr3.git - "antlr3_tool": { - "path": "https://github.com/marcohu/antlr3/raw/master/antlr-3.5.3.jar", - "sha256": "897d0b914adf2e63899ada179c5f4aeb606d59fdfbb6ccaff5bc87aec300e2ce", - }, - "stringtemplate4": { - "path": "org/antlr/ST4/4.0.8/ST4-4.0.8.jar", - "sha256": "58caabc40c9f74b0b5993fd868e0f64a50c0759094e6a251aaafad98edfc7a3b", - }, + "antlr3_runtime": "3.5.2", + "antlr3_tool": "3.5.2", + "stringtemplate4": "4.0.8", }, ) -def _antlr3_dependencies(languages, archive, dependencies): - for name in dependencies: - _download( - name = name, - path = dependencies[name]["path"], - sha256 = dependencies[name]["sha256"], - ) - +def _antlr3_dependencies(version, languages, dependencies): + _dependencies(dependencies) + archive = PACKAGES["antlr"][version] build_script = _antlr3_build_script(languages) if build_script: @@ -356,41 +448,30 @@ py_library( visibility = ["//visibility:public"], ) """ + return script def _antlr277_dependencies(languages): _antlr2_dependencies( + "2.7.7", languages, { - "url": "https://www.antlr2.org/download/antlr-2.7.7.tar.gz", - "prefix": "antlr-2.7.7", - "sha256": "853aeb021aef7586bda29e74a6b03006bcb565a755c86b66032d8ec31b67dbb9", - }, - { - "antlr2": { - "path": "antlr/antlr/2.7.7/antlr-2.7.7.jar", - "sha256": "88fbda4b912596b9f56e8e12e580cc954bacfb51776ecfddd3e18fc1cf56dc4c", - }, + "antlr2": "2.7.7", }, ) -def _antlr2_dependencies(languages, archive, dependencies): - for name in dependencies: - _download( - name = name, - path = dependencies[name]["path"], - sha256 = dependencies[name]["sha256"], - ) - +def _antlr2_dependencies(version, languages, dependencies): + _dependencies(dependencies) + archive = PACKAGES["antlr"][version] build_script = _antlr2_build_script(languages) if build_script: http_archive( name = "antlr2_runtimes", - sha256 = "853aeb021aef7586bda29e74a6b03006bcb565a755c86b66032d8ec31b67dbb9", + sha256 = archive["sha256"], strip_prefix = "antlr-2.7.7", - urls = ["https://www.antlr2.org/download/antlr-2.7.7.tar.gz"], - patches = ["@rules_antlr//third_party:antlr2_strings.patch"], + url = archive["url"], + patches = archive["patches"] if "patches" in archive else [], build_file_content = build_script, ) @@ -423,6 +504,15 @@ py_library( return script +def _dependencies(dependencies): + for key in dependencies: + version = dependencies[key] + _download( + name = key, + path = PACKAGES[key][version]["path"], + sha256 = PACKAGES[key][version]["sha256"], + ) + def _download(name, path, sha256): http_jar( name = name, @@ -448,7 +538,3 @@ def _validateVersions(versions): def _toString(x): return str(x) - -def _merge(x, y): - x.update(y) - return x diff --git a/docs/setup.md b/docs/setup.md index 2543f1b..59f52d0 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -8,9 +8,9 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") http_archive( name = "rules_antlr", - sha256 = "f7c73e1fe3d3b1be3b65172da756a326d12100f6a8d1ef8327498705c0d52efc", - strip_prefix = "rules_antlr-0.4.0", - urls = ["https://github.com/marcohu/rules_antlr/archive/0.4.0.tar.gz"], + sha256 = "", + strip_prefix = "rules_antlr-0.5.0", + urls = ["https://github.com/marcohu/rules_antlr/archive/0.5.0.tar.gz"], ) ``` @@ -74,7 +74,7 @@ The currently supported releases are: | Release Stream | Supported Versions| Bundled Runtimes |-----------------|-------------------|--- -| 4 | 4.7.1, 4.7.2 | C++, Go, Java, Python2, Python3 +| 4 | 4.7.1, 4.7.2, 4.8 | C++, Go, Java, Python2, Python3 | 3 | 3.5.2 | C++, Java, Python2, Python3 | 2 | 2.7.7 | C++, Java, Python2 diff --git a/examples/WORKSPACE b/examples/WORKSPACE index b0f45eb..8634864 100644 --- a/examples/WORKSPACE +++ b/examples/WORKSPACE @@ -32,7 +32,7 @@ load("@rules_python//python:repositories.bzl", "py_repositories") py_repositories() -load("@rules_antlr//antlr:lang.bzl", "C", "CPP", "GO", "JAVA", "PYTHON", "PYTHON2") +load("@rules_antlr//antlr:lang.bzl", "C", "CPP", "GO", "JAVA", "OBJC", "PYTHON", "PYTHON2") load("@rules_antlr//antlr:repositories.bzl", "rules_antlr_dependencies") -rules_antlr_dependencies(2, 3, "4.7.2", C, CPP, GO, PYTHON, PYTHON2) +rules_antlr_dependencies("2.7.7", 3, "4.8", C, CPP, GO, OBJC, PYTHON, PYTHON2) diff --git a/examples/antlr3/BUILD b/examples/antlr3/BUILD index 6e3ae64..8b9f424 100644 --- a/examples/antlr3/BUILD +++ b/examples/antlr3/BUILD @@ -7,6 +7,7 @@ filegroup( "InheritSameFolder", "Java", "LanguageByAttribute", + "ObjC", "Python2", "Python3", ], diff --git a/examples/antlr3/ImportGenerated/src/codegen/BUILD b/examples/antlr3/ImportGenerated/src/codegen/BUILD new file mode 100644 index 0000000..229fc03 --- /dev/null +++ b/examples/antlr3/ImportGenerated/src/codegen/BUILD @@ -0,0 +1,7 @@ +load("@rules_antlr//antlr:antlr3.bzl", "antlr") + +antlr( + name = "codegen", + srcs = glob(["*.g"]), + imports = ["//antlr3/ImportGenerated/src/parse:parser"], +) diff --git a/examples/antlr3/ImportGenerated/src/codegen/SourceGenTriggers.g b/examples/antlr3/ImportGenerated/src/codegen/SourceGenTriggers.g new file mode 100644 index 0000000..85e3dc4 --- /dev/null +++ b/examples/antlr3/ImportGenerated/src/codegen/SourceGenTriggers.g @@ -0,0 +1,198 @@ +/* + * [The "BSD license"] + * Copyright (c) 2012-2016 Terence Parr + * Copyright (c) 2012-2016 Sam Harwell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +tree grammar SourceGenTriggers; +options { + language = Java; + tokenVocab = ANTLRParser; + ASTLabelType = GrammarAST; +} + +@header { +package org.antlr.v4.codegen; +import org.antlr.v4.misc.Utils; +import org.antlr.v4.codegen.model.*; +import org.antlr.v4.codegen.model.decl.*; +import org.antlr.v4.tool.*; +import org.antlr.v4.tool.ast.*; +import java.util.Collections; +import java.util.Map; +import java.util.HashMap; +} + +@members { + public OutputModelController controller; + public boolean hasLookaheadBlock; + public SourceGenTriggers(TreeNodeStream input, OutputModelController controller) { + this(input); + this.controller = controller; + } +} + +dummy : block[null, null] ; + +block[GrammarAST label, GrammarAST ebnfRoot] returns [List omos] + : ^( blk=BLOCK (^(OPTIONS .+))? + {List alts = new ArrayList();} + ( alternative {alts.add($alternative.altCodeBlock);} )+ + ) + { + if ( alts.size()==1 && ebnfRoot==null) return alts; + if ( ebnfRoot==null ) { + $omos = DefaultOutputModelFactory.list(controller.getChoiceBlock((BlockAST)$blk, alts, $label)); + } + else { + Choice choice = controller.getEBNFBlock($ebnfRoot, alts); + hasLookaheadBlock |= choice instanceof PlusBlock || choice instanceof StarBlock; + $omos = DefaultOutputModelFactory.list(choice); + } + } + ; + +alternative returns [CodeBlockForAlt altCodeBlock, List ops] +@init { + boolean outerMost = inContext("RULE BLOCK"); +} +@after { + controller.finishAlternative($altCodeBlock, $ops, outerMost); +} + : a=alt[outerMost] {$altCodeBlock=$a.altCodeBlock; $ops=$a.ops;} + ; + +alt[boolean outerMost] returns [CodeBlockForAlt altCodeBlock, List ops] +@init { + // set alt if outer ALT only (the only ones with alt field set to Alternative object) + AltAST altAST = (AltAST)retval.start; + if ( outerMost ) controller.setCurrentOuterMostAlt(altAST.alt); +} + : { + List elems = new ArrayList(); + // TODO: shouldn't we pass $start to controller.alternative()? + $altCodeBlock = controller.alternative(controller.getCurrentOuterMostAlt(), outerMost); + $altCodeBlock.ops = $ops = elems; + controller.setCurrentBlock($altCodeBlock); + } + ^( ALT elementOptions? ( element {if ($element.omos!=null) elems.addAll($element.omos);} )+ ) + + | ^(ALT elementOptions? EPSILON) + {$altCodeBlock = controller.epsilon(controller.getCurrentOuterMostAlt(), outerMost);} + ; + +element returns [List omos] + : labeledElement {$omos = $labeledElement.omos;} + | atom[null,false] {$omos = $atom.omos;} + | subrule {$omos = $subrule.omos;} + | ACTION {$omos = controller.action((ActionAST)$ACTION);} + | SEMPRED {$omos = controller.sempred((ActionAST)$SEMPRED);} + | ^(ACTION elementOptions) {$omos = controller.action((ActionAST)$ACTION);} + | ^(SEMPRED elementOptions) {$omos = controller.sempred((ActionAST)$SEMPRED);} + ; + +labeledElement returns [List omos] + : ^(ASSIGN ID atom[$ID,false] ) {$omos = $atom.omos;} + | ^(PLUS_ASSIGN ID atom[$ID,false]) {$omos = $atom.omos;} + | ^(ASSIGN ID block[$ID,null] ) {$omos = $block.omos;} + | ^(PLUS_ASSIGN ID block[$ID,null]) {$omos = $block.omos;} + ; + +subrule returns [List omos] + : ^(OPTIONAL b=block[null,$OPTIONAL]) + { + $omos = $block.omos; + } + | ( ^(op=CLOSURE b=block[null,null]) + | ^(op=POSITIVE_CLOSURE b=block[null,null]) + ) + { + List alts = new ArrayList(); + SrcOp blk = $b.omos.get(0); + CodeBlockForAlt alt = new CodeBlockForAlt(controller.delegate); + alt.addOp(blk); + alts.add(alt); + SrcOp loop = controller.getEBNFBlock($op, alts); // "star it" + hasLookaheadBlock |= loop instanceof PlusBlock || loop instanceof StarBlock; + $omos = DefaultOutputModelFactory.list(loop); + } + | block[null, null] {$omos = $block.omos;} + ; + +blockSet[GrammarAST label, boolean invert] returns [List omos] + : ^(SET atom[label,invert]+) {$omos = controller.set($SET, $label, invert);} + ; + +/* +setElement + : STRING_LITERAL + | TOKEN_REF + | ^(RANGE STRING_LITERAL STRING_LITERAL) + ; +*/ + +// TODO: combine ROOT/BANG into one then just make new op ref'ing return value of atom/terminal... +// TODO: same for NOT +atom[GrammarAST label, boolean invert] returns [List omos] + : ^(NOT a=atom[$label, true]) {$omos = $a.omos;} + | range[label] {$omos = $range.omos;} + | ^(DOT ID terminal[$label]) + | ^(DOT ID ruleref[$label]) + | ^(WILDCARD .) {$omos = controller.wildcard($WILDCARD, $label);} + | WILDCARD {$omos = controller.wildcard($WILDCARD, $label);} + | terminal[label] {$omos = $terminal.omos;} + | ruleref[label] {$omos = $ruleref.omos;} + | blockSet[$label, invert] {$omos = $blockSet.omos;} + ; + +ruleref[GrammarAST label] returns [List omos] + : ^(RULE_REF ARG_ACTION? elementOptions?) {$omos = controller.ruleRef($RULE_REF, $label, $ARG_ACTION);} + ; + +range[GrammarAST label] returns [List omos] + : ^(RANGE a=STRING_LITERAL b=STRING_LITERAL) + ; + +terminal[GrammarAST label] returns [List omos] + : ^(STRING_LITERAL .) {$omos = controller.stringRef($STRING_LITERAL, $label);} + | STRING_LITERAL {$omos = controller.stringRef($STRING_LITERAL, $label);} + | ^(TOKEN_REF ARG_ACTION .) {$omos = controller.tokenRef($TOKEN_REF, $label, $ARG_ACTION);} + | ^(TOKEN_REF .) {$omos = controller.tokenRef($TOKEN_REF, $label, null);} + | TOKEN_REF {$omos = controller.tokenRef($TOKEN_REF, $label, null);} + ; + +elementOptions + : ^(ELEMENT_OPTIONS elementOption+) + ; + +elementOption + : ID + | ^(ASSIGN ID ID) + | ^(ASSIGN ID STRING_LITERAL) + | ^(ASSIGN ID ACTION) + | ^(ASSIGN ID INT) + ; diff --git a/examples/antlr3/ImportGenerated/src/parse/ANTLRLexer.g b/examples/antlr3/ImportGenerated/src/parse/ANTLRLexer.g new file mode 100644 index 0000000..4f113a7 --- /dev/null +++ b/examples/antlr3/ImportGenerated/src/parse/ANTLRLexer.g @@ -0,0 +1,824 @@ +/* + * [The "BSD license"] + * Copyright (c) 2012-2016 Terence Parr + * Copyright (c) 2012-2016 Sam Harwell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// File : A3Lexer.g +// Author : Jim Idle (jimi@temporal-wave.com) +// Copyright : Free BSD - See @header clause below +// Version : First implemented as part of ANTLR 3.2 this is the self +// hosting ANTLR 3 Lexer. +// +// Description +// ----------- +// This is the definitive lexer grammar for parsing ANTLR V3.x.x grammars. All other +// gramnmars are derived from this grammar via source code control integration (perforce) +// or by the gdiff tool. +// +// This grammar and its associated grmmmars A3Parser.g and A3Walker.g exhibit the following +// traits, which are recommended for all production quality grammars: +// +// 1) They are separate grammars, not composite grammars; +// 2) They implement all supporting methods in a superclass (at least this is recommended +// for language targets that support inheritence; +// 3) All errors are pushed as far down the parsing chain as possible, which means +// that the lexer tries to defer error reporting to the parser, and the parser +// tries to defer error reporting to a semantic phase consisting of a single +// walk of the AST. The reason for this is that the error messages produced +// from later phases of the parse will generally have better context and so +// be more useful to the end user. Consider the message: "Syntax error at 'options'" +// vs: "You cannot specify two options{} sections in a single grammar file". +// 4) The lexer is 'programmed' to catch common mistakes such as unterminated literals +// and report them specifically and not just issue confusing lexer mismatch errors. +// + +/** Read in an ANTLR grammar and build an AST. Try not to do + * any actions, just build the tree. + * + * The phases are: + * + * A3Lexer.g (this file) + * A3Parser.g + * A3Verify.g (derived from A3Walker.g) + * assign.types.g + * define.g + * buildnfa.g + * antlr.print.g (optional) + * codegen.g + * + * Terence Parr + * University of San Francisco + * 2005 + * Jim Idle (this v3 grammar) + * Temporal Wave LLC + * 2009 + */ +lexer grammar ANTLRLexer; + +// ============================================================================== +// Note that while this grammar does not care about order of constructs +// that don't really matter, such as options before @header etc, it must first +// be parsed by the original v2 parser, before it replaces it. That parser does +// care about order of structures. Hence we are constrained by the v2 parser +// for at least the first bootstrap release that causes this parser to replace +// the v2 version. +// ============================================================================== + +// ------- +// Options +// +// V3 option directives to tell the tool what we are asking of it for this +// grammar. +// +options { + + // Target language is Java, which is the default but being specific + // here as this grammar is also meant as a good example grammar for + // for users. + // + language = Java; + + // The super class that this lexer should expect to inherit from, and + // which contains any and all support routines for the lexer. This is + // commented out in this baseline (definitive or normative grammar) + // - see the ANTLR tool implementation for hints on how to use the super + // class + // + //superclass = AbstractA3Lexer; +} + +tokens { SEMPRED; TOKEN_REF; RULE_REF; LEXER_CHAR_SET; ARG_ACTION; } + +// Include the copyright in this source and also the generated source +// +@lexer::header { +/* + [The "BSD licence"] + Copyright (c) 2005-2009 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +package org.antlr.v4.parse; +import org.antlr.v4.tool.*; +import org.antlr.v4.runtime.misc.Interval; +} + + +@members { + public static final int COMMENTS_CHANNEL = 2; + + public CommonTokenStream tokens; // track stream we push to; need for context info + public boolean isLexerRule = false; + + public void grammarError(ErrorType etype, org.antlr.runtime.Token token, Object... args) { } + + /** scan backwards from current point in this.tokens list + * looking for the start of the rule or subrule. + * Return token or null if for some reason we can't find the start. + */ + public Token getRuleOrSubruleStartToken() { + if ( tokens==null ) return null; + int i = tokens.index(); + int n = tokens.size(); + if ( i>=n ) i = n-1; // seems index == n as we lex + while ( i>=0 && i ' $ANTLR' SRC + | ~(NLCHARS)* + ) + + | // Multi-line comment, which may be a documentation comment + // if it starts /** (note that we protect against accidentaly + // recognizing a comment /**/ as a documentation comment + // + '*' ( + { input.LA(2) != '/'}?=> '*' { $type = DOC_COMMENT; } + | { true }?=> // Required to cover all alts with predicates + ) + + // Should we support embedded multiline comments here? + // + ( + // Pick out end of multiline comment and exit the loop + // if we find it. + // + { !(input.LA(1) == '*' && input.LA(2) == '/') }? + + // Anything else other than the non-greedy match of + // the comment close sequence + // + . + )* + ( + // Look for the comment terminator, but if it is accidentally + // unterminated, then we will hit EOF, which will trigger the + // epsilon alt and hence we can issue an error message relative + // to the start of the unterminated multi-line comment + // + '*/' + + | // Unterminated comment! + // + { + // ErrorManager.msg(Msg.UNTERMINATED_DOC_COMMENT, startLine, offset, $pos, startLine, offset, $pos, (Object)null); + } + ) + + | // There was nothing that made sense following the opening '/' and so + // we issue an error regarding the malformed comment + // + { + // TODO: Insert error message relative to comment start + // + } + ) + { + // We do not wish to pass the comments in to the parser. If you are + // writing a formatter then you will want to preserve the comments off + // channel, but could just skip and save token space if not. + // + $channel=COMMENTS_CHANNEL; + } + ; + +ARG_OR_CHARSET +options {k=1;} + : {isLexerRule}?=> LEXER_CHAR_SET {$type=LEXER_CHAR_SET;} + | {!isLexerRule}?=> ARG_ACTION + { + $type=ARG_ACTION; + // Set the token text to our gathered string minus outer [ ] + String t = $text; + t = t.substring(1,t.length()-1); + setText(t); + } + ; + +fragment +LEXER_CHAR_SET + : '[' + ( '\\' ~('\r'|'\n') + | ~('\r'|'\n'|'\\'|']') + )* + ']' + ; + +// -------------- +// Argument specs +// +// Certain argument lists, such as those specifying call parameters +// to a rule invocation, or input parameters to a rule specification +// are contained within square brackets. In the lexer we consume them +// all at once and sort them out later in the grammar analysis. +// +fragment +ARG_ACTION + : '[' + ( + ARG_ACTION + + | ('"')=>ACTION_STRING_LITERAL + + | ('\'')=>ACTION_CHAR_LITERAL + + | ~('['|']') + )* + + ']' + ; + +// ------- +// Actions +// +// Other than making sure to distinguish between { and } embedded +// within what we have assumed to be literals in the action code, the +// job of the lexer is merely to gather the code within the action +// (delimited by {}) and pass it to the parser as a single token. +// We know that this token will be asked for its text somewhere +// in the upcoming parse, so setting the text here to exclude +// the delimiting {} is no additional overhead. +// +ACTION + : NESTED_ACTION + ( '?' {$type = SEMPRED;} + ( (WSNLCHARS* '=>') => WSNLCHARS* '=>' // v3 gated sempred + { + Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); + t.setLine(state.tokenStartLine); + t.setText(state.text); + t.setCharPositionInLine(state.tokenStartCharPositionInLine); + grammarError(ErrorType.V3_GATED_SEMPRED, t); + } + )? + )? + ; + +// ---------------- +// Action structure +// +// Many language targets use {} as block delimiters and so we +// must recursively match {} delimited blocks to balance the +// braces. Additionally, we must make some assumptions about +// literal string representation in the target language. We assume +// that they are delimited by ' or " and so consume these +// in their own alts so as not to inadvertantly match {}. +// This rule calls itself on matching a { +// +fragment +NESTED_ACTION +@init { + + // Record the start line and offsets as if we need to report an + // unterminated block, then we want to show the start of the comment + // we think is broken, not the end, where people will have to try and work + // it out themselves. + // + int startLine = getLine(); + int offset = getCharPositionInLine(); +} + + : // Action and other blocks start with opening { + // + '{' + ( + // And now we can match one of a number of embedded + // elements within the action until we find a + // } that balances the opening {. If we do not find + // the balanced } then we will hit EOF and can issue + // an error message about the brace that we belive to + // be mismatched. This won't be foolproof but we will + // be able to at least report an error against the + // opening brace that we feel is in error and this will + // guide the user to the correction as best we can. + // + + + // An embedded {} block + // + NESTED_ACTION + + | // What appears to be a literal + // + ACTION_CHAR_LITERAL + + | // We have assumed that the target language has C/Java + // type comments. + // + COMMENT + + | // What appears to be a literal + // + ACTION_STRING_LITERAL + + | // What appears to be an escape sequence + // + ACTION_ESC + + | // Some other single character that is not + // handled above + // + ~('\\'|'"'|'\''|'/'|'{'|'}') + + )* + + ( + // Correctly balanced closing brace + // + '}' + + | // Looks like have an imblanced {} block, report + // with respect to the opening brace. + // + { + // TODO: Report imbalanced {} + System.out.println("Block starting at line " + startLine + " offset " + (offset+1) + " contains imbalanced {} or is missing a }"); + } + ) + ; + + +// Keywords +// -------- +// keywords used to specify ANTLR v3 grammars. Keywords may not be used as +// labels for rules or in any other context where they would be ambiguous +// with the keyword vs some other identifier +// OPTIONS, TOKENS, and CHANNELS must also consume the opening brace that captures +// their option block, as this is the easiest way to parse it separate +// to an ACTION block, despite it using the same {} delimiters. +// +OPTIONS : 'options' WSNLCHARS* '{' ; +TOKENS_SPEC : 'tokens' WSNLCHARS* '{' ; +CHANNELS : 'channels' WSNLCHARS* '{' ; + +IMPORT : 'import' ; +FRAGMENT : 'fragment' ; +LEXER : 'lexer' ; +PARSER : 'parser' ; +GRAMMAR : 'grammar' ; +TREE_GRAMMAR : 'tree' WSNLCHARS* 'grammar' ; +PROTECTED : 'protected' ; +PUBLIC : 'public' ; +PRIVATE : 'private' ; +RETURNS : 'returns' ; +LOCALS : 'locals' ; +THROWS : 'throws' ; +CATCH : 'catch' ; +FINALLY : 'finally' ; +MODE : 'mode' ; + +// ----------- +// Punctuation +// +// Character sequences used as separators, delimters, operators, etc +// +COLON : ':' + { + // scan backwards, looking for a RULE_REF or TOKEN_REF. + // which would indicate the start of a rule definition. + // If we see a LPAREN, then it's the start of the subrule. + // this.tokens is the token string we are pushing into, so + // just loop backwards looking for a rule definition. Then + // we set isLexerRule. + Token t = getRuleOrSubruleStartToken(); + if ( t!=null ) { + if ( t.getType()==RULE_REF ) isLexerRule = false; + else if ( t.getType()==TOKEN_REF ) isLexerRule = true; + // else must be subrule; don't alter context + } + } + ; +COLONCOLON : '::' ; +COMMA : ',' ; +SEMI : ';' ; +LPAREN : '(' ; +RPAREN : ')' ; +RARROW : '->' ; +LT : '<' ; +GT : '>' ; +ASSIGN : '=' ; +QUESTION : '?' ; +SYNPRED : '=>' + { + Token t = new CommonToken(input, state.type, state.channel, + state.tokenStartCharIndex, getCharIndex()-1); + t.setLine(state.tokenStartLine); + t.setText(state.text); + t.setCharPositionInLine(state.tokenStartCharPositionInLine); + grammarError(ErrorType.V3_SYNPRED, t); + $channel=HIDDEN; + } + ; +STAR : '*' ; +PLUS : '+' ; +PLUS_ASSIGN : '+=' ; +OR : '|' ; +DOLLAR : '$' ; +DOT : '.' ; // can be WILDCARD or DOT in qid or imported rule ref +RANGE : '..' ; +AT : '@' ; +POUND : '#' ; +NOT : '~' ; +RBRACE : '}' ; + +/** Allow unicode rule/token names */ +ID : a=NameStartChar NameChar* + { + if ( Grammar.isTokenName($a.text) ) $type = TOKEN_REF; + else $type = RULE_REF; + } + ; + +fragment +NameChar : NameStartChar + | '0'..'9' + | '_' + | '\u00B7' + | '\u0300'..'\u036F' + | '\u203F'..'\u2040' + ; + +fragment +NameStartChar + : 'A'..'Z' | 'a'..'z' + | '\u00C0'..'\u00D6' + | '\u00D8'..'\u00F6' + | '\u00F8'..'\u02FF' + | '\u0370'..'\u037D' + | '\u037F'..'\u1FFF' + | '\u200C'..'\u200D' + | '\u2070'..'\u218F' + | '\u2C00'..'\u2FEF' + | '\u3001'..'\uD7FF' + | '\uF900'..'\uFDCF' + | '\uFDF0'..'\uFEFE' + | '\uFF00'..'\uFFFD' + ; // ignores | ['\u10000-'\uEFFFF] ; + +// ---------------------------- +// Literals embedded in actions +// +// Note that we have made the assumption that the language used within +// actions uses the fairly standard " and ' delimiters for literals and +// that within these literals, characters are escaped using the \ character. +// There are some languages which do not conform to this in all cases, such +// as by using /string/ and so on. We will have to deal with such cases if +// if they come up in targets. +// + +// Within actions, or other structures that are not part of the ANTLR +// syntax, we may encounter literal characters. Within these, we do +// not want to inadvertantly match things like '}' and so we eat them +// specifically. While this rule is called CHAR it allows for the fact that +// some languages may use/allow ' as the string delimiter. +// +fragment +ACTION_CHAR_LITERAL + : '\'' (('\\')=>ACTION_ESC | ~'\'' )* '\'' + ; + +// Within actions, or other structures that are not part of the ANTLR +// syntax, we may encounter literal strings. Within these, we do +// not want to inadvertantly match things like '}' and so we eat them +// specifically. +// +fragment +ACTION_STRING_LITERAL + : '"' (('\\')=>ACTION_ESC | ~'"')* '"' + ; + +// Within literal strings and characters that are not part of the ANTLR +// syntax, we must allow for escaped character sequences so that we do not +// inadvertantly recognize the end of a string or character when the terminating +// delimiter has been esacped. +// +fragment +ACTION_ESC + : '\\' . + ; + +// ------- +// Integer +// +// Obviously (I hope) match an aribtrary long sequence of digits. +// +INT : ('0'..'9')+ + ; + +// ----------- +// Source spec +// +// A fragment rule for picking up information about an origrinating +// file from which the grammar we are parsing has been generated. This allows +// ANTLR to report errors against the originating file and not the generated +// file. +// +fragment +SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT + { + // TODO: Add target specific code to change the source file name and current line number + // + } + ; + +// -------------- +// Literal string +// +// ANTLR makes no disticintion between a single character literal and a +// multi-character string. All literals are single quote delimited and +// may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx}, +// where x is a valid hexadecimal number. +STRING_LITERAL + : '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )* + ( '\'' + | // Unterminated string literal + { + Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); + t.setLine(state.tokenStartLine); + t.setText(state.text); + t.setCharPositionInLine(state.tokenStartCharPositionInLine); + grammarError(ErrorType.UNTERMINATED_STRING_LITERAL, t); + } + ) + ; + +// A valid hex digit specification +// +fragment +HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ; + +// Any kind of escaped character that we can embed within ANTLR +// literal strings. +// +fragment +ESC_SEQ + : '\\' + ( + // The standard escaped character set such as tab, newline, etc... + 'b'|'t'|'n'|'f'|'r'|'\''|'\\' + + | // A Java style Unicode escape sequence + UNICODE_ESC + + | // A Swift/Hack style Unicode escape sequence + UNICODE_EXTENDED_ESC + + | // An illegal escape seqeunce + ~('b'|'t'|'n'|'f'|'r'|'\''|'\\'|'u') // \x for any invalid x (make sure to match char here) + { + Token t = new CommonToken(input, state.type, state.channel, getCharIndex()-2, getCharIndex()-1); + t.setText(t.getText()); + t.setLine(input.getLine()); + t.setCharPositionInLine(input.getCharPositionInLine()-2); + grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(getCharIndex()-2,getCharIndex()-1)); + } + ) + ; + +fragment +UNICODE_ESC +@init { + + // Flag to tell us whether we have a valid number of + // hex digits in the escape sequence + // + int hCount = 0; +} + : 'u' // Leadin for unicode escape sequence + + // We now require 4 hex digits. Note though + // that we accept any number of characters + // and issue an error if we do not get 4. We cannot + // use an inifinite count such as + because this + // might consume too many, so we lay out the lexical + // options and issue an error at the invalid paths. + // + ( + ( + HEX_DIGIT { hCount++; } + ( + HEX_DIGIT { hCount++; } + ( + HEX_DIGIT { hCount++; } + ( + // Four valid hex digits, we are good + // + HEX_DIGIT { hCount++; } + + | // Three valid digits + ) + + | // Two valid digits + ) + + | // One valid digit + ) + ) + | // No valid hex digits at all + ) + + // Now check the digit count and issue an error if we need to + // + { + if (hCount < 4) { + Interval badRange = Interval.of(getCharIndex()-2-hCount, getCharIndex()); + String lastChar = input.substring(badRange.b, badRange.b); + if ( lastChar.codePointAt(0)=='\'' ) { + badRange.b--; + } + String bad = input.substring(badRange.a, badRange.b); + Token t = new CommonToken(input, state.type, state.channel, badRange.a, badRange.b); + t.setLine(input.getLine()); + t.setCharPositionInLine(input.getCharPositionInLine()-hCount-2); + grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, bad); + } + } + ; + +fragment +UNICODE_EXTENDED_ESC + : 'u{' // Leadin for unicode extended escape sequence + + HEX_DIGIT+ // One or more hexadecimal digits + + '}' // Leadout for unicode extended escape sequence + + // Now check the digit count and issue an error if we need to + { + int numDigits = getCharIndex()-state.tokenStartCharIndex-6; + if (numDigits > 6) { + Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); + t.setText(t.getText()); + t.setLine(input.getLine()); + t.setCharPositionInLine(input.getCharPositionInLine()-numDigits); + grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(state.tokenStartCharIndex,getCharIndex()-1)); + } + } + ; + +// ---------- +// Whitespace +// +// Characters and character constructs that are of no import +// to the parser and are used to make the grammar easier to read +// for humans. +// +WS + : ( + ' ' + | '\t' + | '\r' + | '\n' + | '\f' + )+ + {$channel=HIDDEN;} + ; + +// A fragment rule for use in recognizing end of line in +// rules like COMMENT. +// +fragment +NLCHARS + : '\n' | '\r' + ; + +// A fragment rule for recognizing traditional whitespace +// characters within lexer rules. +// +fragment +WSCHARS + : ' ' | '\t' | '\f' + ; + +// A fragment rule for recognizing both traditional whitespace and +// end of line markers, when we don't care to distinguish but don't +// want any action code going on. +// +fragment +WSNLCHARS + : ' ' | '\t' | '\f' | '\n' | '\r' + ; + +// This rule allows ANTLR 4 to parse grammars using the UTF-8 encoding with a +// byte order mark. Since this Unicode character doesn't appear as a token +// anywhere else in the grammar, we can simply skip all instances of it without +// problem. This rule will not break usage of \uFEFF inside a LEXER_CHAR_SET or +// STRING_LITERAL. +UnicodeBOM + : '\uFEFF' {skip();} + ; + +// ----------------- +// Illegal Character +// +// This is an illegal character trap which is always the last rule in the +// lexer specification. It matches a single character of any value and being +// the last rule in the file will match when no other rule knows what to do +// about the character. It is reported as an error but is not passed on to the +// parser. This means that the parser to deal with the gramamr file anyway +// but we will not try to analyse or code generate from a file with lexical +// errors. +// +ERRCHAR + : . + { + Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); + t.setLine(state.tokenStartLine); + t.setText(state.text); + t.setCharPositionInLine(state.tokenStartCharPositionInLine); + String msg = getTokenErrorDisplay(t) + " came as a complete surprise to me"; + grammarError(ErrorType.SYNTAX_ERROR, t, msg); + state.syntaxErrors++; + skip(); + } + ; diff --git a/examples/antlr3/ImportGenerated/src/parse/ANTLRParser.g b/examples/antlr3/ImportGenerated/src/parse/ANTLRParser.g new file mode 100644 index 0000000..3c10460 --- /dev/null +++ b/examples/antlr3/ImportGenerated/src/parse/ANTLRParser.g @@ -0,0 +1,922 @@ +/* + * [The "BSD license"] + * Copyright (c) 2012-2016 Terence Parr + * Copyright (c) 2012-2016 Sam Harwell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** The definitive ANTLR v3 grammar to parse ANTLR v4 grammars. + * The grammar builds ASTs that are sniffed by subsequent stages. + */ +parser grammar ANTLRParser; + +options { + // Target language is Java, which is the default but being specific + // here as this grammar is also meant as a good example grammar for + // for users. + language = Java; + + // The output of this grammar is going to be an AST upon which + // we run a semantic checking phase, then the rest of the analysis + // including final code generation. + output = AST; + + // The vocabulary (tokens and their int token types) we are using + // for the parser. This is generated by the lexer. The vocab will be extended + // to include the imaginary tokens below. + tokenVocab = ANTLRLexer; + + ASTLabelType = GrammarAST; +} + +// Imaginary Tokens +// +// Imaginary tokens do not exist as far as the lexer is concerned, and it cannot +// generate them. However we sometimes need additional 'tokens' to use as root +// nodes for the AST we are generating. The tokens section is where we +// specify any such tokens +tokens { + RULE; + PREC_RULE; // flip to this if we find that it's left-recursive + RULES; + RULEMODIFIERS; + RULEACTIONS; + BLOCK; + OPTIONAL; + CLOSURE; + POSITIVE_CLOSURE; + RANGE; + SET; + CHAR_RANGE; + EPSILON; + ALT; + ALTLIST; + ID; + ARG; + ARGLIST; + RET; + COMBINED; + INITACTION; + LABEL; // $x used in rewrite rules + TEMPLATE; + WILDCARD; + // A generic node indicating a list of something when we don't + // really need to distinguish what we have a list of as the AST + // will 'kinow' by context. + // + LIST; + ELEMENT_OPTIONS; // TOKEN + RESULT; + + // lexer action stuff + LEXER_ALT_ACTION; + LEXER_ACTION_CALL; // ID(foo) +} + +// Include the copyright in this source and also the generated source +// +@header { +/* + [The "BSD licence"] + Copyright (c) 2005-20012 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +package org.antlr.v4.parse; + +import org.antlr.v4.tool.*; +import org.antlr.v4.tool.ast.*; + +import java.util.ArrayDeque; +import java.util.Deque; +} + +@members { +Deque paraphrases = new ArrayDeque(); +public void grammarError(ErrorType etype, org.antlr.runtime.Token token, Object... args) { } +} + +// The main entry point for parsing a V3 grammar from top to toe. This is +// the method call from whence to obtain the AST for the parse. +// +grammarSpec +@after { +GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.OPTIONS); +if ( options!=null ) { + Grammar.setNodeOptions($tree, options); +} +} + : // First we should see the type and name of the grammar file that + // we are about to parse. + // + grammarType id SEMI + + // There now follows zero or more declaration sections that should + // be given to us before the rules are declared + // +// A number of things can be declared/stated before the grammar rules +// 'proper' are parsed. These include grammar imports (delegate), grammar +// options, imaginary token declarations, global scope declarations, +// and actions such as @header. In this rule we allow any number of +// these constructs in any order so that the grammar author is not +// constrained by some arbitrary order of declarations that nobody +// can remember. In the next phase of the parse, we verify that these +// constructs are valid, not repeated and so on. + sync ( prequelConstruct sync )* + + // We should now see at least one ANTLR EBNF style rule + // declaration. If the rules are missing we will let the + // semantic verification phase tell the user about it. + // + rules + + modeSpec* + + // And we force ANTLR to process everything it finds in the input + // stream by specifying hte need to match End Of File before the + // parse is complete. + // + EOF + + // Having parsed everything in the file and accumulated the relevant + // subtrees, we can now rewrite everything into the main AST form + // that our tree walkers are expecting. + // + + -> ^(grammarType // The grammar type is our root AST node + id // We need to identify the grammar of course + prequelConstruct* // The set of declarations we accumulated + rules // And of course, we need the set of rules we discovered + modeSpec* + ) + ; + +grammarType +@after { + if ( $tg!=null ) throw new v3TreeGrammarException(tg); + if ( $t!=null ) ((GrammarRootAST)$tree).grammarType = $t.type; + else ((GrammarRootAST)$tree).grammarType=COMBINED; +} + : ( t=LEXER g=GRAMMAR -> GRAMMAR[$g, "LEXER_GRAMMAR", getTokenStream()] + | // A standalone parser specification + t=PARSER g=GRAMMAR -> GRAMMAR[$g, "PARSER_GRAMMAR", getTokenStream()] + + // A combined lexer and parser specification + | g=GRAMMAR -> GRAMMAR[$g, "COMBINED_GRAMMAR", getTokenStream()] + | tg=TREE_GRAMMAR + + ) + ; + +// This is the list of all constructs that can be declared before +// the set of rules that compose the grammar, and is invoked 0..n +// times by the grammarPrequel rule. +prequelConstruct + : // A list of options that affect analysis and/or code generation + optionsSpec + + | // A list of grammars to which this grammar will delegate certain + // parts of the parsing sequence - a set of imported grammars + delegateGrammars + + | // The declaration of any token types we need that are not already + // specified by a preceeding grammar, such as when a parser declares + // imaginary tokens with which to construct the AST, or a rewriting + // tree parser adds further imaginary tokens to ones defined in a prior + // {tree} parser. + tokensSpec + + | // A list of custom channels used by the grammar + channelsSpec + + | // A declaration of language target implemented constructs. All such + // action sections start with '@' and are given to the language target's + // StringTemplate group. For instance @parser::header and @lexer::header + // are gathered here. + action + ; + +// A list of options that affect analysis and/or code generation +optionsSpec + : OPTIONS (option SEMI)* RBRACE -> ^(OPTIONS[$OPTIONS, "OPTIONS"] option*) + ; + +option + : id ASSIGN^ optionValue + ; + +// ------------ +// Option Value +// +// The actual value of an option - Doh! +// +optionValue + : // If the option value is a single word that conforms to the + // lexical rules of token or rule names, then the user may skip quotes + // and so on. Many option values meet this description + qid + | STRING_LITERAL + | ACTION + | INT + ; + +// A list of grammars to which this grammar will delegate certain +// parts of the parsing sequence - a set of imported grammars +delegateGrammars + : IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI -> ^(IMPORT delegateGrammar+) + ; + +// A possibly named grammar file that should be imported to this gramamr +// and delgated to for the rules it specifies +delegateGrammar + : id ASSIGN^ id + | id + ; + +tokensSpec + : TOKENS_SPEC id (COMMA id)* RBRACE -> ^(TOKENS_SPEC id+) + | TOKENS_SPEC RBRACE -> + | TOKENS_SPEC^ v3tokenSpec+ RBRACE! + {grammarError(ErrorType.V3_TOKENS_SYNTAX, $TOKENS_SPEC);} + ; + +v3tokenSpec + : id + ( ASSIGN lit=STRING_LITERAL + { + grammarError(ErrorType.V3_ASSIGN_IN_TOKENS, $id.start, + $id.text, $lit.getText()); + } + -> id // ignore assignment + | -> id + ) + SEMI + ; + +channelsSpec + : CHANNELS^ id (COMMA! id)* RBRACE! + ; + +// A declaration of a language target specifc section, +// such as @header, @includes and so on. We do not verify these +// sections, they are just passed on to the language target. +/** Match stuff like @parser::members {int i;} */ +action + : AT (actionScopeName COLONCOLON)? id ACTION -> ^(AT actionScopeName? id ACTION) + ; + +/** Sometimes the scope names will collide with keywords; allow them as + * ids for action scopes. + */ +actionScopeName + : id + | LEXER -> ID[$LEXER] + | PARSER -> ID[$PARSER] + ; + +modeSpec + : MODE id SEMI sync (lexerRule sync)* -> ^(MODE id lexerRule*) + ; + +rules + : sync (rule sync)* + // Rewrite with an enclosing node as this is good for counting + // the number of rules and an easy marker for the walker to detect + // that there are no rules. + ->^(RULES rule*) + ; + +sync +@init { + BitSet followSet = computeErrorRecoverySet(); + if ( input.LA(1)!=Token.EOF && !followSet.member(input.LA(1)) ) { + reportError(new NoViableAltException("",0,0,input)); + beginResync(); + consumeUntil(input, followSet); + endResync(); + } +} : + ; + +rule: parserRule + | lexerRule + ; + +// The specification of an EBNF rule in ANTLR style, with all the +// rule level parameters, declarations, actions, rewrite specs and so +// on. +// +// Note that here we allow any number of rule declaration sections (such +// as scope, returns, etc) in any order and we let the upcoming semantic +// verification of the AST determine if things are repeated or if a +// particular functional element is not valid in the context of the +// grammar type, such as using returns in lexer rules and so on. +parserRule +@init { paraphrases.push("matching a rule"); } +@after { + paraphrases.pop(); + GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.OPTIONS); + if ( options!=null ) { + Grammar.setNodeOptions($tree, options); + } +} + : // Start with the rule name. Here we do not distinguish between + // parser or lexer rules, the semantic verification phase will + // reject any rules that make no sense, such as lexer rules in + // a pure parser or tree parser. + RULE_REF + + // Immediately following the rulename, there may be a specification + // of input parameters for the rule. We do not do anything with the + // parameters here except gather them for future phases such as + // semantic verifcation, type assignment etc. We require that + // the input parameters are the next syntactically significant element + // following the rule id. + ARG_ACTION? + + ruleReturns? + + throwsSpec? + + localsSpec? + + // Now, before the rule specification itself, which is introduced + // with a COLON, we may have zero or more configuration sections. + // As usual we just accept anything that is syntactically valid for + // one form of the rule or another and let the semantic verification + // phase throw out anything that is invalid. +// At the rule level, a programmer may specify a number of sections, such +// as scope declarations, rule return elements, @ sections (which may be +// language target specific) and so on. We allow any number of these in any +// order here and as usual rely onthe semantic verification phase to reject +// anything invalid using its addinotal context information. Here we are +// context free and just accept anything that is a syntactically correct +// construct. +// + rulePrequels + + COLON + + // The rule is, at the top level, just a list of alts, with + // finer grained structure defined within the alts. + ruleBlock + + SEMI + + exceptionGroup + + -> ^( RULE RULE_REF ARG_ACTION? + ruleReturns? throwsSpec? localsSpec? rulePrequels? ruleBlock exceptionGroup* + ) + ; + +// Many language targets support exceptions and the rule will +// generally be able to throw the language target equivalent +// of a recognition exception. The grammar programmar can +// specify a list of exceptions to catch or a generic catch all +// and the target language code generation template is +// responsible for generating code that makes sense. +exceptionGroup + : exceptionHandler* finallyClause? + ; + +// Specifies a handler for a particular type of exception +// thrown by a rule +exceptionHandler + : CATCH ARG_ACTION ACTION -> ^(CATCH ARG_ACTION ACTION) + ; + +finallyClause + : FINALLY ACTION -> ^(FINALLY ACTION) + ; + +rulePrequels +@init { paraphrases.push("matching rule preamble"); } +@after { paraphrases.pop(); } + : sync (rulePrequel sync)* -> rulePrequel* + ; + +// An individual rule level configuration as referenced by the ruleActions +// rule above. +// +rulePrequel + : optionsSpec + | ruleAction + ; + +// A rule can return elements that it constructs as it executes. +// The return values are specified in a 'returns' prequel element, +// which contains COMMA separated declarations, where the declaration +// is target language specific. Here we see the returns declaration +// as a single lexical action element, to be processed later. +// +ruleReturns + : RETURNS^ ARG_ACTION + ; + +// -------------- +// Exception spec +// +// Some target languages, such as Java and C# support exceptions +// and they are specified as a prequel element for each rule that +// wishes to throw its own exception type. Note that the name of the +// exception is just a single word, so the header section of the grammar +// must specify the correct import statements (or language equivalent). +// Target languages that do not support exceptions just safely ignore +// them. +// +throwsSpec + : THROWS qid (COMMA qid)* -> ^(THROWS qid+) + ; + +// locals [Cat x, float g] +localsSpec : LOCALS^ ARG_ACTION ; + +// @ Sections are generally target language specific things +// such as local variable declarations, code to run before the +// rule starts and so on. Fir instance most targets support the +// @init {} section where declarations and code can be placed +// to run before the rule is entered. The C target also has +// an @declarations {} section, where local variables are declared +// in order that the generated code is C89 copmliant. +// +/** Match stuff like @init {int i;} */ +ruleAction + : AT id ACTION -> ^(AT id ACTION) + ; + +// A set of alts, rewritten as a BLOCK for generic processing +// in tree walkers. Used by the rule 'rule' so that the list of +// alts for a rule appears as a BLOCK containing the alts and +// can be processed by the generic BLOCK rule. Note that we +// use a separate rule so that the BLOCK node has start and stop +// boundaries set correctly by rule post processing of rewrites. +ruleBlock +@init {Token colon = input.LT(-1);} + : ruleAltList -> ^(BLOCK[colon,"BLOCK"] ruleAltList) + ; + catch [ResyncToEndOfRuleBlock e] { + // just resyncing; ignore error + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), null); + } + +ruleAltList + : labeledAlt (OR labeledAlt)* -> labeledAlt+ + ; + +labeledAlt + : alternative + ( POUND! id! {((AltAST)$alternative.tree).altLabel=$id.tree;} + )? + ; + +lexerRule +@init { paraphrases.push("matching a lexer rule"); } +@after { + paraphrases.pop(); +} + : FRAGMENT? + TOKEN_REF COLON lexerRuleBlock SEMI + -> ^( RULE TOKEN_REF + ^(RULEMODIFIERS FRAGMENT)? lexerRuleBlock + ) + ; + +lexerRuleBlock +@init {Token colon = input.LT(-1);} + : lexerAltList -> ^(BLOCK[colon,"BLOCK"] lexerAltList) + ; + catch [ResyncToEndOfRuleBlock e] { + // just resyncing; ignore error + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), null); + } + +lexerAltList + : lexerAlt (OR lexerAlt)* -> lexerAlt+ + ; + +lexerAlt + : lexerElements + ( lexerCommands -> ^(LEXER_ALT_ACTION lexerElements lexerCommands) + | -> lexerElements + ) + ; + +lexerElements + : lexerElement+ -> ^(ALT lexerElement+) + | -> ^(ALT EPSILON) // empty alt + ; + +lexerElement +@init { + paraphrases.push("looking for lexer rule element"); + int m = input.mark(); +} +@after { paraphrases.pop(); } + : labeledLexerElement + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$labeledLexerElement.start,"BLOCK"] ^(ALT labeledLexerElement) ) ) + | -> labeledLexerElement + ) + | lexerAtom + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$lexerAtom.start,"BLOCK"] ^(ALT lexerAtom) ) ) + | -> lexerAtom + ) + | lexerBlock + ( ebnfSuffix -> ^(ebnfSuffix lexerBlock) + | -> lexerBlock + ) + | actionElement // actions only allowed at end of outer alt actually, + // but preds can be anywhere + ; + catch [RecognitionException re] { + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), re); + int ttype = input.get(input.range()).getType(); // seems to be next token + // look for anything that really belongs at the start of the rule minus the initial ID + if ( ttype==COLON || ttype==RETURNS || ttype==CATCH || ttype==FINALLY || ttype==AT || ttype==EOF ) { + RecognitionException missingSemi = + new v4ParserException("unterminated rule (missing ';') detected at '"+ + input.LT(1).getText()+" "+input.LT(2).getText()+"'", input); + reportError(missingSemi); + if ( ttype==EOF ) { + input.seek(input.index()+1); + } + else if ( ttype==CATCH || ttype==FINALLY ) { + input.seek(input.range()); // ignore what's before rule trailer stuff + } + else if ( ttype==RETURNS || ttype==AT ) { // scan back looking for ID of rule header + int p = input.index(); + Token t = input.get(p); + while ( t.getType()!=RULE_REF && t.getType()!=TOKEN_REF ) { + p--; + t = input.get(p); + } + input.seek(p); + } + throw new ResyncToEndOfRuleBlock(); // make sure it goes back to rule block level to recover + } + reportError(re); + recover(input,re); + } + +labeledLexerElement + : id (ass=ASSIGN|ass=PLUS_ASSIGN) + ( lexerAtom -> ^($ass id lexerAtom) + | lexerBlock -> ^($ass id lexerBlock) + ) + ; + + +lexerBlock +@after { +GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.OPTIONS); +if ( options!=null ) { + Grammar.setNodeOptions($tree, options); +} +} + : LPAREN + ( optionsSpec COLON )? + lexerAltList + RPAREN + -> ^(BLOCK[$LPAREN,"BLOCK"] optionsSpec? lexerAltList ) + ; + +// channel=HIDDEN, skip, more, mode(INSIDE), push(INSIDE), pop +lexerCommands + : RARROW lexerCommand (COMMA lexerCommand)* -> lexerCommand+ + ; + +lexerCommand + : lexerCommandName LPAREN lexerCommandExpr RPAREN -> ^(LEXER_ACTION_CALL lexerCommandName lexerCommandExpr) + | lexerCommandName + ; + +lexerCommandExpr + : id + | INT + ; + +lexerCommandName + : id + | MODE ->ID[$MODE] + ; + +altList + : alternative (OR alternative)* -> alternative+ + ; + +// An individual alt with an optional alt option like +alternative +@init { paraphrases.push("matching alternative"); } +@after { + paraphrases.pop(); + Grammar.setNodeOptions($tree, $o.tree); +} + : o=elementOptions? + ( e+=element+ -> ^(ALT elementOptions? $e+) + | -> ^(ALT elementOptions? EPSILON) // empty alt + ) + ; + +element +@init { + paraphrases.push("looking for rule element"); + int m = input.mark(); +} +@after { paraphrases.pop(); } + : labeledElement + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$labeledElement.start,"BLOCK"] ^(ALT labeledElement ) )) + | -> labeledElement + ) + | atom + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$atom.start,"BLOCK"] ^(ALT atom) ) ) + | -> atom + ) + | ebnf + | actionElement + ; + catch [RecognitionException re] { + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), re); + int ttype = input.get(input.range()).getType(); + // look for anything that really belongs at the start of the rule minus the initial ID + if ( ttype==COLON || ttype==RETURNS || ttype==CATCH || ttype==FINALLY || ttype==AT ) { + RecognitionException missingSemi = + new v4ParserException("unterminated rule (missing ';') detected at '"+ + input.LT(1).getText()+" "+input.LT(2).getText()+"'", input); + reportError(missingSemi); + if ( ttype==CATCH || ttype==FINALLY ) { + input.seek(input.range()); // ignore what's before rule trailer stuff + } + if ( ttype==RETURNS || ttype==AT ) { // scan back looking for ID of rule header + int p = input.index(); + Token t = input.get(p); + while ( t.getType()!=RULE_REF && t.getType()!=TOKEN_REF ) { + p--; + t = input.get(p); + } + input.seek(p); + } + throw new ResyncToEndOfRuleBlock(); // make sure it goes back to rule block level to recover + } + reportError(re); + recover(input,re); + } + +actionElement +@after { + GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.ELEMENT_OPTIONS); + if ( options!=null ) { + Grammar.setNodeOptions($tree, options); + } +} + : ACTION + | ACTION elementOptions -> ^(ACTION elementOptions) + | SEMPRED + | SEMPRED elementOptions -> ^(SEMPRED elementOptions) + ; + +labeledElement + : id (ass=ASSIGN|ass=PLUS_ASSIGN) + ( atom -> ^($ass id atom) + | block -> ^($ass id block) + ) + ; + +// A block of gramamr structure optionally followed by standard EBNF +// notation, or ANTLR specific notation. I.E. ? + ^ and so on +ebnf + : block + // And now we see if we have any of the optional suffixs and rewrite + // the AST for this rule accordingly + ( blockSuffix -> ^(blockSuffix block) + | -> block + ) + ; + +// The standard EBNF suffixes with additional components that make +// sense only to ANTLR, in the context of a grammar block. +blockSuffix + : ebnfSuffix // Standard EBNF + ; + +ebnfSuffix + : QUESTION nongreedy=QUESTION? -> OPTIONAL[$start, $nongreedy] + | STAR nongreedy=QUESTION? -> CLOSURE[$start, $nongreedy] + | PLUS nongreedy=QUESTION? -> POSITIVE_CLOSURE[$start, $nongreedy] + ; + +lexerAtom + : range + | terminal + | RULE_REF + | notSet + | wildcard + | LEXER_CHAR_SET + ; + +atom + : // Qualified reference delegate.rule. This must be + // lexically contiguous (no spaces either side of the DOT) + // otherwise it is two references with a wildcard in between + // and not a qualified reference. + /* + { + input.LT(1).getCharPositionInLine()+input.LT(1).getText().length()== + input.LT(2).getCharPositionInLine() && + input.LT(2).getCharPositionInLine()+1==input.LT(3).getCharPositionInLine() + }? + id DOT ruleref -> ^(DOT id ruleref) + + | + */ + range // Range x..y - only valid in lexers + | terminal + | ruleref + | notSet + | wildcard + ; + catch [RecognitionException re] { throw re; } // pass upwards to element + +wildcard +@after { + GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.ELEMENT_OPTIONS); + if ( options!=null ) { + Grammar.setNodeOptions($tree, options); + } +} + : // Wildcard '.' means any character in a lexer, any + // token in parser and any node or subtree in a tree parser + // Because the terminal rule is allowed to be the node + // specification for the start of a tree rule, we must + // later check that wildcard was not used for that. + DOT elementOptions? + -> ^(WILDCARD[$DOT] elementOptions?) + ; + +// -------------------- +// Inverted element set +// +// A set of characters (in a lexer) or terminal tokens, if a parser, +// that are then used to create the inverse set of them. +notSet + : NOT setElement -> ^(NOT[$NOT] ^(SET[$setElement.start,"SET"] setElement)) + | NOT blockSet -> ^(NOT[$NOT] blockSet) + ; + +blockSet +@init { + Token t; + boolean ebnf = false; +} + : LPAREN setElement (OR setElement)* RPAREN + -> ^(SET[$LPAREN,"SET"] setElement+ ) + ; + +setElement + : TOKEN_REF^ elementOptions? + | STRING_LITERAL^ elementOptions? + | range + | LEXER_CHAR_SET + ; + +// ------------- +// Grammar Block +// +// Anywhere where an element is valid, the grammar may start a new block +// of alts by surrounding that block with ( ). A new block may also have a set +// of options, which apply only to that block. +// +block +@after { +GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.OPTIONS); +if ( options!=null ) { + Grammar.setNodeOptions($tree, options); +} +} + : LPAREN + ( optionsSpec? ra+=ruleAction* COLON )? + altList + RPAREN + -> ^(BLOCK[$LPAREN,"BLOCK"] optionsSpec? $ra* altList ) + ; + +// ---------------- +// Parser rule ref +// +// Reference to a parser rule with optional arguments and optional +// directive to become the root node or ignore the tree produced +// +ruleref +@after { +GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.ELEMENT_OPTIONS); +if ( options!=null ) { + Grammar.setNodeOptions($tree, options); +} +} + : RULE_REF ARG_ACTION? elementOptions? -> ^(RULE_REF ARG_ACTION? elementOptions?) + ; + catch [RecognitionException re] { throw re; } // pass upwards to element + +// --------------- +// Character Range +// +// Specifies a range of characters. Valid for lexer rules only, but +// we do not check that here, the tree walkers shoudl do that. +// Note also that the parser also allows through more than just +// character literals so that we can produce a much nicer semantic +// error about any abuse of the .. operator. +// +range + : STRING_LITERAL RANGE^ STRING_LITERAL + ; + +terminal +@after { +GrammarAST options = (GrammarAST)$tree.getFirstChildWithType(ANTLRParser.ELEMENT_OPTIONS); +if ( options!=null ) { + Grammar.setNodeOptions($tree, options); +} +} + : TOKEN_REF elementOptions? -> ^(TOKEN_REF elementOptions?) + | STRING_LITERAL elementOptions? -> ^(STRING_LITERAL elementOptions?) + ; + +// Terminals may be adorned with certain options when +// reference in the grammar: TOK<,,,> +elementOptions + : LT (elementOption (COMMA elementOption)*)? GT + -> ^(ELEMENT_OPTIONS[$LT,"ELEMENT_OPTIONS"] elementOption*) + ; + +// When used with elements we can specify what the tree node type can +// be and also assign settings of various options (which we do not check here) +elementOption + : // This format indicates the default element option + qid + | id ASSIGN^ optionValue + ; + +// The name of the grammar, and indeed some other grammar elements may +// come through to the parser looking like a rule reference or a token +// reference, hence this rule is used to pick up whichever it is and rewrite +// it as a generic ID token. +id +@init { paraphrases.push("looking for an identifier"); } +@after { paraphrases.pop(); } + : RULE_REF ->ID[$RULE_REF] + | TOKEN_REF ->ID[$TOKEN_REF] + ; + +qid +@init { paraphrases.push("looking for a qualified identifier"); } +@after { paraphrases.pop(); } + : id (DOT id)* -> ID[$qid.start, $text] + ; + +alternativeEntry : alternative EOF ; // allow gunit to call alternative and see EOF afterwards +elementEntry : element EOF ; +ruleEntry : rule EOF ; +blockEntry : block EOF ; diff --git a/examples/antlr3/ImportGenerated/src/parse/BUILD b/examples/antlr3/ImportGenerated/src/parse/BUILD new file mode 100644 index 0000000..73c5066 --- /dev/null +++ b/examples/antlr3/ImportGenerated/src/parse/BUILD @@ -0,0 +1,7 @@ +load("@rules_antlr//antlr:antlr3.bzl", "antlr") + +antlr( + name = "parser", + srcs = glob(["*.g"]), + visibility = ["//visibility:public"], +) diff --git a/examples/antlr3/ObjC/src/BUILD b/examples/antlr3/ObjC/src/BUILD new file mode 100644 index 0000000..9f15510 --- /dev/null +++ b/examples/antlr3/ObjC/src/BUILD @@ -0,0 +1,8 @@ +load("@rules_antlr//antlr:antlr3.bzl", "antlr") + +antlr( + name = "parser", + srcs = glob(["SimpleC.g"]), + language = "ObjC", +) + diff --git a/examples/antlr3/ObjC/src/SimpleC.g b/examples/antlr3/ObjC/src/SimpleC.g new file mode 100644 index 0000000..016ded7 --- /dev/null +++ b/examples/antlr3/ObjC/src/SimpleC.g @@ -0,0 +1,107 @@ +grammar SimpleC; + +options { + language=ObjC; + +} + +program + : declaration+ + ; + +/** In this rule, the functionHeader left prefix on the last two + * alternatives is not LL(k) for a fixed k. However, it is + * LL(*). The LL(*) algorithm simply scans ahead until it sees + * either the ';' or the '{' of the block and then it picks + * the appropriate alternative. Lookhead can be arbitrarily + * long in theory, but is <=10 in most cases. Works great. + * Use ANTLRWorks to see the lookahead use (step by Location) + * and look for blue tokens in the input window pane. :) + */ +declaration + : variable + | functionHeader ';' + { NSLog(@"\%@ is a declaration\n", $functionHeader.name); } + | functionHeader block + { NSLog(@"\%@ is a definition\n", $functionHeader.name); } + ; + +variable + : type declarator ';' + ; + +declarator + : ID + ; + +functionHeader returns [NSString *name] +@init { + name=nil; // for now you must init here rather than in 'returns' +} + : type ID '(' ( formalParameter ( ',' formalParameter )* )? ')' + {$name = $ID.text;} + ; + +formalParameter + : type declarator + ; + +type + : 'int' + | 'char' + | 'void' + | ID + ; + +block + : '{' + variable* + stat* + '}' + ; + +stat: forStat + | expr ';' + | block + | assignStat ';' + | ';' + ; + +forStat + : 'for' '(' assignStat ';' expr ';' assignStat ')' block + ; + +assignStat + : ID '=' expr + ; + +expr: condExpr + ; + +condExpr + : aexpr ( ('==' | '<') aexpr )? + ; + +aexpr + : atom ( '+' atom )* + ; + +atom + : ID + | INT + | '(' expr ')' + ; + +ID : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* + ; + +INT : ('0'..'9')+ + ; + +WS : ( ' ' + | '\t' + | '\r' + | '\n' + )+ + { $channel=HIDDEN; } + ; diff --git a/examples/antlr4-opt/Hello/src/main/antlr4/BUILD b/examples/antlr4-opt/Hello/src/main/antlr4/BUILD new file mode 100644 index 0000000..d3e305a --- /dev/null +++ b/examples/antlr4-opt/Hello/src/main/antlr4/BUILD @@ -0,0 +1,17 @@ +load("@rules_java//java:defs.bzl", "java_library") + +package(default_visibility = ["//visibility:public"]) + +load("@rules_antlr//antlr:antlr4.bzl", "antlr") + +antlr( + name = "generated", + srcs = ["Hello.g4"], + package = "hello.world", +) + +java_library( + name = "HelloWorld", + srcs = [":generated"], + deps = ["@antlr4_runtime//jar"], +) diff --git a/examples/antlr4-opt/Hello/src/main/antlr4/Hello.g4 b/examples/antlr4-opt/Hello/src/main/antlr4/Hello.g4 new file mode 100644 index 0000000..4a9d629 --- /dev/null +++ b/examples/antlr4-opt/Hello/src/main/antlr4/Hello.g4 @@ -0,0 +1,4 @@ +grammar Hello; +r : 'hello' ID ; // match keyword hello followed by an identifier +ID : [a-z]+ ; // match lower-case identifiers +WS : [ \t\r\n]+ -> skip ; // skip spaces, tabs, newlines diff --git a/examples/antlr4-opt/WORKSPACE b/examples/antlr4-opt/WORKSPACE new file mode 100644 index 0000000..a25c000 --- /dev/null +++ b/examples/antlr4-opt/WORKSPACE @@ -0,0 +1,10 @@ +workspace(name = "antlr4_opt") + +local_repository( + name = "rules_antlr", + path = "../..", +) + +load("@rules_antlr//antlr:repositories.bzl", "rules_antlr_optimized_dependencies") + +rules_antlr_optimized_dependencies("4.7.4") diff --git a/examples/antlr4-opt/groovy/BUILD b/examples/antlr4-opt/groovy/BUILD new file mode 100644 index 0000000..13967a9 --- /dev/null +++ b/examples/antlr4-opt/groovy/BUILD @@ -0,0 +1,6 @@ +load("@rules_antlr//antlr:antlr4.bzl", "antlr") + +antlr( + name = "parser", + srcs = glob(["*.g4"]), +) diff --git a/examples/antlr4-opt/groovy/GroovyLexer.g4 b/examples/antlr4-opt/groovy/GroovyLexer.g4 new file mode 100644 index 0000000..016824f --- /dev/null +++ b/examples/antlr4-opt/groovy/GroovyLexer.g4 @@ -0,0 +1,955 @@ +/* + * This file is adapted from the Antlr4 Java grammar which has the following license + * + * Copyright (c) 2013 Terence Parr, Sam Harwell + * All rights reserved. + * [The "BSD licence"] + * + * http://www.opensource.org/licenses/bsd-license.php + * + * Subsequent modifications by the Groovy community have been done under the Apache License v2: + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * The Groovy grammar is based on the official grammar for Java: + * https://github.com/antlr/grammars-v4/blob/master/java/Java.g4 + */ +lexer grammar GroovyLexer; + +options { + superClass = AbstractLexer; +} + +@header { + import java.util.Deque; + import java.util.ArrayDeque; + import java.util.Map; + import java.util.HashMap; + import java.util.Set; + import java.util.HashSet; + import java.util.Collections; + import java.util.Arrays; + import java.util.stream.IntStream; + import org.apache.groovy.util.Maps; + import static org.apache.groovy.parser.antlr4.SemanticPredicates.*; +} + +@members { + private long tokenIndex = 0; + private int lastTokenType = 0; + private int invalidDigitCount = 0; + + /** + * Record the index and token type of the current token while emitting tokens. + */ + @Override + public void emit(Token token) { + this.tokenIndex++; + + int tokenType = token.getType(); + if (Token.DEFAULT_CHANNEL == token.getChannel()) { + this.lastTokenType = tokenType; + } + + if (RollBackOne == tokenType) { + this.rollbackOneChar(); + } + + super.emit(token); + } + + private static final int[] REGEX_CHECK_ARRAY = + IntStream.of( + Identifier, CapitalizedIdentifier, NullLiteral, BooleanLiteral, THIS, RPAREN, RBRACK, RBRACE, + IntegerLiteral, FloatingPointLiteral, StringLiteral, GStringEnd, INC, DEC + ).sorted().toArray(); + + private boolean isRegexAllowed() { + if (Arrays.binarySearch(REGEX_CHECK_ARRAY, this.lastTokenType) >= 0) { + return false; + } + + return true; + } + + /** + * just a hook, which will be overrided by GroovyLangLexer + */ + protected void rollbackOneChar() {} + + private static class Paren { + private String text; + private int lastTokenType; + private int line; + private int column; + + public Paren(String text, int lastTokenType, int line, int column) { + this.text = text; + this.lastTokenType = lastTokenType; + this.line = line; + this.column = column; + } + + public String getText() { + return this.text; + } + + public int getLastTokenType() { + return this.lastTokenType; + } + + public int getLine() { + return line; + } + + public int getColumn() { + return column; + } + + @Override + public int hashCode() { + return (int) (text.hashCode() * line + column); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Paren)) { + return false; + } + + Paren other = (Paren) obj; + + return this.text.equals(other.text) && (this.line == other.line && this.column == other.column); + } + } + + private static final Map PAREN_MAP = + Maps.of( + "(", ")", + "[", "]", + "{", "}" + ); + + protected void enterParenCallback(String text) {} + + protected void exitParenCallback(String text) {} + + private final Deque parenStack = new ArrayDeque<>(32); + + private void enterParen() { + String text = getText(); + enterParenCallback(text); + parenStack.push(new Paren(text, this.lastTokenType, getLine(), getCharPositionInLine())); + } + + private void exitParen() { + Paren paren = parenStack.peek(); + String text = getText(); + + require(null != paren, "Too many '" + text + "'"); + require(text.equals(PAREN_MAP.get(paren.getText())), + "'" + paren.getText() + "'" + new PositionInfo(paren.getLine(), paren.getColumn()) + " can not match '" + text + "'", -1); + + exitParenCallback(text); + parenStack.pop(); + } + private boolean isInsideParens() { + Paren paren = parenStack.peek(); + + // We just care about "(" and "[", inside which the new lines will be ignored. + // Notice: the new lines between "{" and "}" can not be ignored. + if (null == paren) { + return false; + } + return ("(".equals(paren.getText()) && TRY != paren.getLastTokenType()) // we don't treat try-paren(i.e. try (....)) as parenthesis + || "[".equals(paren.getText()); + } + private void ignoreTokenInsideParens() { + if (!this.isInsideParens()) { + return; + } + + this.setChannel(Token.HIDDEN_CHANNEL); + } + private void ignoreMultiLineCommentConditionally() { + if (!this.isInsideParens() && isFollowedByWhiteSpaces(_input)) { + return; + } + + this.setChannel(Token.HIDDEN_CHANNEL); + } + + @Override + public int getSyntaxErrorSource() { + return GroovySyntaxError.LEXER; + } + + @Override + public int getErrorLine() { + return getLine(); + } + + @Override + public int getErrorColumn() { + return getCharPositionInLine() + 1; + } +} + + +// §3.10.5 String Literals +StringLiteral + : GStringQuotationMark DqStringCharacter* GStringQuotationMark + | SqStringQuotationMark SqStringCharacter* SqStringQuotationMark + + | Slash { this.isRegexAllowed() && _input.LA(1) != '*' }? + SlashyStringCharacter+ Slash + + | TdqStringQuotationMark TdqStringCharacter* TdqStringQuotationMark + | TsqStringQuotationMark TsqStringCharacter* TsqStringQuotationMark + | DollarSlashyGStringQuotationMarkBegin DollarSlashyStringCharacter+ DollarSlashyGStringQuotationMarkEnd + ; + +// Groovy gstring +GStringBegin + : GStringQuotationMark DqStringCharacter* Dollar -> pushMode(DQ_GSTRING_MODE), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +TdqGStringBegin + : TdqStringQuotationMark TdqStringCharacter* Dollar -> type(GStringBegin), pushMode(TDQ_GSTRING_MODE), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +SlashyGStringBegin + : Slash { this.isRegexAllowed() && _input.LA(1) != '*' }? SlashyStringCharacter* Dollar { isFollowedByJavaLetterInGString(_input) }? -> type(GStringBegin), pushMode(SLASHY_GSTRING_MODE), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +DollarSlashyGStringBegin + : DollarSlashyGStringQuotationMarkBegin DollarSlashyStringCharacter* Dollar { isFollowedByJavaLetterInGString(_input) }? -> type(GStringBegin), pushMode(DOLLAR_SLASHY_GSTRING_MODE), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; + +mode DQ_GSTRING_MODE; +GStringEnd + : GStringQuotationMark -> popMode + ; +GStringPart + : Dollar -> pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +GStringCharacter + : DqStringCharacter -> more + ; + +mode TDQ_GSTRING_MODE; +TdqGStringEnd + : TdqStringQuotationMark -> type(GStringEnd), popMode + ; +TdqGStringPart + : Dollar -> type(GStringPart), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +TdqGStringCharacter + : TdqStringCharacter -> more + ; + +mode SLASHY_GSTRING_MODE; +SlashyGStringEnd + : Dollar? Slash -> type(GStringEnd), popMode + ; +SlashyGStringPart + : Dollar { isFollowedByJavaLetterInGString(_input) }? -> type(GStringPart), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +SlashyGStringCharacter + : SlashyStringCharacter -> more + ; + +mode DOLLAR_SLASHY_GSTRING_MODE; +DollarSlashyGStringEnd + : DollarSlashyGStringQuotationMarkEnd -> type(GStringEnd), popMode + ; +DollarSlashyGStringPart + : Dollar { isFollowedByJavaLetterInGString(_input) }? -> type(GStringPart), pushMode(GSTRING_TYPE_SELECTOR_MODE) + ; +DollarSlashyGStringCharacter + : DollarSlashyStringCharacter -> more + ; + +mode GSTRING_TYPE_SELECTOR_MODE; +GStringLBrace + : '{' { this.enterParen(); } -> type(LBRACE), popMode, pushMode(DEFAULT_MODE) + ; +GStringIdentifier + : IdentifierInGString -> type(Identifier), popMode, pushMode(GSTRING_PATH_MODE) + ; + + +mode GSTRING_PATH_MODE; +GStringPathPart + : Dot IdentifierInGString + ; +RollBackOne + : . { + // a trick to handle GStrings followed by EOF properly + int readChar = _input.LA(-1); + if (EOF == _input.LA(1) && ('"' == readChar || '/' == readChar)) { + setType(GStringEnd); + } else { + setChannel(HIDDEN); + } + } -> popMode + ; + + +mode DEFAULT_MODE; +// character in the double quotation string. e.g. "a" +fragment +DqStringCharacter + : ~["\r\n\\$] + | EscapeSequence + ; + +// character in the single quotation string. e.g. 'a' +fragment +SqStringCharacter + : ~['\r\n\\] + | EscapeSequence + ; + +// character in the triple double quotation string. e.g. """a""" +fragment TdqStringCharacter + : ~["\\$] + | GStringQuotationMark { _input.LA(1) != '"' || _input.LA(2) != '"' || _input.LA(3) == '"' && (_input.LA(4) != '"' || _input.LA(5) != '"') }? + | EscapeSequence + ; + +// character in the triple single quotation string. e.g. '''a''' +fragment TsqStringCharacter + : ~['\\] + | SqStringQuotationMark { _input.LA(1) != '\'' || _input.LA(2) != '\'' || _input.LA(3) == '\'' && (_input.LA(4) != '\'' || _input.LA(5) != '\'') }? + | EscapeSequence + ; + +// character in the slashy string. e.g. /a/ +fragment SlashyStringCharacter + : SlashEscape + | Dollar { !isFollowedByJavaLetterInGString(_input) }? + | ~[/$\u0000] + ; + +// character in the collar slashy string. e.g. $/a/$ +fragment DollarSlashyStringCharacter + : SlashEscape | DollarSlashEscape | DollarDollarEscape + | Slash { _input.LA(1) != '$' }? + | Dollar { !isFollowedByJavaLetterInGString(_input) }? + | ~[/$\u0000] + ; + +// Groovy keywords +AS : 'as'; +DEF : 'def'; +IN : 'in'; +TRAIT : 'trait'; +THREADSAFE : 'threadsafe'; // reserved keyword + +// the reserved type name of Java10 +VAR : 'var'; + +// §3.9 Keywords +BuiltInPrimitiveType + : BOOLEAN + | CHAR + | BYTE + | SHORT + | INT + | LONG + | FLOAT + | DOUBLE + ; + +ABSTRACT : 'abstract'; +ASSERT : 'assert'; + +fragment +BOOLEAN : 'boolean'; + +BREAK : 'break'; + +fragment +BYTE : 'byte'; + +CASE : 'case'; +CATCH : 'catch'; + +fragment +CHAR : 'char'; + +CLASS : 'class'; +CONST : 'const'; +CONTINUE : 'continue'; +DEFAULT : 'default'; +DO : 'do'; + +fragment +DOUBLE : 'double'; + +ELSE : 'else'; +ENUM : 'enum'; +EXTENDS : 'extends'; +FINAL : 'final'; +FINALLY : 'finally'; + +fragment +FLOAT : 'float'; + + +FOR : 'for'; +IF : 'if'; +GOTO : 'goto'; +IMPLEMENTS : 'implements'; +IMPORT : 'import'; +INSTANCEOF : 'instanceof'; + +fragment +INT : 'int'; + +INTERFACE : 'interface'; + +fragment +LONG : 'long'; + +NATIVE : 'native'; +NEW : 'new'; +PACKAGE : 'package'; +PRIVATE : 'private'; +PROTECTED : 'protected'; +PUBLIC : 'public'; +RETURN : 'return'; + +fragment +SHORT : 'short'; + + +STATIC : 'static'; +STRICTFP : 'strictfp'; +SUPER : 'super'; +SWITCH : 'switch'; +SYNCHRONIZED : 'synchronized'; +THIS : 'this'; +THROW : 'throw'; +THROWS : 'throws'; +TRANSIENT : 'transient'; +TRY : 'try'; +VOID : 'void'; +VOLATILE : 'volatile'; +WHILE : 'while'; + + +// §3.10.1 Integer Literals + +IntegerLiteral + : ( DecimalIntegerLiteral + | HexIntegerLiteral + | OctalIntegerLiteral + | BinaryIntegerLiteral + ) (Underscore { require(false, "Number ending with underscores is invalid", -1, true); })? + + // !!! Error Alternative !!! + | Zero ([0-9] { invalidDigitCount++; })+ { require(false, "Invalid octal number", -(invalidDigitCount + 1), true); } IntegerTypeSuffix? + ; + +fragment +Zero + : '0' + ; + +fragment +DecimalIntegerLiteral + : DecimalNumeral IntegerTypeSuffix? + ; + +fragment +HexIntegerLiteral + : HexNumeral IntegerTypeSuffix? + ; + +fragment +OctalIntegerLiteral + : OctalNumeral IntegerTypeSuffix? + ; + +fragment +BinaryIntegerLiteral + : BinaryNumeral IntegerTypeSuffix? + ; + +fragment +IntegerTypeSuffix + : [lLiIgG] + ; + +fragment +DecimalNumeral + : Zero + | NonZeroDigit (Digits? | Underscores Digits) + ; + +fragment +Digits + : Digit (DigitOrUnderscore* Digit)? + ; + +fragment +Digit + : Zero + | NonZeroDigit + ; + +fragment +NonZeroDigit + : [1-9] + ; + +fragment +DigitOrUnderscore + : Digit + | Underscore + ; + +fragment +Underscores + : Underscore+ + ; + +fragment +Underscore + : '_' + ; + +fragment +HexNumeral + : Zero [xX] HexDigits + ; + +fragment +HexDigits + : HexDigit (HexDigitOrUnderscore* HexDigit)? + ; + +fragment +HexDigit + : [0-9a-fA-F] + ; + +fragment +HexDigitOrUnderscore + : HexDigit + | Underscore + ; + +fragment +OctalNumeral + : Zero Underscores? OctalDigits + ; + +fragment +OctalDigits + : OctalDigit (OctalDigitOrUnderscore* OctalDigit)? + ; + +fragment +OctalDigit + : [0-7] + ; + +fragment +OctalDigitOrUnderscore + : OctalDigit + | Underscore + ; + +fragment +BinaryNumeral + : Zero [bB] BinaryDigits + ; + +fragment +BinaryDigits + : BinaryDigit (BinaryDigitOrUnderscore* BinaryDigit)? + ; + +fragment +BinaryDigit + : [01] + ; + +fragment +BinaryDigitOrUnderscore + : BinaryDigit + | Underscore + ; + +// §3.10.2 Floating-Point Literals + +FloatingPointLiteral + : ( DecimalFloatingPointLiteral + | HexadecimalFloatingPointLiteral + ) (Underscore { require(false, "Number ending with underscores is invalid", -1, true); })? + ; + +fragment +DecimalFloatingPointLiteral + : Digits Dot Digits ExponentPart? FloatTypeSuffix? + | Digits ExponentPart FloatTypeSuffix? + | Digits FloatTypeSuffix + ; + +fragment +ExponentPart + : ExponentIndicator SignedInteger + ; + +fragment +ExponentIndicator + : [eE] + ; + +fragment +SignedInteger + : Sign? Digits + ; + +fragment +Sign + : [+\-] + ; + +fragment +FloatTypeSuffix + : [fFdDgG] + ; + +fragment +HexadecimalFloatingPointLiteral + : HexSignificand BinaryExponent FloatTypeSuffix? + ; + +fragment +HexSignificand + : HexNumeral Dot? + | Zero [xX] HexDigits? Dot HexDigits + ; + +fragment +BinaryExponent + : BinaryExponentIndicator SignedInteger + ; + +fragment +BinaryExponentIndicator + : [pP] + ; + +fragment +Dot : '.' + ; + +// §3.10.3 Boolean Literals + +BooleanLiteral + : 'true' + | 'false' + ; + + +// §3.10.6 Escape Sequences for Character and String Literals + +fragment +EscapeSequence + : Backslash [btnfr"'\\] + | OctalEscape + | UnicodeEscape + | DollarEscape + | LineEscape + ; + + +fragment +OctalEscape + : Backslash OctalDigit + | Backslash OctalDigit OctalDigit + | Backslash ZeroToThree OctalDigit OctalDigit + ; + +// Groovy allows 1 or more u's after the backslash +fragment +UnicodeEscape + : Backslash 'u' HexDigit HexDigit HexDigit HexDigit + ; + +fragment +ZeroToThree + : [0-3] + ; + +// Groovy Escape Sequences + +fragment +DollarEscape + : Backslash Dollar + ; + +fragment +LineEscape + : Backslash '\r'? '\n' + ; + +fragment +SlashEscape + : Backslash Slash + ; + +fragment +Backslash + : '\\' + ; + +fragment +Slash + : '/' + ; + +fragment +Dollar + : '$' + ; + +fragment +GStringQuotationMark + : '"' + ; + +fragment +SqStringQuotationMark + : '\'' + ; + +fragment +TdqStringQuotationMark + : '"""' + ; + +fragment +TsqStringQuotationMark + : '\'\'\'' + ; + +fragment +DollarSlashyGStringQuotationMarkBegin + : '$/' + ; + +fragment +DollarSlashyGStringQuotationMarkEnd + : '/$' + ; + +fragment +DollarSlashEscape + : '$/$' + ; + +fragment +DollarDollarEscape + : '$$' + ; + +// §3.10.7 The Null Literal +NullLiteral + : 'null' + ; + +// Groovy Operators + +RANGE_INCLUSIVE : '..'; +RANGE_EXCLUSIVE : '..<'; +SPREAD_DOT : '*.'; +SAFE_DOT : '?.'; +SAFE_CHAIN_DOT : '??.'; +ELVIS : '?:'; +METHOD_POINTER : '.&'; +METHOD_REFERENCE : '::'; +REGEX_FIND : '=~'; +REGEX_MATCH : '==~'; +POWER : '**'; +POWER_ASSIGN : '**='; +SPACESHIP : '<=>'; +IDENTICAL : '==='; +NOT_IDENTICAL : '!=='; +ARROW : '->'; + +// !internalPromise will be parsed as !in ternalPromise, so semantic predicates are necessary +NOT_INSTANCEOF : '!instanceof' { isFollowedBy(_input, ' ', '\t', '\r', '\n') }?; +NOT_IN : '!in' { isFollowedBy(_input, ' ', '\t', '\r', '\n', '[', '(', '{') }?; + + +// §3.11 Separators + +LPAREN : '(' { this.enterParen(); } -> pushMode(DEFAULT_MODE); +RPAREN : ')' { this.exitParen(); } -> popMode; +LBRACE : '{' { this.enterParen(); } -> pushMode(DEFAULT_MODE); +RBRACE : '}' { this.exitParen(); } -> popMode; +LBRACK : '[' { this.enterParen(); } -> pushMode(DEFAULT_MODE); +RBRACK : ']' { this.exitParen(); } -> popMode; + +SEMI : ';'; +COMMA : ','; +DOT : Dot; + +// §3.12 Operators + +ASSIGN : '='; +GT : '>'; +LT : '<'; +NOT : '!'; +BITNOT : '~'; +QUESTION : '?'; +COLON : ':'; +EQUAL : '=='; +LE : '<='; +GE : '>='; +NOTEQUAL : '!='; +AND : '&&'; +OR : '||'; +INC : '++'; +DEC : '--'; +ADD : '+'; +SUB : '-'; +MUL : '*'; +DIV : Slash; +BITAND : '&'; +BITOR : '|'; +XOR : '^'; +MOD : '%'; + + +ADD_ASSIGN : '+='; +SUB_ASSIGN : '-='; +MUL_ASSIGN : '*='; +DIV_ASSIGN : '/='; +AND_ASSIGN : '&='; +OR_ASSIGN : '|='; +XOR_ASSIGN : '^='; +MOD_ASSIGN : '%='; +LSHIFT_ASSIGN : '<<='; +RSHIFT_ASSIGN : '>>='; +URSHIFT_ASSIGN : '>>>='; +ELVIS_ASSIGN : '?='; + + +// §3.8 Identifiers (must appear after all keywords in the grammar) +CapitalizedIdentifier + : [A-Z] JavaLetterOrDigit* + ; + +Identifier + : JavaLetter JavaLetterOrDigit* + ; + +fragment +IdentifierInGString + : JavaLetterInGString JavaLetterOrDigitInGString* + ; + +fragment +JavaLetterInGString + : [a-zA-Z_] // these are the "java letters" below 0x7F, except for $ + | // covers all characters above 0x7F which are not a surrogate + ~[\u0000-\u007F\uD800-\uDBFF] + {Character.isJavaIdentifierStart(_input.LA(-1))}? + | // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + [\uD800-\uDBFF] [\uDC00-\uDFFF] + {Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}? + ; + +fragment +JavaLetterOrDigitInGString + : [a-zA-Z0-9_] // these are the "java letters or digits" below 0x7F, except for $ + | // covers all characters above 0x7F which are not a surrogate + ~[\u0000-\u007F\uD800-\uDBFF] + {Character.isJavaIdentifierPart(_input.LA(-1))}? + | // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + [\uD800-\uDBFF] [\uDC00-\uDFFF] + {Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}? + ; + + +fragment +JavaLetter + : [a-zA-Z$_] // these are the "java letters" below 0x7F + | // covers all characters above 0x7F which are not a surrogate + ~[\u0000-\u007F\uD800-\uDBFF] + {Character.isJavaIdentifierStart(_input.LA(-1))}? + | // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + [\uD800-\uDBFF] [\uDC00-\uDFFF] + {Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}? + ; + +fragment +JavaLetterOrDigit + : [a-zA-Z0-9$_] // these are the "java letters or digits" below 0x7F + | // covers all characters above 0x7F which are not a surrogate + ~[\u0000-\u007F\uD800-\uDBFF] + {Character.isJavaIdentifierPart(_input.LA(-1))}? + | // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + [\uD800-\uDBFF] [\uDC00-\uDFFF] + {Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}? + ; + +// +// Additional symbols not defined in the lexical specification +// + +AT : '@'; +ELLIPSIS : '...'; + +// +// Whitespace, line escape and comments +// +WS : ([ \t\u000C]+ | LineEscape+) -> skip + ; + + +// Inside (...) and [...] but not {...}, ignore newlines. +NL : '\r'? '\n' { this.ignoreTokenInsideParens(); } + ; + +// Multiple-line comments(including groovydoc comments) +ML_COMMENT + : '/*' .*? '*/' { this.ignoreMultiLineCommentConditionally(); } -> type(NL) + ; + +// Single-line comments +SL_COMMENT + : '//' ~[\r\n\uFFFF]* { this.ignoreTokenInsideParens(); } -> type(NL) + ; + +// Script-header comments. +// The very first characters of the file may be "#!". If so, ignore the first line. +SH_COMMENT + : '#!' { require(0 == this.tokenIndex, "Shebang comment should appear at the first line", -2, true); } ~[\r\n\uFFFF]* -> skip + ; + +// Unexpected characters will be handled by groovy parser later. +UNEXPECTED_CHAR + : . + ; diff --git a/examples/antlr4-opt/groovy/GroovyParser.g4 b/examples/antlr4-opt/groovy/GroovyParser.g4 new file mode 100644 index 0000000..7d802fa --- /dev/null +++ b/examples/antlr4-opt/groovy/GroovyParser.g4 @@ -0,0 +1,1242 @@ +/* + * This file is adapted from the Antlr4 Java grammar which has the following license + * + * Copyright (c) 2013 Terence Parr, Sam Harwell + * All rights reserved. + * [The "BSD licence"] + * + * http://www.opensource.org/licenses/bsd-license.php + * + * Subsequent modifications by the Groovy community have been done under the Apache License v2: + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * The Groovy grammar is based on the official grammar for Java: + * https://github.com/antlr/grammars-v4/blob/master/java/Java.g4 + */ +parser grammar GroovyParser; + +options { + tokenVocab = GroovyLexer; + contextSuperClass = GroovyParserRuleContext; + superClass = AbstractParser; +} + +@header { + import java.util.Map; + import org.codehaus.groovy.ast.NodeMetaDataHandler; + import org.apache.groovy.parser.antlr4.SemanticPredicates; +} + +@members { + + public static class GroovyParserRuleContext extends ParserRuleContext implements NodeMetaDataHandler { + private Map metaDataMap = null; + + public GroovyParserRuleContext() {} + + public GroovyParserRuleContext(ParserRuleContext parent, int invokingStateNumber) { + super(parent, invokingStateNumber); + } + + @Override + public Map getMetaDataMap() { + return this.metaDataMap; + } + + @Override + public void setMetaDataMap(Map metaDataMap) { + this.metaDataMap = metaDataMap; + } + } + + @Override + public int getSyntaxErrorSource() { + return GroovySyntaxError.PARSER; + } + + @Override + public int getErrorLine() { + Token token = _input.LT(-1); + + if (null == token) { + return -1; + } + + return token.getLine(); + } + + @Override + public int getErrorColumn() { + Token token = _input.LT(-1); + + if (null == token) { + return -1; + } + + return token.getCharPositionInLine() + 1 + token.getText().length(); + } +} + +// starting point for parsing a groovy file +compilationUnit + : nls + packageDeclaration? sep? scriptStatements? EOF + ; + +scriptStatements + : scriptStatement (sep scriptStatement)* sep? + ; + +scriptStatement + : importDeclaration // Import statement. Can be used in any scope. Has "import x as y" also. + | typeDeclaration + | statement + ; + +packageDeclaration + : annotationsOpt PACKAGE qualifiedName + ; + +importDeclaration + : annotationsOpt IMPORT STATIC? qualifiedName (DOT MUL | AS alias=identifier)? + ; + + +typeDeclaration + : classOrInterfaceModifiersOpt classDeclaration + ; + +modifier + : classOrInterfaceModifier + | m=( NATIVE + | SYNCHRONIZED + | TRANSIENT + | VOLATILE + | DEF + | VAR + ) + ; + +modifiersOpt + : modifiers? + ; + +modifiers + : (modifier nls)+ + ; + +classOrInterfaceModifiersOpt + : classOrInterfaceModifiers? + ; + +classOrInterfaceModifiers + : (classOrInterfaceModifier nls)+ + ; + +classOrInterfaceModifier + : annotation // class or interface + | m=( PUBLIC // class or interface + | PROTECTED // class or interface + | PRIVATE // class or interface + | STATIC // class or interface + | ABSTRACT // class or interface + | FINAL // class only -- does not apply to interfaces + | STRICTFP // class or interface + | DEFAULT // interface only -- does not apply to classes + ) + ; + +variableModifier + : annotation + | m=( FINAL + | DEF + | VAR + // Groovy supports declaring local variables as instance/class fields, + // e.g. import groovy.transform.*; @Field static List awe = [1, 2, 3] + // e.g. import groovy.transform.*; def a = { @Field public List awe = [1, 2, 3] } + // Notice: Groovy 2.4.7 just allows to declare local variables with the following modifiers when using annotations(e.g. @Field) + // TODO check whether the following modifiers accompany annotations or not. Because the legacy codes(e.g. benchmark/bench/heapsort.groovy) allow to declare the special instance/class fields without annotations, we leave it as it is for the time being + | PUBLIC + | PROTECTED + | PRIVATE + | STATIC + | ABSTRACT + | STRICTFP + ) + ; + +variableModifiersOpt + : variableModifiers? + ; + +variableModifiers + : (variableModifier nls)+ + ; + +typeParameters + : LT nls typeParameter (COMMA nls typeParameter)* nls GT + ; + +typeParameter + : className (EXTENDS nls typeBound)? + ; + +typeBound + : type (BITAND nls type)* + ; + +typeList + : type (COMMA nls type)* + ; + + +/** + * t 0: class; 1: interface; 2: enum; 3: annotation; 4: trait + */ +classDeclaration +locals[ int t ] + : ( CLASS { $t = 0; } + | INTERFACE { $t = 1; } + | ENUM { $t = 2; } + | AT INTERFACE { $t = 3; } + | TRAIT { $t = 4; } + ) + identifier nls + + ( + { 3 != $t }? + typeParameters? nls + ( + { 2 != $t }? + (EXTENDS nls + ( + // Only interface can extend more than one super class + {1 == $t}? scs=typeList + | + sc=type + ) + nls)? + | + /* enum should not have type parameters and extends */ + ) + + ( + {1 != $t}? + (IMPLEMENTS nls is=typeList nls)? + | + /* interface should not implement other interfaces */ + ) + | + /* annotation should not have implements and extends*/ + ) + + classBody[$t] + ; + +// t see the comment of classDeclaration +classBody[int t] + : LBRACE nls + ( + /* Only enum can have enum constants */ + { 2 == $t }? + enumConstants? sep? + | + + ) + classBodyDeclaration[$t]? (sep classBodyDeclaration[$t])* sep? RBRACE + ; + +enumConstants + : enumConstant (nls COMMA nls enumConstant)* (nls COMMA)? + ; + +enumConstant + : annotationsOpt identifier arguments? anonymousInnerClassDeclaration[1]? + ; + +classBodyDeclaration[int t] + : SEMI + | (STATIC nls)? block + | memberDeclaration[$t] + ; + +memberDeclaration[int t] + : methodDeclaration[0, $t] + | fieldDeclaration + | modifiersOpt classDeclaration + ; + +/** + * t 0: *class member* all kinds of method declaration AND constructor declaration, + * 1: normal method declaration, 2: abstract method declaration + * 3: normal method declaration OR abstract method declaration + * ct 9: script, other see the comment of classDeclaration + */ +methodDeclaration[int t, int ct] + : { 3 == $ct }? + returnType[$ct] methodName LPAREN rparen (DEFAULT nls elementValue)? + | + modifiersOpt typeParameters? returnType[$ct]? + methodName formalParameters (nls THROWS nls qualifiedClassNameList)? + nls methodBody? + ; + +methodName + : identifier + | stringLiteral + ; + +returnType[int ct] + : + standardType + | + // annotation method can not have void return type + { 3 != $ct }? VOID + ; + +fieldDeclaration + : variableDeclaration[1] + ; + +variableDeclarators + : variableDeclarator (COMMA nls variableDeclarator)* + ; + +variableDeclarator + : variableDeclaratorId (nls ASSIGN nls variableInitializer)? + ; + +variableDeclaratorId + : identifier + ; + +variableInitializer + : enhancedStatementExpression + ; + +variableInitializers + : variableInitializer nls (COMMA nls variableInitializer nls)* nls COMMA? + ; + +dims + : (annotationsOpt LBRACK RBRACK)+ + ; + +dimsOpt + : dims? + ; + +standardType +options { baseContext = type; } + : annotationsOpt + ( + primitiveType + | + standardClassOrInterfaceType + ) + dimsOpt + ; + +type + : annotationsOpt + ( + ( + primitiveType + | + // !!! Error Alternative !!! + VOID + ) + | + generalClassOrInterfaceType + ) + dimsOpt + ; + +classOrInterfaceType + : ( qualifiedClassName + | qualifiedStandardClassName + ) typeArguments? + ; + +generalClassOrInterfaceType +options { baseContext = classOrInterfaceType; } + : qualifiedClassName typeArguments? + ; + +standardClassOrInterfaceType +options { baseContext = classOrInterfaceType; } + : qualifiedStandardClassName typeArguments? + ; + +primitiveType + : BuiltInPrimitiveType + ; + +typeArguments + : LT nls typeArgument (COMMA nls typeArgument)* nls GT + ; + +typeArgument + : type + | annotationsOpt QUESTION ((EXTENDS | SUPER) nls type)? + ; + +annotatedQualifiedClassName + : annotationsOpt qualifiedClassName + ; + +qualifiedClassNameList + : annotatedQualifiedClassName (COMMA nls annotatedQualifiedClassName)* + ; + +formalParameters + : LPAREN formalParameterList? rparen + ; + +formalParameterList + : (formalParameter | thisFormalParameter) (COMMA nls formalParameter)* + ; + +thisFormalParameter + : type THIS + ; + +formalParameter + : variableModifiersOpt type? ELLIPSIS? variableDeclaratorId (nls ASSIGN nls expression)? + ; + +methodBody + : block + ; + +qualifiedName + : qualifiedNameElement (DOT qualifiedNameElement)* + ; + +/** + * Java doesn't have the keywords 'as', 'in', 'def', 'trait' so we make some allowances + * for them in package names for better integration with existing Java packages + */ +qualifiedNameElement + : identifier + | DEF + | IN + | AS + | TRAIT + ; + +qualifiedNameElements + : (qualifiedNameElement DOT)* + ; + +qualifiedClassName + : qualifiedNameElements identifier + ; + +qualifiedStandardClassName + : qualifiedNameElements className (DOT className)* + ; + +literal + : IntegerLiteral #integerLiteralAlt + | FloatingPointLiteral #floatingPointLiteralAlt + | stringLiteral #stringLiteralAlt + | BooleanLiteral #booleanLiteralAlt + | NullLiteral #nullLiteralAlt + ; + +// GSTRING + +gstring + : GStringBegin gstringValue (GStringPart gstringValue)* GStringEnd + ; + +gstringValue + : gstringPath + | LBRACE statementExpression? RBRACE + | closure + ; + +gstringPath + : identifier GStringPathPart* + ; + + +// LAMBDA EXPRESSION +lambdaExpression +options { baseContext = standardLambdaExpression; } + : lambdaParameters nls ARROW nls lambdaBody + ; + +// JAVA STANDARD LAMBDA EXPRESSION +standardLambdaExpression + : standardLambdaParameters nls ARROW nls lambdaBody + ; + +lambdaParameters +options { baseContext = standardLambdaParameters; } + : formalParameters + + // { a -> a * 2 } can be parsed as a lambda expression in a block, but we expect a closure. + // So it is better to put parameters in the parentheses and the following single parameter without parentheses is limited +// | variableDeclaratorId + ; + +standardLambdaParameters + : formalParameters + | variableDeclaratorId + ; + +lambdaBody + : block + | statementExpression + ; + +// CLOSURE +closure + : LBRACE nls (formalParameterList? nls ARROW nls)? blockStatementsOpt RBRACE + ; + +// GROOVY-8991: Difference in behaviour with closure and lambda +closureOrLambdaExpression + : closure + | lambdaExpression + ; + +blockStatementsOpt + : blockStatements? + ; + +blockStatements + : blockStatement (sep blockStatement)* sep? + ; + +// ANNOTATIONS + +annotationsOpt + : (annotation nls)* + ; + +annotation + : AT annotationName ( LPAREN elementValues? rparen )? + ; + +elementValues + : elementValuePairs + | elementValue + ; + +annotationName : qualifiedClassName ; + +elementValuePairs + : elementValuePair (COMMA elementValuePair)* + ; + +elementValuePair + : elementValuePairName nls ASSIGN nls elementValue + ; + +elementValuePairName + : identifier + | keywords + ; + +// TODO verify the potential performance issue because rule expression contains sub-rule assignments(https://github.com/antlr/grammars-v4/issues/215) +elementValue + : elementValueArrayInitializer + | annotation + | expression + ; + +elementValueArrayInitializer + : LBRACK (elementValue (COMMA elementValue)*)? (COMMA)? RBRACK + ; + +// STATEMENTS / BLOCKS + +block + : LBRACE (nls | sep*) blockStatementsOpt RBRACE + ; + +blockStatement + : localVariableDeclaration + | statement + ; + +localVariableDeclaration + : { !SemanticPredicates.isInvalidLocalVariableDeclaration(_input) }? + variableDeclaration[0] + ; + +classifiedModifiers[int t] + : { 0 == $t }? variableModifiers + | { 1 == $t }? modifiers + ; + + +/** + * t 0: local variable declaration; 1: field declaration + */ +variableDeclaration[int t] +@leftfactor { classifiedModifiers } + : classifiedModifiers[$t] + ( type? variableDeclarators + | typeNamePairs nls ASSIGN nls variableInitializer + ) + | + classifiedModifiers[$t]? + type variableDeclarators + ; + +typeNamePairs + : LPAREN typeNamePair (COMMA typeNamePair)* rparen + ; + +typeNamePair + : type? variableDeclaratorId + ; + +variableNames + : LPAREN variableDeclaratorId (COMMA variableDeclaratorId)+ rparen + ; + +conditionalStatement + : ifElseStatement + | switchStatement + ; + +ifElseStatement + : IF expressionInPar nls tb=statement ((nls | sep) ELSE nls fb=statement)? + ; + +switchStatement + : SWITCH expressionInPar nls LBRACE nls switchBlockStatementGroup* nls RBRACE + ; + +loopStatement + : FOR LPAREN forControl rparen nls statement #forStmtAlt + | WHILE expressionInPar nls statement #whileStmtAlt + | DO nls statement nls WHILE expressionInPar #doWhileStmtAlt + ; + +continueStatement + : CONTINUE + identifier? + ; + +breakStatement + : BREAK + identifier? + ; + +tryCatchStatement + : TRY resources? nls block + (nls catchClause)* + (nls finallyBlock)? + ; + +assertStatement + : ASSERT ce=expression (nls (COLON | COMMA) nls me=expression)? + ; + +statement + : block #blockStmtAlt + | conditionalStatement #conditionalStmtAlt + | loopStatement #loopStmtAlt + + | tryCatchStatement #tryCatchStmtAlt + + | SYNCHRONIZED expressionInPar nls block #synchronizedStmtAlt + | RETURN expression? #returnStmtAlt + | THROW expression #throwStmtAlt + + | breakStatement #breakStmtAlt + | continueStatement #continueStmtAlt + + | identifier COLON nls statement #labeledStmtAlt + + | assertStatement #assertStmtAlt + + | localVariableDeclaration #localVariableDeclarationStmtAlt + + // validate the method in the AstBuilder#visitMethodDeclaration, e.g. method without method body is not allowed + | { !SemanticPredicates.isInvalidMethodDeclaration(_input) }? + methodDeclaration[3, 9] #methodDeclarationStmtAlt + + | statementExpression #expressionStmtAlt + + | SEMI #emptyStmtAlt + ; + +catchClause + : CATCH LPAREN variableModifiersOpt catchType? identifier rparen nls block + ; + +catchType + : qualifiedClassName (BITOR qualifiedClassName)* + ; + +finallyBlock + : FINALLY nls block + ; + +resources + : LPAREN nls resourceList sep? rparen + ; + +resourceList + : resource (sep resource)* + ; + +resource + : localVariableDeclaration + | expression + ; + + +/** Matches cases then statements, both of which are mandatory. + * To handle empty cases at the end, we add switchLabel* to statement. + */ +switchBlockStatementGroup + : (switchLabel nls)+ blockStatements + ; + +switchLabel + : CASE expression COLON + | DEFAULT COLON + ; + +forControl + : enhancedForControl + | classicalForControl + ; + +enhancedForControl + : variableModifiersOpt type? variableDeclaratorId (COLON | IN) expression + ; + +classicalForControl + : forInit? SEMI expression? SEMI forUpdate? + ; + +forInit + : localVariableDeclaration + | expressionList[false] + ; + +forUpdate + : expressionList[false] + ; + + +// EXPRESSIONS + +castParExpression + : LPAREN type rparen + ; + +parExpression + : expressionInPar + ; + +expressionInPar + : LPAREN enhancedStatementExpression rparen + ; + +expressionList[boolean canSpread] + : expressionListElement[$canSpread] (COMMA expressionListElement[$canSpread])* + ; + +expressionListElement[boolean canSpread] + : ( MUL { require($canSpread, "spread operator is not allowed here", -1); } + | + ) expression + ; + +enhancedStatementExpression + : statementExpression + | standardLambdaExpression + ; + +statementExpression + : commandExpression #commandExprAlt + ; + +postfixExpression + : pathExpression op=(INC | DEC)? + ; + +expression + // qualified names, array expressions, method invocation, post inc/dec, type casting (level 1) + // The cast expression must be put before pathExpression to resovle the ambiguities between type casting and call on parentheses expression, e.g. (int)(1 / 2) + : castParExpression castOperandExpression #castExprAlt + | postfixExpression #postfixExprAlt + + // ~(BNOT)/!(LNOT) (level 1) + | (BITNOT | NOT) nls expression #unaryNotExprAlt + + // math power operator (**) (level 2) + | left=expression op=POWER nls right=expression #powerExprAlt + + // ++(prefix)/--(prefix)/+(unary)/-(unary) (level 3) + | op=(INC | DEC | ADD | SUB) expression #unaryAddExprAlt + + // multiplication/division/modulo (level 4) + | left=expression nls op=(MUL | DIV | MOD) nls right=expression #multiplicativeExprAlt + + // binary addition/subtraction (level 5) + | left=expression op=(ADD | SUB) nls right=expression #additiveExprAlt + + // bit shift expressions (level 6) + | left=expression nls + ( ( dlOp=LT LT + | tgOp=GT GT GT + | dgOp=GT GT + ) + | rangeOp=( RANGE_INCLUSIVE + | RANGE_EXCLUSIVE + ) + ) nls + right=expression #shiftExprAlt + + // boolean relational expressions (level 7) + | left=expression nls op=(AS | INSTANCEOF | NOT_INSTANCEOF) nls type #relationalExprAlt + | left=expression nls op=(LE | GE | GT | LT | IN | NOT_IN) nls right=expression #relationalExprAlt + + // equality/inequality (==/!=) (level 8) + | left=expression nls + op=( IDENTICAL + | NOT_IDENTICAL + | EQUAL + | NOTEQUAL + | SPACESHIP + ) nls + right=expression #equalityExprAlt + + // regex find and match (=~ and ==~) (level 8.5) + // jez: moved =~ closer to precedence of == etc, as... + // 'if (foo =~ "a.c")' is very close in intent to 'if (foo == "abc")' + | left=expression nls op=(REGEX_FIND | REGEX_MATCH) nls right=expression #regexExprAlt + + // bitwise or non-short-circuiting and (&) (level 9) + | left=expression nls op=BITAND nls right=expression #andExprAlt + + // exclusive or (^) (level 10) + | left=expression nls op=XOR nls right=expression #exclusiveOrExprAlt + + // bitwise or non-short-circuiting or (|) (level 11) + | left=expression nls op=BITOR nls right=expression #inclusiveOrExprAlt + + // logical and (&&) (level 12) + | left=expression nls op=AND nls right=expression #logicalAndExprAlt + + // logical or (||) (level 13) + | left=expression nls op=OR nls right=expression #logicalOrExprAlt + + // conditional test (level 14) + | con=expression nls + ( QUESTION nls tb=expression nls COLON nls + | ELVIS nls + ) + fb=expression #conditionalExprAlt + + // assignment expression (level 15) + // "(a) = [1]" is a special case of multipleAssignmentExprAlt, it will be handle by assignmentExprAlt + | left=variableNames nls op=ASSIGN nls right=statementExpression #multipleAssignmentExprAlt + | left=expression nls + op=( ASSIGN + | ADD_ASSIGN + | SUB_ASSIGN + | MUL_ASSIGN + | DIV_ASSIGN + | AND_ASSIGN + | OR_ASSIGN + | XOR_ASSIGN + | RSHIFT_ASSIGN + | URSHIFT_ASSIGN + | LSHIFT_ASSIGN + | MOD_ASSIGN + | POWER_ASSIGN + | ELVIS_ASSIGN + ) nls + enhancedStatementExpression #assignmentExprAlt + ; + + +castOperandExpression +options { baseContext = expression; } + : castParExpression castOperandExpression #castExprAlt + | postfixExpression #postfixExprAlt + // ~(BNOT)/!(LNOT) (level 1) + | (BITNOT | NOT) nls castOperandExpression #unaryNotExprAlt + // ++(prefix)/--(prefix)/+(unary)/-(unary) (level 3) + | op=(INC | DEC | ADD | SUB) castOperandExpression #unaryAddExprAlt + ; + + +/* +enhancedExpression + : expression + | standardLambdaExpression + ; +*/ + +commandExpression + : expression + ( + { !SemanticPredicates.isFollowingArgumentsOrClosure($expression.ctx) }? + argumentList + | + /* if pathExpression is a method call, no need to have any more arguments */ + ) + + commandArgument* + ; + +commandArgument + : primary + // what follows is either a normal argument, parens, + // an appended block, an index operation, or nothing + // parens (a b already processed): + // a b c() d e -> a(b).c().d(e) + // a b c()() d e -> a(b).c().call().d(e) + // index (a b already processed): + // a b c[x] d e -> a(b).c[x].d(e) + // a b c[x][y] d e -> a(b).c[x][y].d(e) + // block (a b already processed): + // a b c {x} d e -> a(b).c({x}).d(e) + // + // parens/block completes method call + // index makes method call to property get with index + // + ( pathElement+ + | argumentList + )? + ; + +/** + * A "path expression" is a name or other primary, possibly qualified by various + * forms of dot, and/or followed by various kinds of brackets. + * It can be used for value or assigned to, or else further qualified, indexed, or called. + * It is called a "path" because it looks like a linear path through a data structure. + * Examples: x.y, x?.y, x*.y, x.@y; x[], x[y], x[y,z]; x(), x(y), x(y,z); x{s}; a.b[n].c(x).d{s} + * (Compare to a C lvalue, or LeftHandSide in the JLS section 15.26.) + * General expressions are built up from path expressions, using operators like '+' and '='. + * + * t 0: primary, 1: namePart, 2: arguments, 3: closureOrLambdaExpression, 4: indexPropertyArgs, 5: namedPropertyArgs, + * 6: non-static inner class creator + */ +pathExpression returns [int t] + : primary (pathElement { $t = $pathElement.t; })* + ; + +pathElement returns [int t] + : nls + + // AT: foo.@bar selects the field (or attribute), not property + ( + ( DOT // The all-powerful dot. + | SPREAD_DOT // Spread operator: x*.y === x?.collect{it.y} + | SAFE_DOT // Optional-null operator: x?.y === (x==null)?null:x.y + | SAFE_CHAIN_DOT // Optional-null chain operator: x??.y.z === x?.y?.z + ) nls (AT | nonWildcardTypeArguments)? + | + METHOD_POINTER nls // Method pointer operator: foo.&y == foo.metaClass.getMethodPointer(foo, "y") + | + METHOD_REFERENCE nls // Method reference: System.out::println + ) + namePart + { $t = 1; } + | + nls DOT nls NEW creator[1] + { $t = 6; } + | arguments + { $t = 2; } + + // Can always append a block, as foo{bar} + | nls closureOrLambdaExpression + { $t = 3; } + + // Element selection is always an option, too. + // In Groovy, the stuff between brackets is a general argument list, + // since the bracket operator is transformed into a method call. + | indexPropertyArgs + { $t = 4; } + + | namedPropertyArgs + { $t = 5; } + ; + +/** + * This is the grammar for what can follow a dot: x.a, x.@a, x.&a, x.'a', etc. + */ +namePart + : + ( identifier + + // foo.'bar' is in all ways same as foo.bar, except that bar can have an arbitrary spelling + | stringLiteral + + | dynamicMemberName + + /* just a PROPOSAL, which has not been implemented yet! + // PROPOSAL, DECIDE: Is this inline form of the 'with' statement useful? + // Definition: a.{foo} === {with(a) {foo}} + // May cover some path expression use-cases previously handled by dynamic scoping (closure delegates). + | block + */ + + // let's allow common keywords as property names + | keywords + ) + ; + +/** + * If a dot is followed by a parenthesized or quoted expression, the member is computed dynamically, + * and the member selection is done only at runtime. This forces a statically unchecked member access. + */ +dynamicMemberName + : parExpression + | gstring + ; + +/** An expression may be followed by [...]. + * Unlike Java, these brackets may contain a general argument list, + * which is passed to the array element operator, which can make of it what it wants. + * The brackets may also be empty, as in T[]. This is how Groovy names array types. + */ +indexPropertyArgs + : QUESTION? LBRACK expressionList[true]? RBRACK + ; + +namedPropertyArgs + : QUESTION? LBRACK (mapEntryList | COLON) RBRACK + ; + +primary + : + // Append `typeArguments?` to `identifier` to support constructor reference with generics, e.g. HashMap::new + // Though this is not a graceful solution, it is much faster than replacing `builtInType` with `type` + identifier typeArguments? #identifierPrmrAlt + | literal #literalPrmrAlt + | gstring #gstringPrmrAlt + | NEW nls creator[0] #newPrmrAlt + | THIS #thisPrmrAlt + | SUPER #superPrmrAlt + | parExpression #parenPrmrAlt + | closureOrLambdaExpression #closureOrLambdaExpressionPrmrAlt + | list #listPrmrAlt + | map #mapPrmrAlt + | builtInType #builtInTypePrmrAlt + ; + +list + : LBRACK expressionList[true]? COMMA? RBRACK + ; + +map + : LBRACK + ( mapEntryList COMMA? + | COLON + ) + RBRACK + ; + +mapEntryList + : mapEntry (COMMA mapEntry)* + ; + +mapEntry + : mapEntryLabel COLON nls expression + | MUL COLON nls expression + ; + +mapEntryLabel + : keywords + | primary + ; + +/** + * t 0: general creation; 1: non-static inner class creation + */ +creator[int t] + : createdName + ( {0 == $t || 1 == $t}? nls arguments anonymousInnerClassDeclaration[0]? + | {0 == $t}? (annotationsOpt LBRACK expression RBRACK)+ dimsOpt + | {0 == $t}? dims nls arrayInitializer + ) + ; + +arrayInitializer + : LBRACE nls variableInitializers? nls RBRACE + ; + +/** + * t 0: anonymous inner class; 1: anonymous enum + */ +anonymousInnerClassDeclaration[int t] + : classBody[0] + ; + +createdName + : annotationsOpt + ( primitiveType + | qualifiedClassName typeArgumentsOrDiamond? + ) + ; + +nonWildcardTypeArguments + : LT nls typeList nls GT + ; + +typeArgumentsOrDiamond + : LT GT + | typeArguments + ; + +arguments + : LPAREN enhancedArgumentList? COMMA? rparen + ; + +argumentList +options { baseContext = enhancedArgumentList; } + : argumentListElement + ( COMMA nls + argumentListElement + )* + ; + +enhancedArgumentList + : enhancedArgumentListElement + ( COMMA nls + enhancedArgumentListElement + )* + ; + +argumentListElement +options { baseContext = enhancedArgumentListElement; } + : expressionListElement[true] + | mapEntry + ; + +enhancedArgumentListElement + : expressionListElement[true] + | standardLambdaExpression + | mapEntry + ; + +stringLiteral + : StringLiteral + ; + +className + : CapitalizedIdentifier + ; + +identifier + : Identifier + | CapitalizedIdentifier + | VAR + | + // if 'static' followed by DOT, we can treat them as identifiers, e.g. static.unused = { -> } + { DOT == _input.LT(2).getType() }? + STATIC + | IN +// | DEF + | TRAIT + | AS + ; + +builtInType + : BuiltInPrimitiveType + | VOID + ; + +keywords + : ABSTRACT + | AS + | ASSERT + | BREAK + | CASE + | CATCH + | CLASS + | CONST + | CONTINUE + | DEF + | DEFAULT + | DO + | ELSE + | ENUM + | EXTENDS + | FINAL + | FINALLY + | FOR + | GOTO + | IF + | IMPLEMENTS + | IMPORT + | IN + | INSTANCEOF + | INTERFACE + | NATIVE + | NEW + | PACKAGE + | RETURN + | STATIC + | STRICTFP + | SUPER + | SWITCH + | SYNCHRONIZED + | THIS + | THROW + | THROWS + | TRANSIENT + | TRAIT + | THREADSAFE + | TRY + | VAR + | VOLATILE + | WHILE + + | NullLiteral + | BooleanLiteral + + | BuiltInPrimitiveType + | VOID + + | PUBLIC + | PROTECTED + | PRIVATE + ; + +rparen + : RPAREN + | + // !!!Error Alternative, impact the performance of parsing + { require(false, "Missing ')'"); } + ; + +nls + : NL* + ; + +sep : (NL | SEMI)+ + ; diff --git a/src/it/java/org/antlr/bazel/Antlr3Test.java b/src/it/java/org/antlr/bazel/Antlr3Test.java index c76234b..6ca5423 100644 --- a/src/it/java/org/antlr/bazel/Antlr3Test.java +++ b/src/it/java/org/antlr/bazel/Antlr3Test.java @@ -177,6 +177,29 @@ public void singleError() throws Exception } + @Test + public void objC() throws Exception + { + try (TestProject project = TestProject.create("examples/antlr3/ObjC")) + { + AntlrRules.create(project.root()) + .srcjar(project.srcjar().toString()) + .version("3") + .classpath(classpath()) + .outputDirectory(project.outputDirectory().toString()) + .grammars(project.grammars()) + .args(project.args()) + .generate(); + + project.validate("SimpleCLexer.h", + "SimpleCLexer.m", + "SimpleCParser.h", + "SimpleCParser.m", + "SimpleC.tokens"); + } + } + + private String[] classpath() throws Exception { Path root = Paths.get(System.getenv().get("RUNFILES_DIR")); diff --git a/src/it/java/org/antlr/bazel/BUILD b/src/it/java/org/antlr/bazel/BUILD index cc5faec..1064c49 100644 --- a/src/it/java/org/antlr/bazel/BUILD +++ b/src/it/java/org/antlr/bazel/BUILD @@ -1,18 +1,23 @@ load("@rules_java//java:defs.bzl", "java_library") load("//tools:gen_test_rules.bzl", "java_tests") -filegroup( +java_library( name = "support", srcs = glob( ["*.java"], exclude = ["*Test.java"], ), + javacopts = ["--release 11"], + deps = [ + "//src/test/java/org/antlr/bazel:tests", + "@junit//jar", + ], ) java_library( name = "antlr2_tests", testonly = True, - srcs = glob(["Antlr2Test.java"]) + [":support"], + srcs = glob(["Antlr2Test.java"]), data = [ "//:srcs", "//antlr:srcs", @@ -22,6 +27,7 @@ java_library( "@examples//antlr2:srcs", ], deps = [ + ":support", "//src/main/java/org/antlr/bazel", "//src/test/java/org/antlr/bazel:tests", "@antlr2//jar", @@ -41,7 +47,7 @@ java_tests( java_library( name = "antlr3_tests", testonly = True, - srcs = glob(["Antlr3Test.java"]) + [":support"], + srcs = glob(["Antlr3Test.java"]), data = [ "//:srcs", "//antlr:srcs", @@ -50,6 +56,7 @@ java_library( "@examples//antlr3:srcs", ], deps = [ + ":support", "//src/main/java/org/antlr/bazel", "//src/test/java/org/antlr/bazel:tests", "@antlr3_runtime//jar", @@ -71,7 +78,7 @@ java_tests( java_library( name = "antlr4_tests", testonly = True, - srcs = glob(["Antlr4Test.java"]) + [":support"], + srcs = glob(["Antlr4Test.java"]), data = [ "//:srcs", "//antlr:srcs", @@ -80,6 +87,7 @@ java_library( "@examples//antlr4:srcs", ], deps = [ + ":support", "//src/main/java/org/antlr/bazel", "//src/test/java/org/antlr/bazel:tests", "@antlr3_runtime//jar", @@ -103,7 +111,7 @@ java_tests( java_library( name = "repository_tests", testonly = True, - srcs = glob(["RepositoriesTest.java"]) + [":support"], + srcs = glob(["RepositoriesTest.java"]), data = [ "//:srcs", "//antlr:srcs", @@ -113,6 +121,7 @@ java_library( "@examples//antlr2:srcs", ], deps = [ + ":support", "//src/main/java/org/antlr/bazel", "//src/test/java/org/antlr/bazel:tests", "@antlr2//jar", diff --git a/src/main/java/org/antlr/bazel/AntlrRules.java b/src/main/java/org/antlr/bazel/AntlrRules.java index 68768a1..decd3f5 100644 --- a/src/main/java/org/antlr/bazel/AntlrRules.java +++ b/src/main/java/org/antlr/bazel/AntlrRules.java @@ -38,6 +38,11 @@ */ public class AntlrRules { + private final static CopyOption[] COPY_OPTIONS = { + StandardCopyOption.COPY_ATTRIBUTES, + StandardCopyOption.REPLACE_EXISTING + }; + private String[] args; private String[] classpath; private Charset encoding = Charset.defaultCharset(); @@ -50,6 +55,7 @@ public class AntlrRules private Path outputDirectory; private final Path sandbox; private Path srcjar; + private String target; private Version version; private Output output; private boolean split = true; @@ -88,6 +94,7 @@ public static void main(String[] args) throws Exception .namespace(env.get("PACKAGE_NAME")) .language(env.get("TARGET_LANGUAGE")) .layout(env.get("DIRECTORY_LAYOUT")) + .target(env.get("TARGET")) .args(args) .generate(); } @@ -128,6 +135,8 @@ AntlrRules encoding(String encoding) void generate() throws Exception { + expandSrcJarImports(); + Map> namespaces = groupByNamespace(grammars); // use reflection so we are not tied to a specific ANTLR version @@ -187,28 +196,15 @@ void generate() throws Exception Path other = Files.createDirectories( outputDirectory .getParent() - .resolve( - outputDirectory - .getFileName() - .toString() - .replace(".cc", ".antlr") - .replace(".go", ".antlr"))); + .resolve(target + ".antlr")); Path headers = Files.createDirectories( outputDirectory .getParent() - .resolve( - outputDirectory - .getFileName() - .toString() - .replace(".cc", ".inc"))); + .resolve(target + ".inc")); Path includes = Files.createDirectories( outputDirectory .getParent() - .resolve( - outputDirectory - .getFileName() - .toString() - .replace(".cc", ".inc"))); + .resolve(target + ".inc")); Files.createDirectories(includes); List files = new ArrayList<>(); @@ -218,9 +214,9 @@ void generate() throws Exception PathMatcher expanded = outputDirectory.getFileSystem() .getPathMatcher("glob:**/expanded*.g"); PathMatcher csources = outputDirectory.getFileSystem() - .getPathMatcher("glob:**.{c,cc,cpp,cxx,c++,C}"); + .getPathMatcher("glob:**.{c,cc,cpp,cxx,c++,C,m,mm}"); PathMatcher cheaders = outputDirectory.getFileSystem() - .getPathMatcher("glob:**.{h,hh,hpp,hxx,inc,inl,H}"); + .getPathMatcher("glob:**.{h,hh,hpp,hxx,h++,inc,inl,ipp,pch,tlh,tli,H}"); PathMatcher gosources = outputDirectory.getFileSystem() .getPathMatcher("glob:**.{go}"); @@ -259,6 +255,7 @@ void generate() throws Exception { case C : case CPP : + case OBJC: { if (cheaders.matches(entry)) { @@ -269,7 +266,6 @@ void generate() throws Exception .resolve(entry.getFileName()); Files.createDirectories(target.getParent()); Files.move(entry, target); - continue; } } @@ -326,12 +322,6 @@ else if (!csources.matches(entry)) Files.walkFileTree(outputDirectory, new SimpleFileVisitor() { - CopyOption[] options = - { - StandardCopyOption.COPY_ATTRIBUTES, - StandardCopyOption.REPLACE_EXISTING - }; - @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attr) throws IOException @@ -368,7 +358,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attr) } Files.createDirectories(target.getParent()); - Files.copy(file, target, options); + Files.copy(file, target, COPY_OPTIONS); return CONTINUE; } @@ -428,7 +418,17 @@ AntlrRules outputDirectory(String directory) AntlrRules srcjar(String srcjar) { this.srcjar = sandbox.resolve(srcjar); - this.output = srcjar.isBlank() ? Output.FOLDER : Output.SRCJAR; + this.output = srcjar.trim().isEmpty() ? Output.FOLDER : Output.SRCJAR; + + return this; + } + + + AntlrRules target(String target) + { + if (target == null) throw new NullPointerException("target must not be null"); + + this.target = target; return this; } @@ -539,6 +539,52 @@ private URLClassLoader classloader(String[] classpath) throws IOException } + private void expandSrcJarImports() throws IOException + { + for (int i = 0; i < args.length; i++) + { + // ANTLR can't handle imports in an archive. We therefore expand it and alter + // the lib path accordingly + if (args[i].equals("-lib") && args[i + 1].endsWith(".srcjar")) + { + Path srcjar = sandbox.resolve(args[i + 1]); + URI uri = URI.create("jar:file:" + srcjar.toUri().getPath()); + + try (FileSystem fs = FileSystems.newFileSystem(uri, new HashMap())) + { + Path root = fs.getPath("/"); + Path target = sandbox.resolve(this.target + ".imports"); + + Files.createDirectories(target); + Files.walkFileTree(root, new SimpleFileVisitor() + { + @Override + public FileVisitResult preVisitDirectory(Path dir, + BasicFileAttributes attrs) throws IOException + { + Files.createDirectories(target.resolve(root.relativize(dir).toString())); + + return FileVisitResult.CONTINUE; + } + + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException + { + Files.copy(file, target.resolve(file.getFileName().toString()), COPY_OPTIONS); + + return FileVisitResult.CONTINUE; + } + }); + + args[i + 1] = sandbox.relativize(target).toString(); + } + } + } + } + + /** * Finds the grammar that corresponds to the given generated file. * diff --git a/src/main/java/org/antlr/bazel/Language.java b/src/main/java/org/antlr/bazel/Language.java index ca85b6f..050a8f3 100644 --- a/src/main/java/org/antlr/bazel/Language.java +++ b/src/main/java/org/antlr/bazel/Language.java @@ -541,7 +541,8 @@ public static Language of(String name) */ private static String header(String grammar) { - Matcher header = HEADER.matcher(grammar); + // remove comments to avoid erroneous matches + Matcher header = HEADER.matcher(grammar.replaceAll("(?m://.*$)|(?s:/\\*.*?\\*/)","")); return header.find() ? header.group(1) : null; } diff --git a/src/test/java/org/antlr/bazel/LanguageTest.java b/src/test/java/org/antlr/bazel/LanguageTest.java index 763e913..0f77406 100644 --- a/src/test/java/org/antlr/bazel/LanguageTest.java +++ b/src/test/java/org/antlr/bazel/LanguageTest.java @@ -116,6 +116,9 @@ public void namespace() JAVA.detectNamespace("header {package foo.bar;}").toString()); assertEquals("foo.bar", JAVA.detectNamespace("header {package\nfoo.bar;}").toString()); + assertEquals("org.antlr.v4.parse", JAVA.detectNamespace("// @header test { comment }\n@lexer::header {\npackage org.antlr.v4.parse;\n}").toString()); + assertEquals("org.antlr.v4.parse", JAVA.detectNamespace("/* @header test { comment } */\n@lexer::header {\npackage org.antlr.v4.parse;\n}").toString()); + assertEquals("org.antlr.v4.parse", JAVA.detectNamespace("/*\n * @header test { comment }\n */\n@lexer::header {\npackage org.antlr.v4.parse;\n}").toString()); assertEquals(null, OBJC.detectNamespace(""));