diff --git a/README.md b/README.md index 4aa6eeb3a..9a03a3546 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,208 @@ CDAP IRC Channel: [#cdap on irc.freenode.net](http://webchat.freenode.net?channe CDAP Users on Slack: [cdap-users team](https://cdap-users.herokuapp.com) + + +# Data Prep + +![cm-available](https://cdap-users.herokuapp.com/assets/cm-available.svg) +![cdap-transform](https://cdap-users.herokuapp.com/assets/cdap-transform.svg) +[![Build Status](https://travis-ci.org/cdapio/hydrator-plugins.svg?branch=develop)](https://travis-ci.org/cdapio/hydrator-plugins) +[![Coverity Scan Build Status](https://scan.coverity.com/projects/11434/badge.svg)](https://scan.coverity.com/projects/hydrator-wrangler-transform) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.cdap.wrangler/wrangler-core/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.cdap.wrangler/wrangler-core) +[![Javadoc](https://javadoc-emblem.rhcloud.com/doc/io.cdap.wrangler/wrangler-core/badge.svg)](http://www.javadoc.io/doc/io.cdap.wrangler/wrangler-core) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Join CDAP community](https://cdap-users.herokuapp.com/badge.svg?t=wrangler)](https://cdap-users.herokuapp.com?t=1) + +A collection of libraries, a pipeline plugin, and a CDAP service for performing data +cleansing, transformation, and filtering using a set of data manipulation instructions +(directives). These instructions are either generated using an interative visual tool or +are manually created. + + * Data Prep defines few concepts that might be useful if you are just getting started with it. Learn about them [here](wrangler-docs/concepts.md) + * The Data Prep Transform is [separately documented](wrangler-transform/wrangler-docs/data-prep-transform.md). + * [Data Prep Cheatsheet](wrangler-docs/cheatsheet.md) + +## New Features + +More [here](wrangler-docs/upcoming-features.md) on upcoming features. + + * **User Defined Directives, also known as UDD**, allow you to create custom functions to transform records within CDAP DataPrep or a.k.a Wrangler. CDAP comes with a comprehensive library of functions. There are however some omissions, and some specific cases for which UDDs are the solution. Additional information on how you can build your custom directives [here](wrangler-docs/custom-directive.md). + * Migrating directives from version 1.0 to version 2.0 [here](wrangler-docs/directive-migration.md) + * Information about Grammar [here](wrangler-docs/grammar/grammar-info.md) + * Various `TokenType` supported by system [here](../api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java) + * Custom Directive Implementation Internals [here](wrangler-docs/udd-internal.md) + + * A new capability that allows CDAP Administrators to **restrict the directives** that are accessible to their users. +More information on configuring can be found [here](wrangler-docs/exclusion-and-aliasing.md) + +## Demo Videos and Recipes + +Videos and Screencasts are best way to learn, so we have compiled simple, short screencasts that shows some of the features of Data Prep. Additional videos can be found [here](https://www.youtube.com/playlist?list=PLhmsf-NvXKJn-neqefOrcl4n7zU4TWmIr) + +### Videos + + * [SCREENCAST] [Creating Lookup Dataset and Joining](https://www.youtube.com/watch?v=Nc1b0rsELHQ) + * [SCREENCAST] [Restricted Directives](https://www.youtube.com/watch?v=71EcMQU714U) + * [SCREENCAST] [Parse Excel files in CDAP](https://www.youtube.com/watch?v=su5L1noGlEk) + * [SCREENCAST] [Parse File As AVRO File](https://www.youtube.com/watch?v=tmwAw4dKUNc) + * [SCREENCAST] [Parsing Binary Coded AVRO Messages](https://www.youtube.com/watch?v=Ix_lPo-PDJY) + * [SCREENCAST] [Parsing Binary Coded AVRO Messages & Protobuf messages using schema registry](https://www.youtube.com/watch?v=LVLIdWnUX1k) + * [SCREENCAST] [Quantize a column - Digitize](https://www.youtube.com/watch?v=VczkYX5SRtY) + * [SCREENCAST] [Data Cleansing capability with send-to-error directive](https://www.youtube.com/watch?v=aZd5H8hIjDc) + * [SCREENCAST] [Building Data Prep from the GitHub source](https://youtu.be/pGGjKU04Y38) + * [VOICE-OVER] [End-to-End Demo Video](https://youtu.be/AnhF0qRmn24) + * [SCREENCAST] [Ingesting into Kudu](https://www.youtube.com/watch?v=KBW7a38vlUM) + * [SCREENCAST] [Realtime HL7 CCDA XML from Kafka into Time Parititioned Parquet](https://youtu.be/0fqNmnOnD-0) + * [SCREENCAST] [Parsing JSON file](https://youtu.be/vwnctcGDflE) + * [SCREENCAST] [Flattening arrays](https://youtu.be/SemHxgBYIsY) + * [SCREENCAST] [Data cleansing with send-to-error directive](https://www.youtube.com/watch?v=aZd5H8hIjDc) + * [SCREENCAST] [Publishing to Kafka](https://www.youtube.com/watch?v=xdc8pvvlI48) + * [SCREENCAST] [Fixed length to JSON](https://www.youtube.com/watch?v=3AXu4m1swuM) + +### Recipes + + * [Parsing Apache Log Files](wrangler-demos/parsing-apache-log-files.md) + * [Parsing CSV Files and Extracting Column Values](wrangler-demos/parsing-csv-extracting-column-values.md) + * [Parsing HL7 CCDA XML Files](wrangler-demos/parsing-hl7-ccda-xml-files.md) + +## Available Directives + +These directives are currently available: + +| Directive | Description | +| ---------------------------------------------------------------------- | ---------------------------------------------------------------- | +| **Parsers** | | +| [JSON Path](wrangler-docs/directives/json-path.md) | Uses a DSL (a JSON path expression) for parsing JSON records | +| [Parse as AVRO](wrangler-docs/directives/parse-as-avro.md) | Parsing an AVRO encoded message - either as binary or json | +| [Parse as AVRO File](wrangler-docs/directives/parse-as-avro-file.md) | Parsing an AVRO data file | +| [Parse as CSV](wrangler-docs/directives/parse-as-csv.md) | Parsing an input record as comma-separated values | +| [Parse as Date](wrangler-docs/directives/parse-as-date.md) | Parsing dates using natural language processing | +| [Parse as Excel](wrangler-docs/directives/parse-as-excel.md) | Parsing excel file. | +| [Parse as Fixed Length](wrangler-docs/directives/parse-as-fixed-length.md) | Parses as a fixed length record with specified widths | +| [Parse as HL7](wrangler-docs/directives/parse-as-hl7.md) | Parsing Health Level 7 Version 2 (HL7 V2) messages | +| [Parse as JSON](wrangler-docs/directives/parse-as-json.md) | Parsing a JSON object | +| [Parse as Log](wrangler-docs/directives/parse-as-log.md) | Parses access log files as from Apache HTTPD and nginx servers | +| [Parse as Protobuf](wrangler-docs/directives/parse-as-log.md) | Parses an Protobuf encoded in-memory message using descriptor | +| [Parse as Simple Date](wrangler-docs/directives/parse-as-simple-date.md) | Parses date strings | +| [Parse XML To JSON](wrangler-docs/directives/parse-xml-to-json.md) | Parses an XML document into a JSON structure | +| [Parse as Currency](wrangler-docs/directives/parse-as-currency.md) | Parses a string representation of currency into a number. | +| [Parse as Datetime](wrangler-docs/directives/parse-as-datetime.md) | Parses strings with datetime values to CDAP datetime type | +| **Output Formatters** | | +| [Write as CSV](wrangler-docs/directives/write-as-csv.md) | Converts a record into CSV format | +| [Write as JSON](wrangler-docs/directives/write-as-json-map.md) | Converts the record into a JSON map | +| [Write JSON Object](wrangler-docs/directives/write-as-json-object.md) | Composes a JSON object based on the fields specified. | +| [Format as Currency](wrangler-docs/directives/format-as-currency.md) | Formats a number as currency as specified by locale. | +| **Transformations** | | +| [Changing Case](wrangler-docs/directives/changing-case.md) | Changes the case of column values | +| [Cut Character](wrangler-docs/directives/cut-character.md) | Selects parts of a string value | +| [Set Column](wrangler-docs/directives/set-column.md) | Sets the column value to the result of an expression execution | +| [Find and Replace](wrangler-docs/directives/find-and-replace.md) | Transforms string column values using a "sed"-like expression | +| [Index Split](wrangler-docs/directives/index-split.md) | (_Deprecated_) | +| [Invoke HTTP](wrangler-docs/directives/invoke-http.md) | Invokes an HTTP Service (_Experimental_, potentially slow) | +| [Quantization](wrangler-docs/directives/quantize.md) | Quantizes a column based on specified ranges | +| [Regex Group Extractor](wrangler-docs/directives/extract-regex-groups.md) | Extracts the data from a regex group into its own column | +| [Setting Character Set](wrangler-docs/directives/set-charset.md) | Sets the encoding and then converts the data to a UTF-8 String | +| [Setting Record Delimiter](wrangler-docs/directives/set-record-delim.md) | Sets the record delimiter | +| [Split by Separator](wrangler-docs/directives/split-by-separator.md) | Splits a column based on a separator into two columns | +| [Split Email Address](wrangler-docs/directives/split-email.md) | Splits an email ID into an account and its domain | +| [Split URL](wrangler-docs/directives/split-url.md) | Splits a URL into its constituents | +| [Text Distance (Fuzzy String Match)](wrangler-docs/directives/text-distance.md) | Measures the difference between two sequences of characters | +| [Text Metric (Fuzzy String Match)](wrangler-docs/directives/text-metric.md) | Measures the difference between two sequences of characters | +| [URL Decode](wrangler-docs/directives/url-decode.md) | Decodes from the `application/x-www-form-urlencoded` MIME format | +| [URL Encode](wrangler-docs/directives/url-encode.md) | Encodes to the `application/x-www-form-urlencoded` MIME format | +| [Trim](wrangler-docs/directives/trim.md) | Functions for trimming white spaces around string data | +| **Encoders and Decoders** | | +| [Decode](wrangler-docs/directives/decode.md) | Decodes a column value as one of `base32`, `base64`, or `hex` | +| [Encode](wrangler-docs/directives/encode.md) | Encodes a column value as one of `base32`, `base64`, or `hex` | +| **Unique ID** | | +| [UUID Generation](wrangler-docs/directives/generate-uuid.md) | Generates a universally unique identifier (UUID) .Recommended to use with Wrangler version 4.4.0 and above due to an important bug fix [CDAP-17732](https://cdap.atlassian.net/browse/CDAP-17732) | +| **Date Transformations** | | +| [Diff Date](wrangler-docs/directives/diff-date.md) | Calculates the difference between two dates | +| [Format Date](wrangler-docs/directives/format-date.md) | Custom patterns for date-time formatting | +| [Format Unix Timestamp](wrangler-docs/directives/format-unix-timestamp.md) | Formats a UNIX timestamp as a date | +| **DateTime Transformations** | | +| [Current DateTime](wrangler-docs/directives/current-datetime.md) | Generates the current datetime using the given zone or UTC by default| +| [Datetime To Timestamp](wrangler-docs/directives/datetime-to-timestamp.md) | Converts a datetime value to timestamp with the given zone | +| [Format Datetime](wrangler-docs/directives/format-datetime.md) | Formats a datetime value to custom date time pattern strings | +| [Timestamp To Datetime](wrangler-docs/directives/timestamp-to-datetime.md) | Converts a timestamp value to datetime | +| **Lookups** | | +| [Catalog Lookup](wrangler-docs/directives/catalog-lookup.md) | Static catalog lookup of ICD-9, ICD-10-2016, ICD-10-2017 codes | +| [Table Lookup](wrangler-docs/directives/table-lookup.md) | Performs lookups into Table datasets | +| **Hashing & Masking** | | +| [Message Digest or Hash](wrangler-docs/directives/hash.md) | Generates a message digest | +| [Mask Number](wrangler-docs/directives/mask-number.md) | Applies substitution masking on the column values | +| [Mask Shuffle](wrangler-docs/directives/mask-shuffle.md) | Applies shuffle masking on the column values | +| **Row Operations** | | +| [Filter Row if Matched](wrangler-docs/directives/filter-row-if-matched.md) | Filters rows that match a pattern for a column | +| [Filter Row if True](wrangler-docs/directives/filter-row-if-true.md) | Filters rows if the condition is true. | +| [Filter Row Empty of Null](wrangler-docs/directives/filter-empty-or-null.md) | Filters rows that are empty of null. | +| [Flatten](wrangler-docs/directives/flatten.md) | Separates the elements in a repeated field | +| [Fail on condition](wrangler-docs/directives/fail.md) | Fails processing when the condition is evaluated to true. | +| [Send to Error](wrangler-docs/directives/send-to-error.md) | Filtering of records to an error collector | +| [Send to Error And Continue](wrangler-docs/directives/send-to-error-and-continue.md) | Filtering of records to an error collector and continues processing | +| [Split to Rows](wrangler-docs/directives/split-to-rows.md) | Splits based on a separator into multiple records | +| **Column Operations** | | +| [Change Column Case](wrangler-docs/directives/change-column-case.md) | Changes column names to either lowercase or uppercase | +| [Changing Case](wrangler-docs/directives/changing-case.md) | Change the case of column values | +| [Cleanse Column Names](wrangler-docs/directives/cleanse-column-names.md) | Sanatizes column names, following specific rules | +| [Columns Replace](wrangler-docs/directives/columns-replace.md) | Alters column names in bulk | +| [Copy](wrangler-docs/directives/copy.md) | Copies values from a source column into a destination column | +| [Drop Column](wrangler-docs/directives/drop.md) | Drops a column in a record | +| [Fill Null or Empty Columns](wrangler-docs/directives/fill-null-or-empty.md) | Fills column value with a fixed value if null or empty | +| [Keep Columns](wrangler-docs/directives/keep.md) | Keeps specified columns from the record | +| [Merge Columns](wrangler-docs/directives/merge.md) | Merges two columns by inserting a third column | +| [Rename Column](wrangler-docs/directives/rename.md) | Renames an existing column in the record | +| [Set Column Header](wrangler-docs/directives/set-headers.md) | Sets the names of columns, in the order they are specified | +| [Split to Columns](wrangler-docs/directives/split-to-columns.md) | Splits a column based on a separator into multiple columns | +| [Swap Columns](wrangler-docs/directives/swap.md) | Swaps column names of two columns | +| [Set Column Data Type](wrangler-docs/directives/set-type.md) | Convert data type of a column | +| **NLP** | | +| [Stemming Tokenized Words](wrangler-docs/directives/stemming.md) | Applies the Porter stemmer algorithm for English words | +| **Transient Aggregators & Setters** | | +| [Increment Variable](wrangler-docs/directives/increment-variable.md) | Increments a transient variable with a record of processing. | +| [Set Variable](wrangler-docs/directives/set-variable.md) | Sets a transient variable with a record of processing. | +| **Functions** | | +| [Data Quality](wrangler-docs/functions/dq-functions.md) | Data quality check functions. Checks for date, time, etc. | +| [Date Manipulations](wrangler-docs/functions/date-functions.md) | Functions that can manipulate date | +| [DDL](wrangler-docs/functions/ddl-functions.md) | Functions that can manipulate definition of data | +| [JSON](wrangler-docs/functions/json-functions.md) | Functions that can be useful in transforming your data | +| [Types](wrangler-docs/functions/type-functions.md) | Functions for detecting the type of data | + +## Performance + +Initial performance tests show that with a set of directives of high complexity for +transforming data, *DataPrep* is able to process at about ~106K records per second. The +rates below are specified as *records/second*. + +| Directive Complexity | Column Count | Records | Size | Mean Rate | +| -------------------- | :----------: | ---------: | -------------: | --------: | +| High (167 Directives) | 426 | 127,946,398 | 82,677,845,324 | 106,367.27 | +| High (167 Directives) | 426 | 511,785,592 | 330,711,381,296 | 105,768.93 | + + +## Contact + +### Mailing Lists + +CDAP User Group and Development Discussions: + +* [cdap-user@googlegroups.com](https://groups.google.com/d/forum/cdap-user) + +The *cdap-user* mailing list is primarily for users using the product to develop +applications or building plugins for appplications. You can expect questions from +users, release announcements, and any other discussions that we think will be helpful +to the users. + +### IRC Channel + +CDAP IRC Channel: [#cdap on irc.freenode.net](http://webchat.freenode.net?channels=%23cdap) + +### Slack Team + +CDAP Users on Slack: [cdap-users team](https://cdap-users.herokuapp.com) + + ## License and Trademarks Copyright © 2016-2019 Cask Data, Inc. @@ -216,3 +418,4 @@ Cask is a trademark of Cask Data, Inc. All rights reserved. Apache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with permission. No endorsement by The Apache Software Foundation is implied by the use of these marks. + diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/DirectiveContext.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/DirectiveContext.java index 78df981d6..f57b1af72 100644 --- a/wrangler-api/src/main/java/io/cdap/wrangler/api/DirectiveContext.java +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/DirectiveContext.java @@ -18,7 +18,7 @@ /** * {@link DirectiveContext} provides the context object to the processing of - * directives. + */ public interface DirectiveContext extends DirectiveEnforcer, DirectiveAlias { } diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java new file mode 100644 index 000000000..22e1ebd04 --- /dev/null +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java @@ -0,0 +1,58 @@ +/* + * Copyright © 2025 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +// --- File: ByteSize.java --- +package io.cdap.wrangler.api.parser; + +import com.google.gson.JsonElement; +import com.google.gson.JsonPrimitive; + +/** + * Token representing a byte size value (e.g., "10KB", "2.5MB"). + */ +public class ByteSize implements Token { + private final long bytes; + + public ByteSize(String value) { + String unit = value.replaceAll("[0-9.]", "").toUpperCase(); + double number = Double.parseDouble(value.replaceAll("[^0-9.]", "")); + switch (unit) { + case "KB": this.bytes = (long) (number * 1024); break; + case "MB": this.bytes = (long) (number * 1024 * 1024); break; + case "GB": this.bytes = (long) (number * 1024 * 1024 * 1024); break; + default: this.bytes = (long) number; break; + } + } + + public long getBytes() { + return this.bytes; + } + + @Override + public Object value() { + return bytes; + } + + @Override + public TokenType type() { + return TokenType.BYTE_SIZE; + } + + @Override + public JsonElement toJson() { + return new JsonPrimitive(bytes); + } +} + diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TimeDuration.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TimeDuration.java new file mode 100644 index 000000000..964783cd1 --- /dev/null +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TimeDuration.java @@ -0,0 +1,61 @@ +/* + * Copyright © 2025 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + + +// --- File: TimeDuration.java --- +package io.cdap.wrangler.api.parser; + +import com.google.gson.JsonElement; +import com.google.gson.JsonPrimitive; + +/** + * Token representing a time duration value (e.g., "150ms", "2.5s"). + */ +public class TimeDuration implements Token { + private final long milliseconds; + + public TimeDuration(String value) { + String unit = value.replaceAll("[0-9.]", "").toLowerCase(); + double number = Double.parseDouble(value.replaceAll("[^0-9.]", "")); + switch (unit) { + case "s": this.milliseconds = (long) (number * 1000); break; + case "ms": this.milliseconds = (long) (number); break; + case "min": this.milliseconds = (long) (number * 60000); break; + default: this.milliseconds = (long) number; break; + } + } + + public long getMilliseconds() { + return milliseconds; + } + + @Override + public Object value() { + return milliseconds; + } + + @Override + public TokenType type() { + return TokenType.TIME_DURATION; + } + + @Override + public JsonElement toJson() { + return new JsonPrimitive(milliseconds); + } +} + + diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/Token.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/Token.java index bc596f4df..5049c319e 100644 --- a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/Token.java +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/Token.java @@ -14,46 +14,47 @@ * the License. */ -package io.cdap.wrangler.api.parser; + package io.cdap.wrangler.api.parser; -import com.google.gson.JsonElement; -import io.cdap.wrangler.api.annotations.PublicEvolving; - -import java.io.Serializable; - -/** - * The Token class represents the object that contains the value and type of - * the token as parsed by the parser of the grammar defined for recipe. - * - *

This class provides methods for retrieving the wrapped value of token parsed - * as well the type of token the implementation of this interface represents.

- * - *

It also provides method for providing the {@code JsonElement} of implementation - * of this interface.

- */ -@PublicEvolving -public interface Token extends Serializable { - /** - * Returns the {@code value} of the object wrapped by the - * implementation of this interface. - * - * @return {@code value} wrapped by the implementation of this interface. - */ - Object value(); - - /** - * Returns the {@code TokenType} of the object represented by the - * implementation of this interface. - * - * @return {@code TokenType} of the implementation object. - */ - TokenType type(); - - /** - * The class implementing this interface will return the {@code JsonElement} - * instance including the values of the object. - * - * @return {@code JsonElement} object containing members of implementing class. - */ - JsonElement toJson(); -} + import com.google.gson.JsonElement; + import io.cdap.wrangler.api.annotations.PublicEvolving; + + import java.io.Serializable; + + /** + * The Token class represents the object that contains the value and type of + * the token as parsed by the parser of the grammar defined for recipe. + * + *

This class provides methods for retrieving the wrapped value of token parsed + * as well the type of token the implementation of this interface represents.

+ * + *

It also provides method for providing the {@code JsonElement} of implementation + * of this interface.

+ */ + @PublicEvolving + public interface Token extends Serializable { + /** + * Returns the {@code value} of the object wrapped by the + * implementation of this interface. + * + * @return {@code value} wrapped by the implementation of this interface. + */ + Object value(); + + /** + * Returns the {@code TokenType} of the object represented by the + * implementation of this interface. + * + * @return {@code TokenType} of the implementation object. + */ + TokenType type(); + + /** + * The class implementing this interface will return the {@code JsonElement} + * instance including the values of the object. + * + * @return {@code JsonElement} object containing members of implementing class. + */ + JsonElement toJson(); + } + \ No newline at end of file diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java index 8c93b0e6a..219e2a9d2 100644 --- a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java @@ -8,149 +8,114 @@ * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package io.cdap.wrangler.api.parser; - -import io.cdap.wrangler.api.annotations.PublicEvolving; - -import java.io.Serializable; - -/** - * The TokenType class provides the enumerated types for different types of - * tokens that are supported by the grammar. - * - * Each of the enumerated types specified in this class also has associated - * object representing it. e.g. {@code DIRECTIVE_NAME} is represented by the - * object {@code DirectiveName}. - * - * @see Bool - * @see BoolList - * @see ColumnName - * @see ColumnNameList - * @see DirectiveName - * @see Numeric - * @see NumericList - * @see Properties - * @see Ranges - * @see Expression - * @see Text - * @see TextList - */ -@PublicEvolving -public enum TokenType implements Serializable { - /** - * Represents the enumerated type for the object {@code DirectiveName} type. - * This type is associated with the token that is recognized as a directive - * name within the recipe. - */ - DIRECTIVE_NAME, - - /** - * Represents the enumerated type for the object of {@code ColumnName} type. - * This type is associated with token that represents the column as defined - * by the grammar as :. - */ - COLUMN_NAME, - - /** - * Represents the enumerated type for the object of {@code Text} type. - * This type is associated with the token that is either enclosed within a single quote(') - * or a double quote (") as string. - */ - TEXT, - - /** - * Represents the enumerated type for the object of {@code Numeric} type. - * This type is associated with the token that is either a integer or real number. - */ - NUMERIC, - - /** - * Represents the enumerated type for the object of {@code Bool} type. - * This type is associated with the token that either represents string 'true' or 'false'. - */ - BOOLEAN, - - /** - * Represents the enumerated type for the object of type {@code BoolList} type. - * This type is associated with the rule that is a collection of {@code Boolean} values - * separated by comman(,). E.g. - * - * ColumnName[,ColumnName]* - * - */ - COLUMN_NAME_LIST, - - /** - * Represents the enumerated type for the object of type {@code TextList} type. - * This type is associated with the comma separated text represented were each text - * is enclosed within a single quote (') or double quote (") and each text is separated - * by comma (,). E.g. - * - * Text[,Text]* - * - */ - TEXT_LIST, - - /** - * Represents the enumerated type for the object of type {@code NumericList} type. - * This type is associated with the collection of {@code Numeric} values separated by - * comma(,). E.g. - * - * Numeric[,Numeric]* - * - * - */ - NUMERIC_LIST, - - /** - * Represents the enumerated type for the object of type {@code BoolList} type. - * This type is associated with the collection of {@code Bool} values separated by - * comma(,). E.g. - * - * Boolean[,Boolean]* - * - */ - BOOLEAN_LIST, - - /** - * Represents the enumerated type for the object of type {@code Expression} type. - * This type is associated with code block that either represents a condition or - * an expression. E.g. - * - * exp:{ } - * - */ - EXPRESSION, - - /** - * Represents the enumerated type for the object of type {@code Properties} type. - * This type is associated with a collection of key and value pairs all separated - * by a comma(,). E.g. - * - * prop:{ =[,=]*} - * - */ - PROPERTIES, - - /** - * Represents the enumerated type for the object of type {@code Ranges} types. - * This type is associated with a collection of range represented in the form shown - * below - * - * :=value[,:=value]* - * - */ - RANGES, - - /** - * Represents the enumerated type for the object of type {@code String} with restrictions - * on characters that can be present in a string. - */ - IDENTIFIER -} + package io.cdap.wrangler.api.parser; + + import io.cdap.wrangler.api.annotations.PublicEvolving; + + import java.io.Serializable; + + /** + * The TokenType class provides the enumerated types for different types of + * tokens that are supported by the grammar. + * + * Each of the enumerated types specified in this class also has associated + * object representing it. e.g. {@code DIRECTIVE_NAME} is represented by the + * object {@code DirectiveName}. + * + * @see Bool + * @see BoolList + * @see ColumnName + * @see ColumnNameList + * @see DirectiveName + * @see Numeric + * @see NumericList + * @see Properties + * @see Ranges + * @see Expression + * @see Text + * @see TextList + */ + @PublicEvolving + public enum TokenType implements Serializable { + /** + * Represents the enumerated type for the object {@code DirectiveName} type. + */ + DIRECTIVE_NAME, + + /** + * Represents the enumerated type for the object of {@code ColumnName} type. + */ + COLUMN_NAME, + + /** + * Represents the enumerated type for the object of {@code Text} type. + */ + TEXT, + + /** + * Represents the enumerated type for the object of {@code Numeric} type. + */ + NUMERIC, + + /** + * Represents the enumerated type for the object of {@code Bool} type. + */ + BOOLEAN, + + /** + * Represents the enumerated type for a list of column names. + */ + COLUMN_NAME_LIST, + + /** + * Represents the enumerated type for a list of text values. + */ + TEXT_LIST, + + /** + * Represents the enumerated type for a list of numeric values. + */ + NUMERIC_LIST, + + /** + * Represents the enumerated type for a list of boolean values. + */ + BOOLEAN_LIST, + + /** + * Represents the enumerated type for the object of type {@code Expression}. + */ + EXPRESSION, + + /** + * Represents the enumerated type for the object of type {@code Properties}. + */ + PROPERTIES, + + /** + * Represents the enumerated type for the object of type {@code Ranges}. + */ + RANGES, + + /** + * Represents the enumerated type for the object of type {@code String} with restrictions. + */ + IDENTIFIER, + + /** + * Represents byte size tokens (e.g., "10KB", "2MB"). + */ + BYTE_SIZE, + + /** + * Represents time duration tokens (e.g., "500ms", "2s"). + */ + TIME_DURATION + } + diff --git a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 index 7c517ed6a..a1123eee8 100644 --- a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 +++ b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 @@ -5,7 +5,7 @@ * use this file except in compliance with the License. You may obtain a copy of * the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT @@ -28,7 +28,7 @@ options { * use this file except in compliance with the License. You may obtain a copy of * the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT @@ -38,57 +38,54 @@ options { */ } -/** - * Parser Grammar for recognizing tokens and constructs of the directives language. - */ recipe : statements EOF ; statements - : ( Comment | macro | directive ';' | pragma ';' | ifStatement)* + : ( Comment | macro | directive ';' | pragma ';' | ifStatement )* ; directive : command - ( codeblock - | identifier - | macro - | text - | number - | bool - | column - | colList - | numberList - | boolList - | stringList - | numberRanges - | properties - )*? - ; + ( codeblock + | identifier + | macro + | text + | number + | bool + | column + | colList + | numberList + | boolList + | stringList + | numberRanges + | properties + )*? + ; ifStatement - : ifStat elseIfStat* elseStat? '}' - ; + : ifStat elseIfStat* elseStat? '}' + ; ifStat - : 'if' expression '{' statements - ; + : 'if' expression '{' statements + ; elseIfStat - : '}' 'else' 'if' expression '{' statements - ; + : '}' 'else' 'if' expression '{' statements + ; elseStat - : '}' 'else' '{' statements - ; + : '}' 'else' '{' statements + ; expression - : '(' (~'(' | expression)* ')' - ; + : '(' (~'(' | expression)* ')' + ; forStatement - : 'for' '(' Identifier '=' expression ';' expression ';' expression ')' '{' statements '}' + : 'for' '(' Identifier '=' expression ';' expression ';' expression ')' '{' statements '}' ; macro @@ -116,11 +113,11 @@ identifier ; properties - : 'prop' ':' OBrace (propertyList)+ CBrace + : 'prop' ':' OBrace (propertyList)+ CBrace | 'prop' ':' OBrace OBrace (propertyList)+ CBrace { notifyErrorListeners("Too many start paranthesis"); } | 'prop' ':' OBrace (propertyList)+ CBrace CBrace { notifyErrorListeners("Too many start paranthesis"); } | 'prop' ':' (propertyList)+ CBrace { notifyErrorListeners("Missing opening brace"); } - | 'prop' ':' OBrace (propertyList)+ { notifyErrorListeners("Missing closing brace"); } + | 'prop' ':' OBrace (propertyList)+ { notifyErrorListeners("Missing closing brace"); } ; propertyList @@ -128,7 +125,7 @@ propertyList ; property - : Identifier '=' ( text | number | bool ) + : Identifier '=' value ; numberRanges @@ -140,9 +137,21 @@ numberRange ; value - : String | Number | Column | Bool + : stringValue + | numberValue + | boolValue + | columnValue + | byteSizeValue + | timeDurationValue ; +stringValue : String; +numberValue : Number; +boolValue : Bool; +columnValue : Column; +byteSizeValue : BYTE_SIZE; +timeDurationValue : TIME_DURATION; + ecommand : '!' Identifier ; @@ -176,7 +185,7 @@ command ; colList - : Column (',' Column)+ + : Column (',' Column)+ ; numberList @@ -195,10 +204,6 @@ identifierList : Identifier (',' Identifier)* ; - -/* - * Following are the Lexer Rules used for tokenizing the recipe. - */ OBrace : '{'; CBrace : '}'; SColon : ';'; @@ -247,7 +252,6 @@ BackSlash: '\\'; Dollar : '$'; Tilde : '~'; - Bool : 'true' | 'false' @@ -275,28 +279,25 @@ String ; EscapeSequence - : '\\' ('b'|'t'|'n'|'f'|'r'|'"'|'\''|'\\') - | UnicodeEscape - | OctalEscape - ; - -fragment -OctalEscape - : '\\' ('0'..'3') ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') - ; - -fragment -UnicodeEscape - : '\\' 'u' HexDigit HexDigit HexDigit HexDigit - ; - -fragment - HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ; + : '\\' ('b'|'t'|'n'|'f'|'r'|'"'|'\''|'\\') + | UnicodeEscape + | OctalEscape + ; + +fragment OctalEscape + : '\\' ('0'..'3') ('0'..'7') ('0'..'7') + | '\\' ('0'..'7') ('0'..'7') + | '\\' ('0'..'7') + ; + +fragment UnicodeEscape + : '\\' 'u' HexDigit HexDigit HexDigit HexDigit + ; + +fragment HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ; Comment - : ('//' ~[\r\n]* | '/*' .*? '*/' | '--' ~[\r\n]* ) -> skip + : ('//' ~[\r\n]* | '/' .? '/' | '--' ~[\r\n] ) -> skip ; Space @@ -311,3 +312,10 @@ fragment Int fragment Digit : [0-9] ; + +BYTE_SIZE : DIGITS BYTE_UNIT ; +TIME_DURATION : DIGITS TIME_UNIT ; + +fragment DIGITS : [0-9]+ ('.' [0-9]+)? ; +fragment BYTE_UNIT : [kK]?[bB] | [mM][bB] | [gG][bB] | [tT][bB] | [pP][bB] ; +fragment TIME_UNIT : 'ns' | 'us' | 'ms' | 's' | 'm' | 'h' | 'd' ; \ No newline at end of file diff --git a/wrangler-core/src/main/java/io/cdap/directives/aggregates/AggregrateStats.java b/wrangler-core/src/main/java/io/cdap/directives/aggregates/AggregrateStats.java new file mode 100644 index 000000000..17ee4cba2 --- /dev/null +++ b/wrangler-core/src/main/java/io/cdap/directives/aggregates/AggregrateStats.java @@ -0,0 +1,56 @@ +/* + * Copyright © 2025 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific + * language governing permissions and limitations under the License. + */ + + package io.cdap.directives.aggregates; + +import io.cdap.wrangler.api.Directive; +import io.cdap.wrangler.api.Row; +import io.cdap.wrangler.api.parser.Text; +import io.cdap.wrangler.api.parser.Token; + +import java.util.List; +import java.util.stream.Collectors; + +public class AggregateStats implements Directive { + + private String operation; + + @Override + public void initialize(List args) { + for (Token token : args) { + if (token instanceof Text) { + Object val = ((Text) token).value(); + if (val instanceof String) { + operation = ((String) val).toLowerCase(); + } + } + } + } + + @Override + public List execute(List rows) { + // Dummy implementation + return rows; + } + + @Override + public void destroy() { + // No cleanup needed + } + + // Optionally implement define() if required by your Directive interface + // If the Directive interface changed, add the appropriate method override +} \ No newline at end of file diff --git a/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java b/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java index ac35e7a5e..111dc7e2c 100644 --- a/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java +++ b/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java @@ -14,316 +14,324 @@ * the License. */ -package io.cdap.wrangler.parser; + package io.cdap.wrangler.parser; -import io.cdap.wrangler.api.LazyNumber; -import io.cdap.wrangler.api.RecipeSymbol; -import io.cdap.wrangler.api.SourceInfo; -import io.cdap.wrangler.api.Triplet; -import io.cdap.wrangler.api.parser.Bool; -import io.cdap.wrangler.api.parser.BoolList; -import io.cdap.wrangler.api.parser.ColumnName; -import io.cdap.wrangler.api.parser.ColumnNameList; -import io.cdap.wrangler.api.parser.DirectiveName; -import io.cdap.wrangler.api.parser.Expression; -import io.cdap.wrangler.api.parser.Identifier; -import io.cdap.wrangler.api.parser.Numeric; -import io.cdap.wrangler.api.parser.NumericList; -import io.cdap.wrangler.api.parser.Properties; -import io.cdap.wrangler.api.parser.Ranges; -import io.cdap.wrangler.api.parser.Text; -import io.cdap.wrangler.api.parser.TextList; -import io.cdap.wrangler.api.parser.Token; -import org.antlr.v4.runtime.ParserRuleContext; -import org.antlr.v4.runtime.misc.Interval; -import org.antlr.v4.runtime.tree.ParseTree; -import org.antlr.v4.runtime.tree.TerminalNode; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * This class RecipeVisitor implements the visitor pattern - * used during traversal of the AST tree. The ParserTree#Walker - * invokes appropriate methods as call backs with information about the node. - * - *

In order to understand what's being invoked, please look at the grammar file - * Directive.g4

. - * - *

This class exposes a getTokenGroups method for retrieving the - * RecipeSymbol after visiting. The RecipeSymbol represents - * all the TokenGroup for all directives in a recipe. Each directive - * will create a TokenGroup

- * - *

As the ParseTree is walking through the call graph, it generates - * one TokenGroup for each directive in the recipe. Each TokenGroup - * contains parsed Tokens for that directive along with more information like - * SourceInfo. A collection of TokenGroup consistutes a RecipeSymbol - * that is returned by this function.

- */ -public final class RecipeVisitor extends DirectivesBaseVisitor { - private RecipeSymbol.Builder builder = new RecipeSymbol.Builder(); - - /** - * Returns a RecipeSymbol for the recipe being parsed. This - * object has all the tokens that were successfully parsed along with source - * information for each directive in the recipe. - * - * @return An compiled object after parsing the recipe. - */ - public RecipeSymbol getCompiledUnit() { - return builder.build(); - } - - /** - * A Recipe is made up of Directives and Directives is made up of each individual - * Directive. This method is invoked on every visit to a new directive in the recipe. - */ - @Override - public RecipeSymbol.Builder visitDirective(DirectivesParser.DirectiveContext ctx) { - builder.createTokenGroup(getOriginalSource(ctx)); - return super.visitDirective(ctx); - } - - /** - * A Directive can include identifiers, this method extracts that token that is being - * identified as token of type Identifier. - */ - @Override - public RecipeSymbol.Builder visitIdentifier(DirectivesParser.IdentifierContext ctx) { - builder.addToken(new Identifier(ctx.Identifier().getText())); - return super.visitIdentifier(ctx); - } - - /** - * A Directive can include properties (which are a collection of key and value pairs), - * this method extracts that token that is being identified as token of type Properties. - */ - @Override - public RecipeSymbol.Builder visitPropertyList(DirectivesParser.PropertyListContext ctx) { - Map props = new HashMap<>(); - List properties = ctx.property(); - for (DirectivesParser.PropertyContext property : properties) { - String identifier = property.Identifier().getText(); - Token token; - if (property.number() != null) { - token = new Numeric(new LazyNumber(property.number().getText())); - } else if (property.bool() != null) { - token = new Bool(Boolean.valueOf(property.bool().getText())); - } else { - String text = property.text().getText(); - token = new Text(text.substring(1, text.length() - 1)); - } - props.put(identifier, token); - } - builder.addToken(new Properties(props)); - return builder; - } - - /** - * A Pragma is an instruction to the compiler to dynamically load the directives being specified - * from the DirectiveRegistry. These do not affect the data flow. - * - *

E.g. #pragma load-directives test1, test2, test3; will collect the tokens - * test1, test2 and test3 as dynamically loadable directives.

- */ - @Override - public RecipeSymbol.Builder visitPragmaLoadDirective(DirectivesParser.PragmaLoadDirectiveContext ctx) { - List identifiers = ctx.identifierList().Identifier(); - for (TerminalNode identifier : identifiers) { - builder.addLoadableDirective(identifier.getText()); - } - return builder; - } - - /** - * A Pragma version is a informational directive to notify compiler about the grammar that is should - * be using to parse the directives below. - */ - @Override - public RecipeSymbol.Builder visitPragmaVersion(DirectivesParser.PragmaVersionContext ctx) { - builder.addVersion(ctx.Number().getText()); - return builder; - } - - /** - * A Directive can include number ranges like start:end=value[,start:end=value]*. This - * visitor method allows you to collect all the number ranges and create a token type - * Ranges. - */ - @Override - public RecipeSymbol.Builder visitNumberRanges(DirectivesParser.NumberRangesContext ctx) { - List> output = new ArrayList<>(); - List ranges = ctx.numberRange(); - for (DirectivesParser.NumberRangeContext range : ranges) { - List numbers = range.Number(); - String text = range.value().getText(); - if (text.startsWith("'") && text.endsWith("'")) { - text = text.substring(1, text.length() - 1); + import io.cdap.wrangler.api.LazyNumber; + import io.cdap.wrangler.api.RecipeSymbol; + import io.cdap.wrangler.api.SourceInfo; + import io.cdap.wrangler.api.Triplet; + import io.cdap.wrangler.api.parser.Bool; + import io.cdap.wrangler.api.parser.BoolList; + import io.cdap.wrangler.api.parser.ColumnName; + import io.cdap.wrangler.api.parser.ColumnNameList; + import io.cdap.wrangler.api.parser.DirectiveName; + import io.cdap.wrangler.api.parser.Expression; + import io.cdap.wrangler.api.parser.Identifier; + import io.cdap.wrangler.api.parser.Numeric; + import io.cdap.wrangler.api.parser.NumericList; + import io.cdap.wrangler.api.parser.Properties; + import io.cdap.wrangler.api.parser.Ranges; + import io.cdap.wrangler.api.parser.Text; + import io.cdap.wrangler.api.parser.TextList; + import io.cdap.wrangler.api.parser.Token; + import org.antlr.v4.runtime.ParserRuleContext; + import org.antlr.v4.runtime.misc.Interval; + import org.antlr.v4.runtime.tree.ParseTree; + import org.antlr.v4.runtime.tree.TerminalNode; + + import java.util.ArrayList; + import java.util.HashMap; + import java.util.List; + import java.util.Map; + + /** + * This class RecipeVisitor implements the visitor pattern + * used during traversal of the AST tree. The ParserTree#Walker + * invokes appropriate methods as call backs with information about the node. + * + *

In order to understand what's being invoked, please look at the grammar file + * Directive.g4

. + * + *

This class exposes a getTokenGroups method for retrieving the + * RecipeSymbol after visiting. The RecipeSymbol represents + * all the TokenGroup for all directives in a recipe. Each directive + * will create a TokenGroup

+ * + *

As the ParseTree is walking through the call graph, it generates + * one TokenGroup for each directive in the recipe. Each TokenGroup + * contains parsed Tokens for that directive along with more information like + * SourceInfo. A collection of TokenGroup consistutes a RecipeSymbol + * that is returned by this function.

+ */ + public final class RecipeVisitor extends DirectivesBaseVisitor { + private RecipeSymbol.Builder builder = new RecipeSymbol.Builder(); + + /** + * Returns a RecipeSymbol for the recipe being parsed. This + * object has all the tokens that were successfully parsed along with source + * information for each directive in the recipe. + * + * @return An compiled object after parsing the recipe. + */ + public RecipeSymbol getCompiledUnit() { + return builder.build(); + } + + /** + * A Recipe is made up of Directives and Directives is made up of each individual + * Directive. This method is invoked on every visit to a new directive in the recipe. + */ + @Override + public RecipeSymbol.Builder visitDirective(DirectivesParser.DirectiveContext ctx) { + builder.createTokenGroup(getOriginalSource(ctx)); + return super.visitDirective(ctx); + } + + /** + * A Directive can include identifiers, this method extracts that token that is being + * identified as token of type Identifier. + */ + @Override + public RecipeSymbol.Builder visitIdentifier(DirectivesParser.IdentifierContext ctx) { + builder.addToken(new Identifier(ctx.Identifier().getText())); + return super.visitIdentifier(ctx); + } + + /** + * A Directive can include properties (which are a collection of key and value pairs), + * this method extracts that token that is being identified as token of type Properties. + */ + @Override + public RecipeSymbol.Builder visitPropertyList(DirectivesParser.PropertyListContext ctx) { + Map props = new HashMap<>(); + List properties = ctx.property(); + for (DirectivesParser.PropertyContext property : properties) { + String identifier = property.Identifier().getText(); + Token token; + if (property.value().numberValue() != null) { + token = new Numeric(new LazyNumber(property.value().numberValue().getText())); + } else if (property.value().boolValue() != null) { + token = new Bool(Boolean.parseBoolean(property.value().boolValue().getText())); + } else if (property.value().stringValue() != null) { + String text = property.value().stringValue().getText(); + token = new Text(text.substring(1, text.length() - 1)); + } else if (property.value().columnValue() != null) { + token = new Text(property.value().columnValue().getText()); + } else if (property.value().byteSizeValue() != null) { + token = new Text(property.value().byteSizeValue().getText()); + } else if (property.value().timeDurationValue() != null) { + token = new Text(property.value().timeDurationValue().getText()); + } else { + token = new Text(""); + } + props.put(identifier, token); } - Triplet val = - new Triplet<>(new Numeric(new LazyNumber(numbers.get(0).getText())), - new Numeric(new LazyNumber(numbers.get(1).getText())), - text - ); - output.add(val); - } - builder.addToken(new Ranges(output)); - return builder; - } - - /** - * This visitor method extracts the custom directive name specified. The custom - * directives are specified with a bang (!) at the start. - */ - @Override - public RecipeSymbol.Builder visitEcommand(DirectivesParser.EcommandContext ctx) { - builder.addToken(new DirectiveName(ctx.Identifier().getText())); - return builder; - } - - /** - * A Directive can consist of column specifiers. These are columns that the directive - * would operate on. When a token of type column is visited, it would generate a token - * type of type ColumnName. - */ - @Override - public RecipeSymbol.Builder visitColumn(DirectivesParser.ColumnContext ctx) { - builder.addToken(new ColumnName(ctx.Column().getText().substring(1))); - return builder; - } - - /** - * A Directive can consist of text field. These type of fields are enclosed within - * a single-quote or a double-quote. This visitor method extracts the string value - * within the quotes and creates a token type Text. - */ - @Override - public RecipeSymbol.Builder visitText(DirectivesParser.TextContext ctx) { - String value = ctx.String().getText(); - builder.addToken(new Text(value.substring(1, value.length() - 1))); - return builder; - } - - /** - * A Directive can consist of numeric field. This visitor method extracts the - * numeric value Numeric. - */ - @Override - public RecipeSymbol.Builder visitNumber(DirectivesParser.NumberContext ctx) { - LazyNumber number = new LazyNumber(ctx.Number().getText()); - builder.addToken(new Numeric(number)); - return builder; - } - - /** - * A Directive can consist of Bool field. The Bool field is represented as - * either true or false. This visitor method extract the bool value into a - * token type Bool. - */ - @Override - public RecipeSymbol.Builder visitBool(DirectivesParser.BoolContext ctx) { - builder.addToken(new Bool(Boolean.valueOf(ctx.Bool().getText()))); - return builder; - } - - /** - * A Directive can include a expression or a condition to be evaluated. When - * such a token type is found, the visitor extracts the expression and generates - * a token type Expression to be added to the TokenGroup - */ - @Override - public RecipeSymbol.Builder visitCondition(DirectivesParser.ConditionContext ctx) { - int childCount = ctx.getChildCount(); - StringBuilder sb = new StringBuilder(); - for (int i = 1; i < childCount - 1; ++i) { - ParseTree child = ctx.getChild(i); - sb.append(child.getText()).append(" "); - } - builder.addToken(new Expression(sb.toString())); - return builder; - } - - /** - * A Directive has name and in the parsing context it's called a command. - * This visitor methods extracts the command and creates a toke type DirectiveName - */ - @Override - public RecipeSymbol.Builder visitCommand(DirectivesParser.CommandContext ctx) { - builder.addToken(new DirectiveName(ctx.Identifier().getText())); - return builder; - } - - /** - * This visitor methods extracts the list of columns specified. It creates a token - * type ColumnNameList to be added to TokenGroup. - */ - @Override - public RecipeSymbol.Builder visitColList(DirectivesParser.ColListContext ctx) { - List columns = ctx.Column(); - List names = new ArrayList<>(); - for (TerminalNode column : columns) { - names.add(column.getText().substring(1)); - } - builder.addToken(new ColumnNameList(names)); - return builder; - } - - /** - * This visitor methods extracts the list of numeric specified. It creates a token - * type NumericList to be added to TokenGroup. - */ - @Override - public RecipeSymbol.Builder visitNumberList(DirectivesParser.NumberListContext ctx) { - List numbers = ctx.Number(); - List numerics = new ArrayList<>(); - for (TerminalNode number : numbers) { - numerics.add(new LazyNumber(number.getText())); - } - builder.addToken(new NumericList(numerics)); - return builder; - } - - /** - * This visitor methods extracts the list of booleans specified. It creates a token - * type BoolList to be added to TokenGroup. - */ - @Override - public RecipeSymbol.Builder visitBoolList(DirectivesParser.BoolListContext ctx) { - List bools = ctx.Bool(); - List booleans = new ArrayList<>(); - for (TerminalNode bool : bools) { - booleans.add(Boolean.parseBoolean(bool.getText())); - } - builder.addToken(new BoolList(booleans)); - return builder; - } - - /** - * This visitor methods extracts the list of strings specified. It creates a token - * type StringList to be added to TokenGroup. - */ - @Override - public RecipeSymbol.Builder visitStringList(DirectivesParser.StringListContext ctx) { - List strings = ctx.String(); - List strs = new ArrayList<>(); - for (TerminalNode string : strings) { - String text = string.getText(); - strs.add(text.substring(1, text.length() - 1)); - } - builder.addToken(new TextList(strs)); - return builder; - } - - private SourceInfo getOriginalSource(ParserRuleContext ctx) { - int a = ctx.getStart().getStartIndex(); - int b = ctx.getStop().getStopIndex(); - Interval interval = new Interval(a, b); - String text = ctx.start.getInputStream().getText(interval); - int lineno = ctx.getStart().getLine(); - int column = ctx.getStart().getCharPositionInLine(); - return new SourceInfo(lineno, column, text); - } -} + builder.addToken(new Properties(props)); + return builder; + } // ✅ <-- This is the correct and only closing brace for this method + + /** + * A Pragma is an instruction to the compiler to dynamically load the directives being specified + * from the DirectiveRegistry. These do not affect the data flow. + * + *

E.g. #pragma load-directives test1, test2, test3; will collect the tokens + * test1, test2 and test3 as dynamically loadable directives.

+ */ + @Override + public RecipeSymbol.Builder visitPragmaLoadDirective(DirectivesParser.PragmaLoadDirectiveContext ctx) { + List identifiers = ctx.identifierList().Identifier(); + for (TerminalNode identifier : identifiers) { + builder.addLoadableDirective(identifier.getText()); + } + return builder; + } + + /** + * A Pragma version is a informational directive to notify compiler about the grammar that is should + * be using to parse the directives below. + */ + @Override + public RecipeSymbol.Builder visitPragmaVersion(DirectivesParser.PragmaVersionContext ctx) { + builder.addVersion(ctx.Number().getText()); + return builder; + } + + /** + * A Directive can include number ranges like start:end=value[,start:end=value]*. This + * visitor method allows you to collect all the number ranges and create a token type + * Ranges. + */ + @Override + public RecipeSymbol.Builder visitNumberRanges(DirectivesParser.NumberRangesContext ctx) { + List> output = new ArrayList<>(); + List ranges = ctx.numberRange(); + for (DirectivesParser.NumberRangeContext range : ranges) { + List numbers = range.Number(); + String text = range.value().getText(); + if (text.startsWith("'") && text.endsWith("'")) { + text = text.substring(1, text.length() - 1); + } + Triplet val = + new Triplet<>(new Numeric(new LazyNumber(numbers.get(0).getText())), + new Numeric(new LazyNumber(numbers.get(1).getText())), + text + ); + output.add(val); + } + builder.addToken(new Ranges(output)); + return builder; + } + + /** + * This visitor method extracts the custom directive name specified. The custom + * directives are specified with a bang (!) at the start. + */ + @Override + public RecipeSymbol.Builder visitEcommand(DirectivesParser.EcommandContext ctx) { + builder.addToken(new DirectiveName(ctx.Identifier().getText())); + return builder; + } + + /** + * A Directive can consist of column specifiers. These are columns that the directive + * would operate on. When a token of type column is visited, it would generate a token + * type of type ColumnName. + */ + @Override + public RecipeSymbol.Builder visitColumn(DirectivesParser.ColumnContext ctx) { + builder.addToken(new ColumnName(ctx.Column().getText().substring(1))); + return builder; + } + + /** + * A Directive can consist of text field. These type of fields are enclosed within + * a single-quote or a double-quote. This visitor method extracts the string value + * within the quotes and creates a token type Text. + */ + @Override + public RecipeSymbol.Builder visitText(DirectivesParser.TextContext ctx) { + String value = ctx.String().getText(); + builder.addToken(new Text(value.substring(1, value.length() - 1))); + return builder; + } + + /** + * A Directive can consist of numeric field. This visitor method extracts the + * numeric value Numeric. + */ + @Override + public RecipeSymbol.Builder visitNumber(DirectivesParser.NumberContext ctx) { + LazyNumber number = new LazyNumber(ctx.Number().getText()); + builder.addToken(new Numeric(number)); + return builder; + } + + /** + * A Directive can consist of Bool field. The Bool field is represented as + * either true or false. This visitor method extract the bool value into a + * token type Bool. + */ + @Override + public RecipeSymbol.Builder visitBool(DirectivesParser.BoolContext ctx) { + builder.addToken(new Bool(Boolean.valueOf(ctx.Bool().getText()))); + return builder; + } + + /** + * A Directive can include a expression or a condition to be evaluated. When + * such a token type is found, the visitor extracts the expression and generates + * a token type Expression to be added to the TokenGroup + */ + @Override + public RecipeSymbol.Builder visitCondition(DirectivesParser.ConditionContext ctx) { + int childCount = ctx.getChildCount(); + StringBuilder sb = new StringBuilder(); + for (int i = 1; i < childCount - 1; ++i) { + ParseTree child = ctx.getChild(i); + sb.append(child.getText()).append(" "); + } + builder.addToken(new Expression(sb.toString())); + return builder; + } + + /** + * A Directive has name and in the parsing context it's called a command. + * This visitor methods extracts the command and creates a toke type DirectiveName + */ + @Override + public RecipeSymbol.Builder visitCommand(DirectivesParser.CommandContext ctx) { + builder.addToken(new DirectiveName(ctx.Identifier().getText())); + return builder; + } + + /** + * This visitor methods extracts the list of columns specified. It creates a token + * type ColumnNameList to be added to TokenGroup. + */ + @Override + public RecipeSymbol.Builder visitColList(DirectivesParser.ColListContext ctx) { + List columns = ctx.Column(); + List names = new ArrayList<>(); + for (TerminalNode column : columns) { + names.add(column.getText().substring(1)); + } + builder.addToken(new ColumnNameList(names)); + return builder; + } + + /** + * This visitor methods extracts the list of numeric specified. It creates a token + * type NumericList to be added to TokenGroup. + */ + @Override + public RecipeSymbol.Builder visitNumberList(DirectivesParser.NumberListContext ctx) { + List numbers = ctx.Number(); + List numerics = new ArrayList<>(); + for (TerminalNode number : numbers) { + numerics.add(new LazyNumber(number.getText())); + } + builder.addToken(new NumericList(numerics)); + return builder; + } + + /** + * This visitor methods extracts the list of booleans specified. It creates a token + * type BoolList to be added to TokenGroup. + */ + @Override + public RecipeSymbol.Builder visitBoolList(DirectivesParser.BoolListContext ctx) { + List bools = ctx.Bool(); + List booleans = new ArrayList<>(); + for (TerminalNode bool : bools) { + booleans.add(Boolean.parseBoolean(bool.getText())); + } + builder.addToken(new BoolList(booleans)); + return builder; + } + + /** + * This visitor methods extracts the list of strings specified. It creates a token + * type StringList to be added to TokenGroup. + */ + @Override + public RecipeSymbol.Builder visitStringList(DirectivesParser.StringListContext ctx) { + List strings = ctx.String(); + List strs = new ArrayList<>(); + for (TerminalNode string : strings) { + String text = string.getText(); + strs.add(text.substring(1, text.length() - 1)); + } + builder.addToken(new TextList(strs)); + return builder; + } + + private SourceInfo getOriginalSource(ParserRuleContext ctx) { + int a = ctx.getStart().getStartIndex(); + int b = ctx.getStop().getStopIndex(); + Interval interval = new Interval(a, b); + String text = ctx.start.getInputStream().getText(interval); + int lineno = ctx.getStart().getLine(); + int column = ctx.getStart().getCharPositionInLine(); + return new SourceInfo(lineno, column, text); + } + }