From a06496d6f57485b71b8eb3cb4c50b5bb6c44eb83 Mon Sep 17 00:00:00 2001 From: Clay McLeod Date: Wed, 22 Nov 2023 12:30:52 -0600 Subject: [PATCH] fix: fixes invalid string parsing --- Gauntlet.toml | 36 +--- wdl-grammar/src/commands/gauntlet.rs | 6 +- wdl-grammar/src/main.rs | 11 + wdl-grammar/src/v1/lint/whitespace.rs | 42 +++- wdl-grammar/src/v1/tests/expression/core.rs | 78 +++---- .../v1/tests/expression/core/array_literal.rs | 192 +++++++++++------- wdl-grammar/src/v1/tests/literal.rs | 56 ++++- wdl-grammar/src/v1/tests/primitives/string.rs | 60 +++++- .../primitives/string/double_quoted_string.rs | 58 +++--- .../primitives/string/single_quoted_string.rs | 62 +++--- .../v1/validation/invalid_escape_character.rs | 2 +- wdl-grammar/src/v1/wdl.pest | 119 +++++------ 12 files changed, 457 insertions(+), 265 deletions(-) diff --git a/Gauntlet.toml b/Gauntlet.toml index c3f855682..46d9ca21d 100644 --- a/Gauntlet.toml +++ b/Gauntlet.toml @@ -1,13 +1,5 @@ version = "v1" -[[repositories]] -organization = "biowdl" -name = "tasks" - -[[repositories]] -organization = "chanzuckerberg" -name = "czid-workflows" - [[repositories]] organization = "stjudecloud" name = "workflows" @@ -16,33 +8,25 @@ name = "workflows" organization = "PacificBiosciences" name = "HiFi-human-WGS-WDL" +[[repositories]] +organization = "chanzuckerberg" +name = "czid-workflows" + +[[repositories]] +organization = "biowdl" +name = "tasks" + [[ignored_errors]] document = "biowdl/tasks:bcftools.wdl" error = '''validation error: [v1::001] invalid escape character '\_' in string at line 114:75''' [[ignored_errors]] document = "biowdl/tasks:bedtools.wdl" -error = """ -parse error: - - --> 29:67 - | -29 | String memory = \"~{512 + ceil(size([inputBed, faidx], \"MiB\"))}MiB\" - | ^--- - | - = expected WHITESPACE or OPTION""" +error = '''validation error: [v1::001] invalid escape character '\.' in string at line 27:48''' [[ignored_errors]] document = "biowdl/tasks:bowtie.wdl" -error = """ -parse error: - - --> 40:58 - | -40 | String memory = \"~{5 + ceil(size(indexFiles, \"GiB\"))}GiB\" - | ^--- - | - = expected WHITESPACE or OPTION""" +error = '''validation error: [v1::001] invalid escape character '\.' in string at line 63:32''' [[ignored_errors]] document = "biowdl/tasks:centrifuge.wdl" diff --git a/wdl-grammar/src/commands/gauntlet.rs b/wdl-grammar/src/commands/gauntlet.rs index d9b27b501..bd2f36940 100644 --- a/wdl-grammar/src/commands/gauntlet.rs +++ b/wdl-grammar/src/commands/gauntlet.rs @@ -89,7 +89,7 @@ pub struct Args { quiet: bool, /// Overwrites the configuration file. - #[arg(short, long, global = true)] + #[arg(long, global = true)] save_config: bool, /// Silences printing detailed error information. @@ -101,7 +101,7 @@ pub struct Args { skip_remote: bool, /// Displays warnings as part of the report output. - #[arg(short, long, global = true)] + #[arg(long, global = true)] show_warnings: bool, /// The Workflow Description Language (WDL) specification version to use. @@ -307,7 +307,7 @@ pub async fn gauntlet(args: Args) -> Result<()> { println!( "\n{}\n", "Undetected expected errors: you should remove these from your \ - Config.toml or run this command with the `-s` option!" + Config.toml or run this command with the `--save-config` option!" .red() .bold() ); diff --git a/wdl-grammar/src/main.rs b/wdl-grammar/src/main.rs index a650d4de8..67e997298 100644 --- a/wdl-grammar/src/main.rs +++ b/wdl-grammar/src/main.rs @@ -106,3 +106,14 @@ async fn main() { Err(err) => eprintln!("error: {}", err), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn verify_arguments() { + use clap::CommandFactory; + Args::command().debug_assert() + } +} diff --git a/wdl-grammar/src/v1/lint/whitespace.rs b/wdl-grammar/src/v1/lint/whitespace.rs index a56325f5d..a4d9556db 100644 --- a/wdl-grammar/src/v1/lint/whitespace.rs +++ b/wdl-grammar/src/v1/lint/whitespace.rs @@ -46,6 +46,21 @@ impl Whitespace { .try_build() .unwrap() } + + /// Creates an error corresponding to a line with a trailing tab. + fn trailing_tab(&self, line_no: NonZeroUsize) -> lint::Warning + where + Self: Rule, + { + // SAFETY: this error is written so that it will always unwrap. + lint::warning::Builder::default() + .code(self.code()) + .level(lint::Level::Low) + .group(lint::Group::Style) + .message(format!("trailing tab at the end of line {}", line_no)) + .try_build() + .unwrap() + } } impl Rule for Whitespace { @@ -73,6 +88,8 @@ impl Rule for Whitespace { results.push(self.empty_line(line_no)); } else if line.ends_with(' ') { results.push(self.trailing_space(line_no)); + } else if line.ends_with('\t') { + results.push(self.trailing_tab(line_no)); } } @@ -122,7 +139,21 @@ mod tests { } #[test] - fn it_unwraps_a_trailing_whitespace_error() { + fn it_catches_a_trailing_tab() -> Result<(), Box> { + let tree = Parser::parse(Rule::document, "version 1.1\t")?; + let warning = Whitespace.check(tree)?.unwrap(); + + assert_eq!(warning.len(), 1); + assert_eq!( + warning.first().unwrap().to_string(), + "[v1::001::Style/Low] trailing tab at the end of line 1" + ); + + Ok(()) + } + + #[test] + fn it_unwraps_a_trailing_space_error() { let warning = Whitespace.trailing_space(NonZeroUsize::try_from(1).unwrap()); assert_eq!( warning.to_string(), @@ -130,6 +161,15 @@ mod tests { ) } + #[test] + fn it_unwraps_a_trailing_tab_error() { + let warning = Whitespace.trailing_tab(NonZeroUsize::try_from(1).unwrap()); + assert_eq!( + warning.to_string(), + "[v1::001::Style/Low] trailing tab at the end of line 1" + ) + } + #[test] fn it_unwraps_an_empty_line_error() { let warning = Whitespace.empty_line(NonZeroUsize::try_from(1).unwrap()); diff --git a/wdl-grammar/src/v1/tests/expression/core.rs b/wdl-grammar/src/v1/tests/expression/core.rs index 17ca84516..1a6443c1d 100644 --- a/wdl-grammar/src/v1/tests/expression/core.rs +++ b/wdl-grammar/src/v1/tests/expression/core.rs @@ -18,45 +18,49 @@ fn it_successfully_parses_an_array_literal_with_spaces_inside() { parser: WdlParser, input: "[if a then b else c, \"Hello, world!\"]", rule: Rule::core, - tokens: [array_literal(0, 37, [ - expression(1, 19, [ - r#if(1, 19, [ - WHITESPACE(3, 4, [ - SPACE(3, 4), - ]), - expression(4, 5, [ - identifier(4, 5), - ]), - WHITESPACE(5, 6, [ - SPACE(5, 6), - ]), - WHITESPACE(10, 11, [ - SPACE(10, 11), - ]), - expression(11, 12, [ - identifier(11, 12), - ]), - WHITESPACE(12, 13, [ - SPACE(12, 13), - ]), - WHITESPACE(17, 18, [ - SPACE(17, 18), + tokens: [ + // `[if a then b else c, "Hello, world!"]` + array_literal(0, 37, [ + // `if a then b else c` + expression(1, 19, [ + // `if a then b else c` + r#if(1, 19, [ + WHITESPACE(3, 4, [SPACE(3, 4)]), + // `a` + expression(4, 5, [ + // `a` + identifier(4, 5), + ]), + WHITESPACE(5, 6, [SPACE(5, 6)]), + WHITESPACE(10, 11, [SPACE(10, 11)]), + // `b` + expression(11, 12, [ + // `b` + identifier(11, 12), + ]), + WHITESPACE(12, 13, [SPACE(12, 13)]), + WHITESPACE(17, 18, [SPACE(17, 18)]), + // `c` + expression(18, 19, [ + // `c` + identifier(18, 19), + ]), + ]), ]), - expression(18, 19, [ - identifier(18, 19), + // `,` + COMMA(19, 20), + WHITESPACE(20, 21, [SPACE(20, 21)]), + // `"Hello, world!"` + expression(21, 36, [ + // `"Hello, world!"` + string(21, 36, [ + // `"` + double_quote(21, 22), + // `Hello, world!` + string_literal_contents(22, 35), + ]), ]), - ]), - ]), - COMMA(19, 20), - WHITESPACE(20, 21, [ - SPACE(20, 21), - ]), - expression(21, 36, [ - string(21, 36, [ - double_quoted_string(21, 36), - ]), - ]), - ]) + ]) ] } } diff --git a/wdl-grammar/src/v1/tests/expression/core/array_literal.rs b/wdl-grammar/src/v1/tests/expression/core/array_literal.rs index d19bc4046..8753ceaf4 100644 --- a/wdl-grammar/src/v1/tests/expression/core/array_literal.rs +++ b/wdl-grammar/src/v1/tests/expression/core/array_literal.rs @@ -33,44 +33,60 @@ fn it_fails_to_parse_an_array_literal_with_spaces_outside_the_input() { fn it_successfully_parses_an_array_literal() { parses_to! { parser: WdlParser, - input: "[if a then b else c,\"Hello, world!\"] ", + input: "[if a then b else c,\"Hello, world!\"]", rule: Rule::array_literal, - tokens: [array_literal(0, 36, [ - expression(1, 19, [ - r#if(1, 19, [ - WHITESPACE(3, 4, [ - SPACE(3, 4), - ]), - expression(4, 5, [ - identifier(4, 5), - ]), - WHITESPACE(5, 6, [ - SPACE(5, 6), - ]), - WHITESPACE(10, 11, [ - SPACE(10, 11), - ]), - expression(11, 12, [ - identifier(11, 12), - ]), - WHITESPACE(12, 13, [ - SPACE(12, 13), - ]), - WHITESPACE(17, 18, [ - SPACE(17, 18), - ]), - expression(18, 19, [ - identifier(18, 19), + tokens: [ + // `[if a then b else c,"Hello, world!"]` + array_literal(0, 36, [ + // `if a then b else c` + expression(1, 19, [ + // `if a then b else c` + r#if(1, 19, [ + WHITESPACE(3, 4, [ + SPACE(3, 4), + ]), + // `a` + expression(4, 5, [ + // `a` + identifier(4, 5), + ]), + WHITESPACE(5, 6, [ + SPACE(5, 6), + ]), + WHITESPACE(10, 11, [ + SPACE(10, 11), + ]), + // `b` + expression(11, 12, [ + // `b` + identifier(11, 12), + ]), + WHITESPACE(12, 13, [ + SPACE(12, 13), + ]), + WHITESPACE(17, 18, [ + SPACE(17, 18), + ]), + // `c` + expression(18, 19, [ + // `c` + identifier(18, 19), + ]), ]), ]), - ]), - COMMA(19, 20), - expression(20, 35, [ - string(20, 35, [ - double_quoted_string(20, 35), + // `,` + COMMA(19, 20), + // `"Hello, world!"` + expression(20, 35, [ + // `"Hello, world!"` + string(20, 35, [ + // `"` + double_quote(20, 21), + // `Hello, world!` + string_literal_contents(21, 34), + ]), ]), - ]), - ]) + ]) ] } } @@ -81,13 +97,19 @@ fn it_successfully_parses_an_array_literal_without_the_trailing_space() { parser: WdlParser, input: "[if a then b else c, \"Hello, world!\"] ", rule: Rule::array_literal, - tokens: [array_literal(0, 37, [ + tokens: [ + // `[if a then b else c, "Hello, world!"]` + array_literal(0, 37, [ + // `if a then b else c` expression(1, 19, [ + // `if a then b else c` r#if(1, 19, [ WHITESPACE(3, 4, [ SPACE(3, 4), ]), + // `a` expression(4, 5, [ + // `a` identifier(4, 5), ]), WHITESPACE(5, 6, [ @@ -96,7 +118,9 @@ fn it_successfully_parses_an_array_literal_without_the_trailing_space() { WHITESPACE(10, 11, [ SPACE(10, 11), ]), + // `b` expression(11, 12, [ + // `b` identifier(11, 12), ]), WHITESPACE(12, 13, [ @@ -105,18 +129,26 @@ fn it_successfully_parses_an_array_literal_without_the_trailing_space() { WHITESPACE(17, 18, [ SPACE(17, 18), ]), + // `c` expression(18, 19, [ + // `c` identifier(18, 19), ]), ]), ]), + // `,` COMMA(19, 20), WHITESPACE(20, 21, [ SPACE(20, 21), ]), + // `"Hello, world!"` expression(21, 36, [ + // `"Hello, world!"` string(21, 36, [ - double_quoted_string(21, 36), + // `"` + double_quote(21, 22), + // `Hello, world!` + string_literal_contents(22, 35), ]), ]), ]) @@ -130,45 +162,61 @@ fn it_successfully_parses_an_array_literal_with_spaces_inside() { parser: WdlParser, input: "[if a then b else c, \"Hello, world!\"]", rule: Rule::array_literal, - tokens: [array_literal(0, 37, [ - expression(1, 19, [ - r#if(1, 19, [ - WHITESPACE(3, 4, [ - SPACE(3, 4), - ]), - expression(4, 5, [ - identifier(4, 5), - ]), - WHITESPACE(5, 6, [ - SPACE(5, 6), - ]), - WHITESPACE(10, 11, [ - SPACE(10, 11), - ]), - expression(11, 12, [ - identifier(11, 12), - ]), - WHITESPACE(12, 13, [ - SPACE(12, 13), - ]), - WHITESPACE(17, 18, [ - SPACE(17, 18), - ]), - expression(18, 19, [ - identifier(18, 19), + tokens: [ + // `[if a then b else c, "Hello, world!"]` + array_literal(0, 37, [ + // `if a then b else c` + expression(1, 19, [ + // `if a then b else c` + r#if(1, 19, [ + WHITESPACE(3, 4, [ + SPACE(3, 4), + ]), + // `a` + expression(4, 5, [ + // `a` + identifier(4, 5), + ]), + WHITESPACE(5, 6, [ + SPACE(5, 6), + ]), + WHITESPACE(10, 11, [ + SPACE(10, 11), + ]), + // `b` + expression(11, 12, [ + // `b` + identifier(11, 12), + ]), + WHITESPACE(12, 13, [ + SPACE(12, 13), + ]), + WHITESPACE(17, 18, [ + SPACE(17, 18), + ]), + // `c` + expression(18, 19, [ + // `c` + identifier(18, 19), + ]), ]), ]), - ]), - COMMA(19, 20), - WHITESPACE(20, 21, [ - SPACE(20, 21), - ]), - expression(21, 36, [ - string(21, 36, [ - double_quoted_string(21, 36), + // `,` + COMMA(19, 20), + WHITESPACE(20, 21, [ + SPACE(20, 21), ]), - ]), - ]) + // `"Hello, world!"` + expression(21, 36, [ + // `"Hello, world!"` + string(21, 36, [ + // `"` + double_quote(21, 22), + // `Hello, world!` + string_literal_contents(22, 35), + ]), + ]), + ]) ] } } diff --git a/wdl-grammar/src/v1/tests/literal.rs b/wdl-grammar/src/v1/tests/literal.rs index d93c1192b..51f2841a9 100644 --- a/wdl-grammar/src/v1/tests/literal.rs +++ b/wdl-grammar/src/v1/tests/literal.rs @@ -134,7 +134,13 @@ fn it_successfully_parses_an_empty_double_quoted_string() { parser: WdlParser, input: "\"\"", rule: Rule::literal, - tokens: [string(0, 2, [double_quoted_string(0, 2)])] + tokens: [ + // `""` + string(0, 2, [ + // `"` + double_quote(0, 1), + ]) + ] } } @@ -144,7 +150,13 @@ fn it_successfully_parses_an_empty_single_quoted_string() { parser: WdlParser, input: "''", rule: Rule::literal, - tokens: [string(0, 2, [single_quoted_string(0, 2)])] + tokens: [ + // `''` + string(0, 2, [ + // `'` + single_quote(0, 1), + ]) + ] } } @@ -154,7 +166,15 @@ fn it_successfully_parses_a_double_quoted_string_with_a_unicode_character() { parser: WdlParser, input: "\"πŸ˜€\"", rule: Rule::literal, - tokens: [string(0, 6, [double_quoted_string(0, 6)])] + tokens: [ + // `"πŸ˜€"` + string(0, 6, [ + // `"` + double_quote(0, 1), + // `πŸ˜€` + string_literal_contents(1, 5), + ]) + ] } } @@ -164,7 +184,15 @@ fn it_successfully_parses_a_single_quoted_string_with_a_unicode_character() { parser: WdlParser, input: "'πŸ˜€'", rule: Rule::literal, - tokens: [string(0, 6, [single_quoted_string(0, 6)])] + tokens: [ + // `'πŸ˜€'` + string(0, 6, [ + // `'` + single_quote(0, 1), + // `πŸ˜€` + string_literal_contents(1, 5), + ]) + ] } } @@ -174,7 +202,15 @@ fn it_successfully_parses_a_double_quoted_string() { parser: WdlParser, input: "\"Hello, world!\"", rule: Rule::literal, - tokens: [string(0, 15, [double_quoted_string(0, 15)])] + tokens: [ + // `"Hello, world!"` + string(0, 15, [ + // `"` + double_quote(0, 1), + // `Hello, world!` + string_literal_contents(1, 14), + ]) + ] } } @@ -184,7 +220,15 @@ fn it_successfully_parses_a_single_quoted_string() { parser: WdlParser, input: "'Hello, world!'", rule: Rule::literal, - tokens: [string(0, 15, [single_quoted_string(0, 15)])] + tokens: [ + // `'Hello, world!'` + string(0, 15, [ + // `'` + single_quote(0, 1), + // `Hello, world!` + string_literal_contents(1, 14), + ]) + ] } } diff --git a/wdl-grammar/src/v1/tests/primitives/string.rs b/wdl-grammar/src/v1/tests/primitives/string.rs index ff7c0345a..a6e601e67 100644 --- a/wdl-grammar/src/v1/tests/primitives/string.rs +++ b/wdl-grammar/src/v1/tests/primitives/string.rs @@ -38,7 +38,7 @@ fn it_fails_to_parse_a_single_double_quote() { parser: WdlParser, input: "\"", rule: Rule::string, - positives: vec![Rule::char_special], + positives: vec![Rule::char_special, Rule::string_placeholder_start], negatives: vec![], pos: 1 } @@ -50,7 +50,7 @@ fn it_fails_to_parse_a_single_single_quote() { parser: WdlParser, input: "\'", rule: Rule::string, - positives: vec![Rule::char_special], + positives: vec![Rule::char_special, Rule::string_placeholder_start], negatives: vec![], pos: 1 } @@ -62,7 +62,13 @@ fn it_successfully_parses_an_empty_double_quoted_string() { parser: WdlParser, input: "\"\"", rule: Rule::string, - tokens: [string(0, 2, [double_quoted_string(0, 2)])] + tokens: [ + // `""` + string(0, 2, [ + // `"` + double_quote(0, 1), + ]) + ] } } @@ -72,7 +78,13 @@ fn it_successfully_parses_an_empty_single_quoted_string() { parser: WdlParser, input: "''", rule: Rule::string, - tokens: [string(0, 2, [single_quoted_string(0, 2)])] + tokens: [ + // `''` + string(0, 2, [ + // `'` + single_quote(0, 1), + ]) + ] } } @@ -82,7 +94,15 @@ fn it_successfully_parses_a_double_quoted_string_with_a_unicode_character() { parser: WdlParser, input: "\"πŸ˜€\"", rule: Rule::string, - tokens: [string(0, 6, [double_quoted_string(0, 6)])] + tokens: [ + // `"πŸ˜€"` + string(0, 6, [ + // `"` + double_quote(0, 1), + // `πŸ˜€` + string_literal_contents(1, 5), + ]) + ] } } @@ -92,7 +112,15 @@ fn it_successfully_parses_a_single_quoted_string_with_a_unicode_character() { parser: WdlParser, input: "'πŸ˜€'", rule: Rule::string, - tokens: [string(0, 6, [single_quoted_string(0, 6)])] + tokens: [ + // `'πŸ˜€'` + string(0, 6, [ + // `'` + single_quote(0, 1), + // `πŸ˜€` + string_literal_contents(1, 5), + ]) + ] } } @@ -102,7 +130,15 @@ fn it_successfully_parses_a_double_quoted_string() { parser: WdlParser, input: "\"Hello, world!\"", rule: Rule::string, - tokens: [string(0, 15, [double_quoted_string(0, 15)])] + tokens: [ + // `"Hello, world!"` + string(0, 15, [ + // `"` + double_quote(0, 1), + // `Hello, world!` + string_literal_contents(1, 14), + ]) + ] } } @@ -112,6 +148,14 @@ fn it_successfully_parses_a_single_quoted_string() { parser: WdlParser, input: "'Hello, world!'", rule: Rule::string, - tokens: [string(0, 15, [single_quoted_string(0, 15)])] + tokens: [ + // `'Hello, world!'` + string(0, 15, [ + // `'` + single_quote(0, 1), + // `Hello, world!` + string_literal_contents(1, 14), + ]) + ] } } diff --git a/wdl-grammar/src/v1/tests/primitives/string/double_quoted_string.rs b/wdl-grammar/src/v1/tests/primitives/string/double_quoted_string.rs index 708cd0b85..4f404f920 100644 --- a/wdl-grammar/src/v1/tests/primitives/string/double_quoted_string.rs +++ b/wdl-grammar/src/v1/tests/primitives/string/double_quoted_string.rs @@ -10,20 +10,8 @@ fn it_fails_to_parse_an_empty_double_quoted_string() { fails_with! { parser: WdlParser, input: "", - rule: Rule::double_quoted_string, - positives: vec![Rule::double_quoted_string], - negatives: vec![], - pos: 0 - } -} - -#[test] -fn it_fails_to_parse_single_quoted_string() { - fails_with! { - parser: WdlParser, - input: "'hello, world'", - rule: Rule::double_quoted_string, - positives: vec![Rule::double_quoted_string], + rule: Rule::string, + positives: vec![Rule::string], negatives: vec![], pos: 0 } @@ -34,8 +22,8 @@ fn it_fails_to_parse_a_single_double_quote() { fails_with! { parser: WdlParser, input: "\"", - rule: Rule::double_quoted_string, - positives: vec![Rule::char_special], + rule: Rule::string, + positives: vec![Rule::char_special, Rule::string_placeholder_start], negatives: vec![], pos: 1 } @@ -46,8 +34,8 @@ fn it_fails_to_parse_a_string_with_a_newline() { fails_with! { parser: WdlParser, input: "\"Hello,\nworld!\"", - rule: Rule::double_quoted_string, - positives: vec![Rule::char_special], + rule: Rule::string, + positives: vec![Rule::char_special, Rule::string_placeholder_start], negatives: vec![], pos: 7 } @@ -58,22 +46,34 @@ fn it_parses_an_empty_double_quoted_string() { parses_to! { parser: WdlParser, input: "\"\"", - rule: Rule::double_quoted_string, - tokens: [double_quoted_string(0, 2)] + rule: Rule::string, + tokens: [ + // `""` + string(0, 2, [ + // `"` + double_quote(0, 1), + ]) + ] } } #[test] fn it_successfully_parses_the_first_two_double_quotes() { - // This test will succeed, as `""`` matches the pattern, but the last double + // This test will succeed, as `""` matches the pattern, but the last double // quote will not be included. This is fine for parsing, as the now // unmatched `"` will throw an error. parses_to! { parser: WdlParser, input: "\"\"\"", - rule: Rule::double_quoted_string, - tokens: [double_quoted_string(0, 2)] + rule: Rule::string, + tokens: [ + // `""` + string(0, 2, [ + // `"` + double_quote(0, 1), + ]) + ] } } @@ -82,7 +82,15 @@ fn it_parses_a_double_quoted_string() { parses_to! { parser: WdlParser, input: "\"Hello, world!\"", - rule: Rule::double_quoted_string, - tokens: [double_quoted_string(0, 15)] + rule: Rule::string, + tokens: [ + // `"Hello, world!"` + string(0, 15, [ + // `"` + double_quote(0, 1), + // `Hello, world!` + string_literal_contents(1, 14), + ]) + ] } } diff --git a/wdl-grammar/src/v1/tests/primitives/string/single_quoted_string.rs b/wdl-grammar/src/v1/tests/primitives/string/single_quoted_string.rs index f87bdb5fb..6c8fd97cc 100644 --- a/wdl-grammar/src/v1/tests/primitives/string/single_quoted_string.rs +++ b/wdl-grammar/src/v1/tests/primitives/string/single_quoted_string.rs @@ -10,32 +10,20 @@ fn it_fails_to_parse_an_empty_single_quoted_string() { fails_with! { parser: WdlParser, input: "", - rule: Rule::single_quoted_string, - positives: vec![Rule::single_quoted_string], + rule: Rule::string, + positives: vec![Rule::string], negatives: vec![], pos: 0 } } #[test] -fn it_fails_to_parse_single_quoted_string() { - fails_with! { - parser: WdlParser, - input: "\"hello, world\"", - rule: Rule::single_quoted_string, - positives: vec![Rule::single_quoted_string], - negatives: vec![], - pos: 0 - } -} - -#[test] -fn it_fails_to_parse_a_single_double_quote() { +fn it_fails_to_parse_a_single_single_quote() { fails_with! { parser: WdlParser, input: "\'", - rule: Rule::single_quoted_string, - positives: vec![Rule::char_special], + rule: Rule::string, + positives: vec![Rule::char_special, Rule::string_placeholder_start], negatives: vec![], pos: 1 } @@ -46,8 +34,8 @@ fn it_fails_to_parse_a_string_with_a_newline() { fails_with! { parser: WdlParser, input: "'Hello,\nworld!'", - rule: Rule::single_quoted_string, - positives: vec![Rule::char_special], + rule: Rule::string, + positives: vec![Rule::char_special, Rule::string_placeholder_start], negatives: vec![], pos: 7 } @@ -58,22 +46,34 @@ fn it_parses_an_empty_single_quoted_string() { parses_to! { parser: WdlParser, input: "''", - rule: Rule::single_quoted_string, - tokens: [single_quoted_string(0, 2)] + rule: Rule::string, + tokens: [ + // `''` + string(0, 2, [ + // `'` + single_quote(0, 1), + ]) + ] } } #[test] fn it_successfully_parses_the_first_two_double_quotes() { - // This test will succeed, as `""`` matches the pattern, but the last double + // This test will succeed, as `''` matches the pattern, but the last single // quote will not be included. This is fine for parsing, as the now - // unmatched `"` will throw an error. + // unmatched `'` will throw an error. parses_to! { parser: WdlParser, input: "'''", - rule: Rule::single_quoted_string, - tokens: [single_quoted_string(0, 2)] + rule: Rule::string, + tokens: [ + // `''` + string(0, 2, [ + // `'` + single_quote(0, 1), + ]) + ] } } @@ -82,7 +82,15 @@ fn it_parses_a_single_quoted_string() { parses_to! { parser: WdlParser, input: "'Hello, world!'", - rule: Rule::single_quoted_string, - tokens: [single_quoted_string(0, 15)] + rule: Rule::string, + tokens: [ + // `'Hello, world!'` + string(0, 15, [ + // `'` + single_quote(0, 1), + // `Hello, world!` + string_literal_contents(1, 14), + ]) + ] } } diff --git a/wdl-grammar/src/v1/validation/invalid_escape_character.rs b/wdl-grammar/src/v1/validation/invalid_escape_character.rs index ca00af300..4eb364e1d 100644 --- a/wdl-grammar/src/v1/validation/invalid_escape_character.rs +++ b/wdl-grammar/src/v1/validation/invalid_escape_character.rs @@ -50,7 +50,7 @@ mod tests { #[test] fn it_catches_an_invalid_escape_character() -> Result<(), Box> { - let tree = Parser::parse(Rule::double_quoted_string, "\"\\.\"")?; + let tree = Parser::parse(Rule::string, "\"\\.\"")?; let error = InvalidEscapeCharacter.validate(tree).unwrap_err(); assert_eq!( diff --git a/wdl-grammar/src/v1/wdl.pest b/wdl-grammar/src/v1/wdl.pest index 36bade575..2e3091075 100644 --- a/wdl-grammar/src/v1/wdl.pest +++ b/wdl-grammar/src/v1/wdl.pest @@ -28,9 +28,9 @@ COMMENT = { "#" ~ (!LINE_ENDING ~ ANY)* } // Atoms // // =======// -OPTION = { "?" } -ONE_OR_MORE = { "+" } -COMMA = { "," } +OPTION = { "?" } +ONE_OR_MORE = { "+" } +COMMA = { "," } // ==========// // Literals // @@ -74,37 +74,47 @@ number = _{ float | integer } // parsing of these invalid string characters so that we can return lint errors // for them (rather than fail parsing, which returns a relatively unhelpful // error message at the time of writing). -// +// // If you wish to remove this leniency, you can remove the // `char_escaped_invalid` rule and its inclusion in the `char_escaped` rule. -char_escaped_invalid = @{ "\\" ~ ANY } -char_escaped = ${ "\\" ~ ("\\" | "\"" | "\'" | "n" | "r" | "b" | "t" | "f" | "a" | "v" | "?") } -char_octal = @{ "\\" ~ ASCII_OCT_DIGIT{1, 3} ~ !ASCII_OCT_DIGIT } -char_hex = @{ "\\x" ~ ASCII_HEX_DIGIT+ } -char_unicode_four = @{ "\\" ~ ("u" | "U") ~ ASCII_HEX_DIGIT{4} ~ !ASCII_HEX_DIGIT } -char_unicode_eight = @{ "\\" ~ ("u" | "U") ~ ASCII_HEX_DIGIT{8} ~ !ASCII_HEX_DIGIT } -char_unicode = ${ char_unicode_four | char_unicode_eight } -char_special = ${ char_escaped | char_hex | char_unicode | char_octal | char_escaped_invalid } -char_other_double_quote = @{ !("\\" | "\"" | "\n") ~ ANY } -char_other_single_quote = @{ !("\\" | "\'" | "\n") ~ ANY } +char_escaped_invalid = @{ "\\" ~ ANY } +char_escaped = ${ "\\" ~ ("\\" | "\"" | "\'" | "n" | "r" | "b" | "t" | "f" | "a" | "v" | "?") } +char_octal = @{ "\\" ~ ASCII_OCT_DIGIT{1, 3} ~ !ASCII_OCT_DIGIT } +char_hex = @{ "\\x" ~ ASCII_HEX_DIGIT+ } +char_unicode_four = @{ "\\" ~ ("u" | "U") ~ ASCII_HEX_DIGIT{4} ~ !ASCII_HEX_DIGIT } +char_unicode_eight = @{ "\\" ~ ("u" | "U") ~ ASCII_HEX_DIGIT{8} ~ !ASCII_HEX_DIGIT } +char_unicode = ${ char_unicode_four | char_unicode_eight } +char_special = ${ char_escaped | char_hex | char_unicode | char_octal | char_escaped_invalid } +char_other = @{ !("\\" | "\n") ~ ANY } // String. -double_quoted_string = @{ - "\"" ~ (char_special | char_other_double_quote)* ~ "\"" + +double_quote = { "\"" } +single_quote = { "\'" } +string_placeholder_start = { "~{" | "${" } +string_placeholder_end = { "}" } + +string_literal_contents = @{ + // NOTE: the `PEEK` here looks at what quoting is being used (double or + // single) and denies any use of the character in literal contents. The + // presence of `PEEK` implies that this rule must be embedded within a rule + // that `PUSH`es these tokens on the stack. + (!string_placeholder_start ~ !PEEK ~ (char_special | char_other))+ } -single_quoted_string = @{ - "\'" ~ (char_special | char_other_single_quote)* ~ "\'" + +string_placeholder_expression = { string_placeholder_contents | expression } + +string_placeholder_contents = ${ + string_placeholder_start ~ expression_placeholder_options* ~ string_placeholder_expression ~ string_placeholder_end } -// For strings, whether the string is double-quoted or single-quote is not -// important context when parsing. HOWEVER, we would like to return whether the -// strings are single or double quoted for linting purposes. Thus, the type of -// quoting is retained. -// // NOTE: all rules included in `string` must be marked as atomic (`@`) or -// compound-atomic (`$`). This is not checked by the compiler, so you must +// compound-atomic (`$`). This is because we don't want rules eating up +// whitespace within a string. This is not checked by the compiler, so you must // ensure it remains true. -string = { (double_quoted_string | single_quoted_string) } +string = ${ + PUSH(double_quote | single_quote) ~ (string_placeholder_contents | string_literal_contents)* ~ POP +} // Identifier. identifier = @{ ASCII_ALPHA ~ (ASCII_ALPHANUMERIC | "_")* } @@ -196,10 +206,7 @@ core = _{ // As such, you will see that none of the permutations of the rule below end in // an optional token. That is by design to avoid the problem above. expression = ${ - prefix* ~ (WHITESPACE | COMMENT)* ~ core ~ (WHITESPACE | COMMENT)* ~ postfix* ~ ( - (WHITESPACE | COMMENT)* ~ infix ~ (WHITESPACE | COMMENT)* ~ prefix* ~ (WHITESPACE | COMMENT)* ~ core ~ (WHITESPACE | COMMENT)* ~ postfix+ | - (WHITESPACE | COMMENT)* ~ infix ~ (WHITESPACE | COMMENT)* ~ prefix* ~ (WHITESPACE | COMMENT)* ~ core - )+ + prefix* ~ (WHITESPACE | COMMENT)* ~ core ~ (WHITESPACE | COMMENT)* ~ postfix* ~ ((WHITESPACE | COMMENT)* ~ infix ~ (WHITESPACE | COMMENT)* ~ prefix* ~ (WHITESPACE | COMMENT)* ~ core ~ (WHITESPACE | COMMENT)* ~ postfix+ | (WHITESPACE | COMMENT)* ~ infix ~ (WHITESPACE | COMMENT)* ~ prefix* ~ (WHITESPACE | COMMENT)* ~ core)+ | prefix* ~ (WHITESPACE | COMMENT)* ~ core ~ (WHITESPACE | COMMENT)* ~ postfix+ | prefix* ~ (WHITESPACE | COMMENT)* ~ core } @@ -209,19 +216,14 @@ expression = ${ // NOTE: techically the spec calls the optional `+` the "non-empty" operator. // Since we have already defined this as the "one or more" operator and they // mean effectively the same thing, I've just kept this nomeclature. -array_type = ${ - "Array" ~ - (WHITESPACE | COMMENT)* ~ - ( - ("[" ~ (WHITESPACE | COMMENT)* ~ wdl_type_inner ~ (WHITESPACE | COMMENT)* ~ "]" ~ ONE_OR_MORE) | - ("[" ~ (WHITESPACE | COMMENT)* ~ wdl_type_inner ~ (WHITESPACE | COMMENT)* ~ "]") - ) +array_type = ${ + "Array" ~ (WHITESPACE | COMMENT)* ~ (("[" ~ (WHITESPACE | COMMENT)* ~ wdl_type_inner ~ (WHITESPACE | COMMENT)* ~ "]" ~ ONE_OR_MORE) | ("[" ~ (WHITESPACE | COMMENT)* ~ wdl_type_inner ~ (WHITESPACE | COMMENT)* ~ "]")) } // NOTE: The `map_type` and `pair_type` rules **must** be marked as non-atomic, as the // `unbound_declaration` and `bound_declaration` rules that use them are marked // as compound-atomic. -map_type = !{ "Map" ~ "[" ~ wdl_type_inner ~ COMMA ~ wdl_type_inner ~ "]" } -pair_type = !{ "Pair" ~ "[" ~ wdl_type_inner ~ COMMA ~ wdl_type_inner ~ "]" } +map_type = !{ "Map" ~ "[" ~ wdl_type_inner ~ COMMA ~ wdl_type_inner ~ "]" } +pair_type = !{ "Pair" ~ "[" ~ wdl_type_inner ~ COMMA ~ wdl_type_inner ~ "]" } string_type = { "String" } file_type = { "File" } bool_type = { "Boolean" } @@ -260,7 +262,7 @@ wdl_type_inner = ${ // **must** be a whitespace between the `wdl_type` and the `identifier` for // `bound_declaration`s and `unbound_declaration`s. Else, you get weird things // happening in these rules. -// +// // For example, when considering `IntermediateFiles`, `Int` matching the integer // `wdl_type` and `ermediateFiles` matching the `identifier`. wdl_type = ${ @@ -379,28 +381,33 @@ task_input = { common_input } // Task output. task_output = { common_output } -// Task expression placeholders. -// +// Expression placeholder options. +// +// Expression placeholder options can be used anywhere where expression +// placeholders are evaluated, including (but not limited to): +// +// * Strings +// // DIVERGE: the specification states that any expression placeholder conforms to // the pattern `option="value"`. However, it is clear from the examples in the // spec that single quoted strings are also allowed. Thus, we allow for either // single or double quoted strings hereβ€”we will leave the selection of which to // use up to a linting question. -// +// // LENIENT: the specification is pretty clear that no spaces are allowed between // the option and the equals sign or the equals sign and the value. However, // many tools choose to allow spaces here. As such, we will allow spaces in // between these elements, but we will throw a lint warning for these cases. -expression_placeholder_sep = { "sep" ~ "=" ~ string } -expression_placeholder_boolean = { boolean ~ "=" ~ string } -expression_placeholder_default = { "default" ~ "=" ~ string } - -expression_placeholder = { - expression_placeholder_sep | - expression_placeholder_boolean | - expression_placeholder_default +expression_placeholder_option_sep = { "sep" ~ "=" ~ string } +expression_placeholder_option_boolean = { boolean ~ "=" ~ string } +expression_placeholder_option_default = { "default" ~ "=" ~ string } + +expression_placeholder_option = { + expression_placeholder_option_sep + | expression_placeholder_option_boolean + | expression_placeholder_option_default } -expression_placeholders = { expression_placeholder+ } +expression_placeholder_options = { expression_placeholder_option+ } // Task commands, curly. command_curly_begin = { "command" ~ "{" } @@ -416,10 +423,7 @@ command_curly_literal_contents = _{ command_curly_interpolated_expression = { command_curly_interpolated_contents | expression } command_curly_interpolated_contents = { - command_curly_interpolation_start ~ - expression_placeholders* ~ - command_curly_interpolated_expression ~ - command_curly_interpolation_end + command_curly_interpolation_start ~ expression_placeholder_options* ~ command_curly_interpolated_expression ~ command_curly_interpolation_end } command_curly = { @@ -439,10 +443,7 @@ command_heredoc_literal_contents = _{ command_heredoc_interpolated_expression = { command_heredoc_interpolated_contents | expression } command_heredoc_interpolated_contents = { - command_heredoc_interpolation_start ~ - expression_placeholders* ~ - command_heredoc_interpolated_expression ~ - command_heredoc_interpolation_end + command_heredoc_interpolation_start ~ expression_placeholder_options* ~ command_heredoc_interpolated_expression ~ command_heredoc_interpolation_end } command_heredoc = {