refactor(js_parser): re-scan template literal parts and regex tokens

srijan-paul · srijan-paul · commit 717067cf0896 · 2025-03-18T11:37:37.000+05:30
diff --git a/src/js/parser.zig b/src/js/parser.zig
@@ -2846,8 +2846,6 @@ fn next(self: *Self) Error!TokenWithId {
 }
 
 /// Consume the next JSX token from the lexer, skipping all comments and whitespaces.
-///
-/// [is_inside_jsx_tags]: Whether we are currently inside JSX tags (i.e. between '<' and '>').
 fn nextJsx(self: *Self) Error!TokenWithId {
     var next_token = try self.tokenizer.nextJsxChild();
     while (next_token.tag == .comment or next_token.tag == .whitespace) {
@@ -2858,19 +2856,49 @@ fn nextJsx(self: *Self) Error!TokenWithId {
     return self.advanceToToken(next_token);
 }
 
+/// Discard the '/' or '/=' token that was just scanned,
+/// and re-tokenize it as a regex literal.
+///
+/// Mutates `self.current` and `self.tokens`
+fn reScanRegexLiteral(self: *Self) Error!void {
+    const token = &self.current.token;
+    assert(token.tag == .@"/" or token.tag == .@"/=");
+
+    const regex_token = try self.tokenizer.reScanRegexLiteral(token);
+    assert(regex_token.tag == .regex_literal);
+
+    self.tokens.items[@intFromEnum(self.current.id)] = regex_token;
+    self.current.token = regex_token;
+}
+
+/// Discard the '}' token that was just scanned, and replace it
+/// with a re-tokenized `.template_literal_part` token.
+///
+/// Mutates `self.current` and `self.tokens`
+fn reScanTemplatePart(self: *Self) Error!void {
+    const cur = &self.current.token;
+    assert(cur.tag == .@"}");
+
+    const template_part = try self.tokenizer.reScanTemplatePart(cur);
+    assert(template_part.tag == .template_literal_part);
+
+    // replate the '}' token in the buffer with the template part token
+    self.tokens.items[@intFromEnum(self.current.id)] = template_part;
+    self.current.token = template_part;
+}
+
 /// Set [next_token] as the current token, and update `self.current`, and `self.prev_token_line`
-/// Returns the newly saved token along with its ID.
+/// Returns the old value of `self.current`.
 fn advanceToToken(self: *Self, next_token: Token) error{OutOfMemory}!TokenWithId {
     try self.saveToken(next_token);
 
-    const current = self.current.token;
-    const current_id = self.current.id;
+    const prev = self.current;
 
     self.current.token = next_token;
     self.current.id = @enumFromInt(self.tokens.items.len - 1);
-    self.prev_token_line = current.line;
+    self.prev_token_line = prev.token.line;
 
-    return TokenWithId{ .token = current, .id = current_id };
+    return prev;
 }
 
 /// Intialize `self.current` by consuming the first token.
@@ -4285,26 +4313,24 @@ fn completeComputedMemberExpression(self: *Self, object: Node.Index) Error!Node.
 }
 
 fn primaryExpression(self: *Self) Error!Node.Index {
-    // If we're currently at a '/' or '/=' token,
-    // we probably have mistaken a regex literal's opening '/' for an operator.
-    // We'll rewind the tokenizer and try to parse a regex literal instead.
     const cur = &self.current.token;
     if (cur.tag == .@"/" or cur.tag == .@"/=") {
-        // TODO: separate this re-lexing out into a separate function.
-        // Go back to the beginning of '/'
-        self.tokenizer.rewind(cur.start, cur.line);
-        self.tokenizer.assume_bslash_starts_regex = true;
-
-        // re-tokenize the regex literal
-        self.current.token = try self.tokenizer.next();
-        self.current.id = @enumFromInt(self.tokens.items.len);
-        try self.saveToken(self.current.token);
+        // If we're currently at a '/' or '/=' token,
+        // we probably have mistaken a regex literal's opening '/' for an operator.
+        // We'll rewind the tokenizer and try to parse a regex literal instead.
+        try self.reScanRegexLiteral();
+        assert(self.current.token.tag == .regex_literal);
 
-        self.tokenizer.assume_bslash_starts_regex = false;
+        const regex_token = try self.next();
+        return self.addNode(
+            .{ .regex_literal = regex_token.id },
+            regex_token.id,
+            regex_token.id,
+        );
+    } else if (cur.tag == .template_literal_part) {
+        return self.templateLiteral();
     }
 
-    if (cur.tag == .template_literal_part) return self.templateLiteral();
-
     switch (cur.tag) {
         .kw_class => return self.classExpression(),
         .kw_this => {
@@ -4399,6 +4425,11 @@ fn jsxFragmentOrElement(self: *Self) Error!Node.Index {
     return self.jsxElement(lt_token.id);
 }
 
+/// JSXElement:
+///     JSXOpeningElement JSXChildren? JSXClosingElement
+///     JSXSelfClosingElement
+///
+/// https://facebook.github.io/jsx/#prod-JSXElement
 fn jsxElement(self: *Self, lt_token: Token.Index) Error!Node.Index {
     const opening_element = try self.jsxOpeningElement(lt_token);
     const children: Node.Index = blk: {
@@ -4990,20 +5021,16 @@ fn templateLiteral(self: *Self) Error!Node.Index {
         // parse an interpolation expression.
         try self.scratch.append(try self.expression());
 
-        // The most recently processed (but unconsumed) token should be a '}'.
-        // We want to rewind back one character, and make the tokenizer treat the '}'
-        // as a part of a template literal.
-        self.tokenizer.rewind(self.current.token.start, self.current.token.line);
-        self.tokenizer.assume_rbrace_is_template_part = true;
-
-        // TODO: separate this re-lexing out into its own function.
-        self.current.token = try self.tokenizer.next();
-        try self.saveToken(self.current.token);
-        self.current.id = @enumFromInt(self.tokens.items.len - 1);
-
-        self.tokenizer.assume_rbrace_is_template_part = false;
+        // After parsing the interpolated expression,
+        // the current token should be a '}'. Now, we re-scan starting
+        // from '}' to the next '${', or the end of the template literal.
+        if (self.current.token.tag != .@"}") {
+            try self.emitBadTokenDiagnostic("'}}' after template expression", &self.current.token);
+            return Error.UnexpectedToken;
+        }
 
-        template_token = try self.expect(.template_literal_part);
+        try self.reScanTemplatePart();
+        template_token = try self.next();
 
         // Now, parse the template part that follows
         try self.scratch.append(try self.addNode(
diff --git a/src/js/tokenize.zig b/src/js/tokenize.zig
@@ -70,12 +70,6 @@ source: []const u8,
 index: u32 = 0,
 /// Current line number (0 indexed).
 line: u32 = 0,
-/// When `true`, the tokenizer will attempt to parse any remaining input
-/// starting with '/' (that isn't a comment starter) as a regex literal (e.g: '/[a-zA-Z0-9]/').
-/// Otherwise, '/' (when not starting a comment) is assumed to be either the '/' or '/=' operator.
-/// This property is used to dis-ambiguate between division operators and regex literals.
-assume_bslash_starts_regex: bool = false,
-
 /// When `true`, the tokenizer assumes that a '}' character is the part of a template
 /// literal after an interpolated expression, and not a "}" token.
 /// e.g:
@@ -120,12 +114,38 @@ pub fn next(self: *Self) Error!Token {
     return token;
 }
 
-/// Returns the next JSX token.
-/// A JSX token is one of: '<', '>', '{', '}', or JSX text.
-/// To tokenize JS expressions inside JSX (e.g: prop values), the 'next' function should be used instead.
-/// The caller (parser) must know when to call `nextJsx` or `next` based on surrounding context.
+/// Regex literals can be ambiguous with "/" or "/=" from the tokenizer's perspective,
+/// as "/a/g" can mean either ['/', 'a', '/', 'g'], or ['/a/g' (regex) ].
+///
+/// When the parser sees a "/" or "/=" token, but it expects an expression, it should
+/// call this function to re-scan the source code starting from the '/' character.
+///
+/// [div_token]: The token that was previously returned by the tokenizer when it saw the '/' character
+/// (must be a "/" or "/=" token).
+pub fn reScanRegexLiteral(self: *Self, div_token: *const Token) Error!Token {
+    assert(div_token.tag == .@"/" or div_token.tag == .@"/=");
+    self.rewind(div_token.start, div_token.line);
+    return self.regexLiteral();
+}
+
+/// Inside template literals, a "}" should be treated as the part of a template string,
+/// instead of a lone '}' token.
+/// So '`foo${bar}baz`' should be tokenized as: '`foo${', 'bar', '}baz`'.
 ///
-/// [is_inside_jsx_tags] is `true` when the tokenizer is inside a '<'  and '>' pair.
+/// When the parser receives a '}' token after 'baz', it should call this function
+/// to rescan the source code starting from the '}' character, and tokenize it as a template part.
+///
+/// [rbrace_token]: The token that was previously returned by the tokenizer when it saw the '}' character.
+pub fn reScanTemplatePart(self: *Self, rbrace_token: *const Token) Error!Token {
+    assert(rbrace_token.tag == .@"}");
+    self.rewind(rbrace_token.start, rbrace_token.line);
+    return self.templateAfterInterpolation();
+}
+
+/// Returns the next token that starts a JSX child.
+/// The token returned is one of: '<', '{', or JSX text.
+/// To tokenize JS expressions inside JSX (e.g: prop values), the 'next' function should be used instead.
+/// The caller (parser) must know when to call `nextJsxChild` or `next` based on surrounding context.
 pub fn nextJsxChild(self: *Self) Error!Token {
     const byte = self.peekByte() orelse {
         return Token{
@@ -248,17 +268,6 @@ fn consumeToken(self: *Self) Error!Token {
         '/' => {
             if (try self.comment()) |tok|
                 return tok;
-
-            // Parsing regex literals is awkward.
-            // A '/abc' can either be the start of a regex literal,
-            // or a '/' (division) token followed by an 'abc' (identifier) token.
-            //
-            // The parser has to tell the tokenizer what it expects
-            // to see next. If it expects to see a literal, then
-            // we want to try tokenizing a regex literal.
-            // Otherwise, we look for '/' or '/='.
-            if (self.assume_bslash_starts_regex)
-                return try self.regexLiteral();
             return try self.punctuator();
         },
         ' ',
@@ -270,9 +279,6 @@ fn consumeToken(self: *Self) Error!Token {
         '\u{000C}',
         => return self.whiteSpaces(),
         '}' => {
-            if (self.assume_rbrace_is_template_part)
-                return try self.templateAfterInterpolation();
-
             self.index += 1;
             return Token{
                 .start = self.index - 1,
@@ -1875,19 +1881,16 @@ test Self {
 
     {
         var tokenizer = try Self.init(" /a\\(bc[some_character_class]/g //foo", .{});
-        tokenizer.assume_bslash_starts_regex = true; // '/' is now interpreted as regex literal start marker.
-        try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
-        try t.expectEqual(Token.Tag.regex_literal, (try tokenizer.next()).tag);
         try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
-        try t.expectEqual(Token.Tag.comment, (try tokenizer.next()).tag);
-        try t.expectEqual(Token.Tag.eof, (try tokenizer.next()).tag);
-    }
 
-    {
-        var tokenizer = try Self.init(" /a\\(bc[some_character_class]/g //foo", .{});
-        tokenizer.assume_bslash_starts_regex = true; // '/' is now interpreted as regex literal start marker.
-        try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
-        try t.expectEqual(Token.Tag.regex_literal, (try tokenizer.next()).tag);
+        // by default  '/' is interpreted as a division operator.
+        const div_token = try tokenizer.next();
+        try t.expectEqual(.@"/", div_token.tag);
+
+        // Then it can be re-scanned as a regex literal.
+        const regex_token = try tokenizer.reScanRegexLiteral(&div_token);
+        try t.expectEqual(.regex_literal, regex_token.tag);
+
         try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
         try t.expectEqual(Token.Tag.comment, (try tokenizer.next()).tag);
         try t.expectEqual(Token.Tag.eof, (try tokenizer.next()).tag);
@@ -1897,9 +1900,10 @@ test Self {
         var tokenizer = try Self.init("`hello ${'world'}`", .{});
         try t.expectEqual(.template_literal_part, (try tokenizer.next()).tag);
         try t.expectEqual(.string_literal, (try tokenizer.next()).tag);
-        tokenizer.assume_rbrace_is_template_part = true;
-        try t.expectEqual(.template_literal_part, (try tokenizer.next()).tag);
-        tokenizer.assume_rbrace_is_template_part = false;
+        const rb_token = try tokenizer.next();
+        try t.expectEqual(.@"}", rb_token.tag);
+        const template_part = try tokenizer.reScanTemplatePart(&rb_token);
+        try t.expectEqual(.template_literal_part, template_part.tag);
         try t.expectEqual(.eof, (try tokenizer.next()).tag);
     }
 }