Skip to content

Commit 717067c

Browse files
committedMar 18, 2025·
refactor(js_parser): re-scan template literal parts and regex tokens
1 parent 202e79c commit 717067c

File tree

2 files changed

+105
-74
lines changed

2 files changed

+105
-74
lines changed
 

‎src/js/parser.zig

+62-35
Original file line numberDiff line numberDiff line change
@@ -2846,8 +2846,6 @@ fn next(self: *Self) Error!TokenWithId {
28462846
}
28472847

28482848
/// Consume the next JSX token from the lexer, skipping all comments and whitespaces.
2849-
///
2850-
/// [is_inside_jsx_tags]: Whether we are currently inside JSX tags (i.e. between '<' and '>').
28512849
fn nextJsx(self: *Self) Error!TokenWithId {
28522850
var next_token = try self.tokenizer.nextJsxChild();
28532851
while (next_token.tag == .comment or next_token.tag == .whitespace) {
@@ -2858,19 +2856,49 @@ fn nextJsx(self: *Self) Error!TokenWithId {
28582856
return self.advanceToToken(next_token);
28592857
}
28602858

2859+
/// Discard the '/' or '/=' token that was just scanned,
2860+
/// and re-tokenize it as a regex literal.
2861+
///
2862+
/// Mutates `self.current` and `self.tokens`
2863+
fn reScanRegexLiteral(self: *Self) Error!void {
2864+
const token = &self.current.token;
2865+
assert(token.tag == .@"/" or token.tag == .@"/=");
2866+
2867+
const regex_token = try self.tokenizer.reScanRegexLiteral(token);
2868+
assert(regex_token.tag == .regex_literal);
2869+
2870+
self.tokens.items[@intFromEnum(self.current.id)] = regex_token;
2871+
self.current.token = regex_token;
2872+
}
2873+
2874+
/// Discard the '}' token that was just scanned, and replace it
2875+
/// with a re-tokenized `.template_literal_part` token.
2876+
///
2877+
/// Mutates `self.current` and `self.tokens`
2878+
fn reScanTemplatePart(self: *Self) Error!void {
2879+
const cur = &self.current.token;
2880+
assert(cur.tag == .@"}");
2881+
2882+
const template_part = try self.tokenizer.reScanTemplatePart(cur);
2883+
assert(template_part.tag == .template_literal_part);
2884+
2885+
// replate the '}' token in the buffer with the template part token
2886+
self.tokens.items[@intFromEnum(self.current.id)] = template_part;
2887+
self.current.token = template_part;
2888+
}
2889+
28612890
/// Set [next_token] as the current token, and update `self.current`, and `self.prev_token_line`
2862-
/// Returns the newly saved token along with its ID.
2891+
/// Returns the old value of `self.current`.
28632892
fn advanceToToken(self: *Self, next_token: Token) error{OutOfMemory}!TokenWithId {
28642893
try self.saveToken(next_token);
28652894

2866-
const current = self.current.token;
2867-
const current_id = self.current.id;
2895+
const prev = self.current;
28682896

28692897
self.current.token = next_token;
28702898
self.current.id = @enumFromInt(self.tokens.items.len - 1);
2871-
self.prev_token_line = current.line;
2899+
self.prev_token_line = prev.token.line;
28722900

2873-
return TokenWithId{ .token = current, .id = current_id };
2901+
return prev;
28742902
}
28752903

28762904
/// Intialize `self.current` by consuming the first token.
@@ -4285,26 +4313,24 @@ fn completeComputedMemberExpression(self: *Self, object: Node.Index) Error!Node.
42854313
}
42864314

42874315
fn primaryExpression(self: *Self) Error!Node.Index {
4288-
// If we're currently at a '/' or '/=' token,
4289-
// we probably have mistaken a regex literal's opening '/' for an operator.
4290-
// We'll rewind the tokenizer and try to parse a regex literal instead.
42914316
const cur = &self.current.token;
42924317
if (cur.tag == .@"/" or cur.tag == .@"/=") {
4293-
// TODO: separate this re-lexing out into a separate function.
4294-
// Go back to the beginning of '/'
4295-
self.tokenizer.rewind(cur.start, cur.line);
4296-
self.tokenizer.assume_bslash_starts_regex = true;
4297-
4298-
// re-tokenize the regex literal
4299-
self.current.token = try self.tokenizer.next();
4300-
self.current.id = @enumFromInt(self.tokens.items.len);
4301-
try self.saveToken(self.current.token);
4318+
// If we're currently at a '/' or '/=' token,
4319+
// we probably have mistaken a regex literal's opening '/' for an operator.
4320+
// We'll rewind the tokenizer and try to parse a regex literal instead.
4321+
try self.reScanRegexLiteral();
4322+
assert(self.current.token.tag == .regex_literal);
43024323

4303-
self.tokenizer.assume_bslash_starts_regex = false;
4324+
const regex_token = try self.next();
4325+
return self.addNode(
4326+
.{ .regex_literal = regex_token.id },
4327+
regex_token.id,
4328+
regex_token.id,
4329+
);
4330+
} else if (cur.tag == .template_literal_part) {
4331+
return self.templateLiteral();
43044332
}
43054333

4306-
if (cur.tag == .template_literal_part) return self.templateLiteral();
4307-
43084334
switch (cur.tag) {
43094335
.kw_class => return self.classExpression(),
43104336
.kw_this => {
@@ -4399,6 +4425,11 @@ fn jsxFragmentOrElement(self: *Self) Error!Node.Index {
43994425
return self.jsxElement(lt_token.id);
44004426
}
44014427

4428+
/// JSXElement:
4429+
/// JSXOpeningElement JSXChildren? JSXClosingElement
4430+
/// JSXSelfClosingElement
4431+
///
4432+
/// https://facebook.github.io/jsx/#prod-JSXElement
44024433
fn jsxElement(self: *Self, lt_token: Token.Index) Error!Node.Index {
44034434
const opening_element = try self.jsxOpeningElement(lt_token);
44044435
const children: Node.Index = blk: {
@@ -4990,20 +5021,16 @@ fn templateLiteral(self: *Self) Error!Node.Index {
49905021
// parse an interpolation expression.
49915022
try self.scratch.append(try self.expression());
49925023

4993-
// The most recently processed (but unconsumed) token should be a '}'.
4994-
// We want to rewind back one character, and make the tokenizer treat the '}'
4995-
// as a part of a template literal.
4996-
self.tokenizer.rewind(self.current.token.start, self.current.token.line);
4997-
self.tokenizer.assume_rbrace_is_template_part = true;
4998-
4999-
// TODO: separate this re-lexing out into its own function.
5000-
self.current.token = try self.tokenizer.next();
5001-
try self.saveToken(self.current.token);
5002-
self.current.id = @enumFromInt(self.tokens.items.len - 1);
5003-
5004-
self.tokenizer.assume_rbrace_is_template_part = false;
5024+
// After parsing the interpolated expression,
5025+
// the current token should be a '}'. Now, we re-scan starting
5026+
// from '}' to the next '${', or the end of the template literal.
5027+
if (self.current.token.tag != .@"}") {
5028+
try self.emitBadTokenDiagnostic("'}}' after template expression", &self.current.token);
5029+
return Error.UnexpectedToken;
5030+
}
50055031

5006-
template_token = try self.expect(.template_literal_part);
5032+
try self.reScanTemplatePart();
5033+
template_token = try self.next();
50075034

50085035
// Now, parse the template part that follows
50095036
try self.scratch.append(try self.addNode(

‎src/js/tokenize.zig

+43-39
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,6 @@ source: []const u8,
7070
index: u32 = 0,
7171
/// Current line number (0 indexed).
7272
line: u32 = 0,
73-
/// When `true`, the tokenizer will attempt to parse any remaining input
74-
/// starting with '/' (that isn't a comment starter) as a regex literal (e.g: '/[a-zA-Z0-9]/').
75-
/// Otherwise, '/' (when not starting a comment) is assumed to be either the '/' or '/=' operator.
76-
/// This property is used to dis-ambiguate between division operators and regex literals.
77-
assume_bslash_starts_regex: bool = false,
78-
7973
/// When `true`, the tokenizer assumes that a '}' character is the part of a template
8074
/// literal after an interpolated expression, and not a "}" token.
8175
/// e.g:
@@ -120,12 +114,38 @@ pub fn next(self: *Self) Error!Token {
120114
return token;
121115
}
122116

123-
/// Returns the next JSX token.
124-
/// A JSX token is one of: '<', '>', '{', '}', or JSX text.
125-
/// To tokenize JS expressions inside JSX (e.g: prop values), the 'next' function should be used instead.
126-
/// The caller (parser) must know when to call `nextJsx` or `next` based on surrounding context.
117+
/// Regex literals can be ambiguous with "/" or "/=" from the tokenizer's perspective,
118+
/// as "/a/g" can mean either ['/', 'a', '/', 'g'], or ['/a/g' (regex) ].
119+
///
120+
/// When the parser sees a "/" or "/=" token, but it expects an expression, it should
121+
/// call this function to re-scan the source code starting from the '/' character.
122+
///
123+
/// [div_token]: The token that was previously returned by the tokenizer when it saw the '/' character
124+
/// (must be a "/" or "/=" token).
125+
pub fn reScanRegexLiteral(self: *Self, div_token: *const Token) Error!Token {
126+
assert(div_token.tag == .@"/" or div_token.tag == .@"/=");
127+
self.rewind(div_token.start, div_token.line);
128+
return self.regexLiteral();
129+
}
130+
131+
/// Inside template literals, a "}" should be treated as the part of a template string,
132+
/// instead of a lone '}' token.
133+
/// So '`foo${bar}baz`' should be tokenized as: '`foo${', 'bar', '}baz`'.
127134
///
128-
/// [is_inside_jsx_tags] is `true` when the tokenizer is inside a '<' and '>' pair.
135+
/// When the parser receives a '}' token after 'baz', it should call this function
136+
/// to rescan the source code starting from the '}' character, and tokenize it as a template part.
137+
///
138+
/// [rbrace_token]: The token that was previously returned by the tokenizer when it saw the '}' character.
139+
pub fn reScanTemplatePart(self: *Self, rbrace_token: *const Token) Error!Token {
140+
assert(rbrace_token.tag == .@"}");
141+
self.rewind(rbrace_token.start, rbrace_token.line);
142+
return self.templateAfterInterpolation();
143+
}
144+
145+
/// Returns the next token that starts a JSX child.
146+
/// The token returned is one of: '<', '{', or JSX text.
147+
/// To tokenize JS expressions inside JSX (e.g: prop values), the 'next' function should be used instead.
148+
/// The caller (parser) must know when to call `nextJsxChild` or `next` based on surrounding context.
129149
pub fn nextJsxChild(self: *Self) Error!Token {
130150
const byte = self.peekByte() orelse {
131151
return Token{
@@ -248,17 +268,6 @@ fn consumeToken(self: *Self) Error!Token {
248268
'/' => {
249269
if (try self.comment()) |tok|
250270
return tok;
251-
252-
// Parsing regex literals is awkward.
253-
// A '/abc' can either be the start of a regex literal,
254-
// or a '/' (division) token followed by an 'abc' (identifier) token.
255-
//
256-
// The parser has to tell the tokenizer what it expects
257-
// to see next. If it expects to see a literal, then
258-
// we want to try tokenizing a regex literal.
259-
// Otherwise, we look for '/' or '/='.
260-
if (self.assume_bslash_starts_regex)
261-
return try self.regexLiteral();
262271
return try self.punctuator();
263272
},
264273
' ',
@@ -270,9 +279,6 @@ fn consumeToken(self: *Self) Error!Token {
270279
'\u{000C}',
271280
=> return self.whiteSpaces(),
272281
'}' => {
273-
if (self.assume_rbrace_is_template_part)
274-
return try self.templateAfterInterpolation();
275-
276282
self.index += 1;
277283
return Token{
278284
.start = self.index - 1,
@@ -1875,19 +1881,16 @@ test Self {
18751881

18761882
{
18771883
var tokenizer = try Self.init(" /a\\(bc[some_character_class]/g //foo", .{});
1878-
tokenizer.assume_bslash_starts_regex = true; // '/' is now interpreted as regex literal start marker.
1879-
try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
1880-
try t.expectEqual(Token.Tag.regex_literal, (try tokenizer.next()).tag);
18811884
try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
1882-
try t.expectEqual(Token.Tag.comment, (try tokenizer.next()).tag);
1883-
try t.expectEqual(Token.Tag.eof, (try tokenizer.next()).tag);
1884-
}
18851885

1886-
{
1887-
var tokenizer = try Self.init(" /a\\(bc[some_character_class]/g //foo", .{});
1888-
tokenizer.assume_bslash_starts_regex = true; // '/' is now interpreted as regex literal start marker.
1889-
try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
1890-
try t.expectEqual(Token.Tag.regex_literal, (try tokenizer.next()).tag);
1886+
// by default '/' is interpreted as a division operator.
1887+
const div_token = try tokenizer.next();
1888+
try t.expectEqual(.@"/", div_token.tag);
1889+
1890+
// Then it can be re-scanned as a regex literal.
1891+
const regex_token = try tokenizer.reScanRegexLiteral(&div_token);
1892+
try t.expectEqual(.regex_literal, regex_token.tag);
1893+
18911894
try t.expectEqual(Token.Tag.whitespace, (try tokenizer.next()).tag);
18921895
try t.expectEqual(Token.Tag.comment, (try tokenizer.next()).tag);
18931896
try t.expectEqual(Token.Tag.eof, (try tokenizer.next()).tag);
@@ -1897,9 +1900,10 @@ test Self {
18971900
var tokenizer = try Self.init("`hello ${'world'}`", .{});
18981901
try t.expectEqual(.template_literal_part, (try tokenizer.next()).tag);
18991902
try t.expectEqual(.string_literal, (try tokenizer.next()).tag);
1900-
tokenizer.assume_rbrace_is_template_part = true;
1901-
try t.expectEqual(.template_literal_part, (try tokenizer.next()).tag);
1902-
tokenizer.assume_rbrace_is_template_part = false;
1903+
const rb_token = try tokenizer.next();
1904+
try t.expectEqual(.@"}", rb_token.tag);
1905+
const template_part = try tokenizer.reScanTemplatePart(&rb_token);
1906+
try t.expectEqual(.template_literal_part, template_part.tag);
19031907
try t.expectEqual(.eof, (try tokenizer.next()).tag);
19041908
}
19051909
}

0 commit comments

Comments
 (0)
Please sign in to comment.