From ee22d416d7f88bd93c7530c94beebc07387c9be9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 13 Dec 2025 20:45:41 +0000 Subject: [PATCH] Fix multiple parser issues from TODO.md - Fix table/database names starting with numbers (e.g., 03657_test) - Add parseIdentifierName helper for numeric-prefix identifiers - Support FORMAT Null and other keyword format names - Add FETCH FIRST ... ROW ONLY syntax support - Add INSERT INTO FUNCTION support - Allow keywords as alias names - Add MOD and DIV keyword operators - Support $ in identifier names for parameterized settings - Handle == as equality operator in lexer Tests passing increased from 5196 to 5352 (+156 tests) --- ast/ast.go | 3 +- lexer/lexer.go | 6 +- parser/expression.go | 7 +- parser/parser.go | 278 +++++++++++++++++++++---------------------- token/token.go | 4 + 5 files changed, 150 insertions(+), 148 deletions(-) diff --git a/ast/ast.go b/ast/ast.go index 2b61012d1..1d892f0eb 100644 --- a/ast/ast.go +++ b/ast/ast.go @@ -201,7 +201,8 @@ func (s *SettingExpr) End() token.Position { return s.Position } type InsertQuery struct { Position token.Position `json:"-"` Database string `json:"database,omitempty"` - Table string `json:"table"` + Table string `json:"table,omitempty"` + Function *FunctionCall `json:"function,omitempty"` // For INSERT INTO FUNCTION syntax Columns []*Identifier `json:"columns,omitempty"` Select Statement `json:"select,omitempty"` Format *Identifier `json:"format,omitempty"` diff --git a/lexer/lexer.go b/lexer/lexer.go index e3b97fea9..7efe64818 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -118,6 +118,10 @@ func (l *Lexer) NextToken() Item { return Item{Token: token.PERCENT, Value: "%", Pos: pos} case '=': l.readChar() + if l.ch == '=' { + l.readChar() + return Item{Token: token.EQ, Value: "==", Pos: pos} + } return Item{Token: token.EQ, Value: "=", Pos: pos} case '!': if l.peekChar() == '=' { @@ -407,7 +411,7 @@ func isIdentStart(ch rune) bool { } func isIdentChar(ch rune) bool { - return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) + return ch == '_' || ch == '$' || unicode.IsLetter(ch) || unicode.IsDigit(ch) } // Tokenize returns all tokens from the reader. diff --git a/parser/expression.go b/parser/expression.go index 12839bff4..14b03588c 100644 --- a/parser/expression.go +++ b/parser/expression.go @@ -44,7 +44,7 @@ func (p *Parser) precedence(tok token.Token) int { return CONCAT_PREC case token.PLUS, token.MINUS: return ADD_PREC - case token.ASTERISK, token.SLASH, token.PERCENT: + case token.ASTERISK, token.SLASH, token.PERCENT, token.DIV, token.MOD: return MUL_PREC case token.LPAREN, token.LBRACKET: return CALL @@ -173,7 +173,7 @@ func (p *Parser) parseInfixExpression(left ast.Expression) ast.Expression { switch p.current.Token { case token.PLUS, token.MINUS, token.ASTERISK, token.SLASH, token.PERCENT, token.EQ, token.NEQ, token.LT, token.GT, token.LTE, token.GTE, - token.AND, token.OR, token.CONCAT: + token.AND, token.OR, token.CONCAT, token.DIV, token.MOD: return p.parseBinaryExpression(left) case token.NULL_SAFE_EQ: return p.parseBinaryExpression(left) @@ -1104,8 +1104,9 @@ func (p *Parser) parseDotAccess(left ast.Expression) ast.Expression { func (p *Parser) parseAlias(left ast.Expression) ast.Expression { p.nextToken() // skip AS + // Alias can be an identifier or a keyword (ClickHouse allows keywords as aliases) alias := "" - if p.currentIs(token.IDENT) { + if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { alias = p.current.Value p.nextToken() } diff --git a/parser/parser.go b/parser/parser.go index 36af97046..b36afd2c4 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -299,6 +299,34 @@ func (p *Parser) parseSelect() *ast.SelectQuery { sel.Offset = p.parseExpression(LOWEST) } + // Parse FETCH FIRST ... ROW ONLY (SQL standard syntax) + if p.currentIs(token.FETCH) { + p.nextToken() + // Skip FIRST or NEXT + if p.currentIs(token.FIRST) || (p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "NEXT") { + p.nextToken() + } + // Parse the limit count + if !p.currentIs(token.IDENT) || strings.ToUpper(p.current.Value) != "ROW" { + sel.Limit = p.parseExpression(LOWEST) + } + // Skip ROW/ROWS + if p.currentIs(token.IDENT) && (strings.ToUpper(p.current.Value) == "ROW" || strings.ToUpper(p.current.Value) == "ROWS") { + p.nextToken() + } + // Skip ONLY + if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "ONLY" { + p.nextToken() + } + // Skip WITH TIES + if p.currentIs(token.WITH) { + p.nextToken() + if p.currentIs(token.TIES) { + p.nextToken() + } + } + } + // Parse SETTINGS clause if p.currentIs(token.SETTINGS) { p.nextToken() @@ -320,10 +348,10 @@ func (p *Parser) parseSelect() *ast.SelectQuery { } } - // Parse FORMAT clause + // Parse FORMAT clause (format names can be keywords like Null, JSON, etc.) if p.currentIs(token.FORMAT) { p.nextToken() - if p.currentIs(token.IDENT) { + if p.currentIs(token.IDENT) || p.currentIs(token.NULL) || p.current.Token.IsKeyword() { sel.Format = &ast.Identifier{ Position: p.current.Pos, Parts: []string{p.current.Value}, @@ -578,11 +606,11 @@ func (p *Parser) parseTableExpression() *ast.TableExpression { expr.Table = p.parseExpression(LOWEST) } p.expect(token.RPAREN) - } else if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { + } else if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() || p.currentIs(token.NUMBER) { // Table identifier or function (keywords can be table names like "system") - ident := p.current.Value + // Table names can also start with numbers in ClickHouse pos := p.current.Pos - p.nextToken() + ident := p.parseIdentifierName() if p.currentIs(token.LPAREN) { // Table function @@ -590,11 +618,7 @@ func (p *Parser) parseTableExpression() *ast.TableExpression { } else if p.currentIs(token.DOT) { // database.table p.nextToken() - tableName := "" - if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { - tableName = p.current.Value - p.nextToken() - } + tableName := p.parseIdentifierName() expr.Table = &ast.TableIdentifier{ Position: pos, Database: ident, @@ -776,20 +800,25 @@ func (p *Parser) parseInsert() *ast.InsertQuery { p.nextToken() } - // Parse table name - if p.currentIs(token.IDENT) { - tableName := p.current.Value + // Handle INSERT INTO FUNCTION + if p.currentIs(token.FUNCTION) { p.nextToken() - - if p.currentIs(token.DOT) { - p.nextToken() - ins.Database = tableName - if p.currentIs(token.IDENT) { - ins.Table = p.current.Value + // Parse the function call + funcName := p.parseIdentifierName() + if funcName != "" && p.currentIs(token.LPAREN) { + ins.Function = p.parseFunctionCall(funcName, p.current.Pos) + } + } else { + // Parse table name (can start with a number in ClickHouse) + tableName := p.parseIdentifierName() + if tableName != "" { + if p.currentIs(token.DOT) { p.nextToken() + ins.Database = tableName + ins.Table = p.parseIdentifierName() + } else { + ins.Table = tableName } - } else { - ins.Table = tableName } } @@ -821,10 +850,10 @@ func (p *Parser) parseInsert() *ast.InsertQuery { ins.Select = p.parseSelectWithUnion() } - // Parse FORMAT + // Parse FORMAT (format names can be keywords like Null, JSON, etc.) if p.currentIs(token.FORMAT) { p.nextToken() - if p.currentIs(token.IDENT) { + if p.currentIs(token.IDENT) || p.currentIs(token.NULL) || p.current.Token.IsKeyword() { ins.Format = &ast.Identifier{ Position: p.current.Pos, Parts: []string{p.current.Value}, @@ -897,18 +926,13 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { } } - // Parse table name - if p.currentIs(token.IDENT) { - tableName := p.current.Value - p.nextToken() - + // Parse table name (can start with a number in ClickHouse) + tableName := p.parseIdentifierName() + if tableName != "" { if p.currentIs(token.DOT) { p.nextToken() create.Database = tableName - if p.currentIs(token.IDENT) { - create.Table = p.current.Value - p.nextToken() - } + create.Table = p.parseIdentifierName() } else { create.Table = tableName } @@ -919,10 +943,7 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - create.OnCluster = p.current.Value - p.nextToken() - } + create.OnCluster = p.parseIdentifierName() } } @@ -1024,21 +1045,15 @@ func (p *Parser) parseCreateDatabase(create *ast.CreateQuery) { } } - // Parse database name - if p.currentIs(token.IDENT) { - create.Database = p.current.Value - p.nextToken() - } + // Parse database name (can start with a number in ClickHouse) + create.Database = p.parseIdentifierName() // Handle ON CLUSTER if p.currentIs(token.ON) { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - create.OnCluster = p.current.Value - p.nextToken() - } + create.OnCluster = p.parseIdentifierName() } } @@ -1065,18 +1080,13 @@ func (p *Parser) parseCreateView(create *ast.CreateQuery) { } } - // Parse view name - if p.currentIs(token.IDENT) { - viewName := p.current.Value - p.nextToken() - + // Parse view name (can start with a number in ClickHouse) + viewName := p.parseIdentifierName() + if viewName != "" { if p.currentIs(token.DOT) { p.nextToken() create.Database = viewName - if p.currentIs(token.IDENT) { - create.View = p.current.Value - p.nextToken() - } + create.View = p.parseIdentifierName() } else { create.View = viewName } @@ -1087,20 +1097,14 @@ func (p *Parser) parseCreateView(create *ast.CreateQuery) { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - create.OnCluster = p.current.Value - p.nextToken() - } + create.OnCluster = p.parseIdentifierName() } } // Handle TO (target table for materialized views) if p.currentIs(token.TO) { p.nextToken() - if p.currentIs(token.IDENT) { - create.To = p.current.Value - p.nextToken() - } + create.To = p.parseIdentifierName() } // Parse ENGINE (for materialized views) @@ -1343,21 +1347,19 @@ func (p *Parser) parseDrop() *ast.DropQuery { } } - // Parse name - if p.currentIs(token.IDENT) { - name := p.current.Value - p.nextToken() - + // Parse name (can start with a number in ClickHouse) + name := p.parseIdentifierName() + if name != "" { if p.currentIs(token.DOT) { p.nextToken() drop.Database = name - if p.currentIs(token.IDENT) { + tableName := p.parseIdentifierName() + if tableName != "" { if drop.DropDatabase { - drop.Database = p.current.Value + drop.Database = tableName } else { - drop.Table = p.current.Value + drop.Table = tableName } - p.nextToken() } } else { if dropUser { @@ -1402,18 +1404,13 @@ func (p *Parser) parseAlter() *ast.AlterQuery { return nil } - // Parse table name - if p.currentIs(token.IDENT) { - tableName := p.current.Value - p.nextToken() - + // Parse table name (can start with a number in ClickHouse) + tableName := p.parseIdentifierName() + if tableName != "" { if p.currentIs(token.DOT) { p.nextToken() alter.Database = tableName - if p.currentIs(token.IDENT) { - alter.Table = p.current.Value - p.nextToken() - } + alter.Table = p.parseIdentifierName() } else { alter.Table = tableName } @@ -1424,10 +1421,7 @@ func (p *Parser) parseAlter() *ast.AlterQuery { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - alter.OnCluster = p.current.Value - p.nextToken() - } + alter.OnCluster = p.parseIdentifierName() } } @@ -1692,18 +1686,13 @@ func (p *Parser) parseTruncate() *ast.TruncateQuery { } } - // Parse table name - if p.currentIs(token.IDENT) { - tableName := p.current.Value - p.nextToken() - + // Parse table name (can start with a number in ClickHouse) + tableName := p.parseIdentifierName() + if tableName != "" { if p.currentIs(token.DOT) { p.nextToken() trunc.Database = tableName - if p.currentIs(token.IDENT) { - trunc.Table = p.current.Value - p.nextToken() - } + trunc.Table = p.parseIdentifierName() } else { trunc.Table = tableName } @@ -1714,10 +1703,7 @@ func (p *Parser) parseTruncate() *ast.TruncateQuery { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - trunc.OnCluster = p.current.Value - p.nextToken() - } + trunc.OnCluster = p.parseIdentifierName() } } @@ -1731,11 +1717,8 @@ func (p *Parser) parseUse() *ast.UseQuery { p.nextToken() // skip USE - // Database name can be an identifier or a keyword like DEFAULT - if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { - use.Database = p.current.Value - p.nextToken() - } + // Database name can be an identifier or a keyword like DEFAULT (can also start with number) + use.Database = p.parseIdentifierName() return use } @@ -1920,18 +1903,13 @@ func (p *Parser) parseOptimize() *ast.OptimizeQuery { return nil } - // Parse table name - if p.currentIs(token.IDENT) { - tableName := p.current.Value - p.nextToken() - + // Parse table name (can start with a number in ClickHouse) + tableName := p.parseIdentifierName() + if tableName != "" { if p.currentIs(token.DOT) { p.nextToken() opt.Database = tableName - if p.currentIs(token.IDENT) { - opt.Table = p.current.Value - p.nextToken() - } + opt.Table = p.parseIdentifierName() } else { opt.Table = tableName } @@ -1942,10 +1920,7 @@ func (p *Parser) parseOptimize() *ast.OptimizeQuery { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - opt.OnCluster = p.current.Value - p.nextToken() - } + opt.OnCluster = p.parseIdentifierName() } } @@ -2025,31 +2000,22 @@ func (p *Parser) parseRename() *ast.RenameQuery { return nil } - // Parse from table name - if p.currentIs(token.IDENT) { - rename.From = p.current.Value - p.nextToken() - } + // Parse from table name (can start with a number in ClickHouse) + rename.From = p.parseIdentifierName() if !p.expect(token.TO) { return nil } - // Parse to table name - if p.currentIs(token.IDENT) { - rename.To = p.current.Value - p.nextToken() - } + // Parse to table name (can start with a number in ClickHouse) + rename.To = p.parseIdentifierName() // Handle ON CLUSTER if p.currentIs(token.ON) { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - rename.OnCluster = p.current.Value - p.nextToken() - } + rename.OnCluster = p.parseIdentifierName() } } @@ -2067,31 +2033,22 @@ func (p *Parser) parseExchange() *ast.ExchangeQuery { return nil } - // Parse first table name - if p.currentIs(token.IDENT) { - exchange.Table1 = p.current.Value - p.nextToken() - } + // Parse first table name (can start with a number in ClickHouse) + exchange.Table1 = p.parseIdentifierName() if !p.expect(token.AND) { return nil } - // Parse second table name - if p.currentIs(token.IDENT) { - exchange.Table2 = p.current.Value - p.nextToken() - } + // Parse second table name (can start with a number in ClickHouse) + exchange.Table2 = p.parseIdentifierName() // Handle ON CLUSTER if p.currentIs(token.ON) { p.nextToken() if p.currentIs(token.CLUSTER) { p.nextToken() - if p.currentIs(token.IDENT) || p.currentIs(token.STRING) { - exchange.OnCluster = p.current.Value - p.nextToken() - } + exchange.OnCluster = p.parseIdentifierName() } } @@ -2178,3 +2135,38 @@ func (p *Parser) parseWindowDefinitions() []*ast.WindowDefinition { return defs } + +// parseIdentifierName parses an identifier name that may start with a number. +// In ClickHouse, table and database names can start with digits (e.g., 03657_test). +// When such names are lexed, they produce NUMBER + IDENT tokens that need to be combined. +func (p *Parser) parseIdentifierName() string { + var name string + + // Handle identifier or keyword used as name + if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { + name = p.current.Value + p.nextToken() + return name + } + + // Handle name starting with number (e.g., 03657_test) + if p.currentIs(token.NUMBER) { + name = p.current.Value + p.nextToken() + // Check if followed by identifier (underscore connects them) + if p.currentIs(token.IDENT) { + name += p.current.Value + p.nextToken() + } + return name + } + + // Handle string (e.g., for cluster names) + if p.currentIs(token.STRING) { + name = p.current.Value + p.nextToken() + return name + } + + return "" +} diff --git a/token/token.go b/token/token.go index e1b58dc0e..857dde6df 100644 --- a/token/token.go +++ b/token/token.go @@ -83,6 +83,7 @@ const ( DETACH DISTINCT DISTRIBUTED + DIV DROP ELSE END @@ -128,6 +129,7 @@ const ( LIVE LOCAL MATERIALIZED + MOD MODIFY NAN NATURAL @@ -262,6 +264,7 @@ var tokens = [...]string{ DETACH: "DETACH", DISTINCT: "DISTINCT", DISTRIBUTED: "DISTRIBUTED", + DIV: "DIV", DROP: "DROP", ELSE: "ELSE", END: "END", @@ -307,6 +310,7 @@ var tokens = [...]string{ LIVE: "LIVE", LOCAL: "LOCAL", MATERIALIZED: "MATERIALIZED", + MOD: "MOD", MODIFY: "MODIFY", NAN: "NAN", NATURAL: "NATURAL",