diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000000..68a8afb145 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,276 @@ +# Comprehensive Plan: Fix Remaining Tests + +## Current Status +- **Tests passing:** 6,005 (88.0%) +- **Tests skipped:** 819 (12.0%) + - Parser failures: 173 tests + - Explain mismatches: 331 tests + - Other (metadata skip/explain=false): ~315 tests + +## Phase 1: Parser Fixes (High Impact) + +### 1.1 `view()` Table Function (~50 tests) +**Problem:** The `view(SELECT ...)` table function with inline subquery fails to parse. +```sql +SELECT * FROM view(SELECT 1 as id); +``` +**Files:** `parser/parser.go` (parseTableExpression, parseFunctionCall) +**Solution:** When parsing a function call and the function name is `view`, check if the first argument starts with SELECT/WITH and parse it as a subquery instead of expression list. + +### 1.2 Complex Type Casts with Named Parameters (~30 tests) +**Problem:** `::Tuple(a UInt32, b String)` with named fields fails +```sql +SELECT tuple(42, 42)::Tuple(a UInt32, b UInt32); +``` +**Files:** `parser/expression.go` (parseDataType) +**Solution:** Extend parseDataType to handle named parameters in type constructors like `Tuple(name Type, ...)`. + +### 1.3 DESCRIBE on Table Functions (~20 tests) +**Problem:** `desc format()`, `desc url()`, `desc s3Cluster()` fail +```sql +desc format(CSV, '"value"'); +``` +**Files:** `parser/parser.go` (parseDescribe) +**Solution:** Handle table function after DESC/DESCRIBE by calling parseTableExpression. + +### 1.4 INSERT INTO FUNCTION (~15 tests) +**Problem:** INSERT INTO FUNCTION with file paths and settings fails +```sql +insert into function file(02458_data.jsonl) select * settings engine_file_truncate_on_insert=1; +``` +**Files:** `parser/parser.go` (parseInsert) +**Solution:** Handle TABLE FUNCTION keyword and parse function call with settings. + +### 1.5 CREATE USER / FUNCTION / DICTIONARY (~10 tests) +**Problem:** These CREATE variants are not supported +```sql +CREATE USER test_user GRANTEES ...; +CREATE DICTIONARY d0 (c1 UInt64) PRIMARY KEY c1; +``` +**Files:** `parser/parser.go` (parseCreate) +**Solution:** Add cases for USER, FUNCTION, DICTIONARY in parseCreate switch. + +### 1.6 SHOW SETTINGS (~5 tests) +**Problem:** SHOW SETTINGS LIKE syntax not supported +```sql +show settings like 'send_timeout'; +``` +**Files:** `parser/parser.go` (parseShow) +**Solution:** Handle SETTINGS keyword after SHOW. + +### 1.7 PASTE JOIN (~3 tests) +**Problem:** PASTE JOIN is not recognized +```sql +SELECT * FROM t1 PASTE JOIN t2; +``` +**Files:** `parser/parser.go` (parseTableExpression or join parsing) +**Solution:** Add PASTE as a valid join type. + +### 1.8 `any()` Subquery Syntax (~2 tests) +**Problem:** `== any (SELECT ...)` syntax not supported +```sql +select 1 == any (select number from numbers(10)); +``` +**Files:** `parser/expression.go` +**Solution:** Handle `any(subquery)` as a special expression form after comparison operators. + +--- + +## Phase 2: Explain Layer Fixes (Medium Impact) + +### 2.1 INDEX Clause in CREATE TABLE (~50 tests) +**Problem:** INDEX definitions are skipped but should produce explain output +```sql +CREATE TABLE t (x UInt8, INDEX i x TYPE hypothesis GRANULARITY 100); +``` +**Files:** `parser/parser.go` (parseCreateTable), `internal/explain/statements.go` +**Solution:** +1. Parse INDEX into an ast.IndexDefinition struct +2. Add explain output for index definitions + +### 2.2 SETTINGS Inside Function Arguments (~40 tests) +**Problem:** SETTINGS in table functions should create a Set child +```sql +SELECT * FROM icebergS3(s3_conn, SETTINGS key='value'); +``` +**Files:** `parser/expression.go` (parseFunctionCall), `internal/explain/functions.go` +**Solution:** Capture SETTINGS as a Set node attached to the function call, output in explain. + +### 2.3 WITH FILL Clause (~30 tests) +**Problem:** ORDER BY ... WITH FILL is not captured +```sql +SELECT nan ORDER BY 1 WITH FILL; +``` +**Files:** `parser/parser.go` (parseOrderByItem), `internal/explain/select.go` +**Solution:** Add WithFill field to OrderItem, parse WITH FILL, output in explain. + +### 2.4 Column CODEC Clause (~20 tests) +**Problem:** CODEC(GCD, LZ4) in columns not captured +```sql +CREATE TABLE t (col UInt32 CODEC(GCD, LZ4)); +``` +**Files:** `parser/parser.go` (parseColumnDeclaration), `internal/explain/statements.go` +**Solution:** Parse CODEC clause into ColumnDeclaration, output in explain. + +### 2.5 Column EPHEMERAL Modifier (~15 tests) +**Problem:** EPHEMERAL keyword not captured +```sql +CREATE TABLE t (a Int EPHEMERAL); +``` +**Files:** `parser/parser.go` (parseColumnDeclaration) +**Solution:** Add Ephemeral field to ColumnDeclaration, parse and explain. + +### 2.6 CREATE TABLE ... AS function() (~15 tests) +**Problem:** CREATE TABLE AS s3Cluster(...) should have Function child +```sql +CREATE TABLE test AS s3Cluster('cluster', 'url'); +``` +**Files:** `parser/parser.go` (parseCreateTable), `internal/explain/statements.go` +**Solution:** Parse AS clause when followed by function call, store as TableFunction field. + +### 2.7 WithElement Wrapper for CTEs (~20 tests) +**Problem:** Some CTEs need WithElement wrapper in output +```sql +WITH sub AS (SELECT ...) SELECT ...; +``` +**Files:** `internal/explain/select.go` +**Solution:** Output WithElement wrapper when appropriate for CTE definitions. + +### 2.8 Float Scientific Notation (~15 tests) +**Problem:** Very small/large floats should use scientific notation +```sql +SELECT 2.2250738585072014e-308; +``` +**Files:** `internal/explain/format.go` +**Solution:** Format floats using scientific notation when appropriate. + +### 2.9 Negative Literals in Arrays (~10 tests) +**Problem:** Arrays with negatives may output Function instead of Literal +```sql +SELECT [-10000, 5750]; +``` +**Files:** `internal/explain/expressions.go` +**Solution:** Properly detect and format negative integer literals in arrays. + +### 2.10 Parameterized View Placeholders (~10 tests) +**Problem:** `{name:Type}` parameters in views +```sql +create view v as select number where number%2={parity:Int8}; +``` +**Files:** `internal/explain/expressions.go` +**Solution:** Output Parameter nodes correctly with type info. + +### 2.11 Column TTL (~10 tests) +**Problem:** TTL expression on columns not captured +```sql +CREATE TABLE t (c Int TTL expr()); +``` +**Files:** `parser/parser.go` (parseColumnDeclaration) +**Solution:** Parse TTL clause into ColumnDeclaration. + +--- + +## Phase 3: Lower Priority Fixes + +### 3.1 GROUPING SETS (~5 tests) +```sql +SELECT ... GROUP BY GROUPING SETS ((a), (b)); +``` + +### 3.2 QUALIFY Clause (~5 tests) +```sql +SELECT x QUALIFY row_number() OVER () = 1; +``` + +### 3.3 INTO OUTFILE TRUNCATE (~3 tests) +```sql +SELECT 1 INTO OUTFILE '/dev/null' TRUNCATE FORMAT Npy; +``` + +### 3.4 INTERVAL with Dynamic Type (~3 tests) +```sql +SELECT INTERVAL c0::Dynamic DAY; +``` + +### 3.5 ALTER TABLE with Multiple Operations (~3 tests) +```sql +ALTER TABLE t (DELETE WHERE ...), (UPDATE ... WHERE ...); +``` + +### 3.6 EXPLAIN SYNTAX for SYSTEM commands (~2 tests) +```sql +explain syntax system drop schema cache for hdfs; +``` + +--- + +## Implementation Order (Recommended) + +1. **Week 1: Parser Fundamentals** + - 1.2 Complex Type Casts (unlocks many tests) + - 1.1 view() Table Function (high impact) + - 1.3 DESCRIBE on Table Functions + +2. **Week 2: Parser Completeness** + - 1.4 INSERT INTO FUNCTION + - 1.5 CREATE USER/FUNCTION/DICTIONARY + - 1.6 SHOW SETTINGS + - 1.7 PASTE JOIN + - 1.8 any() Subquery + +3. **Week 3: Explain Layer - CREATE TABLE** + - 2.1 INDEX Clause + - 2.4 CODEC Clause + - 2.5 EPHEMERAL Modifier + - 2.6 CREATE TABLE AS function() + - 2.11 Column TTL + +4. **Week 4: Explain Layer - SELECT** + - 2.2 SETTINGS in Functions + - 2.3 WITH FILL + - 2.7 WithElement for CTEs + - 2.10 Parameterized View Placeholders + +5. **Week 5: Explain Layer - Formatting** + - 2.8 Float Scientific Notation + - 2.9 Negative Literals in Arrays + +6. **Week 6: Remaining Items** + - Phase 3 lower priority items + +--- + +## Estimated Impact + +| Phase | Tests Fixed | New Pass Rate | +|-------|-------------|---------------| +| 1.1-1.4 | ~115 | ~90% | +| 1.5-1.8 | ~20 | ~90.5% | +| 2.1-2.6 | ~140 | ~93% | +| 2.7-2.11 | ~65 | ~94% | +| Phase 3 | ~20 | ~94.5% | + +--- + +## Files to Modify + +### Parser Layer +- `parser/parser.go` - Main parser (CREATE, INSERT, DESCRIBE, SHOW, joins) +- `parser/expression.go` - Expression parsing (type casts, functions, special syntax) +- `ast/ast.go` - AST node definitions (IndexDefinition, new fields) + +### Explain Layer +- `internal/explain/statements.go` - CREATE TABLE explain +- `internal/explain/select.go` - SELECT explain (WITH FILL, CTEs) +- `internal/explain/functions.go` - Function explain (SETTINGS) +- `internal/explain/expressions.go` - Expression explain (literals, parameters) +- `internal/explain/format.go` - Output formatting (scientific notation) + +--- + +## Testing Strategy + +1. Run tests frequently: `go test ./parser -timeout 5s` +2. After each fix, verify no regressions: compare PASS count +3. Check specific test cases: `go test ./parser -v -run "TestParser/test_name"` +4. Monitor for infinite loops (timeout protection already in place) diff --git a/ast/ast.go b/ast/ast.go index 0f8898baeb..47cfa5da4d 100644 --- a/ast/ast.go +++ b/ast/ast.go @@ -57,6 +57,7 @@ type SelectQuery struct { WithCube bool `json:"with_cube,omitempty"` WithTotals bool `json:"with_totals,omitempty"` Having Expression `json:"having,omitempty"` + Qualify Expression `json:"qualify,omitempty"` Window []*WindowDefinition `json:"window,omitempty"` OrderBy []*OrderByElement `json:"order_by,omitempty"` Limit Expression `json:"limit,omitempty"` @@ -90,6 +91,7 @@ func (w *WindowDefinition) End() token.Position { return w.Position } type IntoOutfileClause struct { Position token.Position `json:"-"` Filename string `json:"filename"` + Truncate bool `json:"truncate,omitempty"` } func (i *IntoOutfileClause) Pos() token.Position { return i.Position } @@ -162,6 +164,7 @@ const ( JoinRight JoinType = "RIGHT" JoinFull JoinType = "FULL" JoinCross JoinType = "CROSS" + JoinPaste JoinType = "PASTE" ) // JoinStrictness represents the join strictness. @@ -208,6 +211,7 @@ type InsertQuery struct { Table string `json:"table,omitempty"` Function *FunctionCall `json:"function,omitempty"` // For INSERT INTO FUNCTION syntax Columns []*Identifier `json:"columns,omitempty"` + PartitionBy Expression `json:"partition_by,omitempty"` // For PARTITION BY clause Select Statement `json:"select,omitempty"` Format *Identifier `json:"format,omitempty"` HasSettings bool `json:"has_settings,omitempty"` // For SETTINGS clause @@ -219,29 +223,36 @@ func (i *InsertQuery) statementNode() {} // CreateQuery represents a CREATE statement. type CreateQuery struct { - Position token.Position `json:"-"` - OrReplace bool `json:"or_replace,omitempty"` - IfNotExists bool `json:"if_not_exists,omitempty"` - Temporary bool `json:"temporary,omitempty"` - Database string `json:"database,omitempty"` - Table string `json:"table,omitempty"` - View string `json:"view,omitempty"` - Materialized bool `json:"materialized,omitempty"` - To string `json:"to,omitempty"` // Target table for materialized views - Populate bool `json:"populate,omitempty"` // POPULATE for materialized views - Columns []*ColumnDeclaration `json:"columns,omitempty"` - Constraints []*Constraint `json:"constraints,omitempty"` - Engine *EngineClause `json:"engine,omitempty"` - OrderBy []Expression `json:"order_by,omitempty"` - PartitionBy Expression `json:"partition_by,omitempty"` - PrimaryKey []Expression `json:"primary_key,omitempty"` - SampleBy Expression `json:"sample_by,omitempty"` - TTL *TTLClause `json:"ttl,omitempty"` - Settings []*SettingExpr `json:"settings,omitempty"` - AsSelect Statement `json:"as_select,omitempty"` - Comment string `json:"comment,omitempty"` - OnCluster string `json:"on_cluster,omitempty"` - CreateDatabase bool `json:"create_database,omitempty"` + Position token.Position `json:"-"` + OrReplace bool `json:"or_replace,omitempty"` + IfNotExists bool `json:"if_not_exists,omitempty"` + Temporary bool `json:"temporary,omitempty"` + Database string `json:"database,omitempty"` + Table string `json:"table,omitempty"` + View string `json:"view,omitempty"` + Materialized bool `json:"materialized,omitempty"` + To string `json:"to,omitempty"` // Target table for materialized views + Populate bool `json:"populate,omitempty"` // POPULATE for materialized views + Columns []*ColumnDeclaration `json:"columns,omitempty"` + Indexes []*IndexDefinition `json:"indexes,omitempty"` + Constraints []*Constraint `json:"constraints,omitempty"` + Engine *EngineClause `json:"engine,omitempty"` + OrderBy []Expression `json:"order_by,omitempty"` + PartitionBy Expression `json:"partition_by,omitempty"` + PrimaryKey []Expression `json:"primary_key,omitempty"` + SampleBy Expression `json:"sample_by,omitempty"` + TTL *TTLClause `json:"ttl,omitempty"` + Settings []*SettingExpr `json:"settings,omitempty"` + AsSelect Statement `json:"as_select,omitempty"` + Comment string `json:"comment,omitempty"` + OnCluster string `json:"on_cluster,omitempty"` + CreateDatabase bool `json:"create_database,omitempty"` + CreateFunction bool `json:"create_function,omitempty"` + CreateUser bool `json:"create_user,omitempty"` + CreateDictionary bool `json:"create_dictionary,omitempty"` + FunctionName string `json:"function_name,omitempty"` + FunctionBody Expression `json:"function_body,omitempty"` + UserName string `json:"user_name,omitempty"` } func (c *CreateQuery) Pos() token.Position { return c.Position } @@ -258,6 +269,7 @@ type ColumnDeclaration struct { DefaultKind string `json:"default_kind,omitempty"` // DEFAULT, MATERIALIZED, ALIAS, EPHEMERAL Codec *CodecExpr `json:"codec,omitempty"` TTL Expression `json:"ttl,omitempty"` + PrimaryKey bool `json:"primary_key,omitempty"` // PRIMARY KEY constraint Comment string `json:"comment,omitempty"` } @@ -289,13 +301,26 @@ func (n *NameTypePair) expressionNode() {} // CodecExpr represents a CODEC expression. type CodecExpr struct { - Position token.Position `json:"-"` - Codecs []*FunctionCall `json:"codecs"` + Position token.Position `json:"-"` + Codecs []*FunctionCall `json:"codecs"` } func (c *CodecExpr) Pos() token.Position { return c.Position } func (c *CodecExpr) End() token.Position { return c.Position } +// IndexDefinition represents an INDEX definition in CREATE TABLE. +type IndexDefinition struct { + Position token.Position `json:"-"` + Name string `json:"name"` + Expression Expression `json:"expression"` + Type *FunctionCall `json:"type"` + Granularity Expression `json:"granularity,omitempty"` +} + +func (i *IndexDefinition) Pos() token.Position { return i.Position } +func (i *IndexDefinition) End() token.Position { return i.Position } +func (i *IndexDefinition) expressionNode() {} + // Constraint represents a table constraint. type Constraint struct { Position token.Position `json:"-"` @@ -434,9 +459,12 @@ func (u *UseQuery) statementNode() {} // DescribeQuery represents a DESCRIBE statement. type DescribeQuery struct { - Position token.Position `json:"-"` - Database string `json:"database,omitempty"` - Table string `json:"table"` + Position token.Position `json:"-"` + Database string `json:"database,omitempty"` + Table string `json:"table,omitempty"` + TableFunction *FunctionCall `json:"table_function,omitempty"` + Settings []*SettingExpr `json:"settings,omitempty"` + Format string `json:"format,omitempty"` } func (d *DescribeQuery) Pos() token.Position { return d.Position } @@ -470,6 +498,7 @@ const ( ShowColumns ShowType = "COLUMNS" ShowDictionaries ShowType = "DICTIONARIES" ShowFunctions ShowType = "FUNCTIONS" + ShowSettings ShowType = "SETTINGS" ) // ExplainQuery represents an EXPLAIN statement. @@ -487,11 +516,12 @@ func (e *ExplainQuery) statementNode() {} type ExplainType string const ( - ExplainAST ExplainType = "AST" - ExplainSyntax ExplainType = "SYNTAX" - ExplainPlan ExplainType = "PLAN" - ExplainPipeline ExplainType = "PIPELINE" - ExplainEstimate ExplainType = "ESTIMATE" + ExplainAST ExplainType = "AST" + ExplainSyntax ExplainType = "SYNTAX" + ExplainPlan ExplainType = "PLAN" + ExplainPipeline ExplainType = "PIPELINE" + ExplainEstimate ExplainType = "ESTIMATE" + ExplainCurrentTransaction ExplainType = "CURRENT TRANSACTION" ) // SetQuery represents a SET statement. @@ -531,11 +561,20 @@ func (s *SystemQuery) Pos() token.Position { return s.Position } func (s *SystemQuery) End() token.Position { return s.Position } func (s *SystemQuery) statementNode() {} +// RenamePair represents a single rename pair in RENAME TABLE. +type RenamePair struct { + FromDatabase string `json:"from_database,omitempty"` + FromTable string `json:"from_table"` + ToDatabase string `json:"to_database,omitempty"` + ToTable string `json:"to_table"` +} + // RenameQuery represents a RENAME TABLE statement. type RenameQuery struct { Position token.Position `json:"-"` - From string `json:"from"` - To string `json:"to"` + Pairs []*RenamePair `json:"pairs"` // Multiple rename pairs + From string `json:"from,omitempty"` // Deprecated: for backward compat + To string `json:"to,omitempty"` // Deprecated: for backward compat OnCluster string `json:"on_cluster,omitempty"` } @@ -695,6 +734,7 @@ type FunctionCall struct { Name string `json:"name"` Parameters []Expression `json:"parameters,omitempty"` // For parametric functions like quantile(0.9)(x) Arguments []Expression `json:"arguments,omitempty"` + Settings []*SettingExpr `json:"settings,omitempty"` // For table functions with SETTINGS Distinct bool `json:"distinct,omitempty"` Over *WindowSpec `json:"over,omitempty"` Alias string `json:"alias,omitempty"` @@ -841,7 +881,8 @@ func (w *WhenClause) End() token.Position { return w.Position } type CastExpr struct { Position token.Position `json:"-"` Expr Expression `json:"expr"` - Type *DataType `json:"type"` + Type *DataType `json:"type,omitempty"` + TypeExpr Expression `json:"type_expr,omitempty"` // For dynamic type like CAST(x, if(cond, 'Type1', 'Type2')) Alias string `json:"alias,omitempty"` OperatorSyntax bool `json:"operator_syntax,omitempty"` // true if using :: syntax } diff --git a/internal/explain/explain.go b/internal/explain/explain.go index 1432ad8d96..2822f3fbe2 100644 --- a/internal/explain/explain.go +++ b/internal/explain/explain.go @@ -177,3 +177,27 @@ func Column(sb *strings.Builder, col *ast.ColumnDeclaration, depth int) { Node(sb, col.Default, depth+1) } } + +func Index(sb *strings.Builder, idx *ast.IndexDefinition, depth int) { + indent := strings.Repeat(" ", depth) + children := 0 + if idx.Expression != nil { + children++ + } + if idx.Type != nil { + children++ + } + fmt.Fprintf(sb, "%sIndex (children %d)\n", indent, children) + if idx.Expression != nil { + // Expression is typically an identifier + if ident, ok := idx.Expression.(*ast.Identifier); ok { + fmt.Fprintf(sb, "%s Identifier %s\n", indent, ident.Name()) + } else { + Node(sb, idx.Expression, depth+1) + } + } + if idx.Type != nil { + // Type is a function like minmax, bloom_filter, etc. + explainFunctionCall(sb, idx.Type, indent+" ", depth+1) + } +} diff --git a/internal/explain/functions.go b/internal/explain/functions.go index 7e7d49eac9..c522dc7f08 100644 --- a/internal/explain/functions.go +++ b/internal/explain/functions.go @@ -26,15 +26,30 @@ func explainFunctionCallWithAlias(sb *strings.Builder, n *ast.FunctionCall, alia } else { fmt.Fprintf(sb, "%sFunction %s (children %d)\n", indent, fnName, children) } - // Arguments + // Arguments (Settings are included as part of argument count) + argCount := len(n.Arguments) + if len(n.Settings) > 0 { + argCount++ // Set is counted as one argument + } fmt.Fprintf(sb, "%s ExpressionList", indent) - if len(n.Arguments) > 0 { - fmt.Fprintf(sb, " (children %d)", len(n.Arguments)) + if argCount > 0 { + fmt.Fprintf(sb, " (children %d)", argCount) } fmt.Fprintln(sb) for _, arg := range n.Arguments { + // For view() table function, unwrap Subquery wrapper + if strings.ToLower(n.Name) == "view" { + if sq, ok := arg.(*ast.Subquery); ok { + Node(sb, sq.Query, depth+2) + continue + } + } Node(sb, arg, depth+2) } + // Settings appear as Set node inside ExpressionList + if len(n.Settings) > 0 { + fmt.Fprintf(sb, "%s Set\n", indent) + } // Parameters (for parametric functions) if len(n.Parameters) > 0 { fmt.Fprintf(sb, "%s ExpressionList (children %d)\n", indent, len(n.Parameters)) diff --git a/internal/explain/select.go b/internal/explain/select.go index b26cfedfa1..697b567516 100644 --- a/internal/explain/select.go +++ b/internal/explain/select.go @@ -15,6 +15,13 @@ func explainSelectWithUnionQuery(sb *strings.Builder, n *ast.SelectWithUnionQuer for _, sel := range n.Selects { Node(sb, sel, depth+2) } + // INTO OUTFILE clause - check if any SelectQuery has IntoOutfile set + for _, sel := range n.Selects { + if sq, ok := sel.(*ast.SelectQuery); ok && sq.IntoOutfile != nil { + fmt.Fprintf(sb, "%s Literal \\'%s\\'\n", indent, sq.IntoOutfile.Filename) + break + } + } // FORMAT clause - check if any SelectQuery has Format set var hasFormat bool for _, sel := range n.Selects { @@ -95,12 +102,49 @@ func explainSelectQuery(sb *strings.Builder, n *ast.SelectQuery, indent string, } func explainOrderByElement(sb *strings.Builder, n *ast.OrderByElement, indent string, depth int) { - fmt.Fprintf(sb, "%sOrderByElement (children %d)\n", indent, 1) + children := 1 // expression + if n.WithFill { + children++ // FillModifier + } + fmt.Fprintf(sb, "%sOrderByElement (children %d)\n", indent, children) Node(sb, n.Expression, depth+1) + if n.WithFill { + fillChildren := 0 + if n.FillFrom != nil { + fillChildren++ + } + if n.FillTo != nil { + fillChildren++ + } + if n.FillStep != nil { + fillChildren++ + } + if fillChildren > 0 { + fmt.Fprintf(sb, "%s FillModifier (children %d)\n", indent, fillChildren) + if n.FillFrom != nil { + Node(sb, n.FillFrom, depth+2) + } + if n.FillTo != nil { + Node(sb, n.FillTo, depth+2) + } + if n.FillStep != nil { + Node(sb, n.FillStep, depth+2) + } + } else { + fmt.Fprintf(sb, "%s FillModifier\n", indent) + } + } } func countSelectUnionChildren(n *ast.SelectWithUnionQuery) int { count := 1 // ExpressionList of selects + // Check if any SelectQuery has IntoOutfile set + for _, sel := range n.Selects { + if sq, ok := sel.(*ast.SelectQuery); ok && sq.IntoOutfile != nil { + count++ + break + } + } // Check if any SelectQuery has Format set var hasFormat bool for _, sel := range n.Selects { diff --git a/internal/explain/statements.go b/internal/explain/statements.go index 74e60bf74e..3a4cfa4f3c 100644 --- a/internal/explain/statements.go +++ b/internal/explain/statements.go @@ -44,6 +44,25 @@ func explainInsertQuery(sb *strings.Builder, n *ast.InsertQuery, indent string, } func explainCreateQuery(sb *strings.Builder, n *ast.CreateQuery, indent string, depth int) { + // Handle special CREATE types + if n.CreateFunction { + children := 1 // lambda + fmt.Fprintf(sb, "%sCreateFunctionQuery %s (children %d)\n", indent, n.FunctionName, children) + if n.FunctionBody != nil { + Node(sb, n.FunctionBody, depth+1) + } + return + } + if n.CreateUser { + fmt.Fprintf(sb, "%sCreateUserQuery %s\n", indent, n.UserName) + return + } + if n.CreateDictionary { + fmt.Fprintf(sb, "%sCreateDictionaryQuery %s (children 1)\n", indent, n.Table) + fmt.Fprintf(sb, "%s Identifier %s\n", indent, n.Table) + return + } + name := n.Table if n.View != "" { name = n.View @@ -69,11 +88,26 @@ func explainCreateQuery(sb *strings.Builder, n *ast.CreateQuery, indent string, fmt.Fprintf(sb, "%sCreateQuery %s (children %d)\n", indent, name, children) } fmt.Fprintf(sb, "%s Identifier %s\n", indent, name) - if len(n.Columns) > 0 { - fmt.Fprintf(sb, "%s Columns definition (children %d)\n", indent, 1) - fmt.Fprintf(sb, "%s ExpressionList (children %d)\n", indent, len(n.Columns)) - for _, col := range n.Columns { - Column(sb, col, depth+3) + if len(n.Columns) > 0 || len(n.Indexes) > 0 { + childrenCount := 0 + if len(n.Columns) > 0 { + childrenCount++ + } + if len(n.Indexes) > 0 { + childrenCount++ + } + fmt.Fprintf(sb, "%s Columns definition (children %d)\n", indent, childrenCount) + if len(n.Columns) > 0 { + fmt.Fprintf(sb, "%s ExpressionList (children %d)\n", indent, len(n.Columns)) + for _, col := range n.Columns { + Column(sb, col, depth+3) + } + } + if len(n.Indexes) > 0 { + fmt.Fprintf(sb, "%s ExpressionList (children %d)\n", indent, len(n.Indexes)) + for _, idx := range n.Indexes { + Index(sb, idx, depth+3) + } } } if n.Engine != nil || len(n.OrderBy) > 0 || len(n.PrimaryKey) > 0 || n.PartitionBy != nil || len(n.Settings) > 0 { @@ -237,11 +271,24 @@ func explainUseQuery(sb *strings.Builder, n *ast.UseQuery, indent string) { } func explainDescribeQuery(sb *strings.Builder, n *ast.DescribeQuery, indent string) { - name := n.Table - if n.Database != "" { - name = n.Database + "." + n.Table + if n.TableFunction != nil { + // DESCRIBE on a table function + children := 1 + if len(n.Settings) > 0 { + children++ + } + fmt.Fprintf(sb, "%sDescribeQuery (children %d)\n", indent, children) + explainFunctionCall(sb, n.TableFunction, indent+" ", 1) + if len(n.Settings) > 0 { + fmt.Fprintf(sb, "%s Set\n", indent) + } + } else { + name := n.Table + if n.Database != "" { + name = n.Database + "." + n.Table + } + fmt.Fprintf(sb, "%sDescribe %s\n", indent, name) } - fmt.Fprintf(sb, "%sDescribe %s\n", indent, name) } func explainDataType(sb *strings.Builder, n *ast.DataType, indent string, depth int) { diff --git a/internal/explain/tables.go b/internal/explain/tables.go index b9b8d22897..30cde1b5fb 100644 --- a/internal/explain/tables.go +++ b/internal/explain/tables.go @@ -84,7 +84,11 @@ func explainTableJoin(sb *strings.Builder, n *ast.TableJoin, indent string, dept if len(n.Using) > 0 { children++ } - fmt.Fprintf(sb, "%sTableJoin (children %d)\n", indent, children) + if children > 0 { + fmt.Fprintf(sb, "%sTableJoin (children %d)\n", indent, children) + } else { + fmt.Fprintf(sb, "%sTableJoin\n", indent) + } if n.On != nil { Node(sb, n.On, depth+1) } diff --git a/lexer/lexer.go b/lexer/lexer.go index ca33357139..34bf79139a 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -205,15 +205,47 @@ func (l *Lexer) NextToken() Item { case '?': l.readChar() return Item{Token: token.QUESTION, Value: "?", Pos: pos} + case '^': + l.readChar() + return Item{Token: token.CARET, Value: "^", Pos: pos} case '\'': return l.readString('\'') + case '\u2018', '\u2019': // Unicode curly single quotes ' ' + return l.readUnicodeString(l.ch) case '"': return l.readQuotedIdentifier() + case '\u201C', '\u201D': // Unicode curly double quotes " " + return l.readUnicodeQuotedIdentifier(l.ch) + case '\u2212': // Unicode minus sign − + l.readChar() + return Item{Token: token.MINUS, Value: "−", Pos: pos} case '`': return l.readBacktickIdentifier() + case '@': + // Handle @@ system variables and @ for user@host syntax + if l.peekChar() == '@' { + l.readChar() // skip first @ + l.readChar() // skip second @ + // Read the variable name + if isIdentStart(l.ch) || unicode.IsDigit(l.ch) { + var sb strings.Builder + sb.WriteString("@@") + for isIdentChar(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + } + return Item{Token: token.IDENT, Value: sb.String(), Pos: pos} + } + return Item{Token: token.IDENT, Value: "@@", Pos: pos} + } + // Single @ - used in user@host syntax, return as IDENT + l.readChar() + return Item{Token: token.IDENT, Value: "@", Pos: pos} default: if unicode.IsDigit(l.ch) { - return l.readNumber() + // Check if this is a number or an identifier starting with digits + // In ClickHouse, identifiers like "02422_data" start with digits + return l.readNumberOrIdent() } if isIdentStart(l.ch) { return l.readIdentifier() @@ -357,6 +389,50 @@ func (l *Lexer) readQuotedIdentifier() Item { return Item{Token: token.IDENT, Value: sb.String(), Pos: pos} } +// readUnicodeString reads a string enclosed in Unicode curly quotes (' or ') +func (l *Lexer) readUnicodeString(openQuote rune) Item { + pos := l.pos + var sb strings.Builder + l.readChar() // skip opening quote + + // Unicode curly quotes: ' (U+2018) opens, ' (U+2019) closes + closeQuote := '\u2019' // ' + if openQuote == '\u2019' { + closeQuote = '\u2019' + } + + for !l.eof && l.ch != closeQuote { + sb.WriteRune(l.ch) + l.readChar() + } + if l.ch == closeQuote { + l.readChar() // skip closing quote + } + return Item{Token: token.STRING, Value: sb.String(), Pos: pos} +} + +// readUnicodeQuotedIdentifier reads an identifier enclosed in Unicode curly double quotes (" or ") +func (l *Lexer) readUnicodeQuotedIdentifier(openQuote rune) Item { + pos := l.pos + var sb strings.Builder + l.readChar() // skip opening quote + + // Unicode curly double quotes: " (U+201C) opens, " (U+201D) closes + closeQuote := '\u201D' // " + if openQuote == '\u201D' { + closeQuote = '\u201D' + } + + for !l.eof && l.ch != closeQuote { + sb.WriteRune(l.ch) + l.readChar() + } + if l.ch == closeQuote { + l.readChar() // skip closing quote + } + return Item{Token: token.IDENT, Value: sb.String(), Pos: pos} +} + func (l *Lexer) readBacktickIdentifier() Item { pos := l.pos var sb strings.Builder @@ -462,6 +538,119 @@ func (l *Lexer) readNumber() Item { return Item{Token: token.NUMBER, Value: sb.String(), Pos: pos} } +// readNumberOrIdent handles tokens that start with digits. +// In ClickHouse, identifiers can start with digits if followed by underscore and letters +// e.g., "02422_data" is a valid identifier +func (l *Lexer) readNumberOrIdent() Item { + pos := l.pos + var sb strings.Builder + + // Peek ahead to see if this will become an identifier + // We need to look for pattern: digits followed by underscore followed by letter + // Save position for potential rollback + startCh := l.ch + + // Read initial digits + for unicode.IsDigit(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + } + + // Check if followed by underscore and then letter (identifier pattern) + if l.ch == '_' { + // Peek to see what follows the underscore + nextCh := l.peekChar() + if unicode.IsLetter(nextCh) || nextCh == '_' { + // This is an identifier that starts with digits + sb.WriteRune(l.ch) + l.readChar() + // Continue reading as identifier + for isIdentChar(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + } + return Item{Token: token.IDENT, Value: sb.String(), Pos: pos} + } + } + + // Not an identifier, continue as number + // But we already consumed the digits, so continue from here + // Handle underscore separators in numbers (only if followed by a digit) + for l.ch == '_' && unicode.IsDigit(l.peekChar()) { + l.readChar() // skip underscore + for unicode.IsDigit(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + } + } + + // Check for decimal point + if l.ch == '.' && unicode.IsDigit(l.peekChar()) { + sb.WriteRune(l.ch) + l.readChar() + for unicode.IsDigit(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + for l.ch == '_' && unicode.IsDigit(l.peekChar()) { + l.readChar() + } + } + } + + // Check for exponent + if l.ch == 'e' || l.ch == 'E' { + sb.WriteRune(l.ch) + l.readChar() + if l.ch == '+' || l.ch == '-' { + sb.WriteRune(l.ch) + l.readChar() + } + for unicode.IsDigit(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + for l.ch == '_' && unicode.IsDigit(l.peekChar()) { + l.readChar() + } + } + } + + // Special case: if the token was just "0" and current char is 'x', 'b', or 'o', + // this might be a hex/binary/octal number that we need to handle specially + val := sb.String() + if val == "0" && (l.ch == 'x' || l.ch == 'X') { + sb.WriteRune(l.ch) + l.readChar() + for isHexDigit(l.ch) { + sb.WriteRune(l.ch) + l.readChar() + } + } else if val == "0" && (l.ch == 'b' || l.ch == 'B') && (l.peekChar() == '0' || l.peekChar() == '1') { + sb.WriteRune(l.ch) + l.readChar() + for l.ch == '0' || l.ch == '1' { + sb.WriteRune(l.ch) + l.readChar() + } + } + + // Handle special case where number starts with 0 but we're inside readNumberOrIdent + // and the number already consumed is just the leading zero (checking for 0x, 0b, 0o) + if startCh == '0' && len(sb.String()) == 1 { + // Already handled above for 0x, 0b + // Handle 0o for octal + if l.ch == 'o' || l.ch == 'O' { + sb.WriteRune(l.ch) + l.readChar() + for l.ch >= '0' && l.ch <= '7' { + sb.WriteRune(l.ch) + l.readChar() + } + } + } + + return Item{Token: token.NUMBER, Value: sb.String(), Pos: pos} +} + func isHexDigit(ch rune) bool { return unicode.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') } diff --git a/parser/expression.go b/parser/expression.go index 599b0ba3cd..055feb9875 100644 --- a/parser/expression.go +++ b/parser/expression.go @@ -36,7 +36,7 @@ func (p *Parser) precedence(tok token.Token) int { case token.NOT: return NOT_PREC case token.EQ, token.NEQ, token.LT, token.GT, token.LTE, token.GTE, - token.LIKE, token.ILIKE, token.IN, token.BETWEEN, token.IS, + token.LIKE, token.ILIKE, token.REGEXP, token.IN, token.BETWEEN, token.IS, token.NULL_SAFE_EQ, token.GLOBAL: return COMPARE case token.QUESTION: @@ -56,7 +56,7 @@ func (p *Parser) precedence(tok token.Token) int { case token.DOT: return HIGHEST // Dot access case token.ARROW: - return ALIAS_PREC // Lambda arrow (low precedence) + return OR_PREC // Lambda arrow (just above ALIAS_PREC to allow parsing before AS) case token.NUMBER: // Handle .1 as tuple access (number starting with dot) return LOWEST @@ -101,6 +101,38 @@ func (p *Parser) parseExpressionList() []ast.Expression { return exprs } +// parseGroupingSets parses GROUPING SETS ((a), (b), (a, b)) +func (p *Parser) parseGroupingSets() []ast.Expression { + var exprs []ast.Expression + + if !p.expect(token.LPAREN) { + return exprs + } + + for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { + // Each element in GROUPING SETS is a tuple or a single expression + if p.currentIs(token.LPAREN) { + // Parse as tuple + tuple := p.parseGroupedOrTuple() + exprs = append(exprs, tuple) + } else { + // Single expression + expr := p.parseExpression(LOWEST) + if expr != nil { + exprs = append(exprs, expr) + } + } + + // Skip comma if present + if p.currentIs(token.COMMA) { + p.nextToken() + } + } + + p.expect(token.RPAREN) + return exprs +} + // parseFunctionArgumentList parses arguments for function calls, stopping at SETTINGS func (p *Parser) parseFunctionArgumentList() []ast.Expression { var exprs []ast.Expression @@ -133,7 +165,13 @@ func (p *Parser) parseFunctionArgumentList() []ast.Expression { func (p *Parser) parseImplicitAlias(expr ast.Expression) ast.Expression { // If next token is a plain identifier (not a keyword), treat as implicit alias // Keywords like FROM, WHERE etc. are tokenized as their own token types, not IDENT + // INTERSECT is not a keyword but should not be treated as an alias if p.currentIs(token.IDENT) { + upper := strings.ToUpper(p.current.Value) + // Don't consume SQL set operation keywords that aren't tokens + if upper == "INTERSECT" { + return expr + } alias := p.current.Value p.nextToken() @@ -191,6 +229,8 @@ func (p *Parser) parsePrefixExpression() ast.Expression { return p.parseSpecialNumber() case token.MINUS: return p.parseUnaryMinus() + case token.PLUS: + return p.parseUnaryPlus() case token.NOT: return p.parseNot() case token.LPAREN: @@ -208,7 +248,7 @@ func (p *Parser) parsePrefixExpression() ast.Expression { case token.INTERVAL: // INTERVAL can be a literal (INTERVAL 1 DAY) or identifier reference // Check if next token can start an interval value - if p.peekIs(token.NUMBER) || p.peekIs(token.LPAREN) || p.peekIs(token.MINUS) || p.peekIs(token.STRING) { + if p.peekIs(token.NUMBER) || p.peekIs(token.LPAREN) || p.peekIs(token.MINUS) || p.peekIs(token.STRING) || p.peekIs(token.IDENT) { return p.parseInterval() } // Otherwise treat as identifier @@ -263,8 +303,10 @@ func (p *Parser) parseInfixExpression(left ast.Expression) ast.Expression { return p.parseTernary(left) case token.LIKE, token.ILIKE: return p.parseLikeExpression(left, false) + case token.REGEXP: + return p.parseRegexpExpression(left, false) case token.NOT: - // NOT IN, NOT LIKE, NOT BETWEEN, IS NOT + // NOT IN, NOT LIKE, NOT BETWEEN, NOT REGEXP, IS NOT p.nextToken() switch p.current.Token { case token.IN: @@ -273,6 +315,8 @@ func (p *Parser) parseInfixExpression(left ast.Expression) ast.Expression { return p.parseLikeExpression(left, true) case token.ILIKE: return p.parseLikeExpression(left, true) + case token.REGEXP: + return p.parseRegexpExpression(left, true) case token.BETWEEN: return p.parseBetweenExpression(left, true) default: @@ -358,7 +402,8 @@ func (p *Parser) parseIdentifierOrFunction() ast.Expression { parts := []string{name} for p.currentIs(token.DOT) { p.nextToken() - if p.currentIs(token.IDENT) { + if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { + // Keywords can be used as column/field names (e.g., l_t.key, t.index) parts = append(parts, p.current.Value) p.nextToken() } else if p.currentIs(token.ASTERISK) { @@ -398,31 +443,35 @@ func (p *Parser) parseFunctionCall(name string, pos token.Position) *ast.Functio p.nextToken() } - // Parse arguments - if !p.currentIs(token.RPAREN) && !p.currentIs(token.SETTINGS) { + // Handle view() and similar functions that take a subquery as argument + // view(SELECT ...) should parse SELECT as a subquery, not expression + if strings.ToLower(name) == "view" && (p.currentIs(token.SELECT) || p.currentIs(token.WITH)) { + subquery := p.parseSelectWithUnion() + fn.Arguments = []ast.Expression{&ast.Subquery{Position: pos, Query: subquery}} + } else if !p.currentIs(token.RPAREN) && !p.currentIs(token.SETTINGS) { + // Parse arguments fn.Arguments = p.parseFunctionArgumentList() } // Handle SETTINGS inside function call (table functions) if p.currentIs(token.SETTINGS) { p.nextToken() - // Parse settings as key=value pairs until ) - for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { - // Just skip the settings for now - p.nextToken() - } + fn.Settings = p.parseSettingsList() } p.expect(token.RPAREN) // Handle IGNORE NULLS / RESPECT NULLS (window function modifiers) - if p.currentIs(token.IDENT) { + // Can appear multiple times (e.g., RESPECT NULLS IGNORE NULLS) + for p.currentIs(token.IDENT) { upper := strings.ToUpper(p.current.Value) if upper == "IGNORE" || upper == "RESPECT" { p.nextToken() if p.currentIs(token.NULLS) { p.nextToken() } + } else { + break } } @@ -649,6 +698,16 @@ func (p *Parser) parseUnaryMinus() ast.Expression { return expr } +func (p *Parser) parseUnaryPlus() ast.Expression { + expr := &ast.UnaryExpr{ + Position: p.current.Pos, + Op: "+", + } + p.nextToken() + expr.Operand = p.parseExpression(UNARY) + return expr +} + func (p *Parser) parseNot() ast.Expression { expr := &ast.UnaryExpr{ Position: p.current.Pos, @@ -673,7 +732,7 @@ func (p *Parser) parseGroupedOrTuple() ast.Expression { } } - // Check for subquery + // Check for subquery (SELECT, WITH, or EXPLAIN) if p.currentIs(token.SELECT) || p.currentIs(token.WITH) { subquery := p.parseSelectWithUnion() p.expect(token.RPAREN) @@ -682,6 +741,15 @@ func (p *Parser) parseGroupedOrTuple() ast.Expression { Query: subquery, } } + // EXPLAIN as subquery + if p.currentIs(token.EXPLAIN) { + explain := p.parseExplain() + p.expect(token.RPAREN) + return &ast.Subquery{ + Position: pos, + Query: explain, + } + } // Parse first expression first := p.parseExpression(LOWEST) @@ -791,19 +859,22 @@ func (p *Parser) parseCast() ast.Expression { // Use ALIAS_PREC to avoid consuming AS as an alias operator expr.Expr = p.parseExpression(ALIAS_PREC) - // Handle both CAST(x AS Type) and CAST(x, 'Type') syntax + // Handle both CAST(x AS Type) and CAST(x, 'Type') or CAST(x, expr) syntax if p.currentIs(token.AS) { p.nextToken() expr.Type = p.parseDataType() } else if p.currentIs(token.COMMA) { p.nextToken() - // Type is given as a string literal + // Type can be given as a string literal or an expression (e.g., if(cond, 'Type1', 'Type2')) if p.currentIs(token.STRING) { expr.Type = &ast.DataType{ Position: p.current.Pos, Name: p.current.Value, } p.nextToken() + } else { + // Parse as expression for dynamic type casting + expr.TypeExpr = p.parseExpression(LOWEST) } } @@ -820,49 +891,56 @@ func (p *Parser) parseExtract() ast.Expression { return nil } - // Check if it's EXTRACT(field FROM expr) or extract(str, pattern) form - if p.currentIs(token.IDENT) { + // Check if it's EXTRACT(field FROM expr) form + // The field must be a known date/time field identifier followed by FROM + if p.currentIs(token.IDENT) && !p.peekIs(token.LPAREN) { field := strings.ToUpper(p.current.Value) - p.nextToken() - - // Check for FROM keyword - if present, it's the EXTRACT(field FROM expr) form - if p.currentIs(token.FROM) { - p.nextToken() - from := p.parseExpression(LOWEST) - p.expect(token.RPAREN) - return &ast.ExtractExpr{ - Position: pos, - Field: field, - From: from, - } - } - - // Not FROM, so backtrack and parse as regular function call - // This is the extract(str, pattern) regex form - // We need to re-parse as a function call - args := []ast.Expression{ - &ast.Identifier{Position: pos, Parts: []string{strings.ToLower(field)}}, + // Check if it's a known date/time field + dateTimeFields := map[string]bool{ + "YEAR": true, "QUARTER": true, "MONTH": true, "WEEK": true, + "DAY": true, "DAYOFWEEK": true, "DAYOFYEAR": true, + "HOUR": true, "MINUTE": true, "SECOND": true, + "TIMEZONE_HOUR": true, "TIMEZONE_MINUTE": true, } - if p.currentIs(token.COMMA) { + if dateTimeFields[field] { p.nextToken() - for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { - args = append(args, p.parseExpression(LOWEST)) - if p.currentIs(token.COMMA) { - p.nextToken() - } else { - break + // Check for FROM keyword - if present, it's the EXTRACT(field FROM expr) form + if p.currentIs(token.FROM) { + p.nextToken() + from := p.parseExpression(LOWEST) + p.expect(token.RPAREN) + return &ast.ExtractExpr{ + Position: pos, + Field: field, + From: from, } } - } - p.expect(token.RPAREN) - return &ast.FunctionCall{ - Position: pos, - Name: "extract", - Arguments: args, + // Not FROM, so create args starting with the field as identifier + args := []ast.Expression{ + &ast.Identifier{Position: pos, Parts: []string{strings.ToLower(field)}}, + } + if p.currentIs(token.COMMA) { + p.nextToken() + for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { + args = append(args, p.parseExpression(LOWEST)) + if p.currentIs(token.COMMA) { + p.nextToken() + } else { + break + } + } + } + p.expect(token.RPAREN) + return &ast.FunctionCall{ + Position: pos, + Name: "extract", + Arguments: args, + } } } - // If first token is a string, it's the regex form extract(str, pattern) + // Parse as regular function call - extract(str, pattern) regex form + // or extract(expr, pattern) where expr can be any expression var args []ast.Expression for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { args = append(args, p.parseExpression(LOWEST)) @@ -887,9 +965,10 @@ func (p *Parser) parseInterval() ast.Expression { } p.nextToken() // skip INTERVAL - expr.Value = p.parseExpression(LOWEST) + // Use ALIAS_PREC to prevent consuming the unit as an alias + expr.Value = p.parseExpression(ALIAS_PREC) - // Parse unit + // Parse unit (interval units are identifiers like DAY, MONTH, etc.) if p.currentIs(token.IDENT) { expr.Unit = strings.ToUpper(p.current.Value) p.nextToken() @@ -1053,6 +1132,42 @@ func (p *Parser) parseBinaryExpression(left ast.Expression) ast.Expression { prec := p.precedence(p.current.Token) p.nextToken() + // Check for ANY/ALL subquery comparison modifier: expr >= ANY(subquery) + if p.currentIs(token.ANY) || p.currentIs(token.ALL) { + modifier := strings.ToUpper(p.current.Value) + p.nextToken() + if p.currentIs(token.LPAREN) { + p.nextToken() + // Parse the subquery + if p.currentIs(token.SELECT) || p.currentIs(token.WITH) { + subquery := p.parseSelectWithUnion() + p.expect(token.RPAREN) + // Wrap the comparison in a function call representing ANY/ALL + return &ast.FunctionCall{ + Position: expr.Position, + Name: strings.ToLower(modifier) + "Match", + Arguments: []ast.Expression{ + left, + &ast.Subquery{Position: expr.Position, Query: subquery}, + }, + } + } + // Not a subquery, parse as expression list + args := p.parseExpressionList() + p.expect(token.RPAREN) + return &ast.BinaryExpr{ + Position: expr.Position, + Left: left, + Op: expr.Op, + Right: &ast.FunctionCall{ + Position: expr.Position, + Name: strings.ToLower(modifier), + Arguments: args, + }, + } + } + } + expr.Right = p.parseExpression(prec) return expr } @@ -1074,6 +1189,30 @@ func (p *Parser) parseLikeExpression(left ast.Expression, not bool) ast.Expressi return expr } +func (p *Parser) parseRegexpExpression(left ast.Expression, not bool) ast.Expression { + pos := p.current.Pos + p.nextToken() // skip REGEXP + + pattern := p.parseExpression(COMPARE) + + // REGEXP translates to match(expr, pattern) function + fnCall := &ast.FunctionCall{ + Position: pos, + Name: "match", + Arguments: []ast.Expression{left, pattern}, + } + + if not { + // NOT REGEXP uses NOT match(...) + return &ast.UnaryExpr{ + Position: pos, + Op: "NOT", + Operand: fnCall, + } + } + return fnCall +} + func (p *Parser) parseInExpression(left ast.Expression, not bool) ast.Expression { expr := &ast.InExpr{ Position: p.current.Pos, @@ -1177,6 +1316,26 @@ func (p *Parser) parseIsExpression(left ast.Expression) ast.Expression { } } + // IS [NOT] DISTINCT FROM expr + if p.currentIs(token.DISTINCT) { + p.nextToken() // skip DISTINCT + if p.currentIs(token.FROM) { + p.nextToken() // skip FROM + right := p.parseExpression(COMPARE) + // IS NOT DISTINCT FROM is same as =, IS DISTINCT FROM is same as != + op := "=" + if not { + op = "!=" + } + return &ast.BinaryExpr{ + Position: pos, + Left: left, + Op: op, + Right: right, + } + } + } + return left } @@ -1219,6 +1378,24 @@ func (p *Parser) parseTupleAccessFromNumber(left ast.Expression) ast.Expression func (p *Parser) parseDotAccess(left ast.Expression) ast.Expression { p.nextToken() // skip . + // Check for JSON path parent access with ^ (e.g., x.^c0) + if p.currentIs(token.CARET) { + p.nextToken() // skip ^ + if p.currentIs(token.IDENT) { + pathPart := "^" + p.current.Value + p.nextToken() + if ident, ok := left.(*ast.Identifier); ok { + ident.Parts = append(ident.Parts, pathPart) + return ident + } + // Create new identifier with JSON path + return &ast.Identifier{ + Position: left.Pos(), + Parts: []string{pathPart}, + } + } + } + // Check for tuple access with number if p.currentIs(token.NUMBER) { expr := &ast.TupleAccess{ @@ -1229,8 +1406,8 @@ func (p *Parser) parseDotAccess(left ast.Expression) ast.Expression { return expr } - // Regular identifier access - if p.currentIs(token.IDENT) { + // Regular identifier access (keywords can also be column/field names after DOT) + if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { if ident, ok := left.(*ast.Identifier); ok { ident.Parts = append(ident.Parts, p.current.Value) p.nextToken() @@ -1323,7 +1500,9 @@ func (p *Parser) parseLambda(left ast.Expression) ast.Expression { p.nextToken() // skip -> - lambda.Body = p.parseExpression(LOWEST) + // Use ALIAS_PREC to prevent consuming AS keyword that might belong to containing context + // e.g., WITH x -> toString(x) AS lambda_1 SELECT... + lambda.Body = p.parseExpression(ALIAS_PREC) return lambda } @@ -1335,13 +1514,16 @@ func (p *Parser) parseTernary(condition ast.Expression) ast.Expression { p.nextToken() // skip ? - ternary.Then = p.parseExpression(LOWEST) + // Use ALIAS_PREC to prevent consuming AS keyword, but still allow nested ternaries + ternary.Then = p.parseExpression(ALIAS_PREC) if !p.expect(token.COLON) { return nil } - ternary.Else = p.parseExpression(LOWEST) + // Use ALIAS_PREC to prevent consuming AS keyword that might belong to containing context + // e.g., WITH cond ? a : b AS x SELECT... + ternary.Else = p.parseExpression(ALIAS_PREC) return ternary } @@ -1364,6 +1546,20 @@ func (p *Parser) parseParametricFunctionCall(fn *ast.FunctionCall) *ast.Function p.expect(token.RPAREN) + // Handle IGNORE NULLS / RESPECT NULLS (aggregate function modifiers) + // Can appear multiple times (e.g., RESPECT NULLS IGNORE NULLS) + for p.currentIs(token.IDENT) { + upper := strings.ToUpper(p.current.Value) + if upper == "IGNORE" || upper == "RESPECT" { + p.nextToken() + if p.currentIs(token.NULLS) { + p.nextToken() + } + } else { + break + } + } + // Handle OVER clause for window functions if p.currentIs(token.OVER) { p.nextToken() @@ -1466,7 +1662,11 @@ func (p *Parser) parseKeywordAsFunction() ast.Expression { } var args []ast.Expression - if !p.currentIs(token.RPAREN) { + // Handle view() and similar functions that take a subquery as argument + if name == "view" && (p.currentIs(token.SELECT) || p.currentIs(token.WITH)) { + subquery := p.parseSelectWithUnion() + args = []ast.Expression{&ast.Subquery{Position: pos, Query: subquery}} + } else if !p.currentIs(token.RPAREN) { args = p.parseExpressionList() } @@ -1515,16 +1715,29 @@ func (p *Parser) parseAsteriskExcept(asterisk *ast.Asterisk) ast.Expression { func (p *Parser) parseAsteriskReplace(asterisk *ast.Asterisk) ast.Expression { p.nextToken() // skip REPLACE - if !p.expect(token.LPAREN) { - return asterisk + // REPLACE can have optional parentheses: REPLACE (expr AS col) or REPLACE expr AS col + hasParens := p.currentIs(token.LPAREN) + if hasParens { + p.nextToken() } - for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { + for { + // Stop conditions based on context + if hasParens && p.currentIs(token.RPAREN) { + break + } + if !hasParens && (p.currentIs(token.FROM) || p.currentIs(token.WHERE) || p.currentIs(token.EOF) || + p.currentIs(token.GROUP) || p.currentIs(token.ORDER) || p.currentIs(token.HAVING) || + p.currentIs(token.LIMIT) || p.currentIs(token.SETTINGS) || p.currentIs(token.FORMAT) || + p.currentIs(token.UNION) || p.currentIs(token.EXCEPT) || p.currentIs(token.COMMA)) { + break + } + replace := &ast.ReplaceExpr{ Position: p.current.Pos, } - replace.Expr = p.parseExpression(LOWEST) + replace.Expr = p.parseExpression(ALIAS_PREC) if p.currentIs(token.AS) { p.nextToken() @@ -1538,10 +1751,18 @@ func (p *Parser) parseAsteriskReplace(asterisk *ast.Asterisk) ast.Expression { if p.currentIs(token.COMMA) { p.nextToken() + // If no parens and we see comma, might be end of select column + if !hasParens { + break + } + } else if !hasParens { + break } } - p.expect(token.RPAREN) + if hasParens { + p.expect(token.RPAREN) + } return asterisk } diff --git a/parser/parser.go b/parser/parser.go index 2825ad6a7c..3d42aaaae3 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -147,22 +147,42 @@ func (p *Parser) parseStatement() ast.Statement { } } -// parseSelectWithUnion parses SELECT ... UNION ... queries +// parseSelectWithUnion parses SELECT ... UNION/INTERSECT/EXCEPT ... queries func (p *Parser) parseSelectWithUnion() *ast.SelectWithUnionQuery { query := &ast.SelectWithUnionQuery{ Position: p.current.Pos, } - // Parse first SELECT - sel := p.parseSelect() - if sel == nil { - return nil + // Handle parenthesized start: (SELECT 1) UNION (SELECT 2) + if p.currentIs(token.LPAREN) { + p.nextToken() // skip ( + nested := p.parseSelectWithUnion() + p.expect(token.RPAREN) + for _, s := range nested.Selects { + query.Selects = append(query.Selects, s) + } + } else { + // Parse first SELECT + sel := p.parseSelect() + if sel == nil { + return nil + } + query.Selects = append(query.Selects, sel) } - query.Selects = append(query.Selects, sel) - // Parse UNION clauses - for p.currentIs(token.UNION) { - p.nextToken() // skip UNION + // Parse UNION/INTERSECT/EXCEPT clauses + for p.currentIs(token.UNION) || p.currentIs(token.EXCEPT) || + (p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "INTERSECT") { + var setOp string + if p.currentIs(token.UNION) { + setOp = "UNION" + } else if p.currentIs(token.EXCEPT) { + setOp = "EXCEPT" + } else { + setOp = "INTERSECT" + } + p.nextToken() // skip UNION/INTERSECT/EXCEPT + var mode string if p.currentIs(token.ALL) { query.UnionAll = true @@ -172,7 +192,7 @@ func (p *Parser) parseSelectWithUnion() *ast.SelectWithUnionQuery { mode = "DISTINCT" p.nextToken() } - query.UnionModes = append(query.UnionModes, mode) + query.UnionModes = append(query.UnionModes, setOp+" "+mode) // Handle parenthesized subqueries: UNION ALL (SELECT ... UNION ALL SELECT ...) if p.currentIs(token.LPAREN) { @@ -254,7 +274,30 @@ func (p *Parser) parseSelect() *ast.SelectQuery { if !p.expect(token.BY) { return nil } - sel.GroupBy = p.parseExpressionList() + + // Handle GROUPING SETS, ROLLUP(...), CUBE(...) as special expressions + if p.currentIs(token.GROUPING) && p.peekIs(token.SETS) { + // GROUPING SETS ((a), (b), (a, b)) + p.nextToken() // skip GROUPING + p.nextToken() // skip SETS + sel.GroupBy = p.parseGroupingSets() + } else if p.currentIs(token.ROLLUP) && p.peekIs(token.LPAREN) { + // ROLLUP(a, b, c) + p.nextToken() // skip ROLLUP + p.nextToken() // skip ( + sel.GroupBy = p.parseExpressionList() + p.expect(token.RPAREN) + sel.WithRollup = true + } else if p.currentIs(token.CUBE) && p.peekIs(token.LPAREN) { + // CUBE(a, b, c) + p.nextToken() // skip CUBE + p.nextToken() // skip ( + sel.GroupBy = p.parseExpressionList() + p.expect(token.RPAREN) + sel.WithCube = true + } else { + sel.GroupBy = p.parseExpressionList() + } // WITH ROLLUP if p.currentIs(token.WITH) && p.peekIs(token.ROLLUP) { @@ -284,6 +327,12 @@ func (p *Parser) parseSelect() *ast.SelectQuery { sel.Having = p.parseExpression(LOWEST) } + // Parse QUALIFY clause (window function filter) + if p.currentIs(token.QUALIFY) { + p.nextToken() + sel.Qualify = p.parseExpression(LOWEST) + } + // Parse WINDOW clause for named windows if p.currentIs(token.WINDOW) { p.nextToken() @@ -336,6 +385,10 @@ func (p *Parser) parseSelect() *ast.SelectQuery { if p.currentIs(token.OFFSET) { p.nextToken() sel.Offset = p.parseExpression(LOWEST) + // Skip optional ROWS keyword + if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "ROWS" { + p.nextToken() + } } // Parse FETCH FIRST ... ROW ONLY (SQL standard syntax) @@ -390,6 +443,11 @@ func (p *Parser) parseSelect() *ast.SelectQuery { Filename: p.current.Value, } p.nextToken() + // Parse optional TRUNCATE + if p.currentIs(token.TRUNCATE) { + sel.IntoOutfile.Truncate = true + p.nextToken() + } } } } @@ -474,6 +532,8 @@ func (p *Parser) parseWithClause() []ast.Expression { } else { // Scalar WITH: expr AS name (ClickHouse style) // Examples: WITH 1 AS x, WITH 'hello' AS s, WITH func() AS f + // Also handles lambda: WITH x -> toString(x) AS lambda_1 + // Arrow has OR_PREC precedence, so it gets parsed with ALIAS_PREC elem.Query = p.parseExpression(ALIAS_PREC) // Use ALIAS_PREC to stop before AS if !p.expect(token.AS) { @@ -528,7 +588,7 @@ func (p *Parser) isJoinKeyword() bool { } switch p.current.Token { case token.JOIN, token.INNER, token.LEFT, token.RIGHT, token.FULL, token.CROSS, - token.GLOBAL, token.ANY, token.ALL, token.ASOF, token.SEMI, token.ANTI: + token.GLOBAL, token.ANY, token.ALL, token.ASOF, token.SEMI, token.ANTI, token.PASTE: return true case token.COMMA: return true @@ -613,6 +673,9 @@ func (p *Parser) parseTableElementWithJoin() *ast.TablesInSelectQueryElement { case token.CROSS: join.Type = ast.JoinCross p.nextToken() + case token.PASTE: + join.Type = ast.JoinPaste + p.nextToken() default: join.Type = ast.JoinInner } @@ -653,6 +716,10 @@ func (p *Parser) parseTableExpression() *ast.TableExpression { if p.currentIs(token.SELECT) || p.currentIs(token.WITH) { subquery := p.parseSelectWithUnion() expr.Table = &ast.Subquery{Query: subquery} + } else if p.currentIs(token.EXPLAIN) { + // EXPLAIN as subquery in FROM clause + explain := p.parseExplain() + expr.Table = &ast.Subquery{Query: explain} } else { // Table function or expression expr.Table = p.parseExpression(LOWEST) @@ -720,10 +787,10 @@ func (p *Parser) parseTableExpression() *ast.TableExpression { func (p *Parser) isKeywordForClause() bool { switch p.current.Token { - case token.WHERE, token.GROUP, token.HAVING, token.ORDER, token.LIMIT, + case token.WHERE, token.GROUP, token.HAVING, token.QUALIFY, token.ORDER, token.LIMIT, token.OFFSET, token.UNION, token.EXCEPT, token.SETTINGS, token.FORMAT, token.PREWHERE, token.JOIN, token.LEFT, token.RIGHT, token.INNER, - token.FULL, token.CROSS, token.ON, token.USING, token.GLOBAL, + token.FULL, token.CROSS, token.PASTE, token.ON, token.USING, token.GLOBAL, token.ANY, token.ALL, token.SEMI, token.ANTI, token.ASOF: return true } @@ -912,6 +979,15 @@ func (p *Parser) parseInsert() *ast.InsertQuery { p.expect(token.RPAREN) } + // Parse PARTITION BY (for INSERT INTO FUNCTION) + if p.currentIs(token.PARTITION) { + p.nextToken() + if p.currentIs(token.BY) { + p.nextToken() + ins.PartitionBy = p.parseExpression(LOWEST) + } + } + // Parse SETTINGS before VALUES if p.currentIs(token.SETTINGS) { ins.HasSettings = true @@ -920,6 +996,18 @@ func (p *Parser) parseInsert() *ast.InsertQuery { p.parseSettingsList() } + // Parse FROM INFILE clause (for INSERT ... FROM INFILE '...') + if p.currentIs(token.FROM) { + p.nextToken() + if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "INFILE" { + p.nextToken() + // Skip the file path + if p.currentIs(token.STRING) { + p.nextToken() + } + } + } + // Parse VALUES or SELECT if p.currentIs(token.VALUES) { p.nextToken() @@ -997,8 +1085,33 @@ func (p *Parser) parseCreate() *ast.CreateQuery { case token.VIEW: p.nextToken() p.parseCreateView(create) + case token.FUNCTION: + // CREATE FUNCTION name AS lambda_expr + create.CreateFunction = true + p.nextToken() + p.parseCreateFunction(create) + case token.USER: + // CREATE USER name ... + create.CreateUser = true + p.nextToken() + p.parseCreateUser(create) + case token.IDENT: + // Handle CREATE DICTIONARY, CREATE RESOURCE, CREATE WORKLOAD, etc. + identUpper := strings.ToUpper(p.current.Value) + switch identUpper { + case "DICTIONARY": + create.CreateDictionary = true + p.nextToken() + p.parseCreateGeneric(create) + case "RESOURCE", "WORKLOAD", "POLICY", "ROLE", "QUOTA", "PROFILE": + // Skip these statements - just consume tokens until semicolon + p.parseCreateGeneric(create) + default: + p.errors = append(p.errors, fmt.Errorf("expected TABLE, DATABASE, VIEW, FUNCTION, USER after CREATE")) + return nil + } default: - p.errors = append(p.errors, fmt.Errorf("expected TABLE, DATABASE, or VIEW after CREATE")) + p.errors = append(p.errors, fmt.Errorf("expected TABLE, DATABASE, VIEW, FUNCTION, USER after CREATE")) return nil } @@ -1045,12 +1158,26 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { // Handle INDEX definition if p.currentIs(token.INDEX) { - p.nextToken() - // Skip index definition: INDEX name expr TYPE type GRANULARITY n - p.parseIdentifierName() // index name - // Skip expression and other index parts - for !p.currentIs(token.COMMA) && !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { + idx := p.parseIndexDefinition() + if idx != nil { + create.Indexes = append(create.Indexes, idx) + } + } else if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "PROJECTION" { + // Skip PROJECTION definitions: PROJECTION name (SELECT ...) + p.nextToken() // skip PROJECTION + p.parseIdentifierName() // projection name + // Skip the (SELECT ...) part + if p.currentIs(token.LPAREN) { + depth := 1 p.nextToken() + for depth > 0 && !p.currentIs(token.EOF) { + if p.currentIs(token.LPAREN) { + depth++ + } else if p.currentIs(token.RPAREN) { + depth-- + } + p.nextToken() + } } } else if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "CONSTRAINT" { // Skip CONSTRAINT definitions @@ -1089,7 +1216,8 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { case p.currentIs(token.PARTITION): p.nextToken() if p.expect(token.BY) { - create.PartitionBy = p.parseExpression(LOWEST) + // Use ALIAS_PREC to avoid consuming AS keyword (for AS SELECT) + create.PartitionBy = p.parseExpression(ALIAS_PREC) } case p.currentIs(token.ORDER): p.nextToken() @@ -1111,7 +1239,8 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { create.OrderBy = exprs } } else { - create.OrderBy = []ast.Expression{p.parseExpression(LOWEST)} + // Use ALIAS_PREC to avoid consuming AS keyword (for AS SELECT) + create.OrderBy = []ast.Expression{p.parseExpression(ALIAS_PREC)} } } case p.currentIs(token.PRIMARY): @@ -1134,19 +1263,21 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { create.PrimaryKey = exprs } } else { - create.PrimaryKey = []ast.Expression{p.parseExpression(LOWEST)} + // Use ALIAS_PREC to avoid consuming AS keyword (for AS SELECT) + create.PrimaryKey = []ast.Expression{p.parseExpression(ALIAS_PREC)} } } case p.currentIs(token.SAMPLE): p.nextToken() if p.expect(token.BY) { - create.SampleBy = p.parseExpression(LOWEST) + // Use ALIAS_PREC to avoid consuming AS keyword (for AS SELECT) + create.SampleBy = p.parseExpression(ALIAS_PREC) } case p.currentIs(token.TTL): p.nextToken() create.TTL = &ast.TTLClause{ Position: p.current.Pos, - Expression: p.parseExpression(LOWEST), + Expression: p.parseExpression(ALIAS_PREC), // Use ALIAS_PREC for AS SELECT } case p.currentIs(token.SETTINGS): p.nextToken() @@ -1157,16 +1288,21 @@ func (p *Parser) parseCreateTable(create *ast.CreateQuery) { } done_table_options: - // Parse AS SELECT or AS table_function() + // Parse AS SELECT or AS (subquery) or AS table_function() or AS database.table if p.currentIs(token.AS) { p.nextToken() - if p.currentIs(token.SELECT) || p.currentIs(token.WITH) { + if p.currentIs(token.SELECT) || p.currentIs(token.WITH) || p.currentIs(token.LPAREN) { + // AS SELECT... or AS (SELECT...) INTERSECT ... create.AsSelect = p.parseSelectWithUnion() - } else if p.currentIs(token.IDENT) { - // AS table_function(...) like "AS s3Cluster(...)" - // Skip the function call for now - p.parseIdentifierName() - if p.currentIs(token.LPAREN) { + } else if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { + // AS table_function(...) or AS database.table + name := p.parseIdentifierName() + if p.currentIs(token.DOT) { + // AS database.table - skip the table name + p.nextToken() + p.parseIdentifierName() + } else if p.currentIs(token.LPAREN) { + // AS function(...) - skip the function call depth := 1 p.nextToken() for depth > 0 && !p.currentIs(token.EOF) { @@ -1178,8 +1314,18 @@ done_table_options: p.nextToken() } } + _ = name // Use name for future AS table support } } + + // Parse ENGINE after AS (for CREATE TABLE x AS y ENGINE=z syntax) + if create.Engine == nil && p.currentIs(token.ENGINE) { + p.nextToken() + if p.currentIs(token.EQ) { + p.nextToken() + } + create.Engine = p.parseEngineClause() + } } func (p *Parser) parseCreateDatabase(create *ast.CreateQuery) { @@ -1272,15 +1418,126 @@ func (p *Parser) parseCreateView(create *ast.CreateQuery) { p.nextToken() } - // Parse AS SELECT + // Parse AS SELECT or AS (subquery) INTERSECT/UNION (subquery) if p.currentIs(token.AS) { p.nextToken() - if p.currentIs(token.SELECT) || p.currentIs(token.WITH) { + if p.currentIs(token.SELECT) || p.currentIs(token.WITH) || p.currentIs(token.LPAREN) { create.AsSelect = p.parseSelectWithUnion() } } } +func (p *Parser) parseCreateFunction(create *ast.CreateQuery) { + // Handle IF NOT EXISTS + if p.currentIs(token.IF) { + p.nextToken() + if p.currentIs(token.NOT) { + p.nextToken() + if p.currentIs(token.EXISTS) { + create.IfNotExists = true + p.nextToken() + } + } + } + + // Parse function name + create.FunctionName = p.parseIdentifierName() + + // Handle ON CLUSTER + if p.currentIs(token.ON) { + p.nextToken() + if p.currentIs(token.CLUSTER) { + p.nextToken() + create.OnCluster = p.parseIdentifierName() + } + } + + // Parse AS lambda_expression + if p.currentIs(token.AS) { + p.nextToken() + create.FunctionBody = p.parseExpression(LOWEST) + } +} + +func (p *Parser) parseCreateUser(create *ast.CreateQuery) { + // Handle IF NOT EXISTS + if p.currentIs(token.IF) { + p.nextToken() + if p.currentIs(token.NOT) { + p.nextToken() + if p.currentIs(token.EXISTS) { + create.IfNotExists = true + p.nextToken() + } + } + } + + // Parse user name + create.UserName = p.parseIdentifierName() + + // Skip the rest of the user definition (complex syntax) + for !p.currentIs(token.EOF) && !p.currentIs(token.SEMICOLON) { + p.nextToken() + } +} + +func (p *Parser) parseCreateGeneric(create *ast.CreateQuery) { + // Parse name + name := p.parseIdentifierName() + if name != "" { + create.Table = name // Reuse Table field for generic name + } + + // Skip the rest of the statement + for !p.currentIs(token.EOF) && !p.currentIs(token.SEMICOLON) { + p.nextToken() + } +} + +func (p *Parser) parseIndexDefinition() *ast.IndexDefinition { + idx := &ast.IndexDefinition{ + Position: p.current.Pos, + } + + p.nextToken() // skip INDEX + + // Parse index name + idx.Name = p.parseIdentifierName() + + // Parse expression (the column or expression being indexed) + idx.Expression = p.parseExpression(LOWEST) + + // Parse TYPE + if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "TYPE" { + p.nextToken() + // Type is a function call like bloom_filter(0.025) or minmax + pos := p.current.Pos + typeName := p.parseIdentifierName() + if typeName != "" { + idx.Type = &ast.FunctionCall{ + Position: pos, + Name: typeName, + } + // Check for parentheses (type parameters) + if p.currentIs(token.LPAREN) { + p.nextToken() + if !p.currentIs(token.RPAREN) { + idx.Type.Arguments = p.parseExpressionList() + } + p.expect(token.RPAREN) + } + } + } + + // Parse GRANULARITY + if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "GRANULARITY" { + p.nextToken() + idx.Granularity = p.parseExpression(LOWEST) + } + + return idx +} + func (p *Parser) parseColumnDeclaration() *ast.ColumnDeclaration { col := &ast.ColumnDeclaration{ Position: p.current.Pos, @@ -1294,8 +1551,14 @@ func (p *Parser) parseColumnDeclaration() *ast.ColumnDeclaration { return nil } - // Parse data type - col.Type = p.parseDataType() + // Check if next token is DEFAULT/MATERIALIZED/ALIAS (type omitted) + // These keywords indicate the type is omitted and we go straight to default expression + if p.currentIs(token.DEFAULT) || p.currentIs(token.MATERIALIZED) || p.currentIs(token.ALIAS) { + // Type is omitted, skip to default parsing below + } else { + // Parse data type + col.Type = p.parseDataType() + } // Parse DEFAULT/MATERIALIZED/ALIAS/EPHEMERAL switch p.current.Token { @@ -1335,6 +1598,15 @@ func (p *Parser) parseColumnDeclaration() *ast.ColumnDeclaration { col.TTL = p.parseExpression(LOWEST) } + // Parse PRIMARY KEY (column constraint) + if p.currentIs(token.PRIMARY) { + p.nextToken() + if p.currentIs(token.KEY) { + col.PrimaryKey = true + p.nextToken() + } + } + // Parse COMMENT if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "COMMENT" { p.nextToken() @@ -1364,46 +1636,62 @@ func (p *Parser) parseDataType() *ast.DataType { dt.HasParentheses = true p.nextToken() - // Special handling for Nested type - it contains column declarations, not just types - if strings.ToUpper(dt.Name) == "NESTED" { - for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { - // Parse as column name + type - if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { - pos := p.current.Pos - colName := p.current.Value - p.nextToken() - // Parse the type for this column - colType := p.parseDataType() - if colType != nil { - // Use NameTypePair for Nested column declarations - ntp := &ast.NameTypePair{ - Position: pos, - Name: colName, - Type: colType, + // Determine if this type uses named parameters (Nested, Tuple, JSON) + upperName := strings.ToUpper(dt.Name) + usesNamedParams := upperName == "NESTED" || upperName == "TUPLE" || upperName == "JSON" + + for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { + // Check if this is a named parameter: identifier followed by a type name + // e.g., "a UInt32" where "a" is the name and "UInt32" is the type + isNamedParam := false + if usesNamedParams && (p.currentIs(token.IDENT) || p.current.Token.IsKeyword()) { + // Check if current is NOT a type name and peek IS a type name or LPAREN follows for complex types + if !p.isDataTypeName(p.current.Value) { + // Current is a name (not a type), next should be a type + isNamedParam = true + } else if p.peekIs(token.IDENT) || p.peekIs(token.LPAREN) { + // Current looks like a type name but is followed by another identifier + // This happens with things like "a Tuple(...)" where "a" looks like it could be a type + // Check if peek is a known type name + if p.peekIs(token.IDENT) && p.isDataTypeName(p.peek.Value) { + isNamedParam = true + } else if p.peekIs(token.LPAREN) { + // Could be a function-like type or named with parenthesized type + // Check if current is a valid type name - if so, it's a type, not a name + if !p.isDataTypeName(p.current.Value) { + isNamedParam = true } - dt.Parameters = append(dt.Parameters, ntp) } } - if p.currentIs(token.COMMA) { - p.nextToken() - } else { - break - } } - } else { - for !p.currentIs(token.RPAREN) && !p.currentIs(token.EOF) { - // Could be another data type or an expression - // Type names can be identifiers or keywords (Array, Nested, etc.) - if (p.currentIs(token.IDENT) || p.current.Token.IsKeyword()) && p.isDataTypeName(p.current.Value) { - dt.Parameters = append(dt.Parameters, p.parseDataType()) - } else { - dt.Parameters = append(dt.Parameters, p.parseExpression(LOWEST)) - } - if p.currentIs(token.COMMA) { - p.nextToken() - } else { - break + + if isNamedParam { + // Parse as name + type pair + pos := p.current.Pos + paramName := p.current.Value + p.nextToken() + // Parse the type for this parameter + paramType := p.parseDataType() + if paramType != nil { + ntp := &ast.NameTypePair{ + Position: pos, + Name: paramName, + Type: paramType, + } + dt.Parameters = append(dt.Parameters, ntp) } + } else if (p.currentIs(token.IDENT) || p.current.Token.IsKeyword()) && p.isDataTypeName(p.current.Value) { + // It's a type name, parse as data type + dt.Parameters = append(dt.Parameters, p.parseDataType()) + } else { + // Parse as expression (for things like Decimal(10, 2)) + dt.Parameters = append(dt.Parameters, p.parseExpression(LOWEST)) + } + + if p.currentIs(token.COMMA) { + p.nextToken() + } else { + break } } p.expect(token.RPAREN) @@ -2008,12 +2296,17 @@ func (p *Parser) parseDescribe() *ast.DescribeQuery { p.nextToken() } - // Parse table name (can be identifier or keyword used as table name like "system") + // Parse table name or table function + // Table functions look like: format(CSV, '...'), url('...'), s3Cluster(...) if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { + pos := p.current.Pos tableName := p.current.Value p.nextToken() - if p.currentIs(token.DOT) { + // Check if this is a function call (table function) + if p.currentIs(token.LPAREN) { + desc.TableFunction = p.parseFunctionCall(tableName, pos) + } else if p.currentIs(token.DOT) { p.nextToken() desc.Database = tableName if p.currentIs(token.IDENT) || p.current.Token.IsKeyword() { @@ -2025,6 +2318,21 @@ func (p *Parser) parseDescribe() *ast.DescribeQuery { } } + // Parse SETTINGS clause + if p.currentIs(token.SETTINGS) { + p.nextToken() + desc.Settings = p.parseSettingsList() + } + + // Parse FORMAT clause + if p.currentIs(token.FORMAT) { + p.nextToken() + if p.currentIs(token.IDENT) || p.currentIs(token.NULL) || p.current.Token.IsKeyword() { + desc.Format = p.current.Value + p.nextToken() + } + } + return desc } @@ -2056,6 +2364,9 @@ func (p *Parser) parseShow() *ast.ShowQuery { p.nextToken() } } + case token.SETTINGS: + show.ShowType = ast.ShowSettings + p.nextToken() default: // Handle SHOW PROCESSLIST, SHOW DICTIONARIES, SHOW FUNCTIONS, etc. if p.currentIs(token.IDENT) { @@ -2094,8 +2405,8 @@ func (p *Parser) parseShow() *ast.ShowQuery { } } - // Parse LIKE clause - if p.currentIs(token.LIKE) { + // Parse LIKE or ILIKE clause + if p.currentIs(token.LIKE) || p.currentIs(token.ILIKE) { p.nextToken() if p.currentIs(token.STRING) { show.Like = p.current.Value @@ -2143,11 +2454,36 @@ func (p *Parser) parseExplain() *ast.ExplainQuery { case "ESTIMATE": explain.ExplainType = ast.ExplainEstimate p.nextToken() + case "CURRENT": + // EXPLAIN CURRENT TRANSACTION + p.nextToken() + if p.currentIs(token.IDENT) && strings.ToUpper(p.current.Value) == "TRANSACTION" { + p.nextToken() + } + explain.ExplainType = ast.ExplainCurrentTransaction + return explain // No statement follows CURRENT TRANSACTION default: explain.ExplainType = ast.ExplainPlan } } + // Parse EXPLAIN options (e.g., header = 1, input_headers = 1) + // These come before the actual statement + for p.currentIs(token.IDENT) && !p.currentIs(token.SELECT) && !p.currentIs(token.WITH) { + // Check if it looks like an option (ident = value) + if p.peekIs(token.EQ) { + p.nextToken() // skip option name + p.nextToken() // skip = + p.parseExpression(LOWEST) // skip value + // Skip comma if present + if p.currentIs(token.COMMA) { + p.nextToken() + } + } else { + break + } + } + // Parse the statement being explained explain.Statement = p.parseStatement() @@ -2274,15 +2610,58 @@ func (p *Parser) parseRename() *ast.RenameQuery { return nil } - // Parse from table name (can start with a number in ClickHouse) - rename.From = p.parseIdentifierName() + // Parse rename pairs (can have multiple: t1 TO t2, t3 TO t4, ...) + for { + pair := &ast.RenamePair{} - if !p.expect(token.TO) { - return nil + // Parse from table name (can be qualified: database.table) + fromName := p.parseIdentifierName() + if p.currentIs(token.DOT) { + p.nextToken() + pair.FromDatabase = fromName + pair.FromTable = p.parseIdentifierName() + } else { + pair.FromTable = fromName + } + + if !p.expect(token.TO) { + break + } + + // Parse to table name (can be qualified: database.table) + toName := p.parseIdentifierName() + if p.currentIs(token.DOT) { + p.nextToken() + pair.ToDatabase = toName + pair.ToTable = p.parseIdentifierName() + } else { + pair.ToTable = toName + } + + rename.Pairs = append(rename.Pairs, pair) + + // Check for more pairs + if p.currentIs(token.COMMA) { + p.nextToken() + } else { + break + } } - // Parse to table name (can start with a number in ClickHouse) - rename.To = p.parseIdentifierName() + // Set legacy From/To fields for backward compatibility (first pair) + if len(rename.Pairs) > 0 { + first := rename.Pairs[0] + if first.FromDatabase != "" { + rename.From = first.FromDatabase + "." + first.FromTable + } else { + rename.From = first.FromTable + } + if first.ToDatabase != "" { + rename.To = first.ToDatabase + "." + first.ToTable + } else { + rename.To = first.ToTable + } + } // Handle ON CLUSTER if p.currentIs(token.ON) { diff --git a/token/token.go b/token/token.go index 7e63c8751f..d945d39579 100644 --- a/token/token.go +++ b/token/token.go @@ -33,6 +33,7 @@ const ( ARROW // -> COLONCOLON // :: NULL_SAFE_EQ // <=> + CARET // ^ // Delimiters LPAREN // ( @@ -108,6 +109,7 @@ const ( GLOBAL GRANT GROUP + GROUPING HAVING IF ILIKE @@ -146,9 +148,12 @@ const ( OUTFILE OVER PARTITION + PASTE POPULATE PREWHERE PRIMARY + QUALIFY + REGEXP RENAME REPLACE REVOKE @@ -158,6 +163,7 @@ const ( SELECT SEMI SET + SETS SETTINGS SHOW STEP @@ -290,6 +296,7 @@ var tokens = [...]string{ GLOBAL: "GLOBAL", GRANT: "GRANT", GROUP: "GROUP", + GROUPING: "GROUPING", HAVING: "HAVING", IF: "IF", ILIKE: "ILIKE", @@ -328,9 +335,12 @@ var tokens = [...]string{ OUTFILE: "OUTFILE", OVER: "OVER", PARTITION: "PARTITION", + PASTE: "PASTE", POPULATE: "POPULATE", PREWHERE: "PREWHERE", PRIMARY: "PRIMARY", + QUALIFY: "QUALIFY", + REGEXP: "REGEXP", RENAME: "RENAME", REPLACE: "REPLACE", REVOKE: "REVOKE", @@ -340,6 +350,7 @@ var tokens = [...]string{ SELECT: "SELECT", SEMI: "SEMI", SET: "SET", + SETS: "SETS", SETTINGS: "SETTINGS", SHOW: "SHOW", STEP: "STEP",