From fe5ac1747a8a16055daf88a16d3a2eb07059c2cc Mon Sep 17 00:00:00 2001
From: apstndb <803393+apstndb@users.noreply.github.com>
Date: Sat, 21 Sep 2024 21:57:20 +0900
Subject: [PATCH] Add SplitRawStatements() (#102)

* Add statement separator

* Update separator*.go

* Add test cases

* Fix ineffassign lint error

* Add test cases for separator_test.go

* Update test desc

* Rename to splitter and some change

* Simplify control flow

* Apply review comments

* Fix test function name
---
 split.go      | 48 +++++++++++++++++++++++++++++++++++++++++++
 split_test.go | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)
 create mode 100644 split.go
 create mode 100644 split_test.go
diff --git a/split.go b/split.go
new file mode 100644
index 00000000..9f0193ce
--- /dev/null
+++ b/split.go
@@ -0,0 +1,48 @@
+package memefish
+
+import "github.com/cloudspannerecosystem/memefish/token"
+
+// SplitRawStatements splits an input string to statement strings at terminating semicolons without parsing.
+// Statements are terminated by `;`, `<eof>` or `;<eof>` and the minimum output will be []string{""}.
+// See [terminating semicolons].
+// This function won't panic but return error if lexer become error state.
+// filepath can be used in error message.
+//
+// [terminating semicolons]: https://cloud.google.com/spanner/docs/reference/standard-sql/lexical#terminating_semicolons
+func SplitRawStatements(filepath, s string) ([]string, error) {
+	lex := &Lexer{
+		File: &token.File{
+			FilePath: filepath,
+			Buffer:   s,
+		},
+	}
+
+	var result []string
+	var firstPos token.Pos
+	for {
+		if lex.Token.Kind == ";" {
+			result = append(result, s[firstPos:lex.Token.Pos])
+			if err := lex.NextToken(); err != nil {
+				return nil, err
+			}
+			firstPos = lex.Token.Pos
+			continue
+		}
+
+		err := lex.NextToken()
+		if err != nil {
+			return nil, err
+		}
+
+		if lex.Token.Kind == token.TokenEOF {
+			if lex.Token.Pos != firstPos {
+				result = append(result, s[firstPos:lex.Token.Pos])
+			}
+			break
+		}
+	}
+	if len(result) == 0 {
+		return []string{""}, nil
+	}
+	return result, nil
+}
diff --git a/split_test.go b/split_test.go
new file mode 100644
index 00000000..d9b8a5b6
--- /dev/null
+++ b/split_test.go
@@ -0,0 +1,57 @@
+package memefish_test
+
+import (
+	"github.com/cloudspannerecosystem/memefish"
+	"github.com/google/go-cmp/cmp"
+	"regexp"
+	"testing"
+)
+
+func TestSplitRawStatements(t *testing.T) {
+	for _, test := range []struct {
+		desc  string
+		input string
+		errRe *regexp.Regexp
+		want  []string
+	}{
+		// SplitRawStatements treats only lexical structures, so the test cases can be invalid statements.
+		{desc: "empty input", input: "", want: []string{""}},
+		{desc: "single statement ends with semicolon", input: `SELECT "123";`, want: []string{`SELECT "123"`}},
+		{desc: "single statement ends with EOF", input: `SELECT "123"`, want: []string{`SELECT "123"`}},
+		{desc: "two statement ends with semicolon", input: `SELECT "123"; SELECT "456";`, want: []string{`SELECT "123"`, `SELECT "456"`}},
+		{desc: "two statement ends with EOF", input: `SELECT "123"; SELECT "456"`, want: []string{`SELECT "123"`, `SELECT "456"`}},
+		{desc: "second statement is empty", input: `SELECT 1; ;`, want: []string{`SELECT 1`, ``}},
+		{desc: "two statement with new lines", input: "SELECT 1;\n SELECT 2;\n", want: []string{"SELECT 1", "SELECT 2"}},
+		{desc: "single statement with line comment", input: `SELECT 1//
+`, want: []string{"SELECT 1//\n"}},
+		{desc: "semicolon in line comment", input: "SELECT 1 //;\n + 2", want: []string{"SELECT 1 //;\n + 2"}},
+		{desc: "semicolon in multi-line comment", input: "SELECT 1 /*;\n*/ + 2", want: []string{"SELECT 1 /*;\n*/ + 2"}},
+		{desc: "semicolon in double-quoted string", input: `SELECT "1;2;3";`, want: []string{`SELECT "1;2;3"`}},
+		{desc: "semicolon in single-quoted string", input: `SELECT '1;2;3';`, want: []string{`SELECT '1;2;3'`}},
+		{desc: "semicolon in back-quote", input: "SELECT `1;2;3`;", want: []string{"SELECT `1;2;3`"}},
+		// $` may become a valid token in the future, but it's reasonable to check its current behavior.
+		{desc: "unknown token", input: "SELECT $;", errRe: regexp.MustCompile(`illegal input character: '\$'`)},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			stmts, err := memefish.SplitRawStatements("", test.input)
+			if err != nil {
+				if test.errRe == nil {
+					t.Errorf("should success, but %v", err)
+					return
+				}
+				if !test.errRe.MatchString(err.Error()) {
+					t.Errorf("error message should match %q, but %q", test.errRe, err)
+					return
+				}
+			}
+			if err == nil && test.errRe != nil {
+				t.Errorf("success, but should fail %q", test.errRe)
+				return
+			}
+			if diff := cmp.Diff(stmts, test.want); diff != "" {
+				t.Errorf("differs: %v", diff)
+				return
+			}
+		})
+	}
+}