Add SplitRawStatements() (#102)

* Add statement separator * Update separator*.go * Add test cases * Fix ineffassign lint error * Add test cases for separator_test.go * Update test desc * Rename to splitter and some change * Simplify control flow * Apply review comments * Fix test function name
cloudspannerecosystem · Sep 21, 2024 · fe5ac17 · fe5ac17
1 parent 64f802f
commit fe5ac17
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 0 deletions.
diff --git a/split.go b/split.go
@@ -0,0 +1,48 @@
+package memefish
+
+import "github.com/cloudspannerecosystem/memefish/token"
+
+// SplitRawStatements splits an input string to statement strings at terminating semicolons without parsing.
+// Statements are terminated by `;`, `<eof>` or `;<eof>` and the minimum output will be []string{""}.
+// See [terminating semicolons].
+// This function won't panic but return error if lexer become error state.
+// filepath can be used in error message.
+//
+// [terminating semicolons]: https://cloud.google.com/spanner/docs/reference/standard-sql/lexical#terminating_semicolons
+func SplitRawStatements(filepath, s string) ([]string, error) {
+	lex := &Lexer{
+		File: &token.File{
+			FilePath: filepath,
+			Buffer:   s,
+		},
+	}
+
+	var result []string
+	var firstPos token.Pos
+	for {
+		if lex.Token.Kind == ";" {
+			result = append(result, s[firstPos:lex.Token.Pos])
+			if err := lex.NextToken(); err != nil {
+				return nil, err
+			}
+			firstPos = lex.Token.Pos
+			continue
+		}
+
+		err := lex.NextToken()
+		if err != nil {
+			return nil, err
+		}
+
+		if lex.Token.Kind == token.TokenEOF {
+			if lex.Token.Pos != firstPos {
+				result = append(result, s[firstPos:lex.Token.Pos])
+			}
+			break
+		}
+	}
+	if len(result) == 0 {
+		return []string{""}, nil
+	}
+	return result, nil
+}
diff --git a/split_test.go b/split_test.go
@@ -0,0 +1,57 @@
+package memefish_test
+
+import (
+	"github.com/cloudspannerecosystem/memefish"
+	"github.com/google/go-cmp/cmp"
+	"regexp"
+	"testing"
+)
+
+func TestSplitRawStatements(t *testing.T) {
+	for _, test := range []struct {
+		desc  string
+		input string
+		errRe *regexp.Regexp
+		want  []string
+	}{
+		// SplitRawStatements treats only lexical structures, so the test cases can be invalid statements.
+		{desc: "empty input", input: "", want: []string{""}},
+		{desc: "single statement ends with semicolon", input: `SELECT "123";`, want: []string{`SELECT "123"`}},
+		{desc: "single statement ends with EOF", input: `SELECT "123"`, want: []string{`SELECT "123"`}},
+		{desc: "two statement ends with semicolon", input: `SELECT "123"; SELECT "456";`, want: []string{`SELECT "123"`, `SELECT "456"`}},
+		{desc: "two statement ends with EOF", input: `SELECT "123"; SELECT "456"`, want: []string{`SELECT "123"`, `SELECT "456"`}},
+		{desc: "second statement is empty", input: `SELECT 1; ;`, want: []string{`SELECT 1`, ``}},
+		{desc: "two statement with new lines", input: "SELECT 1;\n SELECT 2;\n", want: []string{"SELECT 1", "SELECT 2"}},
+		{desc: "single statement with line comment", input: `SELECT 1//
+`, want: []string{"SELECT 1//\n"}},
+		{desc: "semicolon in line comment", input: "SELECT 1 //;\n + 2", want: []string{"SELECT 1 //;\n + 2"}},
+		{desc: "semicolon in multi-line comment", input: "SELECT 1 /*;\n*/ + 2", want: []string{"SELECT 1 /*;\n*/ + 2"}},
+		{desc: "semicolon in double-quoted string", input: `SELECT "1;2;3";`, want: []string{`SELECT "1;2;3"`}},
+		{desc: "semicolon in single-quoted string", input: `SELECT '1;2;3';`, want: []string{`SELECT '1;2;3'`}},
+		{desc: "semicolon in back-quote", input: "SELECT `1;2;3`;", want: []string{"SELECT `1;2;3`"}},
+		// $` may become a valid token in the future, but it's reasonable to check its current behavior.
+		{desc: "unknown token", input: "SELECT $;", errRe: regexp.MustCompile(`illegal input character: '\$'`)},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			stmts, err := memefish.SplitRawStatements("", test.input)
+			if err != nil {
+				if test.errRe == nil {
+					t.Errorf("should success, but %v", err)
+					return
+				}
+				if !test.errRe.MatchString(err.Error()) {
+					t.Errorf("error message should match %q, but %q", test.errRe, err)
+					return
+				}
+			}
+			if err == nil && test.errRe != nil {
+				t.Errorf("success, but should fail %q", test.errRe)
+				return
+			}
+			if diff := cmp.Diff(stmts, test.want); diff != "" {
+				t.Errorf("differs: %v", diff)
+				return
+			}
+		})
+	}
+}