From 3b5aaf93b1d2b2a36899a79fe2718480929ccc5c Mon Sep 17 00:00:00 2001 From: name Date: Thu, 12 Jun 2025 09:52:19 -0700 Subject: [PATCH 1/3] failing test --- cmd/scip/convert_test.go | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/cmd/scip/convert_test.go b/cmd/scip/convert_test.go index a3e85ca3..ede258ef 100644 --- a/cmd/scip/convert_test.go +++ b/cmd/scip/convert_test.go @@ -40,6 +40,7 @@ func TestConvert_SmokeTest(t *testing.T) { {"documents", checkDocuments}, {"symbols", checkSymbols}, {"occurrences", checkOccurrences}, + {"query_symbol_at_position", checkQuerySymbolAtPosition}, } for _, check := range checks { @@ -173,3 +174,45 @@ type occurrenceData struct { Role int32 Range scip.Range } + +func checkQuerySymbolAtPosition(t *testing.T, index *scip.Index, db *sqlite.Conn) { + // Query for the symbol at line 10, character 3 in document "a.go" + // This should return pkg1S1Sym according to our test data + expectedSymbol := "scip-go go . . pkg1/S1#" + targetLine := int32(10) + targetChar := int32(3) + targetDoc := "a.go" + + // This query attempts to use JSON operators on the occurrences column + // which should fail since it's currently a compressed binary blob + query := ` + SELECT gs.symbol + FROM documents d + JOIN chunks c ON c.document_id = d.id + JOIN global_symbols gs ON gs.id IN ( + SELECT json_extract(occ.value, '$.symbol_id') + FROM json_each(c.occurrences) AS occ + WHERE json_extract(occ.value, '$.range[0]') = ? + AND json_extract(occ.value, '$.range[1]') = ? + ) + WHERE d.relative_path = ? + AND ? BETWEEN c.start_line AND c.end_line + ` + + var foundSymbol string + err := sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{ + Args: []any{targetLine, targetChar, targetDoc, targetLine}, + ResultFunc: func(stmt *sqlite.Stmt) error { + foundSymbol = stmt.ColumnText(0) + return nil + }, + }) + + // This test should fail because the occurrences column is currently + // a compressed binary blob, not JSON. This is a failing test that demonstrates + // the need to change the storage format to enable JSON queries. + require.NoError(t, err, "Query should succeed once occurrences are stored as JSON") + require.Equal(t, expectedSymbol, foundSymbol, + "Expected to find symbol %s at position %d:%d in document %s", + expectedSymbol, targetLine, targetChar, targetDoc) +} From 913b8e637d23c5614d92900f9267610abb7840fe Mon Sep 17 00:00:00 2001 From: name Date: Thu, 12 Jun 2025 11:58:01 -0700 Subject: [PATCH 2/3] basic impl --- cmd/scip/convert.go | 72 ++++++++++++++++++++++++---------------- cmd/scip/convert_test.go | 35 ++++++++----------- 2 files changed, 56 insertions(+), 51 deletions(-) diff --git a/cmd/scip/convert.go b/cmd/scip/convert.go index f276026d..ebf81b0c 100644 --- a/cmd/scip/convert.go +++ b/cmd/scip/convert.go @@ -2,6 +2,7 @@ package main import ( "bytes" + "encoding/json" "fmt" "io" "log" @@ -14,7 +15,7 @@ import ( "github.com/cockroachdb/errors" "github.com/klauspost/compress/zstd" "github.com/urfave/cli/v2" - "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/encoding/protojson" "zombiezen.com/go/sqlite" "zombiezen.com/go/sqlite/sqlitex" @@ -36,7 +37,7 @@ func convertCommand() cli.Command { For inspecting the data, use the SQLite CLI. For inspecting the schema, use .schema. -Occurrences are stored opaquely as a blob to prevent the DB size from growing very quickly.`, +Occurrences are stored as a JSON array of serialized Occurrence messages.`, Flags: []cli.Flag{ &cli.StringFlag{ Name: "output", @@ -67,7 +68,7 @@ Occurrences are stored opaquely as a blob to prevent the DB size from growing ve return command } -func convertMain(indexPath, sqliteDBPath, cpuProfilePath string, chunkSize int, out io.Writer) (err error) { +func convertMain(indexPath, sqliteDBPath, cpuProfilePath string, chunkSize int, _ io.Writer) (err error) { index, err := readFromOption(indexPath) if err != nil { return err @@ -182,7 +183,7 @@ func createSQLiteDatabase(path string) (conn *sqlite.Conn, err error) { chunk_index INTEGER NOT NULL, start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, - occurrences BLOB NOT NULL, + occurrences TEXT NOT NULL, FOREIGN KEY (document_id) REFERENCES documents(id) );`, `CREATE TABLE global_symbols ( @@ -460,44 +461,57 @@ func (c *Converter) insertGlobalSymbols(symbol *scip.SymbolInformation) (symbolI return symbolID, err } -func (c *Chunk) toDBFormat(encoder *zstd.Encoder) ([]byte, error) { - occurrencesBlob, err := proto.Marshal(&scip.Document{ - Occurrences: c.Occurrences, - }) - if err != nil { - return nil, errors.Wrap(err, "failed to serialize occurrences") +func (c *Chunk) toDBFormat(_ *zstd.Encoder) (string, error) { + // Serialize the occurrences slice directly as a JSON array + // We'll marshal each occurrence individually using protobuf JSON and combine them + marshaler := protojson.MarshalOptions{ + UseProtoNames: true, + EmitUnpopulated: false, } - var buf bytes.Buffer - encoder.Reset(&buf) - if _, err = encoder.Write(occurrencesBlob); err != nil { - return nil, errors.Wrap(err, "compression error") + if len(c.Occurrences) == 0 { + return "[]", nil } - if err = encoder.Close(); err != nil { - return nil, errors.Wrap(err, "flushing encoder") + + var occurrenceJSONs []string + for _, occ := range c.Occurrences { + jsonData, err := marshaler.Marshal(occ) + if err != nil { + return "", errors.Wrap(err, "failed to serialize occurrence as protobuf JSON") + } + occurrenceJSONs = append(occurrenceJSONs, string(jsonData)) } - return buf.Bytes(), nil + + // Combine into a JSON array + return "[" + strings.Join(occurrenceJSONs, ",") + "]", nil } -func (c *Chunk) fromDBFormat(reader *bytes.Reader, decoder *zstd.Decoder) error { - if err := decoder.Reset(reader); err != nil { - return errors.Wrap(err, "resetting zstd Decoder") +func (c *Chunk) fromDBFormat(jsonData string) error { + // Parse the JSON array directly + var rawArray []json.RawMessage + if err := json.Unmarshal([]byte(jsonData), &rawArray); err != nil { + return errors.Wrap(err, "failed to parse JSON array") } - protoBytes, err := io.ReadAll(decoder) - if err != nil { - return errors.Wrap(err, "reading compressed data") + + // Use protobuf JSON unmarshaling for each occurrence + unmarshaler := protojson.UnmarshalOptions{ + DiscardUnknown: true, } - var tmpDoc scip.Document - if err = proto.Unmarshal(protoBytes, &tmpDoc); err != nil { - return errors.Wrap(err, "failed to unmarshal occurrences") + c.Occurrences = make([]*scip.Occurrence, len(rawArray)) + for i, rawOcc := range rawArray { + var occ scip.Occurrence + if err := unmarshaler.Unmarshal(rawOcc, &occ); err != nil { + return errors.Wrapf(err, "failed to unmarshal occurrence at index %d", i) + } + c.Occurrences[i] = &occ } - c.Occurrences = tmpDoc.Occurrences + return nil } func (c *Converter) insertChunk(chunk Chunk, docID int64, chunkIndex int) (chunkID int64, err error) { - compressedOccurrences, err := chunk.toDBFormat(c.zstdWriter) + jsonOccurrences, err := chunk.toDBFormat(c.zstdWriter) if err != nil { return 0, errors.Wrap(err, "failed to serialize chunk") } @@ -513,7 +527,7 @@ func (c *Converter) insertChunk(chunk Chunk, docID int64, chunkIndex int) (chunk chunkStmt.BindInt64(2, int64(chunkIndex)) chunkStmt.BindInt64(3, int64(chunk.StartLine)) chunkStmt.BindInt64(4, int64(chunk.EndLine)) - chunkStmt.BindBytes(5, compressedOccurrences) + chunkStmt.BindText(5, jsonOccurrences) _, err = chunkStmt.Step() if err != nil { diff --git a/cmd/scip/convert_test.go b/cmd/scip/convert_test.go index ede258ef..df423955 100644 --- a/cmd/scip/convert_test.go +++ b/cmd/scip/convert_test.go @@ -1,7 +1,6 @@ package main import ( - "bytes" "cmp" "path/filepath" "testing" @@ -122,17 +121,14 @@ func checkSymbols(t *testing.T, index *scip.Index, db *sqlite.Conn) { } func checkOccurrences(t *testing.T, index *scip.Index, db *sqlite.Conn) { - zstdReader, err := zstd.NewReader(bytes.NewBuffer(nil)) - require.NoError(t, err) - query := `SELECT d.relative_path, occurrences FROM documents d JOIN chunks c ON c.document_id = d.id` dbOccurrences := []occurrenceData{} - err = sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{ + err := sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{ ResultFunc: func(stmt *sqlite.Stmt) error { var c Chunk - err = c.fromDBFormat(stmt.ColumnReader(1), zstdReader) + err := c.fromDBFormat(stmt.ColumnText(1)) require.NoError(t, err) for _, occ := range c.Occurrences { dbOccurrences = append(dbOccurrences, occurrenceData{ @@ -183,36 +179,31 @@ func checkQuerySymbolAtPosition(t *testing.T, index *scip.Index, db *sqlite.Conn targetChar := int32(3) targetDoc := "a.go" - // This query attempts to use JSON operators on the occurrences column - // which should fail since it's currently a compressed binary blob + // This query uses -> / ->> operators on the occurrences column to find the symbol at the specified position + // The occurrences column now contains a direct JSON array of occurrence objects + // Uses -> for object access and ->> for final text extraction query := ` - SELECT gs.symbol + SELECT occ.value ->> 'symbol' as symbol FROM documents d - JOIN chunks c ON c.document_id = d.id - JOIN global_symbols gs ON gs.id IN ( - SELECT json_extract(occ.value, '$.symbol_id') - FROM json_each(c.occurrences) AS occ - WHERE json_extract(occ.value, '$.range[0]') = ? - AND json_extract(occ.value, '$.range[1]') = ? - ) + JOIN chunks c ON c.document_id = d.id, + json_each(c.occurrences) AS occ WHERE d.relative_path = ? + AND json_extract(occ.value -> 'range', '$[0]') = ? + AND json_extract(occ.value -> 'range', '$[1]') = ? AND ? BETWEEN c.start_line AND c.end_line ` var foundSymbol string err := sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{ - Args: []any{targetLine, targetChar, targetDoc, targetLine}, + Args: []any{targetDoc, targetLine, targetChar, targetLine}, ResultFunc: func(stmt *sqlite.Stmt) error { foundSymbol = stmt.ColumnText(0) return nil }, }) - // This test should fail because the occurrences column is currently - // a compressed binary blob, not JSON. This is a failing test that demonstrates - // the need to change the storage format to enable JSON queries. require.NoError(t, err, "Query should succeed once occurrences are stored as JSON") - require.Equal(t, expectedSymbol, foundSymbol, - "Expected to find symbol %s at position %d:%d in document %s", + require.Equal(t, expectedSymbol, foundSymbol, + "Expected to find symbol %s at position %d:%d in document %s", expectedSymbol, targetLine, targetChar, targetDoc) } From c5e3169b45f21a6ee0479f09d63511bd0f5a07ad Mon Sep 17 00:00:00 2001 From: name Date: Thu, 12 Jun 2025 13:12:22 -0700 Subject: [PATCH 3/3] cleanup --- cmd/scip/convert.go | 24 ++++++++---------------- cmd/scip/convert_test.go | 16 +++++----------- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/cmd/scip/convert.go b/cmd/scip/convert.go index ebf81b0c..f8964be9 100644 --- a/cmd/scip/convert.go +++ b/cmd/scip/convert.go @@ -1,7 +1,6 @@ package main import ( - "bytes" "encoding/json" "fmt" "io" @@ -13,7 +12,6 @@ import ( "strings" "github.com/cockroachdb/errors" - "github.com/klauspost/compress/zstd" "github.com/urfave/cli/v2" "google.golang.org/protobuf/encoding/protojson" "zombiezen.com/go/sqlite" @@ -109,12 +107,8 @@ func convertMain(indexPath, sqliteDBPath, cpuProfilePath string, chunkSize int, err = errors.CombineErrors(err, db.Close()) }() - writer, err := zstd.NewWriter(bytes.NewBuffer(nil)) - if err != nil { - return errors.Wrap(err, "zstd writer creation") - } // Convert the SCIP index to the SQLite database - converter := NewConverter(db, chunkSize, writer) + converter := NewConverter(db, chunkSize) if err := converter.Convert(index); err != nil { return err } @@ -232,17 +226,15 @@ func executeAll(conn *sqlite.Conn, statements []string) error { // Converter handles the conversion from SCIP to SQLite type Converter struct { - conn *sqlite.Conn - chunkSize int - zstdWriter *zstd.Encoder + conn *sqlite.Conn + chunkSize int } // NewConverter creates a new converter instance -func NewConverter(conn *sqlite.Conn, chunkSize int, writer *zstd.Encoder) *Converter { +func NewConverter(conn *sqlite.Conn, chunkSize int) *Converter { return &Converter{ - conn, - chunkSize, - writer, + conn: conn, + chunkSize: chunkSize, } } @@ -461,7 +453,7 @@ func (c *Converter) insertGlobalSymbols(symbol *scip.SymbolInformation) (symbolI return symbolID, err } -func (c *Chunk) toDBFormat(_ *zstd.Encoder) (string, error) { +func (c *Chunk) toDBFormat() (string, error) { // Serialize the occurrences slice directly as a JSON array // We'll marshal each occurrence individually using protobuf JSON and combine them marshaler := protojson.MarshalOptions{ @@ -511,7 +503,7 @@ func (c *Chunk) fromDBFormat(jsonData string) error { } func (c *Converter) insertChunk(chunk Chunk, docID int64, chunkIndex int) (chunkID int64, err error) { - jsonOccurrences, err := chunk.toDBFormat(c.zstdWriter) + jsonOccurrences, err := chunk.toDBFormat() if err != nil { return 0, errors.Wrap(err, "failed to serialize chunk") } diff --git a/cmd/scip/convert_test.go b/cmd/scip/convert_test.go index df423955..cb2b9857 100644 --- a/cmd/scip/convert_test.go +++ b/cmd/scip/convert_test.go @@ -5,7 +5,6 @@ import ( "path/filepath" "testing" - "github.com/klauspost/compress/zstd" "github.com/stretchr/testify/require" "golang.org/x/exp/slices" "zombiezen.com/go/sqlite" @@ -26,9 +25,7 @@ func TestConvert_SmokeTest(t *testing.T) { require.NoError(t, err) defer func() { require.NoError(t, db.Close()) }() - writer, err := zstd.NewWriter(nil) - require.NoError(t, err) - converter := NewConverter(db, chunkSizeHint, writer) + converter := NewConverter(db, chunkSizeHint) err = converter.Convert(index) require.NoError(t, err) @@ -49,8 +46,9 @@ func TestConvert_SmokeTest(t *testing.T) { } } +const pkg1S1Sym = "scip-go go . . pkg1/S1#" + func testIndex1() *scip.Index { - pkg1S1Sym := "scip-go go . . pkg1/S1#" return &scip.Index{ Documents: []*scip.Document{ { @@ -174,14 +172,10 @@ type occurrenceData struct { func checkQuerySymbolAtPosition(t *testing.T, index *scip.Index, db *sqlite.Conn) { // Query for the symbol at line 10, character 3 in document "a.go" // This should return pkg1S1Sym according to our test data - expectedSymbol := "scip-go go . . pkg1/S1#" targetLine := int32(10) targetChar := int32(3) targetDoc := "a.go" - // This query uses -> / ->> operators on the occurrences column to find the symbol at the specified position - // The occurrences column now contains a direct JSON array of occurrence objects - // Uses -> for object access and ->> for final text extraction query := ` SELECT occ.value ->> 'symbol' as symbol FROM documents d @@ -203,7 +197,7 @@ func checkQuerySymbolAtPosition(t *testing.T, index *scip.Index, db *sqlite.Conn }) require.NoError(t, err, "Query should succeed once occurrences are stored as JSON") - require.Equal(t, expectedSymbol, foundSymbol, + require.Equal(t, pkg1S1Sym, foundSymbol, "Expected to find symbol %s at position %d:%d in document %s", - expectedSymbol, targetLine, targetChar, targetDoc) + pkg1S1Sym, targetLine, targetChar, targetDoc) }