Skip to content

feat: Store JSON occurrences in sqlite #321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 49 additions & 43 deletions cmd/scip/convert.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package main

import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
Expand All @@ -12,9 +12,8 @@ import (
"strings"

"github.com/cockroachdb/errors"
"github.com/klauspost/compress/zstd"
"github.com/urfave/cli/v2"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/encoding/protojson"
"zombiezen.com/go/sqlite"
"zombiezen.com/go/sqlite/sqlitex"

Expand All @@ -36,7 +35,7 @@ func convertCommand() cli.Command {
For inspecting the data, use the SQLite CLI.
For inspecting the schema, use .schema.

Occurrences are stored opaquely as a blob to prevent the DB size from growing very quickly.`,
Occurrences are stored as a JSON array of serialized Occurrence messages.`,
Flags: []cli.Flag{
&cli.StringFlag{
Name: "output",
Expand Down Expand Up @@ -67,7 +66,7 @@ Occurrences are stored opaquely as a blob to prevent the DB size from growing ve
return command
}

func convertMain(indexPath, sqliteDBPath, cpuProfilePath string, chunkSize int, out io.Writer) (err error) {
func convertMain(indexPath, sqliteDBPath, cpuProfilePath string, chunkSize int, _ io.Writer) (err error) {
index, err := readFromOption(indexPath)
if err != nil {
return err
Expand Down Expand Up @@ -108,12 +107,8 @@ func convertMain(indexPath, sqliteDBPath, cpuProfilePath string, chunkSize int,
err = errors.CombineErrors(err, db.Close())
}()

writer, err := zstd.NewWriter(bytes.NewBuffer(nil))
if err != nil {
return errors.Wrap(err, "zstd writer creation")
}
// Convert the SCIP index to the SQLite database
converter := NewConverter(db, chunkSize, writer)
converter := NewConverter(db, chunkSize)
if err := converter.Convert(index); err != nil {
return err
}
Expand Down Expand Up @@ -182,7 +177,7 @@ func createSQLiteDatabase(path string) (conn *sqlite.Conn, err error) {
chunk_index INTEGER NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
occurrences BLOB NOT NULL,
occurrences TEXT NOT NULL,
FOREIGN KEY (document_id) REFERENCES documents(id)
);`,
`CREATE TABLE global_symbols (
Expand Down Expand Up @@ -231,17 +226,15 @@ func executeAll(conn *sqlite.Conn, statements []string) error {

// Converter handles the conversion from SCIP to SQLite
type Converter struct {
conn *sqlite.Conn
chunkSize int
zstdWriter *zstd.Encoder
conn *sqlite.Conn
chunkSize int
}

// NewConverter creates a new converter instance
func NewConverter(conn *sqlite.Conn, chunkSize int, writer *zstd.Encoder) *Converter {
func NewConverter(conn *sqlite.Conn, chunkSize int) *Converter {
return &Converter{
conn,
chunkSize,
writer,
conn: conn,
chunkSize: chunkSize,
}
}

Expand Down Expand Up @@ -460,44 +453,57 @@ func (c *Converter) insertGlobalSymbols(symbol *scip.SymbolInformation) (symbolI
return symbolID, err
}

func (c *Chunk) toDBFormat(encoder *zstd.Encoder) ([]byte, error) {
occurrencesBlob, err := proto.Marshal(&scip.Document{
Occurrences: c.Occurrences,
})
if err != nil {
return nil, errors.Wrap(err, "failed to serialize occurrences")
func (c *Chunk) toDBFormat() (string, error) {
// Serialize the occurrences slice directly as a JSON array
// We'll marshal each occurrence individually using protobuf JSON and combine them
marshaler := protojson.MarshalOptions{
UseProtoNames: true,
EmitUnpopulated: false,
}

var buf bytes.Buffer
encoder.Reset(&buf)
if _, err = encoder.Write(occurrencesBlob); err != nil {
return nil, errors.Wrap(err, "compression error")
if len(c.Occurrences) == 0 {
return "[]", nil
}
if err = encoder.Close(); err != nil {
return nil, errors.Wrap(err, "flushing encoder")

var occurrenceJSONs []string
for _, occ := range c.Occurrences {
jsonData, err := marshaler.Marshal(occ)
if err != nil {
return "", errors.Wrap(err, "failed to serialize occurrence as protobuf JSON")
}
occurrenceJSONs = append(occurrenceJSONs, string(jsonData))
}
return buf.Bytes(), nil

// Combine into a JSON array
return "[" + strings.Join(occurrenceJSONs, ",") + "]", nil
}

func (c *Chunk) fromDBFormat(reader *bytes.Reader, decoder *zstd.Decoder) error {
if err := decoder.Reset(reader); err != nil {
return errors.Wrap(err, "resetting zstd Decoder")
func (c *Chunk) fromDBFormat(jsonData string) error {
// Parse the JSON array directly
var rawArray []json.RawMessage
if err := json.Unmarshal([]byte(jsonData), &rawArray); err != nil {
return errors.Wrap(err, "failed to parse JSON array")
}
protoBytes, err := io.ReadAll(decoder)
if err != nil {
return errors.Wrap(err, "reading compressed data")

// Use protobuf JSON unmarshaling for each occurrence
unmarshaler := protojson.UnmarshalOptions{
DiscardUnknown: true,
}

var tmpDoc scip.Document
if err = proto.Unmarshal(protoBytes, &tmpDoc); err != nil {
return errors.Wrap(err, "failed to unmarshal occurrences")
c.Occurrences = make([]*scip.Occurrence, len(rawArray))
for i, rawOcc := range rawArray {
var occ scip.Occurrence
if err := unmarshaler.Unmarshal(rawOcc, &occ); err != nil {
return errors.Wrapf(err, "failed to unmarshal occurrence at index %d", i)
}
c.Occurrences[i] = &occ
}
c.Occurrences = tmpDoc.Occurrences

return nil
}

func (c *Converter) insertChunk(chunk Chunk, docID int64, chunkIndex int) (chunkID int64, err error) {
compressedOccurrences, err := chunk.toDBFormat(c.zstdWriter)
jsonOccurrences, err := chunk.toDBFormat()
if err != nil {
return 0, errors.Wrap(err, "failed to serialize chunk")
}
Expand All @@ -513,7 +519,7 @@ func (c *Converter) insertChunk(chunk Chunk, docID int64, chunkIndex int) (chunk
chunkStmt.BindInt64(2, int64(chunkIndex))
chunkStmt.BindInt64(3, int64(chunk.StartLine))
chunkStmt.BindInt64(4, int64(chunk.EndLine))
chunkStmt.BindBytes(5, compressedOccurrences)
chunkStmt.BindText(5, jsonOccurrences)

_, err = chunkStmt.Step()
if err != nil {
Expand Down
50 changes: 39 additions & 11 deletions cmd/scip/convert_test.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package main

import (
"bytes"
"cmp"
"path/filepath"
"testing"

"github.com/klauspost/compress/zstd"
"github.com/stretchr/testify/require"
"golang.org/x/exp/slices"
"zombiezen.com/go/sqlite"
Expand All @@ -27,9 +25,7 @@ func TestConvert_SmokeTest(t *testing.T) {
require.NoError(t, err)
defer func() { require.NoError(t, db.Close()) }()

writer, err := zstd.NewWriter(nil)
require.NoError(t, err)
converter := NewConverter(db, chunkSizeHint, writer)
converter := NewConverter(db, chunkSizeHint)
err = converter.Convert(index)
require.NoError(t, err)

Expand All @@ -40,6 +36,7 @@ func TestConvert_SmokeTest(t *testing.T) {
{"documents", checkDocuments},
{"symbols", checkSymbols},
{"occurrences", checkOccurrences},
{"query_symbol_at_position", checkQuerySymbolAtPosition},
}

for _, check := range checks {
Expand All @@ -49,8 +46,9 @@ func TestConvert_SmokeTest(t *testing.T) {
}
}

const pkg1S1Sym = "scip-go go . . pkg1/S1#"

func testIndex1() *scip.Index {
pkg1S1Sym := "scip-go go . . pkg1/S1#"
return &scip.Index{
Documents: []*scip.Document{
{
Expand Down Expand Up @@ -121,17 +119,14 @@ func checkSymbols(t *testing.T, index *scip.Index, db *sqlite.Conn) {
}

func checkOccurrences(t *testing.T, index *scip.Index, db *sqlite.Conn) {
zstdReader, err := zstd.NewReader(bytes.NewBuffer(nil))
require.NoError(t, err)

query := `SELECT d.relative_path, occurrences
FROM documents d
JOIN chunks c ON c.document_id = d.id`
dbOccurrences := []occurrenceData{}
err = sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{
err := sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{
ResultFunc: func(stmt *sqlite.Stmt) error {
var c Chunk
err = c.fromDBFormat(stmt.ColumnReader(1), zstdReader)
err := c.fromDBFormat(stmt.ColumnText(1))
require.NoError(t, err)
for _, occ := range c.Occurrences {
dbOccurrences = append(dbOccurrences, occurrenceData{
Expand Down Expand Up @@ -173,3 +168,36 @@ type occurrenceData struct {
Role int32
Range scip.Range
}

func checkQuerySymbolAtPosition(t *testing.T, index *scip.Index, db *sqlite.Conn) {
// Query for the symbol at line 10, character 3 in document "a.go"
// This should return pkg1S1Sym according to our test data
targetLine := int32(10)
targetChar := int32(3)
targetDoc := "a.go"

query := `
SELECT occ.value ->> 'symbol' as symbol
FROM documents d
JOIN chunks c ON c.document_id = d.id,
json_each(c.occurrences) AS occ
WHERE d.relative_path = ?
AND json_extract(occ.value -> 'range', '$[0]') = ?
AND json_extract(occ.value -> 'range', '$[1]') = ?
AND ? BETWEEN c.start_line AND c.end_line
`

var foundSymbol string
err := sqlitex.ExecuteTransient(db, query, &sqlitex.ExecOptions{
Args: []any{targetDoc, targetLine, targetChar, targetLine},
ResultFunc: func(stmt *sqlite.Stmt) error {
foundSymbol = stmt.ColumnText(0)
return nil
},
})

require.NoError(t, err, "Query should succeed once occurrences are stored as JSON")
require.Equal(t, pkg1S1Sym, foundSymbol,
"Expected to find symbol %s at position %d:%d in document %s",
pkg1S1Sym, targetLine, targetChar, targetDoc)
}