Skip to content

Commit

Permalink
Add searcher implementation (#6432)
Browse files Browse the repository at this point in the history
  • Loading branch information
tylerwilliams authored Apr 24, 2024
1 parent 41da461 commit 33148df
Show file tree
Hide file tree
Showing 6 changed files with 347 additions and 2 deletions.
17 changes: 17 additions & 0 deletions codesearch/searcher/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "searcher",
srcs = ["searcher.go"],
importpath = "github.com/buildbuddy-io/buildbuddy/codesearch/searcher",
visibility = ["//visibility:public"],
deps = [
"//codesearch/query",
"//codesearch/result",
"//codesearch/types",
"//server/util/log",
"//server/util/status",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@org_golang_x_sync//errgroup",
],
)
302 changes: 302 additions & 0 deletions codesearch/searcher/searcher.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
package searcher

import (
"bufio"
"bytes"
"fmt"
"regexp"
"regexp/syntax"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"

"github.com/buildbuddy-io/buildbuddy/codesearch/query"
"github.com/buildbuddy-io/buildbuddy/codesearch/result"
"github.com/buildbuddy-io/buildbuddy/codesearch/types"
"github.com/buildbuddy-io/buildbuddy/server/util/log"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/go-enry/go-enry/v2"
"golang.org/x/sync/errgroup"
)

type Searcher interface {
Search(q string, numResults int) ([]*result.Result, error)
}

type CodeSearcher struct {
indexReader types.IndexReader
log log.Logger
}

func New(ir types.IndexReader) Searcher {
subLog := log.NamedSubLogger("searcher")
return &CodeSearcher{indexReader: ir, log: subLog}
}

// remove these one-off methods
var nl = []byte{'\n'}

func countNL(b []byte) int {
r := b
n := 0
for {
i := bytes.IndexByte(r, '\n')
if i < 0 {
break
}
n++
r = r[i+1:]
}
return n
}

func extractLine(buf []byte, lineNumber int) []byte {
s := bufio.NewScanner(bytes.NewReader(buf))
currentLine := 0
for s.Scan() {
currentLine++
if currentLine == lineNumber {
return s.Bytes()
}
}
return nil
}

// formalize this?
type region struct {
startOffset int
endOffset int
lineNumber int
}

type reScorer struct {
re *regexp.Regexp
}

func (s *reScorer) match(buf []byte) []region {
matchIndexes := s.re.FindAllIndex(buf, -1)
results := make([]region, len(matchIndexes))
for i, pair := range matchIndexes {
results[i] = region{
startOffset: pair[0],
endOffset: pair[1],
lineNumber: countNL(buf[:pair[0]]) + 1,
}
}
return results
}

func (s *reScorer) Score(doc types.Document) float64 {
docScore := 0.0
for _, fieldName := range doc.Fields() {
field := doc.Field(fieldName)
if len(field.Contents()) == 0 {
continue
}
matchingRegions := s.match(field.Contents())
f_qi_d := float64(len(matchingRegions))
D := float64(len(strings.Fields(string(field.Contents()))))
k1, b := bm25Params(field.Name())
fieldScore := (f_qi_d * (k1 + 1)) / (f_qi_d + k1*(1-b+b*D))
docScore += fieldScore
}
return docScore
}

func (s *reScorer) makeResults(docs []types.Document) []*result.Result {
results := make([]*result.Result, len(docs))
for i, doc := range docs {
r := &result.Result{}
for _, fieldName := range doc.Fields() {
field := doc.Field(fieldName)
if field.Name() == "filename" {
r.Filename = string(field.Contents())
}
for _, region := range s.match(field.Contents()) {
r.MatchCount += 1
line := extractLine(field.Contents(), region.lineNumber)
r.Snippets = append(r.Snippets, []byte(fmt.Sprintf("%d: %s\n", region.lineNumber, line)))
}
}
results[i] = r
}
return results
}

func (c *CodeSearcher) parse(q string) (*reScorer, []byte, error) {
c.log.Infof("raw query: [%s]", q)

// A list of s-expression strings that must be satisfied by
// the query. (added to the query with AND)
requiredSClauses := make([]string, 0)
regexOpts := []string{
"(?m)", // always use multiline mode.
}

// match `case:yes` or `case:y`
caseMatcher := regexp.MustCompile(`case:(yes|y)`)
if caseMatcher.MatchString(q) {
q = caseMatcher.ReplaceAllString(q, "")
} else {
// otherwise default to case-insensitive
regexOpts = append(regexOpts, "(?i)")
}

// match `file:test.js`, `f:test.js`, and `path:test.js`
fileMatcher := regexp.MustCompile(`(?:file:|f:|path:)(?P<filepath>[[:graph:]]+)`)
fileMatch := fileMatcher.FindStringSubmatch(q)
if len(fileMatch) == 2 {
q = fileMatcher.ReplaceAllString(q, "")
syn, err := syntax.Parse(fileMatch[1], syntax.Perl)
if err != nil {
return nil, nil, err
}
subQ := query.RegexpQuery(syn).SQuery("filename")
requiredSClauses = append(requiredSClauses, subQ)
}

// match `lang:go`, `lang:java`, etc.
// the list of supported languages (and their aliases) is here:
// https://github.com/github-linguist/linguist/blob/master/lib/linguist/languages.yml
langMatcher := regexp.MustCompile(`(?:lang:)(?P<lang>[[:graph:]]+)`)
langMatch := langMatcher.FindStringSubmatch(q)
if len(langMatch) == 2 {
q = langMatcher.ReplaceAllString(q, "")
lang, ok := enry.GetLanguageByAlias(langMatch[1])
if ok {
subQ := fmt.Sprintf("(:eq lang %s)", strconv.Quote(strings.ToLower(lang)))
requiredSClauses = append(requiredSClauses, subQ)
} else {
return nil, nil, status.InvalidArgumentErrorf("unknown lang %q", langMatch[1])
}
}
q = strings.TrimSpace(q)
q = strings.Join(regexOpts, "") + q
c.log.Infof("parsed query: [%s]", q)

syn, err := syntax.Parse(q, syntax.Perl)
if err != nil {
return nil, nil, err
}

// Annoyingly, we have to compile the regexp in the normal way too.
re, err := regexp.Compile(q)
if err != nil {
return nil, nil, err
}

scorer := &reScorer{re}
queryObj := query.RegexpQuery(syn)
squery := queryObj.SQuery(types.AllFields)

if len(requiredSClauses) > 0 {
clauses := strings.Join(requiredSClauses, " ")
squery = "(:and " + squery + clauses + ")"
}
c.log.Infof("squery: %q", squery)
return scorer, []byte(squery), nil
}

func bm25Params(fieldName string) (k1 float64, b float64) {
switch fieldName {
case "filename":
return 1.2, 0.8
default:
return 1.4, 0.9
}
}

func (c *CodeSearcher) retrieveDocs(candidateDocIDs []uint64) ([]types.Document, error) {
start := time.Now()
docs := make([]types.Document, len(candidateDocIDs))
g := new(errgroup.Group)
g.SetLimit(runtime.GOMAXPROCS(0))

for i, docID := range candidateDocIDs {
docID := docID
i := i
g.Go(func() error {
doc, err := c.indexReader.GetStoredDocument(docID)
if err != nil {
return err
}
docs[i] = doc
return nil
})
}
if err := g.Wait(); err != nil {
return nil, err
}
c.log.Infof("Fetching docs took %s", time.Since(start))
return docs, nil
}

func (c *CodeSearcher) scoreDocs(scorer types.Scorer, candidateDocIDs []uint64, numResults int) ([]uint64, error) {
start := time.Now()
numCandidateDocIDs := len(candidateDocIDs)
scoreMap := make(map[uint64]float64, len(candidateDocIDs))
var mu sync.Mutex

// TODO(tylerw): use a priority-queue; stop iteration early.
g := new(errgroup.Group)
g.SetLimit(runtime.GOMAXPROCS(0))

for _, docID := range candidateDocIDs {
docID := docID
g.Go(func() error {
doc, err := c.indexReader.GetStoredDocument(docID, "filename", "content")
if err != nil {
return err
}
score := scorer.Score(doc)
mu.Lock()
scoreMap[docID] = score
mu.Unlock()
return nil
})
}
if err := g.Wait(); err != nil {
log.Errorf("error: %s", err)
}

sort.Slice(candidateDocIDs, func(i, j int) bool {
return scoreMap[candidateDocIDs[i]] > scoreMap[candidateDocIDs[j]]
})

if len(candidateDocIDs) > numResults {
candidateDocIDs = candidateDocIDs[:numResults]
}

c.log.Infof("Scoring %d docs took %s", numCandidateDocIDs, time.Since(start))
return candidateDocIDs, nil
}

func (c *CodeSearcher) Search(rawQ string, numResults int) ([]*result.Result, error) {
searchStart := time.Now()

scorer, squery, err := c.parse(rawQ)
if err != nil {
return nil, err
}

candidateDocIDs, err := c.indexReader.RawQuery([]byte(squery))
if err != nil {
return nil, err
}

candidateDocIDs, err = c.scoreDocs(scorer, candidateDocIDs, numResults)
if err != nil {
return nil, err
}
docs, err := c.retrieveDocs(candidateDocIDs)
if err != nil {
return nil, err
}
results := scorer.makeResults(docs)
c.log.Infof("Search took %s", time.Since(searchStart))

return results, nil
}
12 changes: 10 additions & 2 deletions codesearch/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,19 @@ type IndexWriter interface {
}

type IndexReader interface {
GetStoredFieldValue(docID uint64, field string) ([]byte, error)
GetStoredDocument(docID uint64) (Document, error)
GetStoredDocument(docID uint64, fieldNames ...string) (Document, error)
RawQuery(squery []byte) ([]uint64, error)
}

type Scorer interface {
Score(docs Document) float64
}

type RegionMatch struct {
Start int
End int
}

type NamedField struct {
ftype FieldType
name string
Expand Down
12 changes: 12 additions & 0 deletions deps.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1661,6 +1661,18 @@ def install_go_mod_dependencies(workspace_name = "buildbuddy"):
sum = "h1:abibh5XYBTASawfTQ0rA7dVtQT+6KzpGqb/J+DxRDaw=",
version = "v0.6.3",
)
go_repository(
name = "com_github_go_enry_go_enry_v2",
importpath = "github.com/go-enry/go-enry/v2",
sum = "h1:vbab0pcf5Yo1cHQLzbWZ+QomUh3EfEU8EiR5n7W0lnQ=",
version = "v2.8.7",
)
go_repository(
name = "com_github_go_enry_go_oniguruma",
importpath = "github.com/go-enry/go-oniguruma",
sum = "h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=",
version = "v1.2.1",
)
go_repository(
name = "com_github_go_errors_errors",
importpath = "github.com/go-errors/errors",
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ require (
github.com/dop251/goja v0.0.0-20230626124041-ba8a63e79201
github.com/elastic/gosigar v0.14.2
github.com/firecracker-microvm/firecracker-go-sdk v0.0.0-00010101000000-000000000000
github.com/go-enry/go-enry/v2 v2.8.7
github.com/go-faker/faker/v4 v4.0.0-beta.3
github.com/go-git/go-git/v5 v5.11.0
github.com/go-redis/redis/extra/redisotel/v8 v8.11.5
Expand Down Expand Up @@ -224,6 +225,7 @@ require (
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/getsentry/sentry-go v0.22.0 // indirect
github.com/go-chi/chi/v5 v5.0.7 // indirect
github.com/go-enry/go-oniguruma v1.2.1 // indirect
github.com/go-faster/city v1.0.1 // indirect
github.com/go-faster/errors v0.6.1 // indirect
github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,10 @@ github.com/gliderlabs/ssh v0.3.5/go.mod h1:8XB4KraRrX39qHhT6yxPsHedjA08I/uBVwj4x
github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=
github.com/go-chi/chi/v5 v5.0.7 h1:rDTPXLDHGATaeHvVlLcR4Qe0zftYethFucbjVQ1PxU8=
github.com/go-chi/chi/v5 v5.0.7/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
github.com/go-enry/go-enry/v2 v2.8.7 h1:vbab0pcf5Yo1cHQLzbWZ+QomUh3EfEU8EiR5n7W0lnQ=
github.com/go-enry/go-enry/v2 v2.8.7/go.mod h1:9yrj4ES1YrbNb1Wb7/PWYr2bpaCXUGRt0uafN0ISyG8=
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-errors/errors v1.0.1/go.mod h1:f4zRHt4oKfwPJE5k8C9vpYG+aDHdBFUsgrm6/TyX73Q=
github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
Expand Down

0 comments on commit 33148df

Please sign in to comment.