Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bleve fuzziness search (#33078) #33087

Merged
merged 3 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions custom/conf/app.example.ini
Original file line number Diff line number Diff line change
Expand Up @@ -1482,6 +1482,10 @@ LEVEL = Info
;REPO_INDEXER_EXCLUDE =
;;
;MAX_FILE_SIZE = 1048576
;;
;; Bleve engine has performance problems with fuzzy search, so we limit the fuzziness to 0 by default to disable it.
;; If you'd like to enable it, you can set it to a value between 0 and 2.
;TYPE_BLEVE_MAX_FUZZINESS = 0

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Expand Down
3 changes: 1 addition & 2 deletions modules/indexer/code/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,12 @@ func Init() {
for _, indexerData := range items {
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
unhandled = append(unhandled, indexerData)
if !setting.IsInTesting {
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
}
}
}
return unhandled
return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
}

indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)
Expand Down
4 changes: 3 additions & 1 deletion modules/indexer/code/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (
"code.gitea.io/gitea/modules/indexer/code/bleve"
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
"code.gitea.io/gitea/modules/indexer/code/internal"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"

_ "code.gitea.io/gitea/models"
_ "code.gitea.io/gitea/models/actions"
Expand Down Expand Up @@ -279,7 +281,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {

func TestBleveIndexAndSearch(t *testing.T) {
unittest.PrepareTestEnv(t)

defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
dir := t.TempDir()

idx := bleve.NewIndexer(dir)
Expand Down
9 changes: 5 additions & 4 deletions modules/indexer/internal/bleve/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"unicode"

"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"

"github.com/blevesearch/bleve/v2"
Expand Down Expand Up @@ -54,9 +55,9 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}

// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
// may be different on two string and they still be considered equivalent.
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
// may be different on two string, and they still be considered equivalent.
// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
tokens := tokenizer.Tokenize([]byte(s))
Expand Down Expand Up @@ -85,5 +86,5 @@ func guessFuzzinessByKeyword(s string) int {
return 0
}
}
return min(maxFuzziness, len(s)/4)
return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
}
7 changes: 6 additions & 1 deletion modules/indexer/internal/bleve/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@ import (
"fmt"
"testing"

"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"

"github.com/stretchr/testify/assert"
)

func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()

scenarios := []struct {
Input string
Fuzziness int // See util.go for the definition of fuzziness in this particular context
Expand Down Expand Up @@ -46,7 +51,7 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
}

for _, scenario := range scenarios {
t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
})
}
Expand Down
3 changes: 3 additions & 0 deletions modules/setting/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ var Indexer = struct {
IncludePatterns []*GlobMatcher
ExcludePatterns []*GlobMatcher
ExcludeVendored bool

TypeBleveMaxFuzzniess int
}{
IssueType: "bleve",
IssuePath: "indexers/issues.bleve",
Expand Down Expand Up @@ -88,6 +90,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) {
Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true)
Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024)
Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second)
Indexer.TypeBleveMaxFuzzniess = sec.Key("TYPE_BLEVE_MAX_FUZZINESS").MustInt(0)
}

// IndexerGlobFromString parses a comma separated list of patterns and returns a glob.Glob slice suited for repo indexing
Expand Down
39 changes: 39 additions & 0 deletions routers/common/codesearch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package common

import (
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/services/context"
)

func PrepareCodeSearch(ctx *context.Context) (ret struct {
Keyword string
Language string
IsFuzzy bool
},
) {
ret.Language = ctx.FormTrim("l")
ret.Keyword = ctx.FormTrim("q")

fuzzyDefault := setting.Indexer.RepoIndexerEnabled
fuzzyAllow := true
if setting.Indexer.RepoType == "bleve" && setting.Indexer.TypeBleveMaxFuzzniess == 0 {
fuzzyDefault = false
fuzzyAllow = false
}
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(fuzzyDefault)
if isFuzzy && !fuzzyAllow {
ctx.Flash.Info("Fuzzy search is disabled by default due to performance reasons")
isFuzzy = false
}

ctx.Data["IsBleveFuzzyDisabled"] = true
ctx.Data["Keyword"] = ret.Keyword
ctx.Data["Language"] = ret.Language
ctx.Data["IsFuzzy"] = isFuzzy

ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
return ret
}
21 changes: 7 additions & 14 deletions routers/web/explore/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"code.gitea.io/gitea/modules/base"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/common"
"code.gitea.io/gitea/services/context"
)

Expand All @@ -32,18 +33,10 @@ func Code(ctx *context.Context) {
ctx.Data["Title"] = ctx.Tr("explore")
ctx.Data["PageIsExplore"] = true
ctx.Data["PageIsExploreCode"] = true

language := ctx.FormTrim("l")
keyword := ctx.FormTrim("q")

isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["PageIsViewCode"] = true

if keyword == "" {
prepareSearch := common.PrepareCodeSearch(ctx)
if prepareSearch.Keyword == "" {
ctx.HTML(http.StatusOK, tplExploreCode)
return
}
Expand Down Expand Up @@ -80,9 +73,9 @@ func Code(ctx *context.Context) {
if (len(repoIDs) > 0) || isAdmin {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: repoIDs,
Keyword: keyword,
IsKeywordFuzzy: isFuzzy,
Language: language,
Keyword: prepareSearch.Keyword,
IsKeywordFuzzy: prepareSearch.IsFuzzy,
Language: prepareSearch.Language,
Paginator: &db.ListOptions{
Page: page,
PageSize: setting.UI.RepoSearchPagingNum,
Expand Down Expand Up @@ -138,7 +131,7 @@ func Code(ctx *context.Context) {

pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5)
pager.SetDefaultParams(ctx)
pager.AddParamString("l", language)
pager.AddParamString("l", prepareSearch.Language)
ctx.Data["Page"] = pager

ctx.HTML(http.StatusOK, tplExploreCode)
Expand Down
26 changes: 9 additions & 17 deletions routers/web/repo/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"code.gitea.io/gitea/modules/git"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/common"
"code.gitea.io/gitea/services/context"
)

Expand All @@ -29,18 +30,9 @@ func indexSettingToGitGrepPathspecList() (list []string) {

// Search render repository search page
func Search(ctx *context.Context) {
language := ctx.FormTrim("l")
keyword := ctx.FormTrim("q")

isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["PageIsViewCode"] = true
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled

if keyword == "" {
prepareSearch := common.PrepareCodeSearch(ctx)
if prepareSearch.Keyword == "" {
ctx.HTML(http.StatusOK, tplSearch)
return
}
Expand All @@ -57,9 +49,9 @@ func Search(ctx *context.Context) {
var err error
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: []int64{ctx.Repo.Repository.ID},
Keyword: keyword,
IsKeywordFuzzy: isFuzzy,
Language: language,
Keyword: prepareSearch.Keyword,
IsKeywordFuzzy: prepareSearch.IsFuzzy,
Language: prepareSearch.Language,
Paginator: &db.ListOptions{
Page: page,
PageSize: setting.UI.RepoSearchPagingNum,
Expand All @@ -75,9 +67,9 @@ func Search(ctx *context.Context) {
ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx)
}
} else {
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{
ContextLineNumber: 1,
IsFuzzy: isFuzzy,
IsFuzzy: prepareSearch.IsFuzzy,
RefName: git.RefNameFromBranch(ctx.Repo.BranchName).String(), // BranchName should be default branch or the first existing branch
PathspecList: indexSettingToGitGrepPathspecList(),
})
Expand Down Expand Up @@ -109,7 +101,7 @@ func Search(ctx *context.Context) {

pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5)
pager.SetDefaultParams(ctx)
pager.AddParamString("l", language)
pager.AddParamString("l", prepareSearch.Language)
ctx.Data["Page"] = pager

ctx.HTML(http.StatusOK, tplSearch)
Expand Down
22 changes: 7 additions & 15 deletions routers/web/user/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"code.gitea.io/gitea/modules/base"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/common"
shared_user "code.gitea.io/gitea/routers/web/shared/user"
"code.gitea.io/gitea/services/context"
)
Expand All @@ -34,20 +35,11 @@ func CodeSearch(ctx *context.Context) {
}

ctx.Data["IsPackageEnabled"] = setting.Packages.Enabled
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
ctx.Data["Title"] = ctx.Tr("explore.code")

language := ctx.FormTrim("l")
keyword := ctx.FormTrim("q")

isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["IsCodePage"] = true

if keyword == "" {
prepareSearch := common.PrepareCodeSearch(ctx)
if prepareSearch.Keyword == "" {
ctx.HTML(http.StatusOK, tplUserCode)
return
}
Expand Down Expand Up @@ -77,9 +69,9 @@ func CodeSearch(ctx *context.Context) {
if len(repoIDs) > 0 {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: repoIDs,
Keyword: keyword,
IsKeywordFuzzy: isFuzzy,
Language: language,
Keyword: prepareSearch.Keyword,
IsKeywordFuzzy: prepareSearch.IsFuzzy,
Language: prepareSearch.Language,
Paginator: &db.ListOptions{
Page: page,
PageSize: setting.UI.RepoSearchPagingNum,
Expand Down Expand Up @@ -122,7 +114,7 @@ func CodeSearch(ctx *context.Context) {

pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5)
pager.SetDefaultParams(ctx)
pager.AddParamString("l", language)
pager.AddParamString("l", prepareSearch.Language)
ctx.Data["Page"] = pager

ctx.HTML(http.StatusOK, tplUserCode)
Expand Down
3 changes: 2 additions & 1 deletion templates/shared/search/code/search.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
{{template "shared/search/combo_fuzzy" dict "Value" .Keyword "Disabled" .CodeIndexerUnavailable "IsFuzzy" .IsFuzzy "Placeholder" (ctx.Locale.Tr "search.code_kind")}}
</form>
<div class="divider"></div>
<div class="ui user list">
<div class="ui list">
{{template "base/alert" .}}
{{if .CodeIndexerUnavailable}}
<div class="ui error message">
<p>{{ctx.Locale.Tr "search.code_search_unavailable"}}</p>
Expand Down
Loading