Skip to content

Commit

Permalink
Merge branch 'feat/update-jieba-dependency' into 'main' (merge reques…
Browse files Browse the repository at this point in the history
…t !58)

feat/update-jieba-dependency
feat: update encoder
  • Loading branch information
rogersqsliu committed Nov 13, 2024
2 parents bc5b6ec + 125c7ef commit c31e01e
Show file tree
Hide file tree
Showing 11 changed files with 138 additions and 197 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog


## v1.4.5
* 更换依赖cgo的分词包,为纯go实现的分词包,以更好的支持跨平台编译

## v1.4.4
* 新增/index/add接口实现

Expand Down
17 changes: 15 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
module github.com/tencent/vectordatabase-sdk-go

go 1.15
go 1.17

require (
github.com/go-ego/gse v0.80.3
github.com/pkg/errors v0.9.1
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72
github.com/tencentyun/cos-go-sdk-v5 v0.7.57
github.com/yanyiwu/gojieba v1.4.3
google.golang.org/grpc v1.48.0
google.golang.org/protobuf v1.31.0
)

require (
github.com/clbanning/mxj v1.8.4 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/go-querystring v1.0.0 // indirect
github.com/mitchellh/mapstructure v1.4.3 // indirect
github.com/mozillazg/go-httpheader v0.2.1 // indirect
github.com/vcaesar/cedar v0.20.2 // indirect
golang.org/x/net v0.0.0-20201021035429-f5854403a974 // indirect
golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4 // indirect
golang.org/x/text v0.3.3 // indirect
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 // indirect
)
8 changes: 6 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.m
github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/go-ego/gse v0.80.3 h1:YNFkjMhlhQnUeuoFcUEd1ivh6SOB764rT8GDsEbDiEg=
github.com/go-ego/gse v0.80.3/go.mod h1:Gt3A9Ry1Eso2Kza4MRaiZ7f2DTAvActmETY46Lxg0gU=
github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
Expand Down Expand Up @@ -72,8 +74,10 @@ github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common v1.0.563/go.mod
github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/kms v1.0.563/go.mod h1:uom4Nvi9W+Qkom0exYiJ9VWJjXwyxtPYTkKkaLMlfE0=
github.com/tencentyun/cos-go-sdk-v5 v0.7.57 h1:V3L7ae7KcsjDEGJcL16DAgkzOBQldHgrt3ZA0THsoBI=
github.com/tencentyun/cos-go-sdk-v5 v0.7.57/go.mod h1:8+hG+mQMuRP/OIS9d83syAvXvrMj9HhkND6Q1fLghw0=
github.com/yanyiwu/gojieba v1.4.3 h1:nYLKSOCq3ZIClNwf8ZloSPfHobDw3Jt0CJihSCu54tQ=
github.com/yanyiwu/gojieba v1.4.3/go.mod h1:54wkP7sMJ6bklf7yPl6F+JG71dzVUU1WigZbR47nGdY=
github.com/vcaesar/cedar v0.20.2 h1:TDx7AdZhilKcfE1WvdToTJf5VrC/FXcUOW+KY1upLZ4=
github.com/vcaesar/cedar v0.20.2/go.mod h1:lyuGvALuZZDPNXwpzv/9LyxW+8Y6faN7zauFezNsnik=
github.com/vcaesar/tt v0.20.1 h1:D/jUeeVCNbq3ad8M7hhtB3J9x5RZ6I1n1eZ0BJp7M+4=
github.com/vcaesar/tt v0.20.1/go.mod h1:cH2+AwGAJm19Wa6xvEa+0r+sXDJBT0QgNQey6mwqLeU=
go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
Expand Down
6 changes: 2 additions & 4 deletions tcvdbtext/encoder/bm25_encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package encoder
import (
"encoding/json"
"fmt"
"io/ioutil"
"math"
"os"
"path/filepath"
Expand Down Expand Up @@ -114,7 +113,7 @@ func (bm25 *BM25Encoder) SetParams(paramsFileLoadPath string) error {
if !tcvdbtext.FileExists(paramsFileLoadPath) {
return fmt.Errorf("the filepath %v doesn't exist", paramsFileLoadPath)
}
data, err := ioutil.ReadFile(paramsFileLoadPath)
data, err := os.ReadFile(paramsFileLoadPath)
if err != nil {
return fmt.Errorf("cannot read file: %v", err)
}
Expand All @@ -133,7 +132,6 @@ func (bm25 *BM25Encoder) SetParams(paramsFileLoadPath string) error {
ForSearch: bm25ParamsByFile.ForSearch,
CutAll: bm25ParamsByFile.CutAll,
Hmm: bm25ParamsByFile.Hmm,
LowerCase: bm25ParamsByFile.LowerCase,

UserDictFilePath: bm25ParamsByFile.UserDictFilePath,
StopWords: bm25ParamsByFile.StopWords,
Expand All @@ -156,7 +154,7 @@ func (bm25 *BM25Encoder) DownloadParams(paramsFileDownloadPath string) error {
return fmt.Errorf("download bm25 params failed because marshal params failed. err: %v", err.Error())
}

err = ioutil.WriteFile(paramsFileDownloadPath, jsonData, os.ModePerm)
err = os.WriteFile(paramsFileDownloadPath, jsonData, os.ModePerm)
if err != nil {
return fmt.Errorf("download bm25 params failed because write file failed. err: %v", err.Error())
}
Expand Down
12 changes: 6 additions & 6 deletions tcvdbtext/encoder/bm25_encoder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ func Test_BM25Encoder_fitAndLoad(t *testing.T) {
fmt.Println("fit with your own corpus")

bm25.FitCorpus([]string{
"腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。",
"作为专门为处理输入向量查询而设计的数据库,它支持多种索引类型和相似度计算方法,单索引支持10亿级向量规模,百万级 QPS 及毫秒级查询延迟。",
"不仅能为大模型提供外部知识库,提高大模型回答的准确性,还可广泛应用于推荐系统、NLP 服务、计算机视觉、智能客服等 AI 领域。",
"腾讯云向量数据库(Tencent Cloud VectorDB)作为一种专门存储和检索向量数据的服务提供给用户, 在高性能、高可用、大规模、低成本、简单易用、稳定可靠等方面体现出显著优势。",
"腾讯云向量数据库可以和大语言模型 LLM 配合使用。企业的私域数据在经过文本分割、向量化后,可以存储在腾讯云向量数据库中,构建起企业专属的外部知识库,从而在后续的检索任务中,为大模型提供提示信息,辅助大模型生成更加准确的答案。",
"腾讯云数据库托管机房分布在全球多个位置,这些位置节点称为地域(Region),每个地域又由多个可用区(Zone)构成。每个地域(Region)都是一个独立的地理区域。每个地域内都有多个相互隔离的位置,称为可用区(Zone)。每个可用区都是独立的,但同一地域下的可用区通过低时延的内网链路相连。腾讯云支持用户在不同位置分配云资源,建议用户在设计系统时考虑将资源放置在不同可用区以屏蔽单点故障导致的服务不可用状态。",
"腾讯云向量数据库(tencent cloud vectordb)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。",
"作为专门为处理输入向量查询而设计的数据库,它支持多种索引类型和相似度计算方法,单索引支持10亿级向量规模,高达百万级 qps 及毫秒级查询延迟。",
"不仅能为大模型提供外部知识库,提高大模型回答的准确性,还可广泛应用于推荐系统、nlp 服务、计算机视觉、智能客服等 AI 领域。",
"腾讯云向量数据库(tencent cloud vectordb)作为一种专门存储和检索向量数据的服务提供给用户, 在高性能、高可用、大规模、低成本、简单易用、稳定可靠等方面体现出显著优势。 ",
"腾讯云向量数据库可以和大语言模型 llm 配合使用。企业的私域数据在经过文本分割、向量化后,可以存储在腾讯云向量数据库中,构建起企业专属的外部知识库,从而在后续的检索任务中,为大模型提供提示信息,辅助大模型生成更加准确的答案。",
"腾讯云数据库托管机房分布在全球多个位置,这些位置节点称为地域(region),每个地域又由多个可用区(zone)构成。每个地域(region)都是一个独立的地理区域。每个地域内都有多个相互隔离的位置,称为可用区(zone)。每个可用区都是独立的,但同一地域下的可用区通过低时延的内网链路相连。腾讯云支持用户在不同位置分配云资源,建议用户在设计系统时考虑将资源放置在不同可用区以屏蔽单点故障导致的服务不可用状态。",
})
fmt.Println("download bm25 params")
bm25.DownloadParams("./bm25_params.json")
Expand Down
Loading

0 comments on commit c31e01e

Please sign in to comment.