Skip to content

Commit b48478b

Browse files
committed
Merge branch 'dev'
2 parents 822a2e5 + d956874 commit b48478b

File tree

5 files changed

+97
-15
lines changed

5 files changed

+97
-15
lines changed

core/ranker.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ func (ranker *Ranker) Rank(
6565
outputDocs = append(outputDocs, types.ScoredDocument{
6666
DocId: d.DocId,
6767
Scores: scores,
68-
TokenSnippetPositions: d.TokenSnippetLocations,
69-
TokenPositions: d.TokenLocations})
68+
TokenSnippetLocations: d.TokenSnippetLocations,
69+
TokenLocations: d.TokenLocations})
7070
}
7171
}
7272

engine/engine_test.go

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,15 @@ func TestEngineIndexDocument(t *testing.T) {
7676

7777
utils.Expect(t, "1", outputs.Docs[0].DocId)
7878
utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
79-
utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetPositions)
79+
utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
8080

8181
utils.Expect(t, "4", outputs.Docs[1].DocId)
8282
utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
83-
utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetPositions)
83+
utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
8484

8585
utils.Expect(t, "0", outputs.Docs[2].DocId)
8686
utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
87-
utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetPositions)
87+
utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
8888
}
8989

9090
func TestReverseOrder(t *testing.T) {
@@ -246,3 +246,58 @@ func TestRemoveDocument(t *testing.T) {
246246
utils.Expect(t, "0", outputs.Docs[0].DocId)
247247
utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
248248
}
249+
250+
func TestEngineIndexDocumentWithTokens(t *testing.T) {
251+
var engine Engine
252+
engine.Init(types.EngineInitOptions{
253+
SegmenterDictionaries: "../testdata/test_dict.txt",
254+
DefaultRankOptions: &types.RankOptions{
255+
OutputOffset: 0,
256+
MaxOutputs: 10,
257+
ScoringCriteria: &RankByTokenProximity{},
258+
},
259+
IndexerInitOptions: &types.IndexerInitOptions{
260+
IndexType: types.LocationsIndex,
261+
},
262+
})
263+
264+
docId := uint64(0)
265+
engine.IndexDocument(docId, types.DocumentIndexData{
266+
Content: "",
267+
Tokens: []types.TokenData{
268+
{"中国", []int{0}},
269+
{"人口", []int{18, 24}},
270+
},
271+
Fields: ScoringFields{1, 2, 3},
272+
})
273+
docId++
274+
engine.IndexDocument(docId, types.DocumentIndexData{
275+
Content: "",
276+
Tokens: []types.TokenData{
277+
{"中国", []int{0}},
278+
{"人口", []int{6}},
279+
},
280+
Fields: ScoringFields{1, 2, 3},
281+
})
282+
docId++
283+
engine.IndexDocument(docId, types.DocumentIndexData{
284+
Content: "中国十三亿人口",
285+
Fields: ScoringFields{0, 9, 1},
286+
})
287+
288+
engine.FlushIndex()
289+
290+
outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
291+
utils.Expect(t, "2", len(outputs.Tokens))
292+
utils.Expect(t, "中国", outputs.Tokens[0])
293+
utils.Expect(t, "人口", outputs.Tokens[1])
294+
utils.Expect(t, "3", len(outputs.Docs))
295+
296+
utils.Expect(t, "1", outputs.Docs[0].DocId)
297+
utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
298+
utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
299+
300+
utils.Expect(t, "4", outputs.Docs[1].DocId)
301+
utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
302+
utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
303+
}

engine/segmenter_worker.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,27 @@ func (engine *Engine) segmenterWorker() {
1414
for {
1515
request := <-engine.segmenterChannel
1616
shard := engine.getShard(request.hash)
17-
segments := engine.segmenter.Segment([]byte(request.data.Content))
18-
tokensMap := make(map[string][]int)
1917

20-
// 加入分词得到的关键词
21-
for _, segment := range segments {
22-
token := segment.Token().Text()
23-
if !engine.stopTokens.IsStopToken(token) {
24-
tokensMap[token] = append(tokensMap[token], segment.Start())
18+
tokensMap := make(map[string][]int)
19+
numTokens := 0
20+
if request.data.Content != "" {
21+
// 当文档正文不为空时,优先从内容分词中得到关键词
22+
segments := engine.segmenter.Segment([]byte(request.data.Content))
23+
for _, segment := range segments {
24+
token := segment.Token().Text()
25+
if !engine.stopTokens.IsStopToken(token) {
26+
tokensMap[token] = append(tokensMap[token], segment.Start())
27+
}
28+
}
29+
numTokens = len(segments)
30+
} else {
31+
// 否则载入用户输入的关键词
32+
for _, t := range request.data.Tokens {
33+
if !engine.stopTokens.IsStopToken(t.Text) {
34+
tokensMap[t.Text] = t.Locations
35+
}
2536
}
37+
numTokens = len(request.data.Tokens)
2638
}
2739

2840
// 加入非分词的文档标签
@@ -35,7 +47,7 @@ func (engine *Engine) segmenterWorker() {
3547
indexerRequest := indexerAddDocumentRequest{
3648
document: &types.DocumentIndex{
3749
DocId: request.docId,
38-
TokenLength: float32(len(segments)),
50+
TokenLength: float32(numTokens),
3951
Keywords: make([]types.KeywordIndex, len(tokensMap)),
4052
},
4153
}

types/document_index_data.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,24 @@ type DocumentIndexData struct {
44
// 文档全文(必须是UTF-8格式),用于生成待索引的关键词
55
Content string
66

7+
// 文档的关键词
8+
// 当Content不为空的时候,优先从Content中分词得到关键词。
9+
// Tokens存在的意义在于绕过悟空内置的分词器,在引擎外部
10+
// 进行分词和预处理。
11+
Tokens []TokenData
12+
713
// 文档标签(必须是UTF-8格式),比如文档的类别属性等,这些标签并不出现在文档文本中
814
Labels []string
915

1016
// 文档的评分字段,可以接纳任何类型的结构体
1117
Fields interface{}
1218
}
19+
20+
// 文档的一个关键词
21+
type TokenData struct {
22+
// 关键词的字符串
23+
Text string
24+
25+
// 关键词的首字节在文档中出现的位置
26+
Locations []int
27+
}

types/search_response.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ type ScoredDocument struct {
2424

2525
// 用于生成摘要的关键词在文本中的字节位置,该切片长度和SearchResponse.Tokens的长度一样
2626
// 只有当IndexType == LocationsIndex时不为空
27-
TokenSnippetPositions []int
27+
TokenSnippetLocations []int
2828

2929
// 关键词出现的位置
3030
// 只有当IndexType == LocationsIndex时不为空
31-
TokenPositions [][]int
31+
TokenLocations [][]int
3232
}
3333

3434
// 为了方便排序

0 commit comments

Comments
 (0)