Skip to content

Commit

Permalink
允许在悟空引擎外部对索引的文档进行分词。
Browse files Browse the repository at this point in the history
  • Loading branch information
Hui Chen committed Aug 13, 2013
1 parent 22e7052 commit d956874
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 15 deletions.
4 changes: 2 additions & 2 deletions core/ranker.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ func (ranker *Ranker) Rank(
outputDocs = append(outputDocs, types.ScoredDocument{
DocId: d.DocId,
Scores: scores,
TokenSnippetPositions: d.TokenSnippetLocations,
TokenPositions: d.TokenLocations})
TokenSnippetLocations: d.TokenSnippetLocations,
TokenLocations: d.TokenLocations})
}
}

Expand Down
61 changes: 58 additions & 3 deletions engine/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ func TestEngineIndexDocument(t *testing.T) {

utils.Expect(t, "1", outputs.Docs[0].DocId)
utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetPositions)
utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)

utils.Expect(t, "4", outputs.Docs[1].DocId)
utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetPositions)
utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)

utils.Expect(t, "0", outputs.Docs[2].DocId)
utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetPositions)
utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
}

func TestReverseOrder(t *testing.T) {
Expand Down Expand Up @@ -246,3 +246,58 @@ func TestRemoveDocument(t *testing.T) {
utils.Expect(t, "0", outputs.Docs[0].DocId)
utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
}

func TestEngineIndexDocumentWithTokens(t *testing.T) {
var engine Engine
engine.Init(types.EngineInitOptions{
SegmenterDictionaries: "../testdata/test_dict.txt",
DefaultRankOptions: &types.RankOptions{
OutputOffset: 0,
MaxOutputs: 10,
ScoringCriteria: &RankByTokenProximity{},
},
IndexerInitOptions: &types.IndexerInitOptions{
IndexType: types.LocationsIndex,
},
})

docId := uint64(0)
engine.IndexDocument(docId, types.DocumentIndexData{
Content: "",
Tokens: []types.TokenData{
{"中国", []int{0}},
{"人口", []int{18, 24}},
},
Fields: ScoringFields{1, 2, 3},
})
docId++
engine.IndexDocument(docId, types.DocumentIndexData{
Content: "",
Tokens: []types.TokenData{
{"中国", []int{0}},
{"人口", []int{6}},
},
Fields: ScoringFields{1, 2, 3},
})
docId++
engine.IndexDocument(docId, types.DocumentIndexData{
Content: "中国十三亿人口",
Fields: ScoringFields{0, 9, 1},
})

engine.FlushIndex()

outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
utils.Expect(t, "2", len(outputs.Tokens))
utils.Expect(t, "中国", outputs.Tokens[0])
utils.Expect(t, "人口", outputs.Tokens[1])
utils.Expect(t, "3", len(outputs.Docs))

utils.Expect(t, "1", outputs.Docs[0].DocId)
utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)

utils.Expect(t, "4", outputs.Docs[1].DocId)
utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
}
28 changes: 20 additions & 8 deletions engine/segmenter_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,27 @@ func (engine *Engine) segmenterWorker() {
for {
request := <-engine.segmenterChannel
shard := engine.getShard(request.hash)
segments := engine.segmenter.Segment([]byte(request.data.Content))
tokensMap := make(map[string][]int)

// 加入分词得到的关键词
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
tokensMap := make(map[string][]int)
numTokens := 0
if request.data.Content != "" {
// 当文档正文不为空时,优先从内容分词中得到关键词
segments := engine.segmenter.Segment([]byte(request.data.Content))
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
}
}
numTokens = len(segments)
} else {
// 否则载入用户输入的关键词
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens = len(request.data.Tokens)
}

// 加入非分词的文档标签
Expand All @@ -35,7 +47,7 @@ func (engine *Engine) segmenterWorker() {
indexerRequest := indexerAddDocumentRequest{
document: &types.DocumentIndex{
DocId: request.docId,
TokenLength: float32(len(segments)),
TokenLength: float32(numTokens),
Keywords: make([]types.KeywordIndex, len(tokensMap)),
},
}
Expand Down
15 changes: 15 additions & 0 deletions types/document_index_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,24 @@ type DocumentIndexData struct {
// 文档全文(必须是UTF-8格式),用于生成待索引的关键词
Content string

// 文档的关键词
// 当Content不为空的时候,优先从Content中分词得到关键词。
// Tokens存在的意义在于绕过悟空内置的分词器,在引擎外部
// 进行分词和预处理。
Tokens []TokenData

// 文档标签(必须是UTF-8格式),比如文档的类别属性等,这些标签并不出现在文档文本中
Labels []string

// 文档的评分字段,可以接纳任何类型的结构体
Fields interface{}
}

// 文档的一个关键词
type TokenData struct {
// 关键词的字符串
Text string

// 关键词的首字节在文档中出现的位置
Locations []int
}
4 changes: 2 additions & 2 deletions types/search_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ type ScoredDocument struct {

// 用于生成摘要的关键词在文本中的字节位置,该切片长度和SearchResponse.Tokens的长度一样
// 只有当IndexType == LocationsIndex时不为空
TokenSnippetPositions []int
TokenSnippetLocations []int

// 关键词出现的位置
// 只有当IndexType == LocationsIndex时不为空
TokenPositions [][]int
TokenLocations [][]int
}

// 为了方便排序
Expand Down

0 comments on commit d956874

Please sign in to comment.