Merge pull request #166 from go-ego/en-pr

Add: add idf embed and dict path custom support, update godoc
go-ego · Jan 16, 2023 · 7b5179d · 7b5179d
2 parents f20a3db + 393a386
commit 7b5179d
Show file tree

Hide file tree

Showing 10 changed files with 108 additions and 81 deletions.
diff --git a/dict_1.16.go b/dict_1.16.go
@@ -50,7 +50,7 @@ func (seg *Segmenter) loadZhST(d string) (begin int, err error) {
 	return
 }
 
-// LoadDictEmbed load dictionary by embed file
+// LoadDictEmbed load the dictionary by embed file
 func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
 	if len(dict) > 0 {
 		d := dict[0]
@@ -87,7 +87,7 @@ func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
 	return seg.loadZh()
 }
 
-// LoadDictStr load dictionary from string
+// LoadDictStr load the dictionary from string
 func (seg *Segmenter) LoadDictStr(dict string) error {
 	if seg.Dict == nil {
 		seg.Dict = NewDict()
@@ -128,7 +128,7 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
 	return nil
 }
 
-// LoadStopEmbed load stop dictionary from embed file
+// LoadStopEmbed load the stop dictionary from embed file
 func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) {
 	if len(dict) > 0 {
 		d := dict[0]

diff --git a/dict_embed.go b/dict_embed.go
@@ -15,6 +15,9 @@ var (
 	zhT string
 	//go:embed data/dict/zh/s_1.txt
 	zhS string
+
+	//go:embed data/dict/zh/idf.txt
+	zhIdf string
 )
 
 //go:embed data/dict/zh/stop_tokens.txt

diff --git a/dict_util.go b/dict_util.go
@@ -48,6 +48,7 @@ func (seg *Segmenter) Init() {
 		seg.TextFreq = "2.0"
 	}
 
+	// init the model of hmm cut
 	if !seg.NotLoadHMM {
 		seg.LoadModel()
 	}
@@ -154,7 +155,7 @@ func (seg *Segmenter) LoadDict(files ...string) error {
 	}
 
 	var (
-		dictDir  = path.Join(path.Dir(GetCurrentFilePath()), "data")
+		dictDir  = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
 		dictPath string
 		// load     bool
 	)
@@ -216,15 +217,19 @@ func (seg *Segmenter) LoadDict(files ...string) error {
 }
 
 // GetCurrentFilePath get the current file path
-func GetCurrentFilePath() string {
+func (seg *Segmenter) GetCurrentFilePath() string {
+	if seg.DictPath != "" {
+		return seg.DictPath
+	}
+
 	_, filePath, _, _ := runtime.Caller(1)
 	return filePath
 }
 
 // GetIdfPath get the idf path
-func GetIdfPath(files ...string) []string {
+func (seg *Segmenter) GetIdfPath(files ...string) []string {
 	var (
-		dictDir  = path.Join(path.Dir(GetCurrentFilePath()), "data")
+		dictDir  = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
 		dictPath = path.Join(dictDir, "dict/zh/idf.txt")
 	)
 

diff --git a/hmm/idf/idf.go b/hmm/idf/idf.go
@@ -42,7 +42,7 @@ func (i *Idf) AddToken(text string, freq float64, pos ...string) error {
 // LoadDict load the idf dictionary
 func (i *Idf) LoadDict(files ...string) error {
 	if len(files) <= 0 {
-		files = gse.GetIdfPath(files...)
+		files = i.seg.GetIdfPath(files...)
 	}
 
 	return i.seg.LoadDict(files...)

diff --git a/hmm/idf/tag_extracker.go b/hmm/idf/tag_extracker.go
@@ -69,6 +69,12 @@ func (t *TagExtracter) LoadIdf(fileName ...string) error {
 	return t.Idf.LoadDict(fileName...)
 }
 
+// LoadIdfStr load and create a new Idf dictionary from the string.
+func (t *TagExtracter) LoadIdfStr(str string) error {
+	t.Idf = NewIdf()
+	return t.Idf.seg.LoadDictStr(str)
+}
+
 // LoadStopWords load and create a new StopWord dictionary from the file.
 func (t *TagExtracter) LoadStopWords(fileName ...string) error {
 	t.stopWord = NewStopWord()

diff --git a/seg_utils.go b/seg_utils.go
@@ -20,16 +20,15 @@ import (
 	"fmt"
 )
 
-// ToString segments to string  输出分词结果为字符串
+// ToString converts a segments slice to string retrun the string
 //
-// 有两种输出模式，以 "山达尔星联邦共和国" 为例
+//	 two output modes:
 //
-//  普通模式（searchMode=false）输出一个分词 "山达尔星联邦共和国/ns "
-//  搜索模式（searchMode=true） 输出普通模式的再细致切分：
-//      "山达尔星/nz 联邦/n 共和/nz 国/n 共和国/ns 联邦共和国/nt 山达尔星联邦共和国/ns "
+//		normal mode (searchMode=false）
+//		search mode（searchMode=true）
 //
-// 默认 searchMode=false
-// 搜索模式主要用于给搜索引擎提供尽可能多的关键字，详情请见 Token 结构体的注释。
+// default searchMode=false
+// search mode is used search engine, and will output more results
 func ToString(segs []Segment, searchMode ...bool) (output string) {
 	var mode bool
 	if len(searchMode) > 0 {
@@ -76,16 +75,7 @@ func tokenToBytes(token *Token) (output []byte) {
 	return
 }
 
-// ToSlice segments to slice 输出分词结果到一个字符串 slice
-//
-// 有两种输出模式，以 "山达尔星联邦共和国" 为例
-//
-//  普通模式（searchMode=false）输出一个分词"[山达尔星联邦共和国]"
-//  搜索模式（searchMode=true） 输出普通模式的再细致切分：
-//      "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"
-//
-// 默认 searchMode=false
-// 搜索模式主要用于给搜索引擎提供尽可能多的关键字，详情请见Token结构体的注释。
+// ToSlice converts a segments to slice retrun string slice
 func ToSlice(segs []Segment, searchMode ...bool) (output []string) {
 	var mode bool
 	if len(searchMode) > 0 {
@@ -121,7 +111,7 @@ func tokenToSlice(token *Token) (output []string) {
 	return
 }
 
-// ToPos segments to SegPos
+// ToPos converts a segments slice to []SegPos
 func ToPos(segs []Segment, searchMode ...bool) (output []SegPos) {
 	var mode bool
 	if len(searchMode) > 0 {
@@ -168,20 +158,20 @@ func tokenToPos(token *Token) (output []SegPos) {
 	return
 }
 
-// 将多个字元拼接一个字符串输出
+// let make multiple []Text into one string ooutput
 func textToString(text []Text) (output string) {
 	for _, word := range text {
 		output += string(word)
 	}
 	return
 }
 
-// 将多个字元拼接一个字符串输出
+// let make []Text toString returns a string output
 func textSliceToString(text []Text) string {
 	return Join(text)
 }
 
-// 返回多个字元的字节总长度
+// retrun total length of text slice
 func textSliceByteLen(text []Text) (length int) {
 	for _, word := range text {
 		length += len(word)

diff --git a/segmenter.go b/segmenter.go
@@ -22,9 +22,10 @@ import (
 
 // Segmenter define the segmenter structure
 type Segmenter struct {
-	Dict    *Dictionary
-	Load    bool
-	DictSep string
+	Dict     *Dictionary
+	Load     bool
+	DictSep  string
+	DictPath string
 
 	// NotLoadHMM option load the default hmm model config (Chinese char)
 	NotLoadHMM bool
@@ -57,7 +58,8 @@ type Segmenter struct {
 	StopWordMap map[string]bool
 }
 
-// jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息
+// jumper this structure is used to record information
+// about the forward leap at a word in the Viterbi algorithm
 type jumper struct {
 	minDistance float32
 	token       *Token
@@ -87,7 +89,7 @@ func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
 }
 
 func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
-	// specific case
+	// special cases
 	if len(bytes) == 0 {
 		// return []Segment{}
 		return nil
@@ -100,14 +102,16 @@ func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
 }
 
 func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
-	// 搜索模式下该分词已无继续划分可能的情况
+	// The case where the division is no longer possible in the search mode
 	if searchMode && len(text) == 1 {
 		return nil
 	}
 
-	// jumpers 定义了每个字元处的向前跳转信息，
-	// 包括这个跳转对应的分词，
-	// 以及从文本段开始到该字元的最短路径值
+	// jumpers defines the forward jump information at each literal,
+	// including the subword corresponding to this jump,
+	// the and the value of the shortest path from the start
+	// of the text segment to that literal
+	//
 	jumpers := make([]jumper, len(text))
 
 	if seg.Dict == nil {
@@ -116,43 +120,49 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
 
 	tokens := make([]*Token, seg.Dict.maxTokenLen)
 	for current := 0; current < len(text); current++ {
-		// 找到前一个字元处的最短路径，以便计算后续路径值
+		// find the shortest path of the previous token,
+		// to calculate the subsequent path values
 		var baseDistance float32
 		if current == 0 {
-			// 当本字元在文本首部时，基础距离应该是零
+			// When this character is at the beginning of the text,
+			// the base distance should be zero
 			baseDistance = 0
 		} else {
 			baseDistance = jumpers[current-1].minDistance
 		}
 
-		// 寻找所有以当前字元开头的分词
+		// find all the segments starting with this token
 		tx := text[current:minInt(current+seg.Dict.maxTokenLen, len(text))]
 		numTokens := seg.Dict.LookupTokens(tx, tokens)
 
-		// 对所有可能的分词，更新分词结束字元处的跳转信息
+		// Update the jump information at the end of the split word
+		// for all possible splits
 		for iToken := 0; iToken < numTokens; iToken++ {
 			location := current + len(tokens[iToken].text) - 1
 			if !searchMode || current != 0 || location != len(text)-1 {
 				updateJumper(&jumpers[location], baseDistance, tokens[iToken])
 			}
 		}
 
-		// 当前字元没有对应分词时补加一个伪分词
+		// Add a pseudo-syllable if there is no corresponding syllable
+		// for the current character
 		if numTokens == 0 || len(tokens[0].text) > 1 {
 			updateJumper(&jumpers[current], baseDistance,
 				&Token{text: []Text{text[current]}, freq: 1, distance: 32, pos: "x"})
 		}
 	}
 
-	// 从后向前扫描第一遍得到需要添加的分词数目
+	// Scan the first pass from back to front
+	// to get the number of subwords to be added
 	numSeg := 0
 	for index := len(text) - 1; index >= 0; {
 		location := index - len(jumpers[index].token.text) + 1
 		numSeg++
 		index = location - 1
 	}
 
-	// 从后向前扫描第二遍添加分词到最终结果
+	// Scan from back to front for a second time
+	// to add the split to the final result
 	outputSegments := make([]Segment, numSeg)
 	for index := len(text) - 1; index >= 0; {
 		location := index - len(jumpers[index].token.text) + 1
@@ -161,7 +171,7 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
 		index = location - 1
 	}
 
-	// 计算各个分词的字节位置
+	// Calculate the byte position of each participle
 	bytePosition := 0
 	for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
 		outputSegments[iSeg].start = bytePosition
@@ -172,11 +182,14 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
 	return outputSegments
 }
 
-// updateJumper 更新跳转信息:
-//  1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况)，或者
-//  2. 当该位置的当前最短路径大于新的最短路径时
+// updateJumper Update the jump information:
+//  1. When the location has never been visited
+//     (the case where jumper.minDistance is zero), or
+//  2. When the current shortest path at the location
+//     is greater than the new shortest path
 //
-// 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率
+// Update the shortest path value of the current location to baseDistance
+// add the probability of the new split
 func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
 	newDistance := baseDistance + token.distance
 	if jumper.minDistance == 0 || jumper.minDistance > newDistance {
@@ -202,7 +215,7 @@ func (seg *Segmenter) SplitTextToWords(text Text) []Text {
 		isNum := unicode.IsNumber(r) && !seg.Num
 		isAlpha := unicode.IsLetter(r) && !seg.Alpha
 		if size <= 2 && (isAlpha || isNum) {
-			// 当前是拉丁字母或数字（非中日韩文字）
+			// Currently is Latin alphabet or numbers (not in CJK)
 			if !inAlphanumeric {
 				alphanumericStart = current
 				inAlphanumeric = true

diff --git a/segmenter_test.go b/segmenter_test.go
@@ -226,6 +226,7 @@ func TestToken(t *testing.T) {
 }
 
 func TestDictPaths(t *testing.T) {
+	var seg1 Segmenter
 	// seg.SkipLog = true
 	paths := DictPaths("./dictDir", "zh, jp")
 	tt.Expect(t, "3", len(paths))
@@ -238,9 +239,13 @@ func TestDictPaths(t *testing.T) {
 	tt.Expect(t, "3", len(paths))
 	tt.Equal(t, paths, paths1)
 
-	p := strings.ReplaceAll(GetCurrentFilePath(), "/segmenter_test.go", "") +
+	p := strings.ReplaceAll(seg1.GetCurrentFilePath(), "/segmenter_test.go", "") +
 		`/data/dict/zh/idf.txt`
-	tt.Equal(t, "["+p+"]", GetIdfPath([]string{}...))
+	tt.Equal(t, "["+p+"]", seg1.GetIdfPath([]string{}...))
+
+	seg1.DictPath = "testdata/zh"
+	tt.Equal(t, "testdata/zh", seg1.GetCurrentFilePath())
+	tt.Equal(t, "[testdata/data/dict/zh/idf.txt]", seg1.GetIdfPath([]string{}...))
 }
 
 func TestInAlphaNum(t *testing.T) {

diff --git a/stop.go b/stop.go
@@ -44,7 +44,7 @@ func (seg *Segmenter) LoadStop(files ...string) error {
 		seg.StopWordMap = make(map[string]bool)
 	}
 
-	dictDir := path.Join(path.Dir(GetCurrentFilePath()), "data")
+	dictDir := path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
 	if len(files) <= 0 {
 		dictPath := path.Join(dictDir, "dict/zh/stop_word.txt")
 		files = append(files, dictPath)