Skip to content

Commit

Permalink
Merge pull request #166 from go-ego/en-pr
Browse files Browse the repository at this point in the history
Add: add idf embed and dict path custom support, update godoc
  • Loading branch information
vcaesar authored Jan 16, 2023
2 parents f20a3db + 393a386 commit 7b5179d
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 81 deletions.
6 changes: 3 additions & 3 deletions dict_1.16.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (seg *Segmenter) loadZhST(d string) (begin int, err error) {
return
}

// LoadDictEmbed load dictionary by embed file
// LoadDictEmbed load the dictionary by embed file
func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
if len(dict) > 0 {
d := dict[0]
Expand Down Expand Up @@ -87,7 +87,7 @@ func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
return seg.loadZh()
}

// LoadDictStr load dictionary from string
// LoadDictStr load the dictionary from string
func (seg *Segmenter) LoadDictStr(dict string) error {
if seg.Dict == nil {
seg.Dict = NewDict()
Expand Down Expand Up @@ -128,7 +128,7 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
return nil
}

// LoadStopEmbed load stop dictionary from embed file
// LoadStopEmbed load the stop dictionary from embed file
func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) {
if len(dict) > 0 {
d := dict[0]
Expand Down
3 changes: 3 additions & 0 deletions dict_embed.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ var (
zhT string
//go:embed data/dict/zh/s_1.txt
zhS string

//go:embed data/dict/zh/idf.txt
zhIdf string
)

//go:embed data/dict/zh/stop_tokens.txt
Expand Down
13 changes: 9 additions & 4 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func (seg *Segmenter) Init() {
seg.TextFreq = "2.0"
}

// init the model of hmm cut
if !seg.NotLoadHMM {
seg.LoadModel()
}
Expand Down Expand Up @@ -154,7 +155,7 @@ func (seg *Segmenter) LoadDict(files ...string) error {
}

var (
dictDir = path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath string
// load bool
)
Expand Down Expand Up @@ -216,15 +217,19 @@ func (seg *Segmenter) LoadDict(files ...string) error {
}

// GetCurrentFilePath get the current file path
func GetCurrentFilePath() string {
func (seg *Segmenter) GetCurrentFilePath() string {
if seg.DictPath != "" {
return seg.DictPath
}

_, filePath, _, _ := runtime.Caller(1)
return filePath
}

// GetIdfPath get the idf path
func GetIdfPath(files ...string) []string {
func (seg *Segmenter) GetIdfPath(files ...string) []string {
var (
dictDir = path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath = path.Join(dictDir, "dict/zh/idf.txt")
)

Expand Down
2 changes: 1 addition & 1 deletion hmm/idf/idf.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func (i *Idf) AddToken(text string, freq float64, pos ...string) error {
// LoadDict load the idf dictionary
func (i *Idf) LoadDict(files ...string) error {
if len(files) <= 0 {
files = gse.GetIdfPath(files...)
files = i.seg.GetIdfPath(files...)
}

return i.seg.LoadDict(files...)
Expand Down
6 changes: 6 additions & 0 deletions hmm/idf/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ func (t *TagExtracter) LoadIdf(fileName ...string) error {
return t.Idf.LoadDict(fileName...)
}

// LoadIdfStr load and create a new Idf dictionary from the string.
func (t *TagExtracter) LoadIdfStr(str string) error {
t.Idf = NewIdf()
return t.Idf.seg.LoadDictStr(str)
}

// LoadStopWords load and create a new StopWord dictionary from the file.
func (t *TagExtracter) LoadStopWords(fileName ...string) error {
t.stopWord = NewStopWord()
Expand Down
32 changes: 11 additions & 21 deletions seg_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@ import (
"fmt"
)

// ToString segments to string 输出分词结果为字符串
// ToString converts a segments slice to string retrun the string
//
// 有两种输出模式,以 "山达尔星联邦共和国" 为例
// two output modes:
//
// 普通模式(searchMode=false)输出一个分词 "山达尔星联邦共和国/ns "
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
// "山达尔星/nz 联邦/n 共和/nz 国/n 共和国/ns 联邦共和国/nt 山达尔星联邦共和国/ns "
// normal mode (searchMode=false)
// search mode(searchMode=true)
//
// 默认 searchMode=false
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见 Token 结构体的注释。
// default searchMode=false
// search mode is used search engine, and will output more results
func ToString(segs []Segment, searchMode ...bool) (output string) {
var mode bool
if len(searchMode) > 0 {
Expand Down Expand Up @@ -76,16 +75,7 @@ func tokenToBytes(token *Token) (output []byte) {
return
}

// ToSlice segments to slice 输出分词结果到一个字符串 slice
//
// 有两种输出模式,以 "山达尔星联邦共和国" 为例
//
// 普通模式(searchMode=false)输出一个分词"[山达尔星联邦共和国]"
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
// "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"
//
// 默认 searchMode=false
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
// ToSlice converts a segments to slice retrun string slice
func ToSlice(segs []Segment, searchMode ...bool) (output []string) {
var mode bool
if len(searchMode) > 0 {
Expand Down Expand Up @@ -121,7 +111,7 @@ func tokenToSlice(token *Token) (output []string) {
return
}

// ToPos segments to SegPos
// ToPos converts a segments slice to []SegPos
func ToPos(segs []Segment, searchMode ...bool) (output []SegPos) {
var mode bool
if len(searchMode) > 0 {
Expand Down Expand Up @@ -168,20 +158,20 @@ func tokenToPos(token *Token) (output []SegPos) {
return
}

// 将多个字元拼接一个字符串输出
// let make multiple []Text into one string ooutput
func textToString(text []Text) (output string) {
for _, word := range text {
output += string(word)
}
return
}

// 将多个字元拼接一个字符串输出
// let make []Text toString returns a string output
func textSliceToString(text []Text) string {
return Join(text)
}

// 返回多个字元的字节总长度
// retrun total length of text slice
func textSliceByteLen(text []Text) (length int) {
for _, word := range text {
length += len(word)
Expand Down
57 changes: 35 additions & 22 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import (

// Segmenter define the segmenter structure
type Segmenter struct {
Dict *Dictionary
Load bool
DictSep string
Dict *Dictionary
Load bool
DictSep string
DictPath string

// NotLoadHMM option load the default hmm model config (Chinese char)
NotLoadHMM bool
Expand Down Expand Up @@ -57,7 +58,8 @@ type Segmenter struct {
StopWordMap map[string]bool
}

// jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息
// jumper this structure is used to record information
// about the forward leap at a word in the Viterbi algorithm
type jumper struct {
minDistance float32
token *Token
Expand Down Expand Up @@ -87,7 +89,7 @@ func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
}

func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
// specific case
// special cases
if len(bytes) == 0 {
// return []Segment{}
return nil
Expand All @@ -100,14 +102,16 @@ func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
}

func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
// 搜索模式下该分词已无继续划分可能的情况
// The case where the division is no longer possible in the search mode
if searchMode && len(text) == 1 {
return nil
}

// jumpers 定义了每个字元处的向前跳转信息,
// 包括这个跳转对应的分词,
// 以及从文本段开始到该字元的最短路径值
// jumpers defines the forward jump information at each literal,
// including the subword corresponding to this jump,
// the and the value of the shortest path from the start
// of the text segment to that literal
//
jumpers := make([]jumper, len(text))

if seg.Dict == nil {
Expand All @@ -116,43 +120,49 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {

tokens := make([]*Token, seg.Dict.maxTokenLen)
for current := 0; current < len(text); current++ {
// 找到前一个字元处的最短路径,以便计算后续路径值
// find the shortest path of the previous token,
// to calculate the subsequent path values
var baseDistance float32
if current == 0 {
// 当本字元在文本首部时,基础距离应该是零
// When this character is at the beginning of the text,
// the base distance should be zero
baseDistance = 0
} else {
baseDistance = jumpers[current-1].minDistance
}

// 寻找所有以当前字元开头的分词
// find all the segments starting with this token
tx := text[current:minInt(current+seg.Dict.maxTokenLen, len(text))]
numTokens := seg.Dict.LookupTokens(tx, tokens)

// 对所有可能的分词,更新分词结束字元处的跳转信息
// Update the jump information at the end of the split word
// for all possible splits
for iToken := 0; iToken < numTokens; iToken++ {
location := current + len(tokens[iToken].text) - 1
if !searchMode || current != 0 || location != len(text)-1 {
updateJumper(&jumpers[location], baseDistance, tokens[iToken])
}
}

// 当前字元没有对应分词时补加一个伪分词
// Add a pseudo-syllable if there is no corresponding syllable
// for the current character
if numTokens == 0 || len(tokens[0].text) > 1 {
updateJumper(&jumpers[current], baseDistance,
&Token{text: []Text{text[current]}, freq: 1, distance: 32, pos: "x"})
}
}

// 从后向前扫描第一遍得到需要添加的分词数目
// Scan the first pass from back to front
// to get the number of subwords to be added
numSeg := 0
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
numSeg++
index = location - 1
}

// 从后向前扫描第二遍添加分词到最终结果
// Scan from back to front for a second time
// to add the split to the final result
outputSegments := make([]Segment, numSeg)
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
Expand All @@ -161,7 +171,7 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
index = location - 1
}

// 计算各个分词的字节位置
// Calculate the byte position of each participle
bytePosition := 0
for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
outputSegments[iSeg].start = bytePosition
Expand All @@ -172,11 +182,14 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
return outputSegments
}

// updateJumper 更新跳转信息:
// 1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况),或者
// 2. 当该位置的当前最短路径大于新的最短路径时
// updateJumper Update the jump information:
// 1. When the location has never been visited
// (the case where jumper.minDistance is zero), or
// 2. When the current shortest path at the location
// is greater than the new shortest path
//
// 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率
// Update the shortest path value of the current location to baseDistance
// add the probability of the new split
func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
newDistance := baseDistance + token.distance
if jumper.minDistance == 0 || jumper.minDistance > newDistance {
Expand All @@ -202,7 +215,7 @@ func (seg *Segmenter) SplitTextToWords(text Text) []Text {
isNum := unicode.IsNumber(r) && !seg.Num
isAlpha := unicode.IsLetter(r) && !seg.Alpha
if size <= 2 && (isAlpha || isNum) {
// 当前是拉丁字母或数字(非中日韩文字)
// Currently is Latin alphabet or numbers (not in CJK)
if !inAlphanumeric {
alphanumericStart = current
inAlphanumeric = true
Expand Down
9 changes: 7 additions & 2 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ func TestToken(t *testing.T) {
}

func TestDictPaths(t *testing.T) {
var seg1 Segmenter
// seg.SkipLog = true
paths := DictPaths("./dictDir", "zh, jp")
tt.Expect(t, "3", len(paths))
Expand All @@ -238,9 +239,13 @@ func TestDictPaths(t *testing.T) {
tt.Expect(t, "3", len(paths))
tt.Equal(t, paths, paths1)

p := strings.ReplaceAll(GetCurrentFilePath(), "/segmenter_test.go", "") +
p := strings.ReplaceAll(seg1.GetCurrentFilePath(), "/segmenter_test.go", "") +
`/data/dict/zh/idf.txt`
tt.Equal(t, "["+p+"]", GetIdfPath([]string{}...))
tt.Equal(t, "["+p+"]", seg1.GetIdfPath([]string{}...))

seg1.DictPath = "testdata/zh"
tt.Equal(t, "testdata/zh", seg1.GetCurrentFilePath())
tt.Equal(t, "[testdata/data/dict/zh/idf.txt]", seg1.GetIdfPath([]string{}...))
}

func TestInAlphaNum(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (seg *Segmenter) LoadStop(files ...string) error {
seg.StopWordMap = make(map[string]bool)
}

dictDir := path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir := path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
if len(files) <= 0 {
dictPath := path.Join(dictDir, "dict/zh/stop_word.txt")
files = append(files, dictPath)
Expand Down
Loading

0 comments on commit 7b5179d

Please sign in to comment.