Skip to content

Commit

Permalink
Add: add idf embed and dict path custom support
Browse files Browse the repository at this point in the history
  • Loading branch information
vcaesar committed Jan 16, 2023
1 parent 1394956 commit 393a386
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 12 deletions.
3 changes: 3 additions & 0 deletions dict_embed.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ var (
zhT string
//go:embed data/dict/zh/s_1.txt
zhS string

//go:embed data/dict/zh/idf.txt
zhIdf string
)

//go:embed data/dict/zh/stop_tokens.txt
Expand Down
12 changes: 8 additions & 4 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (seg *Segmenter) LoadDict(files ...string) error {
}

var (
dictDir = path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath string
// load bool
)
Expand Down Expand Up @@ -217,15 +217,19 @@ func (seg *Segmenter) LoadDict(files ...string) error {
}

// GetCurrentFilePath get the current file path
func GetCurrentFilePath() string {
func (seg *Segmenter) GetCurrentFilePath() string {
if seg.DictPath != "" {
return seg.DictPath
}

_, filePath, _, _ := runtime.Caller(1)
return filePath
}

// GetIdfPath get the idf path
func GetIdfPath(files ...string) []string {
func (seg *Segmenter) GetIdfPath(files ...string) []string {
var (
dictDir = path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath = path.Join(dictDir, "dict/zh/idf.txt")
)

Expand Down
2 changes: 1 addition & 1 deletion hmm/idf/idf.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func (i *Idf) AddToken(text string, freq float64, pos ...string) error {
// LoadDict load the idf dictionary
func (i *Idf) LoadDict(files ...string) error {
if len(files) <= 0 {
files = gse.GetIdfPath(files...)
files = i.seg.GetIdfPath(files...)
}

return i.seg.LoadDict(files...)
Expand Down
6 changes: 6 additions & 0 deletions hmm/idf/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ func (t *TagExtracter) LoadIdf(fileName ...string) error {
return t.Idf.LoadDict(fileName...)
}

// LoadIdfStr load and create a new Idf dictionary from the string.
func (t *TagExtracter) LoadIdfStr(str string) error {
t.Idf = NewIdf()
return t.Idf.seg.LoadDictStr(str)
}

// LoadStopWords load and create a new StopWord dictionary from the file.
func (t *TagExtracter) LoadStopWords(fileName ...string) error {
t.stopWord = NewStopWord()
Expand Down
8 changes: 4 additions & 4 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ import (

// Segmenter define the segmenter structure
type Segmenter struct {
Dict *Dictionary
Load bool
DictSep string
// DictPath string
Dict *Dictionary
Load bool
DictSep string
DictPath string

// NotLoadHMM option load the default hmm model config (Chinese char)
NotLoadHMM bool
Expand Down
9 changes: 7 additions & 2 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ func TestToken(t *testing.T) {
}

func TestDictPaths(t *testing.T) {
var seg1 Segmenter
// seg.SkipLog = true
paths := DictPaths("./dictDir", "zh, jp")
tt.Expect(t, "3", len(paths))
Expand All @@ -238,9 +239,13 @@ func TestDictPaths(t *testing.T) {
tt.Expect(t, "3", len(paths))
tt.Equal(t, paths, paths1)

p := strings.ReplaceAll(GetCurrentFilePath(), "/segmenter_test.go", "") +
p := strings.ReplaceAll(seg1.GetCurrentFilePath(), "/segmenter_test.go", "") +
`/data/dict/zh/idf.txt`
tt.Equal(t, "["+p+"]", GetIdfPath([]string{}...))
tt.Equal(t, "["+p+"]", seg1.GetIdfPath([]string{}...))

seg1.DictPath = "testdata/zh"
tt.Equal(t, "testdata/zh", seg1.GetCurrentFilePath())
tt.Equal(t, "[testdata/data/dict/zh/idf.txt]", seg1.GetIdfPath([]string{}...))
}

func TestInAlphaNum(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (seg *Segmenter) LoadStop(files ...string) error {
seg.StopWordMap = make(map[string]bool)
}

dictDir := path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir := path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
if len(files) <= 0 {
dictPath := path.Join(dictDir, "dict/zh/stop_word.txt")
files = append(files, dictPath)
Expand Down

0 comments on commit 393a386

Please sign in to comment.