From 4aaed1d8bd0e2073ec346e7cf3273b7d9fa247d0 Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 13:13:32 +0900 Subject: [PATCH 1/7] chore: move user dict sample to testdata for testing --- cmd/lattice/cmd_test.go | 6 +++--- cmd/server/cmd_test.go | 6 +++--- cmd/tokenize/cmd_test.go | 2 +- testdata/userdict.txt | 15 +++++++++++++++ tokenizer/lattice/lattice_test.go | 2 +- tokenizer/token_test.go | 2 +- tokenizer/tokenizer_option_test.go | 2 +- 7 files changed, 25 insertions(+), 10 deletions(-) create mode 100644 testdata/userdict.txt diff --git a/cmd/lattice/cmd_test.go b/cmd/lattice/cmd_test.go index 9a6723d..fefbc60 100644 --- a/cmd/lattice/cmd_test.go +++ b/cmd/lattice/cmd_test.go @@ -43,7 +43,7 @@ func TestOptionCheck(t *testing.T) { { name: "all options and input", args: []string{ - "-udict", "../../sample/dict/userdict.txt", + "-udict", "../../testdata/userdict.txt", "-dict", "ipa", "-mode", "search", "-output", "/dev/null", @@ -83,7 +83,7 @@ func TestRun(t *testing.T) { { name: "all options and input", args: []string{ - "-udict", "../../sample/dict/userdict.txt", + "-udict", "../../testdata/userdict.txt", "-dict", "ipa", "-mode", "search", "-output", "/dev/null", @@ -139,7 +139,7 @@ func Test_command(t *testing.T) { { name: "verbose", args: &option{ - udict: "../../sample/dict/userdict.txt", + udict: "../../testdata/userdict.txt", dict: "uni", mode: "extended", output: "", diff --git a/cmd/server/cmd_test.go b/cmd/server/cmd_test.go index a2e8388..887dc13 100644 --- a/cmd/server/cmd_test.go +++ b/cmd/server/cmd_test.go @@ -33,7 +33,7 @@ func TestOptionCheck(t *testing.T) { { name: "all args", args: []string{ - "-userdict", "../../sample/dict/userdict.txt", + "-userdict", "../../testdata/userdict.txt", "-http", ":8888", "-dict", "ipa", }, @@ -73,7 +73,7 @@ func TestRun(t *testing.T) { { name: "normal operation w/ options", args: []string{ - "-userdict", "../../sample/dict/userdict.txt", + "-userdict", "../../testdata/userdict.txt", "-http", ":0", "-dict", "ipa", }, @@ -127,7 +127,7 @@ func Test_command(t *testing.T) { opt: &option{ http: ":0", dict: "ipa", - udict: "../../sample/dict/userdict.txt", + udict: "../../testdata/userdict.txt", flagSet: flag.NewFlagSet(CommandName, flag.ContinueOnError), }, wantErr: false, diff --git a/cmd/tokenize/cmd_test.go b/cmd/tokenize/cmd_test.go index 675e827..e6b7036 100644 --- a/cmd/tokenize/cmd_test.go +++ b/cmd/tokenize/cmd_test.go @@ -258,7 +258,7 @@ func TestRun(t *testing.T) { { name: "normal operation w/ options", args: []string{ - "-udict", "../../sample/dict/userdict.txt", + "-udict", "../../testdata/userdict.txt", "-file", "../../testdata/nekodearu.txt", "-split", }, diff --git a/testdata/userdict.txt b/testdata/userdict.txt new file mode 100644 index 0000000..0a240c5 --- /dev/null +++ b/testdata/userdict.txt @@ -0,0 +1,15 @@ +## +## This file should use UTF-8 encoding +## +## User dictionary format: +## ,,, +## or +## , ... , ... , +## + +# Custom reading for former sumo wrestler Asashoryu +朝青龍,朝青龍,アサショウリュウ,カスタム人名 + +# Custom segmentation for long entries +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞 diff --git a/tokenizer/lattice/lattice_test.go b/tokenizer/lattice/lattice_test.go index 753cc11..a67dacd 100644 --- a/tokenizer/lattice/lattice_test.go +++ b/tokenizer/lattice/lattice_test.go @@ -88,7 +88,7 @@ func Test_LatticeBuild(t *testing.T) { } func Test_LatticeBuildWithUserDict(t *testing.T) { - const udictPath = "../../sample/dict/userdict.txt" + const udictPath = "../../testdata/userdict.txt" udic, err := dict.NewUserDict(udictPath) if err != nil { diff --git a/tokenizer/token_test.go b/tokenizer/token_test.go index 92ba06e..5a56448 100644 --- a/tokenizer/token_test.go +++ b/tokenizer/token_test.go @@ -9,7 +9,7 @@ import ( ) const ( - userDictSample = "../sample/dict/userdict.txt" + userDictSample = "../testdata/userdict.txt" ) func Test_TokenClassString(t *testing.T) { diff --git a/tokenizer/tokenizer_option_test.go b/tokenizer/tokenizer_option_test.go index 3c218f3..63fe139 100644 --- a/tokenizer/tokenizer_option_test.go +++ b/tokenizer/tokenizer_option_test.go @@ -8,7 +8,7 @@ import ( ) const ( - testUserDictPath = "../sample/dict/userdict.txt" + testUserDictPath = "../testdata/userdict.txt" ) func TestTokenizer_Analyze_Nop(t *testing.T) { From 7685a8f557d4e2ac79aeffc91c023507afa1a36e Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 13:17:43 +0900 Subject: [PATCH 2/7] chore: revamp sample dir as _example Refactor directory structure of examples. - related issue #200 https://github.com/ikawaha/kagome/issues/299#issuecomment-2052892253 --- _examples/db_search/README.md | 65 +++++++++++++++++++ .../_example => _examples/db_search}/go.mod | 2 +- .../_example => _examples/db_search}/go.sum | 0 .../_example => _examples}/db_search/main.go | 34 ++-------- _examples/tokenize/go.mod | 12 ++++ _examples/tokenize/go.sum | 4 ++ .../_example => _examples}/tokenize/main.go | 0 _examples/user_dict/go.mod | 11 ++++ _examples/user_dict/go.sum | 4 ++ _examples/user_dict/main.go | 36 ++++++++++ .../dict => _examples/user_dict}/userdict.txt | 0 _examples/wakati/go.mod | 12 ++++ _examples/wakati/go.sum | 4 ++ {sample/_example => _examples}/wakati/main.go | 0 _examples/wasm/README.md | 23 +++++++ _examples/wasm/go.mod | 12 ++++ _examples/wasm/go.sum | 4 ++ {sample => _examples}/wasm/kagome.html | 0 {sample => _examples}/wasm/main.go | 4 +- sample/wasm/README.md | 21 ------ sample/wasm/go.mod | 3 - 21 files changed, 196 insertions(+), 55 deletions(-) create mode 100644 _examples/db_search/README.md rename {sample/_example => _examples/db_search}/go.mod (88%) rename {sample/_example => _examples/db_search}/go.sum (100%) rename {sample/_example => _examples}/db_search/main.go (71%) create mode 100644 _examples/tokenize/go.mod create mode 100644 _examples/tokenize/go.sum rename {sample/_example => _examples}/tokenize/main.go (100%) create mode 100644 _examples/user_dict/go.mod create mode 100644 _examples/user_dict/go.sum create mode 100644 _examples/user_dict/main.go rename {sample/dict => _examples/user_dict}/userdict.txt (100%) create mode 100644 _examples/wakati/go.mod create mode 100644 _examples/wakati/go.sum rename {sample/_example => _examples}/wakati/main.go (100%) create mode 100644 _examples/wasm/README.md create mode 100644 _examples/wasm/go.mod create mode 100644 _examples/wasm/go.sum rename {sample => _examples}/wasm/kagome.html (100%) rename {sample => _examples}/wasm/main.go (96%) delete mode 100644 sample/wasm/README.md delete mode 100644 sample/wasm/go.mod diff --git a/_examples/db_search/README.md b/_examples/db_search/README.md new file mode 100644 index 0000000..f308768 --- /dev/null +++ b/_examples/db_search/README.md @@ -0,0 +1,65 @@ +# Full-text search with Kagome and SQLite3 + +This example provides a practical example of how to work with Japanese text data and **perform efficient [full-text search](https://en.wikipedia.org/wiki/Full-text_search) using Kagome and SQLite3**. + +- Target text data is as follows: + +```text +人魚は、南の方の海にばかり棲んでいるのではありません。 +北の海にも棲んでいたのであります。 +北方の海の色は、青うございました。 +ある時、岩の上に、女の人魚があがって、 +あたりの景色を眺めながら休んでいました。 +小川未明 『赤い蝋燭と人魚』 +``` + +- Example output: + +```shellsession +$ cd /path/to/kagome/_examples/db_search +$ go run . +Searching for: 人魚 + Found content: 人魚は、南の方の海にばかり棲んでいるのではありません。 at line: 1 + Found content: ある時、岩の上に、女の人魚があがって、 at line: 4 + Found content: 小川未明 『赤い蝋燭と人魚』 at line: 6 +Searching for: 人 + No results found +Searching for: 北方 + Found content: 北方の海の色は、青うございました。 at line: 3 +Searching for: 北 + Found content: 北の海にも棲んでいたのであります。 at line: 2 +``` + +- [View main.go](main.go) + +## Details + +In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "人魚" and "人". + +Note that the string tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for [FTS4](https://www.sqlite.org/fts3.html) (Full-Text-Search) at the same time as the original text. + +This allows Unicode text data that is not separated by spaces, such as Japanese, to be searched by FTS. + +### Aim of this example + +This example can be useful in scenarios where you need to perform full-text searches on Japanese text. + +It demonstrates how to tokenize Japanese text using Kagome, which is a common requirement when working with text data in the Japanese language. + +By using SQLite with FTS4, it efficiently manages and searches through a large amount of text data, making it suitable for applications like: + +1. **Search Engines:** You can use this code as a basis for building a search engine that indexes and searches Japanese text content. +2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. +3. **Content Recommendation Systems:** When you have a large collection of Japanese content, you can use this code to implement content recommendation systems based on user queries. +4. **Chatbots and NLP:** If you're building chatbots or natural language processing (NLP) systems for Japanese language, this code can assist in text analysis and search within the chatbot's knowledge base. + +## Acknowledgements + +This example is taken in part from the following book for reference. + +- p.204, 9.2 "データーベース登録プログラム", "Go言語プログラミングエッセンス エンジニア選書" + - Written by: [Mattn](https://github.com/mattn) + - Published: 2023/3/9 (技術評論社) + - ISBN: 4297134195 / 978-4297134198 + - ASIN: B0BVZCJQ4F / [https://amazon.co.jp/dp/4297134195](https://amazon.co.jp/dp/4297134195) + - Original sample code: [https://github.com/mattn/aozora-search](https://github.com/mattn/aozora-search) diff --git a/sample/_example/go.mod b/_examples/db_search/go.mod similarity index 88% rename from sample/_example/go.mod rename to _examples/db_search/go.mod index 1b747b4..c4c8eff 100644 --- a/sample/_example/go.mod +++ b/_examples/db_search/go.mod @@ -1,4 +1,4 @@ -module kagome/examples +module kagome/examples/db_search go 1.19 diff --git a/sample/_example/go.sum b/_examples/db_search/go.sum similarity index 100% rename from sample/_example/go.sum rename to _examples/db_search/go.sum diff --git a/sample/_example/db_search/main.go b/_examples/db_search/main.go similarity index 71% rename from sample/_example/db_search/main.go rename to _examples/db_search/main.go index a811a27..e92de2d 100644 --- a/sample/_example/db_search/main.go +++ b/_examples/db_search/main.go @@ -1,35 +1,9 @@ /* -# TL; DR +# Full-text search with Kagome and SQLite3 This example provides a practical example of how to work with Japanese text data and perform efficient full-text search using Kagome and SQLite3. -# TS; WM - -In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "人魚" and "人". - -Note that the string tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for FTS (Full-Text-Search) at the same time as the original text. - -This allows Unicode text data that is not separated by spaces, such as Japanese, to be searched by FTS. - -Aim of this example: - -This example can be useful in scenarios where you need to perform full-text searches on Japanese text. It demonstrates how to tokenize Japanese text using Kagome, which is a common requirement when working with text data in the Japanese language. By using SQLite with FTS4, it efficiently manages and searches through a large amount of text data, making it suitable for applications like: - -1. **Search Engines:** You can use this code as a basis for building a search engine that indexes and searches Japanese text content. -2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. -3. **Content Recommendation Systems:** When you have a large collection of Japanese content, you can use this code to implement content recommendation systems based on user queries. -4. **Chatbots and NLP:** If you're building chatbots or natural language processing (NLP) systems for Japanese language, this code can assist in text analysis and search within the chatbot's knowledge base. - -Acknowledgements: - -This example is taken in part from the following book for reference. - -- p.204, 9.2 "データーベース登録プログラム", "Go言語プログラミングエッセンス エンジニア選書" - - Written by: Mattn - - Published: 2023/3/9 (技術評論社) - - ISBN: 4297134195 / 978-4297134198 - - ASIN: B0BVZCJQ4F / https://amazon.co.jp/dp/4297134195 - - Original sample code: https://github.com/mattn/aozora-search +For details and acknowledgements, see the README.md file in the same directory. */ package main @@ -39,6 +13,7 @@ import ( "fmt" "log" "os" + "slices" "strings" "github.com/ikawaha/kagome-dict/ipa" @@ -165,6 +140,9 @@ func insertSearchToken(db *sql.DB, rowID int64, content string) error { } seg := tknzr.Wakati(content) + + seg = slices.Compact(seg) // remove duplicate segment tokens + tokenizedContent := strings.Join(seg, " ") _, err = db.Exec( diff --git a/_examples/tokenize/go.mod b/_examples/tokenize/go.mod new file mode 100644 index 0000000..9b8c0fa --- /dev/null +++ b/_examples/tokenize/go.mod @@ -0,0 +1,12 @@ +module kagome/examples/tokenize + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +require github.com/ikawaha/kagome-dict v1.0.9 // indirect + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/tokenize/go.sum b/_examples/tokenize/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/tokenize/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/sample/_example/tokenize/main.go b/_examples/tokenize/main.go similarity index 100% rename from sample/_example/tokenize/main.go rename to _examples/tokenize/main.go diff --git a/_examples/user_dict/go.mod b/_examples/user_dict/go.mod new file mode 100644 index 0000000..8c61210 --- /dev/null +++ b/_examples/user_dict/go.mod @@ -0,0 +1,11 @@ +module kagome/examples/user_dict + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict v1.0.9 + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/user_dict/go.sum b/_examples/user_dict/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/user_dict/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/_examples/user_dict/main.go b/_examples/user_dict/main.go new file mode 100644 index 0000000..148e223 --- /dev/null +++ b/_examples/user_dict/main.go @@ -0,0 +1,36 @@ +package main + +import ( + "fmt" + + "github.com/ikawaha/kagome-dict/dict" + "github.com/ikawaha/kagome-dict/ipa" + "github.com/ikawaha/kagome/v2/tokenizer" +) + +func main() { + // Use IPA dictionary as a system dictionary. + sysDic := ipa.Dict() + + // Build a user dictionary from a file. + userDic, err := dict.NewUserDict("userdict.txt") + if err != nil { + panic(err) + } + + // Specify the user dictionary as an option. + t, err := tokenizer.New(sysDic, tokenizer.UserDict(userDic), tokenizer.OmitBosEos()) + if err != nil { + panic(err) + } + + tokens := t.Analyze("関西国際空港限定トートバッグ", tokenizer.Search) + for _, token := range tokens { + fmt.Printf("%s\t%v\n", token.Surface, token.Features()) + } + + // Output: + // 関西国際空港 [テスト名詞 関西/国際/空港 カンサイ/コクサイ/クウコウ] + // 限定 [名詞 サ変接続 * * * * 限定 ゲンテイ ゲンテイ] + // トートバッグ [名詞 一般 * * * * *] +} diff --git a/sample/dict/userdict.txt b/_examples/user_dict/userdict.txt similarity index 100% rename from sample/dict/userdict.txt rename to _examples/user_dict/userdict.txt diff --git a/_examples/wakati/go.mod b/_examples/wakati/go.mod new file mode 100644 index 0000000..3193fd0 --- /dev/null +++ b/_examples/wakati/go.mod @@ -0,0 +1,12 @@ +module kagome/examples/wakati + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +require github.com/ikawaha/kagome-dict v1.0.9 // indirect + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/wakati/go.sum b/_examples/wakati/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/wakati/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/sample/_example/wakati/main.go b/_examples/wakati/main.go similarity index 100% rename from sample/_example/wakati/main.go rename to _examples/wakati/main.go diff --git a/_examples/wasm/README.md b/_examples/wasm/README.md new file mode 100644 index 0000000..ffbdc01 --- /dev/null +++ b/_examples/wasm/README.md @@ -0,0 +1,23 @@ +# WebAssembly Example of Kagome + +- Build + +```sh +GOOS=js GOARCH=wasm go build -o kagome.wasm main.go +``` + +```shellsession +├── docs ... gh-pages +│   ├── index.html +│   ├── kagome.wasm +│   └── wasm_exec.js +├── _examples +│   └── wasm +│   ├── README.md ... this document +│   ├── kagome.html ... html sample +│   ├── main.go ... source code +│   ├── go.mod +│   └── go.sum +``` + +- Online demo: [https://ikawaha.github.io/kagome/](https://ikawaha.github.io/kagome/) diff --git a/_examples/wasm/go.mod b/_examples/wasm/go.mod new file mode 100644 index 0000000..a916734 --- /dev/null +++ b/_examples/wasm/go.mod @@ -0,0 +1,12 @@ +module kagome/examples/wasm + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +require github.com/ikawaha/kagome-dict v1.0.9 // indirect + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/wasm/go.sum b/_examples/wasm/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/wasm/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/sample/wasm/kagome.html b/_examples/wasm/kagome.html similarity index 100% rename from sample/wasm/kagome.html rename to _examples/wasm/kagome.html diff --git a/sample/wasm/main.go b/_examples/wasm/main.go similarity index 96% rename from sample/wasm/main.go rename to _examples/wasm/main.go index a4f08f0..24a2571 100644 --- a/sample/wasm/main.go +++ b/_examples/wasm/main.go @@ -1,5 +1,5 @@ -//go:build ignore -// +build ignore +//go:build js && wasm +// +build js,wasm package main diff --git a/sample/wasm/README.md b/sample/wasm/README.md deleted file mode 100644 index 9e72e2c..0000000 --- a/sample/wasm/README.md +++ /dev/null @@ -1,21 +0,0 @@ -WebAssembly Sample ---- - -``` -GOOS=js GOARCH=wasm go build -o kagome.wasm main.go -``` - - -``` -├── docs ... gh-pages -│   ├── index.html -│   ├── kagome.wasm -│   └── wasm_exec.js -├── sample -│   └── wasm -│   ├── README.md ... this document. -│   ├── go.mod -│   ├── kagome.html ... html sample -│   └── main.go ... -``` -demo. https://ikawaha.github.io/kagome/ \ No newline at end of file diff --git a/sample/wasm/go.mod b/sample/wasm/go.mod deleted file mode 100644 index 89d4416..0000000 --- a/sample/wasm/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module sample - -go 1.16 From 6c17f98ff90f763c3a326a80effaac1ed278d51f Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 13:18:15 +0900 Subject: [PATCH 3/7] chore: add go.work for multi-modules under _example --- _examples/go.work | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 _examples/go.work diff --git a/_examples/go.work b/_examples/go.work new file mode 100644 index 0000000..d50069c --- /dev/null +++ b/_examples/go.work @@ -0,0 +1,9 @@ +go 1.19 + +use ( + ./db_search + ./tokenize + ./user_dict + ./wakati + ./wasm +) From 63cabe1188829ffa4af89314201c58534e213d67 Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 13:44:21 +0900 Subject: [PATCH 4/7] fix: graphviz error during ci ("brew link" fail on macOS) --- .github/workflows/go.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index f016752..bccb6c7 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -28,6 +28,12 @@ jobs: go-version-file: 'go.mod' cache: true + - name: Remove symlink 2to3 + if: matrix.os == 'macos-latest' + run: | + : # Workaround GitHub Actions Python issues + brew unlink python && brew link --overwrite python + - name: Set up Graphviz uses: ts-graphviz/setup-graphviz@v2 From e212656ce01b236369d2c92c2d28ee8223d9a483 Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 14:23:36 +0900 Subject: [PATCH 5/7] chore: add comment for the workaround Sometimes `brew upgrade` of Graphviz action fails on macOS runner while upgrading python packages. This error is based on the below issue: - https://github.com/actions/setup-python/issues/577 Re-linking python will fix this problem: - Ref: https://github.com/Homebrew/homebrew-core/issues/165793#issuecomment-1989441193 --- .github/workflows/go.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index bccb6c7..e211d86 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -32,6 +32,7 @@ jobs: if: matrix.os == 'macos-latest' run: | : # Workaround GitHub Actions Python issues + : # https://github.com/Homebrew/homebrew-core/issues/165793#issuecomment-1989441193 brew unlink python && brew link --overwrite python - name: Set up Graphviz From 9b43806b7bdf36f7c3e03ebb2c1e71e11a9565fe Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 15:01:26 +0900 Subject: [PATCH 6/7] docs: more description about its functionality (FTS on SQLite3) --- _examples/db_search/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/_examples/db_search/README.md b/_examples/db_search/README.md index f308768..24972be 100644 --- a/_examples/db_search/README.md +++ b/_examples/db_search/README.md @@ -34,12 +34,18 @@ Searching for: 北 ## Details -In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "人魚" and "人". +In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "人魚", "人", "北方" and "北". -Note that the string tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for [FTS4](https://www.sqlite.org/fts3.html) (Full-Text-Search) at the same time as the original text. +When inserting text data into the database, Kagome is used to tokenize the text into words. + +The string (or a line) tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for [FTS4](https://www.sqlite.org/fts3.html) (Full-Text-Search) relative to the original text. This allows Unicode text data that is not separated by spaces, such as Japanese, to be searched by FTS. +Note that it is searching by word and not by character. For example "人" doesn't match "人魚". Likewise, "北" doesn't match "北方". + +This is due to the fact that the FTS4 module in SQLite3 is designed to search for words, not characters. + ### Aim of this example This example can be useful in scenarios where you need to perform full-text searches on Japanese text. From b65900b079835c2de0ed9e498e8a20451e1bc952 Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 15:30:59 +0900 Subject: [PATCH 7/7] docs: add README to each example --- _examples/db_search/README.md | 2 +- _examples/tokenize/README.md | 28 ++++++++++++++++++++++++++++ _examples/tokenize/main.go | 16 ++++++++-------- _examples/wakati/README.md | 25 +++++++++++++++++++++++++ _examples/wasm/README.md | 19 +++++++++++++++---- 5 files changed, 77 insertions(+), 13 deletions(-) create mode 100644 _examples/tokenize/README.md create mode 100644 _examples/wakati/README.md diff --git a/_examples/db_search/README.md b/_examples/db_search/README.md index 24972be..c77ac9a 100644 --- a/_examples/db_search/README.md +++ b/_examples/db_search/README.md @@ -55,7 +55,7 @@ It demonstrates how to tokenize Japanese text using Kagome, which is a common re By using SQLite with FTS4, it efficiently manages and searches through a large amount of text data, making it suitable for applications like: 1. **Search Engines:** You can use this code as a basis for building a search engine that indexes and searches Japanese text content. -2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. +2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. 3. **Content Recommendation Systems:** When you have a large collection of Japanese content, you can use this code to implement content recommendation systems based on user queries. 4. **Chatbots and NLP:** If you're building chatbots or natural language processing (NLP) systems for Japanese language, this code can assist in text analysis and search within the chatbot's knowledge base. diff --git a/_examples/tokenize/README.md b/_examples/tokenize/README.md new file mode 100644 index 0000000..00af8b9 --- /dev/null +++ b/_examples/tokenize/README.md @@ -0,0 +1,28 @@ +# Tokenizing Example with Kagome + +## Analyzing a Japanese text into words and parts of speech with Kagome + +This example demonstrates how to analyzes a sentence (tokenize) and get the part-of-speech (POS) of each word using Kagome. + +- Target text data is as follows: + +```text +すもももももももものうち +``` + +- Example output: + +```shellsession +$ cd /path/to/kagome/_examples/tokenize +$ go run . +---tokenize--- +すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ +も 助詞,係助詞,*,*,*,*,も,モ,モ +もも 名詞,一般,*,*,*,*,もも,モモ,モモ +も 助詞,係助詞,*,*,*,*,も,モ,モ +もも 名詞,一般,*,*,*,*,もも,モモ,モモ +の 助詞,連体化,*,*,*,*,の,ノ,ノ +うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ +``` + +> __Note__ that tokenization varies depending on the dictionary used. In this example we use the IPA dictionary. diff --git a/_examples/tokenize/main.go b/_examples/tokenize/main.go index 9fd5710..d0601fa 100644 --- a/_examples/tokenize/main.go +++ b/_examples/tokenize/main.go @@ -22,12 +22,12 @@ func main() { } // Output: - //---tokenize--- - //すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ - //も 助詞,係助詞,*,*,*,*,も,モ,モ - //もも 名詞,一般,*,*,*,*,もも,モモ,モモ - //も 助詞,係助詞,*,*,*,*,も,モ,モ - //もも 名詞,一般,*,*,*,*,もも,モモ,モモ - //の 助詞,連体化,*,*,*,*,の,ノ,ノ - //うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ + // ---tokenize--- + // すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ + // も 助詞,係助詞,*,*,*,*,も,モ,モ + // もも 名詞,一般,*,*,*,*,もも,モモ,モモ + // も 助詞,係助詞,*,*,*,*,も,モ,モ + // もも 名詞,一般,*,*,*,*,もも,モモ,モモ + // の 助詞,連体化,*,*,*,*,の,ノ,ノ + // うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ } diff --git a/_examples/wakati/README.md b/_examples/wakati/README.md new file mode 100644 index 0000000..b64785c --- /dev/null +++ b/_examples/wakati/README.md @@ -0,0 +1,25 @@ +# Wakati Example with Kagome + +## Segmenting Japanese text into words with Kagome + +In this example, we demonstrate how to segment Japanese text into words using Kagome. + +- Target text data is as follows: + +```text +すもももももももものうち +``` + +- Example output: + +```shellsession +$ cd /path/to/kagome/_examples/wakati +$ go run . +----wakati--- +すもも/も/もも/も/もも/の/うち +``` + +> __Note__ that segmentation varies depending on the dictionary used. +> In this example we use the IPA dictionary. But for searching purposes, the Uni dictionary is recommended. +> +> - [What is a Kagome dictionary?](https://github.com/ikawaha/kagome/wiki/About-the-dictionary#what-is-a-kagome-dictionary) | Wiki | kagome @ GitHub diff --git a/_examples/wasm/README.md b/_examples/wasm/README.md index ffbdc01..0332802 100644 --- a/_examples/wasm/README.md +++ b/_examples/wasm/README.md @@ -1,11 +1,24 @@ -# WebAssembly Example of Kagome +# WebAssembly Example with Kagome -- Build +In this example we will demonstrate how to use Kagome in a WebAssembly application and show how responsive it can be. + +- See: "[Kagome As a Server Side Tokenizer (Feeling Kagome Slow?)](https://github.com/ikawaha/kagome/wiki/Kagome-As-a-Server-Side-Tokenizer)" | Wiki | kagome @ GitHub + +## How to Use ```sh +# Build the wasm binary GOOS=js GOARCH=wasm go build -o kagome.wasm main.go + +# Copy wasm_exec.js which maches to the compiled binary +cp "$(go env GOROOT)/misc/wasm/wasm_exec.js" . +**snip** ``` +Now call the `wasm_exec.js` and `kagome.wasm` from the HTML file and run a web server. + +- Online demo: [https://ikawaha.github.io/kagome/](https://ikawaha.github.io/kagome/) + ```shellsession ├── docs ... gh-pages │   ├── index.html @@ -19,5 +32,3 @@ GOOS=js GOARCH=wasm go build -o kagome.wasm main.go │   ├── go.mod │   └── go.sum ``` - -- Online demo: [https://ikawaha.github.io/kagome/](https://ikawaha.github.io/kagome/)