Skip to content

Commit c3f97cb

Browse files
committed
feat(splitter): add split_to_substring for easy process
1 parent fa2da9f commit c3f97cb

File tree

4 files changed

+185
-51
lines changed

4 files changed

+185
-51
lines changed

README.md

Lines changed: 88 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ Vielen Dank, merci beaucoup, for your help.
3737
- [3.1. Installation](#31-installation)
3838
- [3.2. Sample Code](#32-sample-code)
3939
- [3.2.1. Basic](#321-basic)
40+
- [`split_to_substring`](#split_to_substring)
41+
- [Advanced](#advanced)
42+
- [`split`](#split)
4043
- [3.2.2. Chinese, Japanese, Korean, English (Simple Usage)](#322-chinese-japanese-korean-english-simple-usage)
4144
- [3.2.2.1. Code](#3221-code)
4245
- [3.2.2.2. Output](#3222-output)
@@ -59,15 +62,46 @@ pip install split-lang
5962
## 3.2. Sample Code
6063

6164
### 3.2.1. Basic
65+
#### `split_to_substring`
6266

6367
```python
64-
from langsplit import split
68+
from langsplit import split_to_substring
6569

6670
texts = [
6771
"你喜欢看アニメ吗?",
6872
]
6973

7074
for text in texts:
75+
substr = split_to_substring(
76+
text=text,
77+
verbose=False,
78+
lang_map=new_lang_map,
79+
threshold=4.9e-5,
80+
default_lang="en",
81+
)
82+
for index, item in enumerate(substr):
83+
print(f"{index}|{item.lang}:{item.text}")
84+
print("----------------------")
85+
```
86+
87+
```
88+
0|zh:你喜欢看
89+
1|ja:アニメ
90+
2|zh:吗
91+
3|punctuation:?
92+
```
93+
94+
### Advanced
95+
#### `split`
96+
97+
```python
98+
from langsplit import split
99+
100+
texts = [
101+
"我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。",
102+
]
103+
104+
for text in texts_zh_jp_ko_en:
71105
substr_sections = split(
72106
text=text,
73107
verbose=False,
@@ -76,18 +110,64 @@ for text in texts:
76110
default_lang="en",
77111
)
78112
for index, section in enumerate(substr_sections):
79-
print(f"{index}: ", end="")
113+
print(f"{index}:{section.text}")
80114
if section.is_punctuation:
81-
print(f"{section.text}")
115+
print(f"\t|——punctuation:{section.text}")
82116
continue
83-
for substr in section.substrings:
84-
print(f"{substr.lang}:{substr.text}", end="|")
85-
print()
117+
for inner_index, substr in enumerate(section.substrings):
118+
print(f"\t|——{substr.lang}:{substr.text}")
119+
print("----------------------")
86120
```
87121

88122
```
89-
0: zh:你喜欢看|ja:アニメ|zh:吗|
90-
1: ?
123+
0:我是
124+
|——zh:我是
125+
1:VGroupChatBot
126+
|——en:VGroupChatBot
127+
2:,
128+
|——punctuation:,
129+
3:一个旨在支持多人通信的助手
130+
|——zh:一个旨在支持多人通信的助手
131+
4:,
132+
|——punctuation:,
133+
5:通过可视化消息来帮助团队成员更好地交流
134+
|——zh:通过可视化消息来帮助团队成员更好地交流
135+
6:。
136+
|——punctuation:。
137+
7:我可以帮助团队成员更好地整理和共享信息
138+
|——zh:我可以帮助团队成员更好地整理和共享信息
139+
8:,
140+
|——punctuation:,
141+
9:特别是在讨论
142+
|——zh:特别是在讨论
143+
10:、
144+
|——punctuation:、
145+
11:会议和
146+
|——zh:会议和
147+
12:Brainstorming
148+
|——en:Brainstorming
149+
13:等情况下
150+
|——zh:等情况下
151+
14:。
152+
|——punctuation:。
153+
15:你好我的名字是西野くまです
154+
|——zh:你好我的名字是
155+
|——ja:西野くまです
156+
16:my name is bob
157+
|——en:my name is bob
158+
17:很高兴认识你どうぞよろしくお願いいたします
159+
|——zh:很高兴认识你
160+
|——ja:どうぞよろしくお願いいたします
161+
18:「
162+
|——punctuation:「
163+
19:こんにちは
164+
|——ja:こんにちは
165+
20:」
166+
|——punctuation:」
167+
21:是什么意思
168+
|——zh:是什么意思
169+
22:。
170+
|——punctuation:。
91171
```
92172

93173
### 3.2.2. Chinese, Japanese, Korean, English (Simple Usage)

langsplit/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
from .split.splitter import split, SentenceSplitter, SubString
1+
from .split.splitter import split, split_to_substring, SentenceSplitter, SubString
22
from .detect_lang.detector import LANG_MAP, DEFAULT_LANG

langsplit/split/splitter.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,30 @@ def split(self, text: str, threshold: float = 4.9e-5, verbose=False) -> List[str
7575
default_sentence_splitter = SentenceSplitter()
7676

7777

78+
def split_to_substring(
79+
text: str,
80+
threshold: float = 4.9e-5,
81+
lang_map: Dict = None,
82+
default_lang: str = DEFAULT_LANG,
83+
verbose=False,
84+
splitter: SentenceSplitter = default_sentence_splitter,
85+
) -> List[SubString]:
86+
sections = split(
87+
text=text,
88+
threshold=threshold,
89+
lang_map=lang_map,
90+
default_lang=default_lang,
91+
verbose=verbose,
92+
splitter=splitter,
93+
)
94+
substrings: List[SubString] = []
95+
for section in sections:
96+
if section.is_punctuation:
97+
substrings.append(SubString(lang="punctuation", text=section.text))
98+
substrings.extend(section.substrings)
99+
return substrings
100+
101+
78102
def split(
79103
text: str,
80104
threshold: float = 4.9e-5,

tests/test_split.py

Lines changed: 72 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from langsplit import split
1+
from langsplit import split, split_to_substring
22
from tests.data.generate_test_json import texts_de_fr_en, texts_zh_jp_ko_en
33

44

@@ -10,44 +10,74 @@
1010
"ja": "ja",
1111
}
1212

13-
for text in texts_zh_jp_ko_en:
14-
substr_sections = split(
15-
text=text,
16-
verbose=False,
17-
lang_map=new_lang_map,
18-
threshold=4.9e-5,
19-
default_lang="en",
20-
)
21-
for index, section in enumerate(substr_sections):
22-
print(f"{index}: ", end="")
23-
if section.is_punctuation:
24-
print(f"{section.text}")
25-
continue
26-
for substr in section.substrings:
27-
print(f"{substr.lang}:{substr.text}", end="|")
28-
print()
29-
print("----------------------")
30-
31-
for text in texts_de_fr_en:
32-
substr_sections = split(
33-
text=text,
34-
verbose=False,
35-
# lang_map=new_lang_map,
36-
threshold=4.9e-4,
37-
default_lang="en",
38-
)
39-
for index, section in enumerate(substr_sections):
40-
print(f"{index}: ", end="")
41-
if section.is_punctuation:
42-
print(f"{section.text}")
43-
continue
44-
for substr in section.substrings:
45-
print(f"{substr.lang}:{substr.text}", end="|")
46-
print()
47-
print("----------------------")
48-
49-
# for text in texts_de_fr_en:
50-
# substr_list = split(text=text, verbose=False, lang_map=new_lang_map, threshold=1e-3)
51-
# for index, substr in enumerate(substr_list):
52-
# print(f"{substr.lang}|{index}: {substr.text}")
53-
# print("----------------------")
13+
14+
def test_split():
15+
for text in texts_zh_jp_ko_en:
16+
substr_sections = split(
17+
text=text,
18+
verbose=False,
19+
lang_map=new_lang_map,
20+
threshold=4.9e-5,
21+
default_lang="en",
22+
)
23+
for index, section in enumerate(substr_sections):
24+
print(f"{index}:{section.text}")
25+
if section.is_punctuation:
26+
print(f"\t|——punctuation:{section.text}")
27+
continue
28+
for inner_index, substr in enumerate(section.substrings):
29+
print(f"\t|——{substr.lang}:{substr.text}")
30+
print("----------------------")
31+
32+
for text in texts_de_fr_en:
33+
substr_sections = split(
34+
text=text,
35+
verbose=False,
36+
# lang_map=new_lang_map,
37+
threshold=4.9e-4,
38+
default_lang="en",
39+
)
40+
for index, section in enumerate(substr_sections):
41+
print(f"{index}:{section.text}")
42+
if section.is_punctuation:
43+
print(f"\t|——punctuation:{section.text}")
44+
continue
45+
for inner_index, substr in enumerate(section.substrings):
46+
print(f"\t|——{substr.lang}:{substr.text}")
47+
print("----------------------")
48+
49+
50+
def test_split_to_substring():
51+
for text in texts_zh_jp_ko_en:
52+
substr = split_to_substring(
53+
text=text,
54+
verbose=False,
55+
lang_map=new_lang_map,
56+
threshold=4.9e-5,
57+
default_lang="en",
58+
)
59+
for index, item in enumerate(substr):
60+
print(f"{index}|{item.lang}:{item.text}")
61+
print("----------------------")
62+
63+
for text in texts_de_fr_en:
64+
substr = split_to_substring(
65+
text=text,
66+
verbose=False,
67+
# lang_map=new_lang_map,
68+
threshold=4.9e-4,
69+
default_lang="en",
70+
)
71+
for index, item in enumerate(substr):
72+
print(f"{index}|{item.lang}:{item.text}")
73+
print("----------------------")
74+
75+
76+
def main():
77+
test_split()
78+
# test_split_to_substring()
79+
pass
80+
81+
82+
if __name__ == "__main__":
83+
main()

0 commit comments

Comments
 (0)