Skip to content

Commit 14ea85a

Browse files
committed
feat(detector): support and test fr, de, en exist at the same time.
fix(splitter): add para `lang_map` allow mapping different language to same language for better result, if you know the range of your target languages
1 parent 48caf3d commit 14ea85a

File tree

4 files changed

+106
-19
lines changed

4 files changed

+106
-19
lines changed

langsplit/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .split.splitter import split
1+
from .split.splitter import split, SentenceSplitter, SubString
2+
from .detect_lang.detector import LANG_MAP

langsplit/detect_lang/detector.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
from langdetect import detect
22
import fast_langdetect
33

4-
lang_map = {
4+
LANG_MAP = {
55
"zh": "zh",
66
"zh-cn": "zh",
77
"zh-tw": "x",
88
"ko": "ko",
99
"ja": "ja",
10+
"de": "de",
11+
"fr": "fr",
12+
"en": "en",
13+
"x": "en",
1014
}
1115

1216

langsplit/split/splitter.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
from typing import List
1+
from typing import List, Dict
22
from dataclasses import dataclass
33

44
from langdetect.lang_detect_exception import LangDetectException
55
from wtpsplit import SaT, WtP
66

7-
from langsplit.detect_lang.detector import detect_lang, fast_detect_lang, lang_map
7+
from langsplit.detect_lang.detector import detect_lang, fast_detect_lang, LANG_MAP
88

99

1010
@dataclass
@@ -29,33 +29,42 @@ def split(self, text: str, threshold: float = 5e-5, verbose=False):
2929
def split(
3030
text: str,
3131
threshold: float = 5e-5,
32+
lang_map=None,
3233
verbose=False,
3334
splitter: SentenceSplitter = default_sentence_splitter,
34-
):
35+
) -> List[SubString]:
3536
"""using
3637
1. `wtpsplit` to split sentences into 'small' substring
3738
2. concat substring based on language using `fasttext` and `langdetect`
3839
3940
Args:
4041
text (str): text to split
41-
threshold (float, optional): the lower the more separated (more) substring will return. Defaults to 5e-5.
42+
threshold (float, optional): the lower the more separated (more) substring will return. Defaults to 5e-5 (if your text contains no Chinese, Japanese, Korean, 1e-3 is suggested)
43+
lang_map (_type_, optional): mapping different language to same language for better result, if you know the range of your target languages. Defaults to None.
44+
verbose (bool, optional): print the process. Defaults to False.
45+
splitter (SentenceSplitter, optional): sentence splitter. Defaults to default_sentence_splitter.
46+
47+
Returns:
48+
List[SubString]: substring with .lang and .text
4249
"""
4350
substr_list = splitter.split(text=text, threshold=threshold, verbose=verbose)
4451
if verbose:
4552
print(f"substr_list: {substr_list}")
46-
substr_list = _init_substr_lang(substr_list)
53+
substr_list = _init_substr_lang(substr=substr_list, lang_map=lang_map)
4754
if verbose:
4855
print(f"substr_list: {substr_list}")
49-
substr_list = _smart_concat(substr_list)
56+
substr_list = _smart_concat(substr_list=substr_list, lang_map=lang_map)
5057
if verbose:
5158
print(f"split_result: {substr_list}")
5259
return substr_list
5360

5461

55-
def _smart_concat(substr_list: List[SubString]):
62+
def _smart_concat(substr_list: List[SubString], lang_map=None):
63+
if lang_map is None:
64+
lang_map = LANG_MAP
5665
is_concat_complete = False
5766
while is_concat_complete is False:
58-
substr_list = _smart_concat_logic(substr_list)
67+
substr_list = _smart_concat_logic(substr_list, lang_map=lang_map)
5968
is_concat_complete = True
6069
for index, block in enumerate(substr_list):
6170
if block.lang == "x":
@@ -68,15 +77,20 @@ def _smart_concat(substr_list: List[SubString]):
6877
return substr_list
6978

7079

71-
def _init_substr_lang(substr: List[str]) -> List[SubString]:
80+
def _init_substr_lang(substr: List[str], lang_map=None) -> List[SubString]:
7281
concat_result = []
7382
lang = ""
83+
if lang_map is None:
84+
lang_map = LANG_MAP
7485
for block in substr:
7586
try:
7687
cur_lang = detect_lang(block)
7788
except LangDetectException:
7889
cur_lang = lang
79-
cur_lang = lang_map.get(cur_lang, "en")
90+
cur_lang = lang_map.get(cur_lang, "x")
91+
if cur_lang == "x":
92+
cur_lang = fast_detect_lang(block)
93+
cur_lang = lang_map.get(cur_lang, "x")
8094
concat_result.append(SubString(cur_lang, block))
8195
lang = cur_lang
8296
return concat_result
@@ -143,7 +157,7 @@ def _find_nearest_lang_with_direction(
143157
for i in range(1, len(concat_result)):
144158
if index + i < len(concat_result) and concat_result[index + i].lang != "x":
145159
return concat_result[index + i].lang
146-
return "en"
160+
return "x"
147161

148162

149163
def _get_find_direction(substr_list: List[SubString], index: int) -> bool:
@@ -179,13 +193,15 @@ def _merge_blocks(concat_result: List[SubString]):
179193
return smart_concat_result
180194

181195

182-
def _check_languages(lang_text_list: List[SubString]):
196+
def _check_languages(lang_text_list: List[SubString], lang_map=None):
197+
if lang_map is None:
198+
lang_map = LANG_MAP
183199
for index, block in enumerate(lang_text_list):
184200
try:
185201
cur_lang = fast_detect_lang(block.text)
186202
except LangDetectException:
187-
cur_lang = "en"
188-
cur_lang = lang_map.get(cur_lang, "en")
203+
cur_lang = "x"
204+
cur_lang = lang_map.get(cur_lang, "x")
189205
if cur_lang == "ko":
190206
fast_lang = fast_detect_lang(block.text, text_len_threshold=0)
191207
if fast_lang != "ko":
@@ -198,13 +214,13 @@ def _check_languages(lang_text_list: List[SubString]):
198214
return lang_text_list
199215

200216

201-
def _smart_concat_logic(lang_text_list: List[SubString]):
217+
def _smart_concat_logic(lang_text_list: List[SubString], lang_map=None):
202218
lang_text_list = _merge_middle_substr_to_two_side(lang_text_list)
203219
lang_text_list = _merge_blocks(lang_text_list)
204-
lang_text_list = _check_languages(lang_text_list)
220+
lang_text_list = _check_languages(lang_text_list=lang_text_list, lang_map=lang_map)
205221
lang_text_list = _merge_middle_substr_to_two_side(lang_text_list)
206222
lang_text_list = _fill_missing_languages(lang_text_list)
207223
lang_text_list = _merge_two_side_substr_to_near(lang_text_list)
208224
lang_text_list = _merge_blocks(lang_text_list)
209-
lang_text_list = _check_languages(lang_text_list)
225+
lang_text_list = _check_languages(lang_text_list=lang_text_list, lang_map=lang_map)
210226
return lang_text_list

tests/test_split_multi_lang.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from langsplit import split
2+
3+
texts = [
4+
"我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。",
5+
"你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!",
6+
"你好,今日はどこへ行きますか?",
7+
"我的名字是田中さんです。",
8+
"我喜欢吃寿司和拉面おいしいです。",
9+
"今天の天気はとてもいいですね。",
10+
"我在学习日本語少し難しいです。",
11+
"日语真是おもしろい啊",
12+
"你喜欢看アニメ吗?",
13+
"我想去日本旅行、特に京都に行きたいです。",
14+
"昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。",
15+
"我们一起去カラオケ吧、楽しそうです。",
16+
"你今天吃了什么、朝ごはんは何ですか?",
17+
"我的家在北京、でも、仕事で東京に住んでいます。",
18+
"我在学做日本料理、日本料理を作るのを習っています。",
19+
"你会说几种语言、何ヶ国語話せますか?",
20+
"我昨天看了一本书、その本はとても面白かったです。",
21+
"我们一起去逛街、買い物に行きましょう。",
22+
"你最近好吗、最近どうですか?",
23+
"我在学做日本料理와 한국 요리、日本料理を作るのを習っています。",
24+
"你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?",
25+
"我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。",
26+
"我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。",
27+
"你最近好吗、最近どうですか?요즘 어떻게 지내요?",
28+
]
29+
30+
texts_2 = [
31+
"Ich liebe Paris, c'est une belle ville, and the food is amazing!",
32+
"Berlin ist wunderbar, je veux y retourner, and explore more.",
33+
"Bonjour, wie geht's dir today?",
34+
"Die Musik hier ist fantastisch, la musique est superbe, and I enjoy it a lot.",
35+
"Guten Morgen, je t'aime, have a great day!",
36+
"Das Wetter ist heute schön, il fait beau aujourd'hui, and it's perfect for a walk.",
37+
"Ich mag dieses Buch, ce livre est intéressant, and it has a great story.",
38+
"Vielen Dank, merci beaucoup, for your help.",
39+
"Wir reisen nach Deutschland, nous voyageons en Allemagne, and we are excited.",
40+
"Ich bin müde, je suis fatigué, and I need some rest.",
41+
]
42+
43+
new_lang_map = {
44+
"zh": "zh",
45+
"zh-cn": "zh",
46+
"zh-tw": "x",
47+
"ko": "ko",
48+
"ja": "ja",
49+
"de": "de",
50+
"fr": "fr",
51+
"en": "en",
52+
"x": "en",
53+
}
54+
55+
56+
for text in texts:
57+
substr_list = split(text=text, verbose=False, lang_map=new_lang_map, threshold=5e-5)
58+
for index, substr in enumerate(substr_list):
59+
print(f"{substr.lang}|{index}: {substr.text}")
60+
print("----------------------")
61+
62+
for text in texts_2:
63+
substr_list = split(text=text, verbose=False, lang_map=new_lang_map, threshold=1e-3)
64+
for index, substr in enumerate(substr_list):
65+
print(f"{substr.lang}|{index}: {substr.text}")
66+
print("----------------------")

0 commit comments

Comments
 (0)