Skip to content

Commit

Permalink
修复韵母相关拼音风格在 strict=True 时未按预期只返回拼音标准中定义过的韵母
Browse files Browse the repository at this point in the history
Fixes #266
  • Loading branch information
mozillazg committed Jan 23, 2022
1 parent a421a83 commit 9521e47
Show file tree
Hide file tree
Showing 23 changed files with 840 additions and 571 deletions.
42 changes: 19 additions & 23 deletions pypinyin/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from enum import IntEnum, unique

from pypinyin import pinyin_dict
from pypinyin.compat import SUPPORT_UCS4

# 词语拼音库
if os.environ.get('PYPINYIN_NO_PHRASES'):
Expand All @@ -30,28 +29,25 @@
RE_TONE2 = re.compile(r'([aeoiuvnm])([1-4])$')

# 有拼音的汉字
if SUPPORT_UCS4:
RE_HANS = re.compile(
r'^(?:['
r'\u3007' # 〇
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'])+$'
)
else:
RE_HANS = re.compile( # pragma: no cover
r'^(?:['
r'\u3007' # 〇
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'])+$'
)
RE_HANS = re.compile(
r'^(?:['
r'\u3007' # 〇
r'\ue815-\ue864'
r'\ufa18'
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002B825-\U0002BF6E'
r'\U0002C029-\U0002CE93'
r'\U0002D016'
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'\U00030EDD'
r'\U00030EDE'
r'])+$'
)


@unique
Expand Down
44 changes: 2 additions & 42 deletions pypinyin/contrib/_tone_rule.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

"""
标调位置
有 ɑ 不放过,
  没 ɑ 找 o、e;
  ɑ、o、e、i、u、ü
  标调就按这顺序;
  i、u 若是连在一起,
  谁在后面就标谁。
http://www.hwjyw.com/resource/content/2010/06/04/8183.shtml
https://www.zhihu.com/question/23655297
https://github.com/mozillazg/python-pinyin/issues/160
http://www.pinyin.info/rules/where.html
"""


def right_mark_index(pinyin_no_tone):
# 有 ɑ 不放过, 没 ɑ 找 o、e
for c in ['a', 'o', 'e']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c)

# i、u 若是连在一起,谁在后面就标谁
for c in ['iu', 'ui']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c) + 1

# ɑ、o、e、i、u、ü
for c in ['i', 'u', 'v', 'ü']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c)

# n, m, ê
for c in ['n', 'm', 'ê']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c)
# 向后兼容
from pypinyin.style._tone_rule import right_mark_index # noqa
Loading

0 comments on commit 9521e47

Please sign in to comment.