From 3f832563972a6e04c1a85292ac272b936b8233ba Mon Sep 17 00:00:00 2001 From: mozillazg Date: Sat, 13 Jul 2019 13:13:58 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20m=CC=84=20=C3=AA=CC=84=20?= =?UTF-8?q?=E1=BA=BF=20=C3=AA=CC=8C=20=E1=BB=81=20=E8=BF=99=E5=87=A0?= =?UTF-8?q?=E4=B8=AA=E9=9F=B3=E6=97=A0=E6=B3=95=E8=BD=AC=E6=8D=A2=E4=B8=BA?= =?UTF-8?q?=E4=B8=8D=E5=90=AB=E5=A3=B0=E8=B0=83=E7=BB=93=E6=9E=9C=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.rst | 7 +++++++ pypinyin/phonetic_symbol.py | 6 ++++++ pypinyin/style/_constants.py | 5 +++++ pypinyin/style/_constants.pyi | 1 + pypinyin/style/_utils.py | 20 ++++++++++---------- tests/test_pinyin.py | 16 +++++++++++++++- 6 files changed, 44 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index abea629b..88ea3c68 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,12 @@ Changelog --------- +`0.35.4`_ (2019-07-xx) ++++++++++++++++++++++++ + +* **[Bugfixed]** 修复 ``m̄`` ``ê̄`` ``ế`` ``ê̌`` ``ề`` 这几个音无法转换为不含声调结果的问题。 + + `0.35.3`_ (2019-05-11) ++++++++++++++++++++++++ @@ -795,3 +801,4 @@ __ https://github.com/mozillazg/python-pinyin/issues/8 .. _0.35.1: https://github.com/mozillazg/python-pinyin/compare/v0.35.0...v0.35.1 .. _0.35.2: https://github.com/mozillazg/python-pinyin/compare/v0.35.1...v0.35.2 .. _0.35.3: https://github.com/mozillazg/python-pinyin/compare/v0.35.2...v0.35.3 +.. _0.35.4: https://github.com/mozillazg/python-pinyin/compare/v0.35.3...v0.35.4 diff --git a/pypinyin/phonetic_symbol.py b/pypinyin/phonetic_symbol.py index d42a606b..53b8b2ce 100644 --- a/pypinyin/phonetic_symbol.py +++ b/pypinyin/phonetic_symbol.py @@ -41,7 +41,13 @@ "ň": "n3", "ǹ": "n4", + "m̄": "m1", # len('m̄') == 2 "ḿ": "m2", "m̀": "m4", # len("m̀") == 2 + + "ê̄": "ê1", # len('ê̄') == 2 + "ế": "ê2", + "ê̌": "ê3", # len('ê̌') == 2 + "ề": "ê4", } phonetic_symbol_reverse = dict((v, k) for k, v in phonetic_symbol.items()) diff --git a/pypinyin/style/_constants.py b/pypinyin/style/_constants.py index 04942eb7..0d4d98be 100644 --- a/pypinyin/style/_constants.py +++ b/pypinyin/style/_constants.py @@ -11,6 +11,11 @@ # 带声调字符与数字表示声调的对应关系 PHONETIC_SYMBOL_DICT = phonetic_symbol.phonetic_symbol.copy() +PHONETIC_SYMBOL_DICT_KEY_LENGTH_NOT_ONE = dict( + (k, v) + for k, v in PHONETIC_SYMBOL_DICT.items() + if len(k) > 1 +) # 匹配带声调字符的正则表达式 RE_PHONETIC_SYMBOL = re.compile( r'[{0}]'.format( diff --git a/pypinyin/style/_constants.pyi b/pypinyin/style/_constants.pyi index b39b17a7..341c9526 100644 --- a/pypinyin/style/_constants.pyi +++ b/pypinyin/style/_constants.pyi @@ -7,6 +7,7 @@ _INITIALS_NOT_STRICT = ... # type: List[Text] PHONETIC_SYMBOL_DICT = ... # type: Dict[Text, Text] +PHONETIC_SYMBOL_DICT_KEY_LENGTH_NOT_ONE = ... # type: Dict[Text, Text] RE_PHONETIC_SYMBOL = ... # type : Any diff --git a/pypinyin/style/_utils.py b/pypinyin/style/_utils.py index 06935efa..111316c5 100644 --- a/pypinyin/style/_utils.py +++ b/pypinyin/style/_utils.py @@ -5,6 +5,7 @@ from pypinyin.style._constants import ( _INITIALS, _INITIALS_NOT_STRICT, RE_PHONETIC_SYMBOL, PHONETIC_SYMBOL_DICT, + PHONETIC_SYMBOL_DICT_KEY_LENGTH_NOT_ONE, RE_NUMBER ) @@ -57,24 +58,23 @@ def _replace(match): return PHONETIC_SYMBOL_DICT[symbol] # 替换拼音中的带声调字符 - return RE_PHONETIC_SYMBOL.sub(_replace, pinyin).replace('m̀', 'm4') + value = RE_PHONETIC_SYMBOL.sub(_replace, pinyin) + for symbol, to in PHONETIC_SYMBOL_DICT_KEY_LENGTH_NOT_ONE.items(): + value = value.replace(symbol, to) + + return value def replace_symbol_to_no_symbol(pinyin): """把带声调字符替换为没有声调的字符""" - def _replace(match): - symbol = match.group(0) # 带声调的字符 - # 去掉声调: a1 -> a - return RE_NUMBER.sub(r'', PHONETIC_SYMBOL_DICT[symbol]) - - # 替换拼音中的带声调字符 - return RE_PHONETIC_SYMBOL.sub(_replace, pinyin).replace('m̀', 'm') + value = replace_symbol_to_number(pinyin) + return RE_NUMBER.sub('', value) def has_finals(pinyin): """判断是否有韵母""" - # 鼻音: 'ḿ', 'm̀', 'ń', 'ň', 'ǹ ' 没有韵母 - for symbol in ['ḿ', 'm̀', 'ń', 'ň', 'ǹ']: + # 鼻音: 'm̄', 'ḿ', 'm̀', 'ń', 'ň', 'ǹ ' 没有韵母 + for symbol in ['m̄', 'ḿ', 'm̀', 'ń', 'ň', 'ǹ', 'ê̄', 'ế', 'ê̌', 'ề']: if symbol in pinyin: return False diff --git a/tests/test_pinyin.py b/tests/test_pinyin.py index 80208fea..328e2c71 100644 --- a/tests/test_pinyin.py +++ b/tests/test_pinyin.py @@ -9,7 +9,7 @@ pinyin, slug, lazy_pinyin, load_single_dict, load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS, FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3, - BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC, CYRILLIC_FIRST + BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC, CYRILLIC_FIRST, Style ) from pypinyin.compat import SUPPORT_UCS4 from pypinyin.core import seg @@ -545,6 +545,20 @@ def test_m4(): han, heteronym=True, style=FINALS_TONE3) == [['m2', 'ou2', 'm4']] +@pytest.mark.parametrize('han,style,expect', [ + ['呣', Style.TONE, ['ḿ', 'm̀']], + ['呣', Style.TONE2, ['m2', 'm4']], + ['嘸', Style.TONE, ['m̄', 'ḿ']], + ['嘸', Style.TONE2, ['m1', 'm2']], + ['誒', Style.TONE, ['ê̄', 'ế', 'ê̌', 'ề']], + ['誒', Style.TONE2, ['ê1', 'ê2', 'ê3', 'ê4']], +]) +def test_m_e(han, style, expect): + result = pinyin(han, style=style, heteronym=True) + assert len(result) == 1 + assert (set(result[0]) & set(expect)) == set(expect) + + if __name__ == '__main__': import pytest pytest.cmdline.main()