From 59f7ffcc354652e0e1e36579c5c237056f7ae0b1 Mon Sep 17 00:00:00 2001 From: taishi-i Date: Thu, 8 Sep 2022 20:37:29 +0900 Subject: [PATCH] =?UTF-8?q?fix=20latin=20capital=20letter=20I=20with=20dot?= =?UTF-8?q?=20above=20(=C4=B0)=20AssertionError?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nagisa/nagisa_utils.pyx | 2 ++ test/nagisa_test.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/nagisa/nagisa_utils.pyx b/nagisa/nagisa_utils.pyx index 42c75eb..dd3d7aa 100644 --- a/nagisa/nagisa_utils.pyx +++ b/nagisa/nagisa_utils.pyx @@ -42,6 +42,7 @@ cpdef unicode normalize(unicode text): cpdef unicode preprocess(text): text = utf8rstrip(text) text = normalize(text) + text = text.replace('İ', 'I') text = text.replace(' ', ' ') return text @@ -50,6 +51,7 @@ cpdef unicode preprocess_without_rstrip(text): if type(text) != unicode: text = unicode(text, 'utf-8') text = normalize(text) + text = text.replace('İ', 'I') text = text.replace(' ', ' ') return text diff --git a/test/nagisa_test.py b/test/nagisa_test.py index fbbaf6b..aa41983 100644 --- a/test/nagisa_test.py +++ b/test/nagisa_test.py @@ -18,7 +18,6 @@ def test_tagging(self): words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) - # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' @@ -116,6 +115,13 @@ def test_tagging(self): words = nagisa.tagging(text) self.assertEqual(output, str(words)) + # test_26 + text = "エラーを避けるため、İはIに変換される" + output = "エラー/名詞 を/助詞 避ける/動詞 ため/名詞 、/補助記号 I/名詞 は/助詞 I/名詞 に/助詞 変換/名詞 さ/動詞 れる/助動詞" + words = nagisa.tagging(text) + self.assertEqual(output, str(words)) + + def test_utils(self): # test_20 output = "oov"