Skip to content

Commit

Permalink
fix latin capital letter I with dot above (İ) AssertionError
Browse files Browse the repository at this point in the history
  • Loading branch information
taishi-i committed Sep 8, 2022
1 parent c55a84b commit 59f7ffc
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
2 changes: 2 additions & 0 deletions nagisa/nagisa_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ cpdef unicode normalize(unicode text):
cpdef unicode preprocess(text):
text = utf8rstrip(text)
text = normalize(text)
text = text.replace('İ', 'I')
text = text.replace(' ', ' ')
return text

Expand All @@ -50,6 +51,7 @@ cpdef unicode preprocess_without_rstrip(text):
if type(text) != unicode:
text = unicode(text, 'utf-8')
text = normalize(text)
text = text.replace('İ', 'I')
text = text.replace(' ', ' ')
return text

Expand Down
8 changes: 7 additions & 1 deletion test/nagisa_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def test_tagging(self):
words = nagisa.tagging(text, lower=True)
self.assertEqual(output, str(words))


# test_3
text = 'ニューラルネットワークを使ってます。'
output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
Expand Down Expand Up @@ -116,6 +115,13 @@ def test_tagging(self):
words = nagisa.tagging(text)
self.assertEqual(output, str(words))

# test_26
text = "エラーを避けるため、İはIに変換される"
output = "エラー/名詞 を/助詞 避ける/動詞 ため/名詞 、/補助記号 I/名詞 は/助詞 I/名詞 に/助詞 変換/名詞 さ/動詞 れる/助動詞"
words = nagisa.tagging(text)
self.assertEqual(output, str(words))


def test_utils(self):
# test_20
output = "oov"
Expand Down

0 comments on commit 59f7ffc

Please sign in to comment.