-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_ginza.py
53 lines (43 loc) · 1.62 KB
/
test_ginza.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
from typing import Any, Dict, List, Text
from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.training_data import Message
from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES
# import MeCab
import spacy
nlp = spacy.load('ja_ginza')
doc = nlp('あのラーメン屋にはよく行く。美味しいんだ。')
for sent in doc.sents:
for token in sent:
info = [
token.i, # トークン番号
token.orth_, # テキスト
token._.reading, # 読みカナ
token.lemma_, # 基本形
token.pos_, # 品詞
token.tag_, # 品詞詳細
token._.inf # 活用情報
]
print(info)
class JapaneseTokenizer(Tokenizer):
provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
def __init__(self, component_config: Dict[Text, Any] = None) -> None:
super().__init__(component_config)
# self.mt = MeCab.Tagger()
nlp = spacy.load('ja_ginza')
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text = message.get(attribute)
parsed = self.mt.parse(text)
x = (parsed.replace('\n', '\t').split('\t'))
words = []
for i in range(0, len(x) - 2, 2):
w = x[i]
words.append(w)
running_offset=0
tokens = []
for word in words:
word_offset = text.index(word, running_offset)
word_len = len(word)
running_offset = word_offset + word_len
tokens.append(Token(word, word_offset))
return tokens