Skip to content

Commit

Permalink
Merge pull request #41 from Kensuke-Mitsuzawa/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
Kensuke-Mitsuzawa committed Sep 27, 2017
2 parents 94d51ec + 8b52534 commit 8a28e2d
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 7 deletions.
6 changes: 5 additions & 1 deletion JapaneseTokenizer/common/sever_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
import os


class ProcessDownException(Exception):
pass


class UnixProcessHandler(object):
def __init__(self,
command,
Expand Down Expand Up @@ -109,7 +113,7 @@ def __query(self, input_string):
buffer += line_string

def __notify_handler(self, signum, frame):
raise Exception("""It takes longer time than {time} seconds. You're able to try,
raise ProcessDownException("""It takes longer time than {time} seconds. You're able to try,
1. Change your setting of 'timeout_second' parameter
2. Run restart_process() method when the exception happens.""".format(**{"time": self.timeout_second}))

Expand Down
27 changes: 24 additions & 3 deletions JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# modules
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess, juman_utils
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
from JapaneseTokenizer.common.sever_handler import JumanppHnadler, ProcessDownException
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedSenetence
from typing import List, Dict, Tuple, Union, TypeVar, Any, Callable
Expand Down Expand Up @@ -128,6 +128,13 @@ def __init__(self,
# type: (text_type,int,text_type,text_type,bool)->None
self.eos_pattern = pattern
self.is_use_pyknp = is_use_pyknp


if six.PY2:
self.dummy_text = 'これはダミーテキストです'.decode('utf-8')
elif six.PY3:
self.dummy_text = 'これはダミーテキストです'

if not server is None:
pattern = pattern.encode('utf-8')
else:
Expand All @@ -145,14 +152,18 @@ def __init__(self,

if server is None and self.is_use_pyknp:
# jumanpp-pexpect #
logger.debug('jumanpp wrapper is initialized with pyknp package')
self.jumanpp_obj = Jumanpp(
command=command,
timeout=timeout,
pattern=pattern,
**args)
elif server is None:
# jumanpp-pexpect #
self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern)
logger.debug('jumanpp wrapper is initialized with pexpect unix handler')
self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern) # type: JumanppHnadler
# put dummy sentence to avoid exception just after command initialization #
res = self.jumanpp_obj.query(self.dummy_text)
else:
# jumanpp-server #
self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)
Expand Down Expand Up @@ -181,11 +192,21 @@ def call_juman_interface(self, input_str):
elif isinstance(self.jumanpp_obj, JumanppHnadler):
try:
result_token = self.jumanpp_obj.query(input_string=input_str)
except ProcessDownException:
"""Unix process is down by any reason."""
logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second))
self.jumanpp_obj.restart_process()
self.jumanpp_obj.query(self.dummy_text)
result_token = self.jumanpp_obj.query(input_string=input_str)
ml_token_object = MList(result_token)
except UnicodeDecodeError:
logger.warning(msg="Process is down by some reason. It restarts process automatically.")
self.jumanpp_obj.restart_process()
self.jumanpp_obj.query(self.dummy_text)
result_token = self.jumanpp_obj.query(input_string=input_str)
ml_token_object = MList(result_token)
ml_token_object = MList(result_token)
else:
ml_token_object = MList(result_token)
elif isinstance(self.jumanpp_obj, JumanppClient):
server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)
ml_token_object = MList(server_response)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[![Build Status](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers.svg?branch=travis)](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers)[![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)
[![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)[![Build Status](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers.svg?branch=master)](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers)


# What's this?
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
else:
raise NotImplementedError()

version = '1.3.4'
version = '1.3.5'
name = 'JapaneseTokenizer'
short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization'

Expand Down
16 changes: 16 additions & 0 deletions test/test_jumanpp_wrapper_python2.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,22 @@ def test_jumanpp_localmode_pyexpect(self):
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
assert isinstance(tokenized_obj, TokenizedSenetence)

def test_jumanpp_huge_amount_text(self):
"""pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動"""
logger.info('under testing of processing huge amount of text...')
seq_test_sentence = [u'外国人参政権を欲しい。'] * 500
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
for i, test_s in enumerate(seq_test_sentence):
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)
self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))
if not i == 0 and i % 100 == 0:
"""強制的にプロセスを殺して再起動"""
logger.info('It forces stop unix process.')
jumanpp_tokenizer.jumanpp_obj.restart_process()
else:
pass


if __name__ == '__main__':
unittest.main()
18 changes: 17 additions & 1 deletion test/test_jumanpp_wrapper_python3.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_jumanpp_servermode_stress(self):
del jumanpp_tokenizer

def test_jumanpp_localmode_pyexpect(self):
"""pyexepectを使ったプロセス呼び出しのテスト"""
"""pexpectを使ったプロセス呼び出しのテスト"""
test_sentence = '外国人参政権を欲しい。'
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
Expand All @@ -99,6 +99,22 @@ def test_jumanpp_localmode_pyexpect(self):
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
assert isinstance(tokenized_obj, TokenizedSenetence)

def test_jumanpp_huge_amount_text(self):
"""pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動"""
logger.info('under testing of processing huge amount of text...')
seq_test_sentence = ['外国人参政権を欲しい。'] * 500
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
for i, test_s in enumerate(seq_test_sentence):
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)
self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))
if not i == 0 and i % 100 == 0:
"""強制的にプロセスを殺して再起動"""
logger.info('It forces stop unix process.')
jumanpp_tokenizer.jumanpp_obj.restart_process()
else:
pass


if __name__ == '__main__':
unittest.main()

0 comments on commit 8a28e2d

Please sign in to comment.