Skip to content

Commit

Permalink
Merge pull request #54 from Kensuke-Mitsuzawa/enhancement/#53
Browse files Browse the repository at this point in the history
cleaned up type hint
  • Loading branch information
Kensuke-Mitsuzawa committed Jan 21, 2019
2 parents 76e2e5c + 3ef0cf0 commit 45af698
Show file tree
Hide file tree
Showing 11 changed files with 49 additions and 49 deletions.
2 changes: 1 addition & 1 deletion JapaneseTokenizer/common/juman_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@ def feature_parser(uni_feature, word_surface):
else:
word_stem = word_surface

return tuple_pos, word_stem
return tuple_pos, word_stem
16 changes: 6 additions & 10 deletions JapaneseTokenizer/common/sever_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# logger
from JapaneseTokenizer import init_logger
import logging
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
# typing
from typing import Union
# else
Expand All @@ -17,6 +16,7 @@
import shutil
import signal
import os
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))


class ProcessDownException(Exception):
Expand All @@ -29,8 +29,8 @@ def __init__(self,
option=None,
pattern='EOS',
timeout_second=10):
"""* Get communication with unix process using pexpect module."""
# type: (text_type,text_type,text_type,int)->None
"""* Get communication with unix process using pexpect module."""
self.command = command
self.timeout_second = timeout_second
self.pattern = pattern
Expand All @@ -42,10 +42,10 @@ def __del__(self):
self.process_analyzer.kill(sig=9)

def launch_process(self, command):
# type: (Union[bytes,text_type])->None
"""* What you can do
- It starts process and keep it.
"""
# type: (Union[bytes,text_type])->None
if not self.option is None:
command_plus_option = self.command + " " + self.option
else:
Expand All @@ -67,7 +67,6 @@ def launch_process(self, command):
self.process_id = self.process_analyzer.pid

def restart_process(self):
""""""
# type: ()->None
if not self.option is None:
command_plus_option = self.command + " " + self.option
Expand All @@ -79,10 +78,10 @@ def restart_process(self):
self.process_id = self.process_analyzer.pid

def stop_process(self):
# type: ()->bool
"""* What you can do
- You're able to stop the process which this instance has now.
"""
# type: ()->bool
if hasattr(self, "process_analyzer"):
self.process_analyzer.kill(sig=9)
else:
Expand All @@ -91,11 +90,11 @@ def stop_process(self):
return True

def __query(self, input_string):
# type: (text_type)->text_type
"""* What you can do
- It takes the result of Juman++
- This function monitors time which takes for getting the result.
"""
# type: (text_type)->text_type
signal.signal(signal.SIGALRM, self.__notify_handler)
signal.alarm(self.timeout_second)
self.process_analyzer.sendline(input_string)
Expand All @@ -118,8 +117,6 @@ def __notify_handler(self, signum, frame):
2. Run restart_process() method when the exception happens.""".format(**{"time": self.timeout_second}))

def query(self, input_string):
"""* What you can do
"""
# type: (text_type)->text_type
return self.__query(input_string=input_string)

Expand All @@ -135,6 +132,5 @@ def __init__(self,
super(JumanppHnadler, self).__init__(command=jumanpp_command, option=option, pattern=pattern, timeout_second=timeout_second)

def launch_jumanpp_process(self, command):
""""""
# type: (text_type)->None
return self.launch_process(command)
return self.launch_process(command)
8 changes: 4 additions & 4 deletions JapaneseTokenizer/common/text_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def b(str): return str.encode("utf-8")


def denormalize_text(input_text):
# type: (text_type)->text_type
"""* What you can do
- It converts text into standard japanese writing way
* Note
- hankaku-katakana is to zenkaku-katakana
- zenkaku-eisu is to hankaku-eisu
"""
# type: (text_type)->text_type
if input_text in STRING_EXCEPTION:
return input_text
else:
Expand All @@ -54,13 +54,13 @@ def normalize_text(input_text,
is_kana=True,
is_ascii=True,
is_digit=True):
# type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
"""* What you can do
- It converts input-text into normalized-text which is good for tokenizer input.
* Params
- new_line_replaced: a string which replaces from \n string.
"""
# type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
if is_replace_eos:
without_new_line = input_text.replace('\n', new_line_replaced)
else:
Expand All @@ -75,9 +75,9 @@ def normalize_text(input_text,


def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True):
# type: (text_type,bool,bool,bool)->text_type
"""
* All hankaku Katanaka is converted into Zenkaku Katakana
* All hankaku English alphabet and numberc string are converted into Zenkaku one
"""
# type: (text_type,bool,bool,bool)->text_type
return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)
return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)
4 changes: 3 additions & 1 deletion JapaneseTokenizer/common/timeout_handler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#! -*- coding: utf-8 -*-
from functools import wraps


class TimeoutException(Exception):
pass


def handler_func(msg):
raise TimeoutException()

Expand All @@ -26,4 +28,4 @@ def __wrapper(*args, **kwargs):
signal.alarm(0)
return result
return wraps(function)(__wrapper)
return __decorator
return __decorator
33 changes: 19 additions & 14 deletions JapaneseTokenizer/datamodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# datemodels #
from MeCab import Node
# typing #
from typing import List, Union, Any, Tuple, Dict, Callable
from typing import List, Union, Any, Tuple, Dict, Callable, Optional
from future.utils import text_type, string_types
import sys
import six
Expand All @@ -23,12 +23,11 @@ def __is_sotpwords(token, stopwords):


def __is_valid_pos(pos_tuple, valid_pos):
# type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool
"""This function checks token's pos is with in POS set that user specified.
If token meets all conditions, Return True; else return False
"""
# type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool
def is_valid_pos(valid_pos_tuple):
""""""
# type: (Tuple[text_type,...])->bool
length_valid_pos_tuple = len(valid_pos_tuple)
if valid_pos_tuple == pos_tuple[:length_valid_pos_tuple]:
Expand Down Expand Up @@ -93,9 +92,16 @@ def filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='stem'):


class TokenizedResult(object):
def __init__(self, node_obj, tuple_pos, word_stem, word_surface,
is_feature=True, is_surface=False, misc_info=None, analyzed_line=None):
# type: (Union[Node, None], Union[str, Tuple[text_type, ...], str, str, bool, bool, Union[None, Dict[str, Any]], str])->None
def __init__(self,
node_obj,
tuple_pos,
word_stem,
word_surface,
is_feature=True,
is_surface=False,
misc_info=None,
analyzed_line=None):
# type: (Optional[Node], Tuple[text_type, ...], str, str, bool, bool, Optional[Dict[str, Any]], str)->None
assert isinstance(node_obj, (Node, type(None)))
assert isinstance(tuple_pos, (string_types, tuple))
assert isinstance(word_stem, (string_types))
Expand All @@ -120,12 +126,12 @@ def __init__(self, node_obj, tuple_pos, word_stem, word_surface,

class TokenizedSenetence(object):
def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):
# type: (text_type, List[TokenizedResult], text_type)->None
"""* Parameters
- sentence: sentence
- tokenized_objects: list of TokenizedResult object
- string_encoding: Encoding type of string type. This option is used only under python2.x
"""
# type: (text_type, List[TokenizedResult])->None
assert isinstance(sentence, text_type)
assert isinstance(tokenized_objects, list)

Expand All @@ -137,9 +143,9 @@ def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):
def __extend_token_object(self, token_object,
is_denormalize=True,
func_denormalizer=denormalize_text):
# type: (TokenizedResult,bool,Callable[[str],str])->Tuple
"""This method creates dict object from token object.
"""
# type: (TokenizedResult,bool,Callable[[str],str])->Tuple[str,...]
assert isinstance(token_object, TokenizedResult)

if is_denormalize:
Expand Down Expand Up @@ -170,14 +176,14 @@ def __extend_token_object(self, token_object,
def convert_list_object(self,
is_denormalize=True,
func_denormalizer=denormalize_text):
# type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]]
"""* What you can do
- You extract string object from TokenizedResult object
* Args
- is_denormalize: boolen object. True; it makes denormalize string
- func_denormalizer: callable object. de-normalization function.
"""
# type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]]
sentence_in_list_obj = [
self.__extend_token_object(token_object,is_denormalize,func_denormalizer)
for token_object
Expand All @@ -187,14 +193,14 @@ def convert_list_object(self,
return sentence_in_list_obj

def __convert_string_type(self, p_c_tuple):
# type: (Tuple[text_type,...])->Tuple[text_type]
"""* What you can do
- it normalizes string types into str
"""
# type: (Tuple[text_type,...])->Tuple[text_type]
if not isinstance(p_c_tuple, tuple):
raise Exception('Pos condition expects tuple of string. However = {}'.format(p_c_tuple))

converted = [object] * len(p_c_tuple)
converted = [text_type] * len(p_c_tuple)
for i, pos_element in enumerate(p_c_tuple):
if six.PY2 and isinstance(pos_element, str):
"""str into unicode if python2.x"""
Expand All @@ -209,11 +215,11 @@ def __convert_string_type(self, p_c_tuple):
return tuple(converted)

def __check_pos_condition(self, pos_condistion):
# type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]]
"""* What you can do
- Check your pos condition
- It converts character type into unicode if python version is 2.x
"""
# type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]]
assert isinstance(pos_condistion, list)

return [self.__convert_string_type(p_c_tuple) for p_c_tuple in pos_condistion]
Expand All @@ -224,6 +230,7 @@ def filter(self,
is_normalize=True,
func_normalizer=normalize_text,
check_field_name='stem'):
# type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject
"""* What you can do
- It filters out token which does NOT meet the conditions (stopwords & part-of-speech tag)
- Under python2.x, pos_condition & stopwords are converted into unicode type.
Expand All @@ -243,7 +250,6 @@ def filter(self,
>>> pos_condition = [('名詞', '一般'), ('形容詞', '自立'), ('助詞', '格助詞', '一般')]
>>> stopwords = ['これ', 'それ']
"""
# type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))

Expand Down Expand Up @@ -280,7 +286,6 @@ def filter(self,

class FilteredObject(TokenizedSenetence):
def __init__(self, sentence, tokenized_objects, pos_condition, stopwords):
""""""
# type: (str, List[TokenizedResult], List[str, ...], List[str])->None
super(FilteredObject, self).__init__(
sentence=sentence,
Expand Down
17 changes: 8 additions & 9 deletions JapaneseTokenizer/juman_wrapper/juman_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
# else
from typing import List, Union, Any, Callable, Tuple
from typing import List, Union, Callable, Tuple
from six import text_type
from pyknp import MList
import logging
Expand All @@ -25,7 +25,8 @@
logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')

if six.PY3:
import socket, re
import socket
import re

class MonkeyPatchSocket(object):
"""* Class for overwriting pyknp.Socket because it is only for python2.x"""
Expand All @@ -39,15 +40,14 @@ def __init__(self, hostname, port, option=None):
self.sock.send(option)
data = b""
while b"OK" not in data:
#while isinstance(data, bytes) and b"OK" not in data:
# while isinstance(data, bytes) and b"OK" not in data:
data = self.sock.recv(1024)

def __del__(self):
if self.sock:
self.sock.close()

def query(self, sentence, pattern):
""""""
# type: (str,str)->str
assert(isinstance(sentence, six.text_type))
sentence_bytes = sentence.encode('utf-8').strip()
Expand All @@ -74,9 +74,9 @@ def __init__(self,
pattern='EOS',
is_use_pyknp=False,
**args):
# type: (text_type, text_type, int, int, text_type, Union[bytes, text_type], Union[bytes, text_type], bool, **str)->None
"""* Class to call Juman tokenizer
"""
# type: (text_type,text_type,int,int,text_type,Union[bytes,text_type],Union[bytes,text_type],bool)->None

self.timeout = timeout
self.pattern = pattern
Expand All @@ -101,8 +101,7 @@ def __init__(self,
else:
pass


if not server is None:
if server is not None:
# use server mode #
self.juman = pyknp.Juman(command=command, server=server, port=port,
timeout=self.timeout, rcfile=rcfile, option=option,
Expand All @@ -128,10 +127,10 @@ def __del__(self):
self.juman.stop_process()

def __monkey_patch_juman_lines(self, input_str):
# type: (text_type)->text_type
"""* What you can do
- It overwrites juman_line() method because this method causes TypeError in python3
"""
# type: (text_type,)->text_type
assert isinstance(self.juman, pyknp.Juman)
if not self.juman.socket and not self.juman.subprocess:
if self.juman.server is not None:
Expand Down Expand Up @@ -236,4 +235,4 @@ def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))

return parsed_sentence.filter(pos_condition, stopwords)
return parsed_sentence.filter(pos_condition, stopwords)
2 changes: 1 addition & 1 deletion JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def __init__(self,
port=12000,
is_use_pyknp = False,
** args):
# type: (text_type,int,text_type,text_type,bool)
"""* What you can do
- You can select backend process of jumanpp.
- jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.
Expand All @@ -125,7 +126,6 @@ def __init__(self,
- server: hostname where jumanpp is running
- port: port number where jumanpp is running
"""
# type: (text_type,int,text_type,text_type,bool)
self.eos_pattern = pattern
self.is_use_pyknp = is_use_pyknp

Expand Down
Loading

0 comments on commit 45af698

Please sign in to comment.