From af267d8fa482aa826b0706d11f2b42e4d69b6377 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Mon, 25 Mar 2019 16:15:23 +0100 Subject: [PATCH 1/2] added jumandic and unidic to mecab wrapper moduke --- .gitignore | 2 + .../mecab_wrapper/mecab_wrapper.py | 104 +++++++++++------- setup.py | 2 +- test/Dockerfile-dev | 14 ++- test/test_mecab_wrapper_python2.py | 30 +++-- test/test_mecab_wrapper_python3.py | 49 ++++++--- 6 files changed, 139 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index e48f59b..204c601 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ Mykytea-python/ .DS_Store *tox .cache/ +python/ +python2/ \ No newline at end of file diff --git a/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py b/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py index 2843c08..d58f88b 100644 --- a/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py +++ b/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py @@ -1,5 +1,5 @@ #! -*- coding: utf-8 -*- -# core mddule +# core module from JapaneseTokenizer.object_models import WrapperBase from JapaneseTokenizer.common.text_preprocess import normalize_text from JapaneseTokenizer import init_logger @@ -13,7 +13,7 @@ import six from six import text_type # typing -from typing import List, Dict, Tuple, Union, TypeVar, Callable +from typing import List, Tuple, Union, TypeVar, Callable ContentsTypes = TypeVar('T') __author__ = 'kensuke-mi' @@ -30,27 +30,54 @@ class MecabWrapper(WrapperBase): - def __init__(self, dictType, pathUserDictCsv='', path_mecab_config=None, string_encoding='utf-8'): - # type: (text_type, text_type, text_type, text_type)->None - assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", None] + def __init__(self, + dictType, + pathUserDictCsv=None, + path_mecab_config=None, + path_dictionary=None, + string_encoding='utf-8'): + # type: (text_type, text_type, text_type, text_type, text_type)->None + """ + + :param dictType: a dictionary type called by mecab + :param pathUserDictCsv: path to your original dictionary file + :param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give + :param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected + :param string_encoding: encoding option to parse command line result. This is mainly used for python2.x + """ self.string_encoding = string_encoding - if dictType == 'all' or dictType == 'user': assert os.path.exists(pathUserDictCsv) + self._dictType = dictType + self._pathUserDictCsv = pathUserDictCsv + self._path_dictionary = path_dictionary if path_mecab_config is None: self._path_mecab_config = self.__get_path_to_mecab_config() else: self._path_mecab_config = path_mecab_config - self._dictType = dictType - self._pathUserDictCsv = pathUserDictCsv - self._mecab_dictionary_path = self.__check_mecab_dict_path() + if self._path_dictionary is not None: + assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.' + self._mecab_dictionary_path = None + else: + self._mecab_dictionary_path = self.__check_mecab_dict_path() logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path)) - self.mecabObj = self.__CallMecab() + assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \ + 'Dictionary Type Error. Your dict = {} is NOT available.' + if dictType == 'all': + logger.error('dictionary type "all" is deprecated from version1.6') + raise Exception('dictionary type "all" is deprecated from version1.6') + if dictType == 'user': + logger.error('dictionary type "user" is deprecated from version1.6. You just give path to dictionary csv.') + raise Exception('dictionary type "all" is deprecated from version1.6. You just give path to dictionary csv.') + + if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '': + assert os.path.exists(pathUserDictCsv), \ + 'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv) + def __get_path_to_mecab_config(self): - """* What you can do - - You get path into mecab-config + """You get path into mecab-config """ if six.PY2: path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config']) @@ -62,7 +89,6 @@ def __get_path_to_mecab_config(self): logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir)) return path_mecab_config_dir - def __check_mecab_dict_path(self): """check path to dict of Mecab in system environment """ @@ -78,16 +104,13 @@ def __check_mecab_dict_path(self): logger.error("{}".format(mecab_dic_cmd)) raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config command") if path_mecab_dict == '': - raise SystemError( - 'mecab dictionary path is not found with following command: {} You are not able to use additional dictionary. Still you are able to call mecab default dictionary'.format(mecab_dic_cmd) - ) + raise SystemError("""mecab dictionary path is not found with following command: {} + You are not able to use additional dictionary. + Still you are able to call mecab default dictionary""".format(mecab_dic_cmd)) return path_mecab_dict def __check_mecab_libexe(self): - """* What you can do - """ - mecab_libexe_cmd = "echo `{} --libexecdir`".format(os.path.join(self._path_mecab_config, 'mecab-config')) try: @@ -100,35 +123,40 @@ def __check_mecab_libexe(self): logger.error("{}".format(mecab_libexe_cmd)) raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config --libexecdir") if path_mecab_libexe == '': - raise SystemError('Mecab config is not callable with following command: {} You are not able to compile your user dictionary. Still, you are able to use default mecab dictionary.'.format(mecab_libexe_cmd)) + raise SystemError("""Mecab config is not callable with following command: {} + You are not able to compile your user dictionary. + Still, you are able to use default mecab dictionary.""".format(mecab_libexe_cmd)) return path_mecab_libexe def __CallMecab(self): - """* What you can do - """ - if self._dictType == 'neologd': + if self._path_dictionary is not None and self._mecab_dictionary_path is None: + logger.debug('Use dictionary you specified.') + cmMecabInitialize = '-d {}'.format(self._path_dictionary) + elif self._dictType == 'neologd': + # use neologd logger.debug('Use neologd additional dictionary') cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd")) - - elif self._dictType == 'all': - logger.debug('Use neologd additional dictionary') - pathUserDict = self.__CompileUserdict() - cmMecabInitialize = '-u {} -d {}'.format(pathUserDict, - os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd")) - elif self._dictType == 'ipadic': - logger.debug('Use ipadic additional dictionary') + elif self._dictType == 'ipadic' or self._dictType == 'ipaddic': + # use ipadic + logger.debug('Use ipadic dictionary') cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "ipadic")) - - elif self._dictType == 'user': - logger.debug('Use User dictionary') - pathUserDict = self.__CompileUserdict() - cmMecabInitialize = '-u {}'.format(pathUserDict) - + elif six.PY2 is False and self._dictType == 'jumandic': + # use jumandic. This is impossible to call in Python2.x + logger.debug('Use jumandic dictionary') + cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "jumandic")) + elif six.PY2 and self._dictType == 'jumandic': + raise Exception('In python2.x, impossible to call jumandic.') else: logger.debug('Use no default dictionary') cmMecabInitialize = '' + # execute compile if user dictionary is given + if self._pathUserDictCsv is not None: + logger.debug('Use User dictionary') + pathUserDict = self.__CompileUserdict() + cmMecabInitialize += ' -u {}'.format(pathUserDict) + if six.PY2: cmMecabCall = "-Ochasen {}".format(cmMecabInitialize) else: @@ -246,7 +274,7 @@ def tokenize(self, sentence, else: pass - ### decide normalization function depending on dictType + # decide normalization function depending on dictType if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid: normalized_sentence = neologdn.normalize(sentence) elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False: diff --git a/setup.py b/setup.py index 9fe3209..c6e7da2 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ else: raise NotImplementedError() -version = '1.5' +version = '1.6' name = 'JapaneseTokenizer' short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization' diff --git a/test/Dockerfile-dev b/test/Dockerfile-dev index 2cc8056..b23b1cf 100644 --- a/test/Dockerfile-dev +++ b/test/Dockerfile-dev @@ -6,6 +6,8 @@ ENV MECAB_VERSION 0.996 ENV IPADIC_VERSION 2.7.0-20070801 ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM +ENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM +ENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip ENV build_deps 'curl git bash file sudo openssh gcc make build-base' ENV dependencies 'openssl' @@ -38,10 +40,20 @@ RUN apk add --update --no-cache ${build_deps} \ # Install Neologd && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \ && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \ + # Install jumandic + && curl -SL -o jumandic.tar.gz ${jumandic_url} \ + && tar zxf jumandic.tar.gz \ + && cd mecab-jumandic-7.0-20130310 \ + && ./configure --with-charset=utf8 \ + && make \ + && make install \ + # delete dictionary files + && cd \ && rm -rf \ mecab-${MECAB_VERSION}* \ mecab-${IPADIC_VERSION}* \ - mecab-ipadic-neologd + mecab-ipadic-neologd \ + mecab-jumandic-7.0-20130310 # general RUN apk --no-cache add vim \ diff --git a/test/test_mecab_wrapper_python2.py b/test/test_mecab_wrapper_python2.py index bd0591a..e281a0b 100644 --- a/test/test_mecab_wrapper_python2.py +++ b/test/test_mecab_wrapper_python2.py @@ -47,18 +47,32 @@ def test_default_parse(self): for morph in parsed_obj: assert isinstance(morph, string_types) - def test_init_userdict(self): + # test when user dictionary is called + mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict) + assert isinstance(mecab_obj, MecabWrapper) + parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True) + is_ok = False + for morph in parsed_obj: + if u'さくらまな' == morph: + is_ok = True + else: + pass + assert is_ok + + def test_parse_jumandic(self): + with self.assertRaises(Exception): + mecab_obj = MecabWrapper(dictType='jumandic') + assert isinstance(mecab_obj, MecabWrapper) + + def test_init_alldict(self): """* Test case - すべての辞書を利用した場合の動作を確認する """ - mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict) - assert isinstance(mecab_obj, MecabWrapper) - - res = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True) - assert isinstance(res, list) - assert u'さくらまな' in res + with self.assertRaises(Exception): + mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict) + assert isinstance(mecab_obj, MecabWrapper) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/test_mecab_wrapper_python3.py b/test/test_mecab_wrapper_python3.py index 5833001..755cbcf 100644 --- a/test/test_mecab_wrapper_python3.py +++ b/test/test_mecab_wrapper_python3.py @@ -16,9 +16,7 @@ def setUp(self): self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv') def test_neologd_parse(self): - """* Test case - - neologd辞書で正しく分割できることを確認する - """ + # test using neologd dictionary mecab_obj = MecabWrapper(dictType='neologd') parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence) self.assertTrue(parsed_obj, TokenizedSenetence) @@ -31,9 +29,7 @@ def test_neologd_parse(self): self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object())) def test_default_parse(self): - """* Test case - - デフォルトの状態で動作を確認する - """ + # test default status dictType = "ipadic" mecab_obj = MecabWrapper(dictType=dictType) assert isinstance(mecab_obj, MecabWrapper) @@ -48,19 +44,44 @@ def test_default_parse(self): for morph in parsed_obj: assert isinstance(morph, str) - def test_init_userdict(self): - """* Test case - - すべての辞書を利用した場合の動作を確認する - """ - mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict) + def test_parse_jumandic(self): + mecab_obj = MecabWrapper(dictType='jumandic') assert isinstance(mecab_obj, MecabWrapper) - res = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True) - assert isinstance(res, list) - assert 'さくらまな' in res + parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False) + assert isinstance(parsed_obj, TokenizedSenetence) + for tokenized_obj in parsed_obj.tokenized_objects: + if tokenized_obj.word_stem == '女優': + # ドメイン:文化・芸術 is special output only in Jumandic + assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line + + def test_parse_userdic(self): + pass + + def test_parse_dictionary_path(self): + # put path to dictionary and parse sentence. + path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd' + if os.path.exists(path_default_ipadic): + mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic) + assert mecab_obj._path_dictionary == path_default_ipadic + parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False) + assert isinstance(parsed_obj, TokenizedSenetence) + + def test_init_userdict(self): + # this test should be error response. + mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict) + assert isinstance(mecab_obj, MecabWrapper) + parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False) + assert isinstance(parsed_obj, TokenizedSenetence) + is_ok = False + for tokenized_obj in parsed_obj.tokenized_objects: + if tokenized_obj.word_stem == 'さくらまな': + is_ok = True + assert is_ok if __name__ == '__main__': unittest.main() + From cb7819a581136e06816bbe0edb253fe08fa1c6e9 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Mon, 25 Mar 2019 16:33:36 +0100 Subject: [PATCH 2/2] added mecab juman dictionary command in travis file --- travis-mecab-install.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/travis-mecab-install.sh b/travis-mecab-install.sh index e72e6e8..b2a380d 100644 --- a/travis-mecab-install.sh +++ b/travis-mecab-install.sh @@ -23,5 +23,13 @@ make sudo make install sudo ldconfig +wget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM' +tar zxfv jumandic.tar.gz +cd mecab-jumandic-7.0-20130310 +./configure --with-charset=utf8 +make +sudo make install +sudo ldconfig + cd $base_dir -rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 \ No newline at end of file +rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310 \ No newline at end of file