Merge pull request #57 from Kensuke-Mitsuzawa/issue/#56

added jumandic and unidic to mecab wrapper moduke
Kensuke-Mitsuzawa · Mar 25, 2019 · 3bdfb6b · 3bdfb6b
2 parents 45af698 + cb7819a
commit 3bdfb6b
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@ Mykytea-python/
 .DS_Store
 *tox
 .cache/
+python/
+python2/
diff --git a/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py b/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py
@@ -1,5 +1,5 @@
 #! -*- coding: utf-8 -*-
-# core mddule
+# core module
 from JapaneseTokenizer.object_models import WrapperBase
 from JapaneseTokenizer.common.text_preprocess import normalize_text
 from JapaneseTokenizer import init_logger
@@ -13,7 +13,7 @@
 import six
 from six import text_type
 # typing
-from typing import List, Dict, Tuple, Union, TypeVar, Callable
+from typing import List, Tuple, Union, TypeVar, Callable
 ContentsTypes = TypeVar('T')
 
 __author__ = 'kensuke-mi'
@@ -30,27 +30,54 @@
 
 
 class MecabWrapper(WrapperBase):
-    def __init__(self, dictType, pathUserDictCsv='', path_mecab_config=None, string_encoding='utf-8'):
-        # type: (text_type, text_type, text_type, text_type)->None
-        assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", None]
+    def __init__(self,
+                 dictType,
+                 pathUserDictCsv=None,
+                 path_mecab_config=None,
+                 path_dictionary=None,
+                 string_encoding='utf-8'):
+        # type: (text_type, text_type, text_type, text_type, text_type)->None
+        """
+
+        :param dictType: a dictionary type called by mecab
+        :param pathUserDictCsv: path to your original dictionary file
+        :param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give
+        :param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected
+        :param string_encoding: encoding option to parse command line result. This is mainly used for python2.x
+        """
         self.string_encoding = string_encoding
-        if dictType == 'all' or dictType == 'user': assert os.path.exists(pathUserDictCsv)
+        self._dictType = dictType
+        self._pathUserDictCsv = pathUserDictCsv
+        self._path_dictionary = path_dictionary
         if path_mecab_config is None:
             self._path_mecab_config = self.__get_path_to_mecab_config()
         else:
             self._path_mecab_config = path_mecab_config
 
-        self._dictType = dictType
-        self._pathUserDictCsv = pathUserDictCsv
-        self._mecab_dictionary_path = self.__check_mecab_dict_path()
+        if self._path_dictionary is not None:
+            assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.'
+            self._mecab_dictionary_path = None
+        else:
+            self._mecab_dictionary_path = self.__check_mecab_dict_path()
 
         logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path))
-
         self.mecabObj = self.__CallMecab()
 
+        assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \
+            'Dictionary Type Error. Your dict = {} is NOT available.'
+        if dictType == 'all':
+            logger.error('dictionary type "all" is deprecated from version1.6')
+            raise Exception('dictionary type "all" is deprecated from version1.6')
+        if dictType == 'user':
+            logger.error('dictionary type "user" is deprecated from version1.6. You just give path to dictionary csv.')
+            raise Exception('dictionary type "all" is deprecated from version1.6. You just give path to dictionary csv.')
+
+        if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '':
+            assert os.path.exists(pathUserDictCsv), \
+                'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv)
+
     def __get_path_to_mecab_config(self):
-        """* What you can do
-        - You get path into mecab-config
+        """You get path into mecab-config
         """
         if six.PY2:
             path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config'])
@@ -62,7 +89,6 @@ def __get_path_to_mecab_config(self):
         logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir))
         return path_mecab_config_dir
 
-
     def __check_mecab_dict_path(self):
         """check path to dict of Mecab in system environment
         """
@@ -78,16 +104,13 @@ def __check_mecab_dict_path(self):
             logger.error("{}".format(mecab_dic_cmd))
             raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config command")
         if path_mecab_dict == '':
-            raise SystemError(
-                'mecab dictionary path is not found with following command: {} You are not able to use additional dictionary. Still you are able to call mecab default dictionary'.format(mecab_dic_cmd)
-            )
+            raise SystemError("""mecab dictionary path is not found with following command: {} 
+            You are not able to use additional dictionary. 
+            Still you are able to call mecab default dictionary""".format(mecab_dic_cmd))
 
         return path_mecab_dict
 
     def __check_mecab_libexe(self):
-        """* What you can do
-        """
-
         mecab_libexe_cmd = "echo `{} --libexecdir`".format(os.path.join(self._path_mecab_config, 'mecab-config'))
 
         try:
@@ -100,35 +123,40 @@ def __check_mecab_libexe(self):
             logger.error("{}".format(mecab_libexe_cmd))
             raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config --libexecdir")
         if path_mecab_libexe == '':
-            raise SystemError('Mecab config is not callable with following command: {} You are not able to compile your user dictionary. Still, you are able to use default mecab dictionary.'.format(mecab_libexe_cmd))
+            raise SystemError("""Mecab config is not callable with following command: {} 
+            You are not able to compile your user dictionary. 
+            Still, you are able to use default mecab dictionary.""".format(mecab_libexe_cmd))
 
         return path_mecab_libexe
 
     def __CallMecab(self):
-        """* What you can do
-        """
-        if self._dictType == 'neologd':
+        if self._path_dictionary is not None and self._mecab_dictionary_path is None:
+            logger.debug('Use dictionary you specified.')
+            cmMecabInitialize = '-d {}'.format(self._path_dictionary)
+        elif self._dictType == 'neologd':
+            # use neologd
             logger.debug('Use neologd additional dictionary')
             cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd"))
-
-        elif self._dictType == 'all':
-            logger.debug('Use neologd additional dictionary')
-            pathUserDict = self.__CompileUserdict()
-            cmMecabInitialize = '-u {} -d {}'.format(pathUserDict,
-                                                     os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd"))
-        elif self._dictType == 'ipadic':
-            logger.debug('Use ipadic additional dictionary')
+        elif self._dictType == 'ipadic' or self._dictType == 'ipaddic':
+            # use ipadic
+            logger.debug('Use ipadic dictionary')
             cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "ipadic"))
-
-        elif self._dictType == 'user':
-            logger.debug('Use User dictionary')
-            pathUserDict = self.__CompileUserdict()
-            cmMecabInitialize = '-u {}'.format(pathUserDict)
-
+        elif six.PY2 is False and self._dictType == 'jumandic':
+            # use jumandic. This is impossible to call in Python2.x
+            logger.debug('Use jumandic dictionary')
+            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "jumandic"))
+        elif six.PY2 and self._dictType == 'jumandic':
+            raise Exception('In python2.x, impossible to call jumandic.')
         else:
             logger.debug('Use no default dictionary')
             cmMecabInitialize = ''
 
+        # execute compile if user dictionary is given
+        if self._pathUserDictCsv is not None:
+            logger.debug('Use User dictionary')
+            pathUserDict = self.__CompileUserdict()
+            cmMecabInitialize += ' -u {}'.format(pathUserDict)
+
         if six.PY2:
             cmMecabCall = "-Ochasen {}".format(cmMecabInitialize)
         else:
@@ -246,7 +274,7 @@ def tokenize(self, sentence,
         else:
             pass
 
-        ### decide normalization function depending on dictType
+        # decide normalization function depending on dictType
         if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid:
             normalized_sentence = neologdn.normalize(sentence)
         elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False:

diff --git a/setup.py b/setup.py
@@ -48,7 +48,7 @@
 else:
     raise NotImplementedError()
 
-version = '1.5'
+version = '1.6'
 name = 'JapaneseTokenizer'
 short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization'
 

diff --git a/test/Dockerfile-dev b/test/Dockerfile-dev
@@ -6,6 +6,8 @@ ENV MECAB_VERSION 0.996
 ENV IPADIC_VERSION 2.7.0-20070801
 ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE
 ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
+ENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM
+ENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip
 ENV build_deps 'curl git bash file sudo openssh gcc make build-base'
 ENV dependencies 'openssl'
 
@@ -38,10 +40,20 @@ RUN apk add --update --no-cache ${build_deps} \
   # Install Neologd
   && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \
   && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \
+  # Install jumandic
+  && curl -SL -o jumandic.tar.gz ${jumandic_url} \
+  && tar zxf jumandic.tar.gz \
+  && cd mecab-jumandic-7.0-20130310 \
+  && ./configure --with-charset=utf8 \
+  && make \
+  && make install \
+  # delete dictionary files
+  && cd \
   && rm -rf \
     mecab-${MECAB_VERSION}* \
     mecab-${IPADIC_VERSION}* \
-    mecab-ipadic-neologd
+    mecab-ipadic-neologd \
+    mecab-jumandic-7.0-20130310
 
 # general
 RUN apk --no-cache add vim \

diff --git a/test/test_mecab_wrapper_python2.py b/test/test_mecab_wrapper_python2.py
@@ -47,18 +47,32 @@ def test_default_parse(self):
             for morph in parsed_obj:
                 assert isinstance(morph, string_types)
 
-
     def test_init_userdict(self):
+        # test when user dictionary is called
+        mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
+        assert isinstance(mecab_obj, MecabWrapper)
+        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
+        is_ok = False
+        for morph in parsed_obj:
+            if u'さくらまな' == morph:
+                is_ok = True
+        else:
+            pass
+        assert is_ok
+
+    def test_parse_jumandic(self):
+        with self.assertRaises(Exception):
+            mecab_obj = MecabWrapper(dictType='jumandic')
+            assert isinstance(mecab_obj, MecabWrapper)
+
+    def test_init_alldict(self):
         """* Test case
         - すべての辞書を利用した場合の動作を確認する
         """
-        mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
-        assert isinstance(mecab_obj, MecabWrapper)
-
-        res = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
-        assert isinstance(res, list)
-        assert u'さくらまな' in res
+        with self.assertRaises(Exception):
+            mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
+            assert isinstance(mecab_obj, MecabWrapper)
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
diff --git a/test/test_mecab_wrapper_python3.py b/test/test_mecab_wrapper_python3.py
@@ -16,9 +16,7 @@ def setUp(self):
         self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')
 
     def test_neologd_parse(self):
-        """* Test case
-        - neologd辞書で正しく分割できることを確認する
-        """
+        # test using neologd dictionary
         mecab_obj = MecabWrapper(dictType='neologd')
         parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)
         self.assertTrue(parsed_obj, TokenizedSenetence)
@@ -31,9 +29,7 @@ def test_neologd_parse(self):
         self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))
 
     def test_default_parse(self):
-        """* Test case
-        - デフォルトの状態で動作を確認する
-        """
+        # test default status
         dictType = "ipadic"
         mecab_obj = MecabWrapper(dictType=dictType)
         assert isinstance(mecab_obj, MecabWrapper)
@@ -48,19 +44,44 @@ def test_default_parse(self):
         for morph in parsed_obj:
             assert isinstance(morph, str)
 
-    def test_init_userdict(self):
-        """* Test case
-        - すべての辞書を利用した場合の動作を確認する
-        """
-        mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
+    def test_parse_jumandic(self):
+        mecab_obj = MecabWrapper(dictType='jumandic')
         assert isinstance(mecab_obj, MecabWrapper)
 
-        res = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
-        assert isinstance(res, list)
-        assert 'さくらまな' in res
+        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
+        assert isinstance(parsed_obj, TokenizedSenetence)
+        for tokenized_obj in parsed_obj.tokenized_objects:
+            if tokenized_obj.word_stem == '女優':
+                # ドメイン:文化・芸術 is special output only in Jumandic
+                assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line
+
+    def test_parse_userdic(self):
+        pass
+
+    def test_parse_dictionary_path(self):
+        # put path to dictionary and parse sentence.
+        path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
+        if os.path.exists(path_default_ipadic):
+            mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic)
+            assert mecab_obj._path_dictionary == path_default_ipadic
+            parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
+            assert isinstance(parsed_obj, TokenizedSenetence)
+
+    def test_init_userdict(self):
+        # this test should be error response.
+        mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
+        assert isinstance(mecab_obj, MecabWrapper)
+        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
+        assert isinstance(parsed_obj, TokenizedSenetence)
+        is_ok = False
+        for tokenized_obj in parsed_obj.tokenized_objects:
+            if tokenized_obj.word_stem == 'さくらまな':
+                is_ok = True
+        assert is_ok
 
 
 if __name__ == '__main__':
     unittest.main()
 
 
+
diff --git a/travis-mecab-install.sh b/travis-mecab-install.sh
@@ -23,5 +23,13 @@ make
 sudo make install
 sudo ldconfig
 
+wget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM'
+tar zxfv jumandic.tar.gz
+cd mecab-jumandic-7.0-20130310
+./configure --with-charset=utf8
+make
+sudo make install
+sudo ldconfig
+
 cd $base_dir
-rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801
+rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310