diff --git a/CHANGELOG.md b/CHANGELOG.md index bb9b8e119..884118fcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,24 @@ and this project adheres to ## [5.3.2] - 2026-03-20 -This release focuses on security issues related to path traversal. +This release focuses on security issues related to path traversal +and renaming functions to conform with PEP 8. Old function names are +still accessible but migration to new names are recommended. + +### Added + +- `pythainlp.chunk` module: canonical home for chunking/phrase-structure + parsing, following the NLTK `nltk.chunk` naming convention. + +### Deprecated + +The following names are deprecated and will be removed in 6.0 (#1339): + +- `pythainlp.util.isthaichar()`: use `pythainlp.util.is_thai_char()`. +- `pythainlp.util.isthai()`: use `pythainlp.util.is_thai()`. +- `pythainlp.util.countthai()`: use `pythainlp.util.count_thai()`. +- `pythainlp.tag.crfchunk.CRFchunk`: use `pythainlp.chunk.CRFChunkParser`. +- `pythainlp.tag.chunk_parse()`: use `pythainlp.chunk.chunk_parse()`. ### Security diff --git a/pythainlp/chunk/__init__.py b/pythainlp/chunk/__init__.py new file mode 100644 index 000000000..82c035e5b --- /dev/null +++ b/pythainlp/chunk/__init__.py @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Thai phrase structure (chunking) module. + +This module provides chunk parsing for Thai text, following the +NLTK :mod:`nltk.chunk` naming convention. + +:Example: + + .. code-block:: python + + from pythainlp.chunk import chunk_parse, CRFChunkParser + from pythainlp.tag import pos_tag + + tokens = ["ผม", "รัก", "คุณ"] + tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid") + + # Using the convenience function + print(chunk_parse(tokens_pos)) + # output: ['B-NP', 'B-VP', 'I-VP'] + + # Using the class directly + with CRFChunkParser() as parser: + print(parser.parse(tokens_pos)) + # output: ['B-NP', 'B-VP', 'I-VP'] +""" +from __future__ import annotations + +__all__: list[str] = [ + "CRFChunkParser", + "chunk_parse", +] + +from pythainlp.chunk.crfchunk import CRFChunkParser + + +def chunk_parse( + sent: list[tuple[str, str]], + engine: str = "crf", + corpus: str = "orchidpp", +) -> list[str]: + """Parse a Thai sentence into phrase-structure chunks (IOB format). + + :param list[tuple[str, str]] sent: list of (word, POS-tag) pairs. + :param str engine: chunking engine; currently only ``"crf"`` is + supported. + :param str corpus: corpus name for the CRF model; currently only + ``"orchidpp"`` is supported. + :return: list of IOB chunk labels, one per token. + :rtype: list[str] + + :Example: + + .. code-block:: python + + from pythainlp.chunk import chunk_parse + from pythainlp.tag import pos_tag + + tokens = ["ผม", "รัก", "คุณ"] + tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid") + + print(chunk_parse(tokens_pos)) + # output: ['B-NP', 'B-VP', 'I-VP'] + """ + _parser = CRFChunkParser(corpus=corpus) + return _parser.parse(sent) diff --git a/pythainlp/chunk/crfchunk.py b/pythainlp/chunk/crfchunk.py new file mode 100644 index 000000000..4a354d0cc --- /dev/null +++ b/pythainlp/chunk/crfchunk.py @@ -0,0 +1,164 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""CRF-based Thai phrase structure (chunk) parser.""" +from __future__ import annotations + +from importlib.resources import as_file, files +from typing import TYPE_CHECKING, Any, Optional, Union + +if TYPE_CHECKING: + import types + from contextlib import AbstractContextManager + + from pycrfsuite import ( + Tagger as CRFTagger, # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute] + ) + +from pythainlp.corpus import thai_stopwords + + +def _is_stopword(word: str) -> bool: + return word in thai_stopwords() + + +def _doc2features( + tokens: list[tuple[str, str]], index: int +) -> dict[str, Union[str, bool]]: + """Extract features for a single token in a POS-tagged sentence. + + :param list[tuple[str, str]] tokens: POS-tagged sentence, + a list of (word, POS-tag) pairs. + :param int index: index of the token to extract features for. + :return: feature dictionary for the token. + :rtype: dict[str, Union[str, bool]] + """ + word, pos = tokens[index] + f: dict[str, Union[str, bool]] = { + "word": word, + "word_is_stopword": _is_stopword(word), + "pos": pos, + } + if index > 1: + prevprevword, prevprevpos = tokens[index - 2] + f["prev-prev-word"] = prevprevword + f["prev-prevz-word_is_stopword"] = _is_stopword(prevprevword) + f["prev-prevz-pos"] = prevprevpos + if index > 0: + prevword, prevpos = tokens[index - 1] + f["prev-word"] = prevword + f["prev-word_is_stopword"] = _is_stopword(prevword) + f["prev-pos"] = prevpos + else: + f["BOS"] = True + if index < len(tokens) - 2: + nextnextword, nextnextpos = tokens[index + 2] + f["nextnext-word"] = nextnextword + f["nextnext-word_is_stopword"] = _is_stopword(nextnextword) + f["nextnext-pos"] = nextnextpos + if index < len(tokens) - 1: + nextword, nextpos = tokens[index + 1] + f["next-word"] = nextword + f["next-word_is_stopword"] = _is_stopword(nextword) + f["next-pos"] = nextpos + else: + f["EOS"] = True + + return f + + +def _extract_features( + doc: list[tuple[str, str]], +) -> list[dict[str, Union[str, bool]]]: + return [_doc2features(doc, i) for i in range(len(doc))] + + +class CRFChunkParser: + """CRF-based chunk parser for Thai text. + + Parses a POS-tagged sentence into phrase-structure chunks + (IOB format), following the NLTK :class:`nltk.chunk.ChunkParserI` + convention. + + This class supports the context manager protocol for deterministic + resource cleanup: + + .. code-block:: python + + from pythainlp.chunk import CRFChunkParser + + with CRFChunkParser() as parser: + result = parser.parse(tokens_pos) + + :param str corpus: corpus name for the CRF model + (default: ``"orchidpp"``). + """ + + corpus: str + _model_file_ctx: Optional[AbstractContextManager[Any]] + tagger: CRFTagger + xseq: list[dict[str, Union[str, bool]]] + + def __init__(self, corpus: str = "orchidpp") -> None: + self.corpus = corpus + self._model_file_ctx = None + self.load_model(self.corpus) + + def load_model(self, corpus: str) -> None: + """Load the CRF model for the given corpus. + + :param str corpus: corpus name. + """ + from pycrfsuite import ( + Tagger as CRFTagger, # noqa: PLC0415 # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute] + ) + + self.tagger = CRFTagger() + if corpus == "orchidpp": + corpus_files = files("pythainlp.corpus") + model_file = corpus_files.joinpath("crfchunk_orchidpp.model") + self._model_file_ctx = as_file(model_file) + model_path = self._model_file_ctx.__enter__() + self.tagger.open(str(model_path)) + + def parse(self, token_pos: list[tuple[str, str]]) -> list[str]: + """Parse a POS-tagged sentence into IOB chunk labels. + + :param list[tuple[str, str]] token_pos: list of (word, POS-tag) + pairs. + :return: list of IOB chunk labels, one per token. + :rtype: list[str] + """ + self.xseq = _extract_features(token_pos) + return self.tagger.tag(self.xseq) # type: ignore[no-any-return] + + def __enter__(self) -> CRFChunkParser: + """Context manager entry.""" + return self + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[types.TracebackType], + ) -> None: + """Context manager exit — clean up resources.""" + if self._model_file_ctx is not None: + try: + self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb) + self._model_file_ctx = None + except Exception: # noqa: S110 + pass + + def __del__(self) -> None: + """Attempt resource cleanup on garbage collection. + + .. note:: + :meth:`__del__` is not guaranteed to be called. + Use the context manager protocol for reliable cleanup. + """ + if self._model_file_ctx is not None: + try: + self._model_file_ctx.__exit__(None, None, None) + except Exception: # noqa: S110 + pass diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 8141c605a..ac3210d2e 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -17,7 +17,7 @@ from pythainlp import thai_digits, thai_letters from pythainlp.corpus import phupha, thai_orst_words -from pythainlp.util import isthaichar +from pythainlp.util import is_thai_char def _no_filter(word: str) -> bool: @@ -26,7 +26,7 @@ def _no_filter(word: str) -> bool: def _is_thai_and_not_num(word: str) -> bool: for ch in word: - if ch != "." and not isthaichar(ch): + if ch != "." and not is_thai_char(ch): return False if ch in thai_digits or ch in digits: return False diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 585888c5d..abf4dbdc7 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -5,6 +5,11 @@ Tagging each token in a sentence with supplementary information, such as its part-of-speech (POS) tag, and named entity (NE) tag. + +.. note:: + :func:`chunk_parse` has moved to :mod:`pythainlp.chunk`. + Importing it from :mod:`pythainlp.tag` still works but emits a + :class:`DeprecationWarning` and will be removed in 6.0. """ __all__: list[str] = [ diff --git a/pythainlp/tag/chunk.py b/pythainlp/tag/chunk.py index dc53e6f21..a8bfb971c 100644 --- a/pythainlp/tag/chunk.py +++ b/pythainlp/tag/chunk.py @@ -1,33 +1,37 @@ # SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 +"""Deprecated. Use :func:`pythainlp.chunk.chunk_parse` instead. + +.. deprecated:: 5.3.2 + :func:`chunk_parse` has moved to :mod:`pythainlp.chunk`. +""" from __future__ import annotations +from pythainlp.chunk import chunk_parse as _chunk_parse +from pythainlp.tools import warn_deprecation + def chunk_parse( - sent: list[tuple[str, str]], engine: str = "crf", corpus: str = "orchidpp" + sent: list[tuple[str, str]], + engine: str = "crf", + corpus: str = "orchidpp", ) -> list[str]: - """This function parses Thai sentence to phrase structure in IOB format. + """Parse a Thai sentence into phrase-structure chunks (IOB format). - :param list[tuple[str, str]] sent: list [(word, part-of-speech)] - :param str engine: chunk parse engine (now, it has crf only) - :param str corpus: chunk parse corpus (now, it has orchidpp only) + .. deprecated:: 5.3.2 + Use :func:`pythainlp.chunk.chunk_parse` instead. - :return: a list of tuples (word, part-of-speech, chunking) + :param list[tuple[str, str]] sent: list of (word, POS-tag) pairs. + :param str engine: chunking engine (default: ``"crf"``). + :param str corpus: corpus name (default: ``"orchidpp"``). + :return: list of IOB chunk labels, one per token. :rtype: list[str] - - :Example: - :: - - from pythainlp.tag import chunk_parse, pos_tag - - tokens = ["ผม", "รัก", "คุณ"] - tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid") - - print(chunk_parse(tokens_pos)) - # output: ['B-NP', 'B-VP', 'I-VP'] """ - from .crfchunk import CRFchunk - - _engine = CRFchunk() - return _engine.parse(sent) + warn_deprecation( + "pythainlp.tag.chunk_parse", + "pythainlp.chunk.chunk_parse", + "5.3.2", + "6.0", + ) + return _chunk_parse(sent, engine=engine, corpus=corpus) diff --git a/pythainlp/tag/crfchunk.py b/pythainlp/tag/crfchunk.py index 87aa883f0..5f2faf107 100644 --- a/pythainlp/tag/crfchunk.py +++ b/pythainlp/tag/crfchunk.py @@ -1,134 +1,31 @@ # SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -from __future__ import annotations - -from importlib.resources import as_file, files -from typing import TYPE_CHECKING, Any, Optional, Union - -if TYPE_CHECKING: - import types - from contextlib import AbstractContextManager - -from pycrfsuite import Tagger as CRFTagger - -from pythainlp.corpus import thai_stopwords - - -def _is_stopword(word: str) -> bool: # check Thai stopword - return word in thai_stopwords() - - -def _doc2features( - tokens: list[tuple[str, str]], index: int -) -> dict[str, Union[str, bool]]: - """`tokens` = a POS-tagged sentence [(w1, t1), ...] - `index` = the index of the token we want to extract features for - """ - word, pos = tokens[index] - f: dict[str, Union[str, bool]] = { - "word": word, - "word_is_stopword": _is_stopword(word), - "pos": pos, - } - if index > 0 and index > 1: - prevprevword, prevprevpos = tokens[index - 2] - f["prev-prev-word"] = prevprevword - f["prev-prevz-word_is_stopword"] = _is_stopword(prevprevword) - f["prev-prevz-pos"] = prevprevpos - if index > 0: - prevword, prevpos = tokens[index - 1] - f["prev-word"] = prevword - f["prev-word_is_stopword"] = _is_stopword(prevword) - f["prev-pos"] = prevpos - else: - f["BOS"] = True - if index < len(tokens) - 2: - nextnextword, nextnextpos = tokens[index + 2] - f["nextnext-word"] = nextnextword - f["nextnext-word_is_stopword"] = _is_stopword(nextnextword) - f["nextnext-pos"] = nextnextpos - if index < len(tokens) - 1: - nextword, nextpos = tokens[index + 1] - f["next-word"] = nextword - f["next-word_is_stopword"] = _is_stopword(nextword) - f["next-pos"] = nextpos - else: - f["EOS"] = True - - return f - - -def extract_features( - doc: list[tuple[str, str]], -) -> list[dict[str, Union[str, bool]]]: - return [_doc2features(doc, i) for i in range(0, len(doc))] +"""Deprecated. Use :mod:`pythainlp.chunk` instead. +.. deprecated:: 5.3.2 + This module has been superseded by :mod:`pythainlp.chunk`. + Import :class:`pythainlp.chunk.CRFChunkParser` directly. +""" +from __future__ import annotations -class CRFchunk: - """CRF-based chunker for Thai text. +from pythainlp.chunk.crfchunk import CRFChunkParser +from pythainlp.tools import warn_deprecation - This class can be used as a context manager to ensure proper cleanup - of resources. Example: - with CRFchunk() as chunker: - result = chunker.parse(tokens) +# Backward-compatible alias. Deprecated since 5.3.2; removed in 6.0. +class CRFchunk(CRFChunkParser): + """Deprecated. Use :class:`pythainlp.chunk.CRFChunkParser` instead. - Alternatively, the object will attempt to clean up resources when - garbage collected, though this is not guaranteed. + .. deprecated:: 5.3.2 + Use :class:`pythainlp.chunk.CRFChunkParser` instead. """ - corpus: str - _model_file_ctx: Optional[AbstractContextManager[Any]] - tagger: CRFTagger - xseq: list[dict[str, Union[str, bool]]] - def __init__(self, corpus: str = "orchidpp") -> None: - self.corpus = corpus - self._model_file_ctx = None - self.load_model(self.corpus) - - def load_model(self, corpus: str) -> None: - self.tagger = CRFTagger() - if corpus == "orchidpp": - corpus_files = files("pythainlp.corpus") - model_file = corpus_files.joinpath("crfchunk_orchidpp.model") - self._model_file_ctx = as_file(model_file) - model_path = self._model_file_ctx.__enter__() - self.tagger.open(str(model_path)) - - def parse(self, token_pos: list[tuple[str, str]]) -> list[str]: - self.xseq = extract_features(token_pos) - return self.tagger.tag(self.xseq) # type: ignore[no-any-return] - - def __enter__(self) -> CRFchunk: - """Context manager entry.""" - return self - - def __exit__( - self, - exc_type: Optional[type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[types.TracebackType], - ) -> None: - """Context manager exit - clean up resources.""" - if self._model_file_ctx is not None: - try: - self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb) - self._model_file_ctx = None - except Exception: # noqa: S110 - pass - - def __del__(self) -> None: - """Clean up the context manager when object is destroyed. - - Note: __del__ is not guaranteed to be called and should not be - relied upon for critical cleanup. Use the context manager protocol - (with statement) for reliable resource management. - """ - if self._model_file_ctx is not None: - try: - self._model_file_ctx.__exit__(None, None, None) - except Exception: # noqa: S110 - # Silently ignore cleanup errors during garbage collection - pass + warn_deprecation( + "pythainlp.tag.crfchunk.CRFchunk", + "pythainlp.chunk.CRFChunkParser", + "5.3.2", + "6.0", + ) + super().__init__(corpus) diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py index 63f804ca2..0e27892f9 100644 --- a/pythainlp/tag/thainer.py +++ b/pythainlp/tag/thainer.py @@ -13,10 +13,12 @@ from pythainlp.corpus import get_corpus_path, thai_stopwords from pythainlp.tag.pos_tag import pos_tag from pythainlp.tokenize import word_tokenize -from pythainlp.util import isthai +from pythainlp.util import is_thai if TYPE_CHECKING: - from pycrfsuite import Tagger as CRFTagger + from pycrfsuite import ( + Tagger as CRFTagger, # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute] + ) _TOKENIZER_ENGINE: str = "mm" @@ -33,7 +35,7 @@ def _doc2features(doc: list, i: int) -> dict: features = { "word.word": word, "word.stopword": _is_stopword(word), - "word.isthai": isthai(word), + "word.isthai": is_thai(word), "word.isspace": word.isspace(), "postag": postag, "word.isdigit": word.isdigit(), @@ -48,7 +50,7 @@ def _doc2features(doc: list, i: int) -> dict: prev_features = { "word.prevword": prevword, "word.previsspace": prevword.isspace(), - "word.previsthai": isthai(prevword), + "word.previsthai": is_thai(prevword), "word.prevstopword": _is_stopword(prevword), "word.prevpostag": prevpostag, "word.prevwordisdigit": prevword.isdigit(), @@ -65,7 +67,7 @@ def _doc2features(doc: list, i: int) -> dict: "word.nextword": nextword, "word.nextisspace": nextword.isspace(), "word.nextpostag": nextpostag, - "word.nextisthai": isthai(nextword), + "word.nextisthai": is_thai(nextword), "word.nextstopword": _is_stopword(nextword), "word.nextwordisdigit": nextword.isdigit(), } @@ -102,7 +104,9 @@ def __init__(self, version: str = "1.4") -> None: It's support Thai NER 1.4 & 1.5. The default value is `1.4` """ - from pycrfsuite import Tagger as CRFTagger + from pycrfsuite import ( + Tagger as CRFTagger, # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute] + ) self.crf: "CRFTagger" = CRFTagger() @@ -186,7 +190,7 @@ def get_ner( pos_tags = pos_tag( tokens, engine="perceptron", corpus=self.pos_tag_name ) - x_test = ThaiNameTagger.__extract_features(pos_tags) + x_test = ThaiNameTagger._extract_features(pos_tags) y = self.crf.tag(x_test) sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)] @@ -221,7 +225,7 @@ def get_ner( return sent_ner @staticmethod - def __extract_features( + def _extract_features( doc: list[tuple[str, str]], ) -> list[dict[str, Union[str, bool]]]: return [_doc2features(doc, i) for i in range(len(doc))] diff --git a/pythainlp/tokenize/crfcut.py b/pythainlp/tokenize/crfcut.py index 34941555a..dfdedfde4 100644 --- a/pythainlp/tokenize/crfcut.py +++ b/pythainlp/tokenize/crfcut.py @@ -126,7 +126,7 @@ } -def extract_features( +def _extract_features( doc: list[str], window: int = 2, max_n_gram: int = 3 ) -> list[list[str]]: """Extract features for CRF by sliding `max_n_gram` of tokens @@ -175,7 +175,7 @@ def extract_features( _CRFCUT_DATA_FILENAME: str = "sentenceseg_crfcut.model" -_tagger: pycrfsuite.Tagger = pycrfsuite.Tagger() +_tagger: pycrfsuite.Tagger = pycrfsuite.Tagger() # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-attribute] _tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME)) @@ -186,7 +186,7 @@ def segment(text: str) -> list[str]: :return: list of words, tokenized from the text """ toks = word_tokenize(text) - feat = extract_features(toks) + feat = _extract_features(toks) labs = _tagger.tag(feat) labs[-1] = "E" # make sure it cuts the last sentence diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py index 05ed29e2b..71aa6d6d8 100644 --- a/pythainlp/tokenize/nlpo3.py +++ b/pythainlp/tokenize/nlpo3.py @@ -14,11 +14,10 @@ ) from nlpo3 import segment as nlpo3_segment # noqa: F401 -from pythainlp.corpus.common import _THAI_WORDS_FILENAME - _NLPO3_DEFAULT_DICT_NAME: str = ( "_73bcj049dzbu9t49b4va170k" # supposed to be unique ) +_NLPO3_WORDS_FILENAME: str = "words_th.txt" _NLPO3_DEFAULT_DICT: Optional[str] = None # Will be lazily loaded _dict_file_ctx: Optional[Any] = ( None # File context manager kept alive for program lifetime @@ -49,7 +48,7 @@ def _ensure_default_dict_loaded() -> None: # Double-check pattern to avoid race conditions if _NLPO3_DEFAULT_DICT is None: corpus_files = files("pythainlp.corpus") - dict_file = corpus_files.joinpath(_THAI_WORDS_FILENAME) + dict_file = corpus_files.joinpath(_NLPO3_WORDS_FILENAME) _dict_file_ctx = as_file(dict_file) dict_path = _dict_file_ctx.__enter__() msg, success = nlpo3_load_dict( diff --git a/pythainlp/tools/core.py b/pythainlp/tools/core.py index e4002cd15..8a18b5d5d 100644 --- a/pythainlp/tools/core.py +++ b/pythainlp/tools/core.py @@ -10,26 +10,30 @@ def warn_deprecation( - deprecated_func: str, - replacing_func: str = "", + deprecated_symbol: str, + replacing_symbol: str = "", deprecated_version: str = "", removal_version: str = "", ) -> None: - """Warn about the deprecation of a function. - - :param str deprecated_func: Name of the deprecated function. - :param str replacing_func: Name of the function to use instead (optional). - :param str deprecated_version: Version in which the function will be deprecated (optional). - :param str removal_version: Version in which the function will be removed (optional). + """Warn about the deprecation of a function, class, or other symbol. + + :param str deprecated_symbol: Fully qualified name of the deprecated + symbol (e.g. ``"pythainlp.util.isthaichar"``). + :param str replacing_symbol: Fully qualified name of the replacement + (optional). + :param str deprecated_version: Version in which the symbol was + deprecated (optional). + :param str removal_version: Version in which the symbol will be + removed (optional). """ - message = f"The '{deprecated_func}' function is deprecated" + message = f"'{deprecated_symbol}' is deprecated" if deprecated_version: message += f" since {deprecated_version}" if not removal_version: removal_version = "a future release" message += f" and will be removed in {removal_version}." - if replacing_func: - message += f" Please use '{replacing_func}' instead." + if replacing_symbol: + message += f" Use '{replacing_symbol}' instead." warnings.warn(message, DeprecationWarning, stacklevel=2) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 305f726ae..76273b953 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -13,6 +13,7 @@ "collate", "contains_profanity", "convert_years", + "count_thai", "count_thai_chars", "countthai", "dict_trie", @@ -24,6 +25,8 @@ "find_keyword", "find_profanity", "ipa_to_rtgs", + "is_thai", + "is_thai_char", "isthai", "isthaichar", "longest_common_subsequence", @@ -129,9 +132,12 @@ from pythainlp.util.thai import ( analyze_thai_text, + count_thai, count_thai_chars, countthai, display_thai_char, + is_thai, + is_thai_char, isthai, isthaichar, thai_word_tone_detector, diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index f443c4e6f..37552bb27 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -7,6 +7,7 @@ import string from collections import defaultdict +from types import MappingProxyType from typing import Optional from pythainlp import ( @@ -21,6 +22,7 @@ thai_tonemarks, thai_vowels, ) +from pythainlp.tools import warn_deprecation _DEFAULT_IGNORE_CHARS: str = ( string.whitespace + string.digits + string.punctuation @@ -29,7 +31,8 @@ _TH_LAST_CHAR_ASCII: int = 3711 # A comprehensive map of Thai characters to their descriptive names. -THAI_CHAR_NAMES: dict[str, str] = { +# MappingProxyType makes this constant read-only at runtime. +_THAI_CHAR_NAMES: MappingProxyType[str, str] = MappingProxyType({ # Consonants **{char: char for char in thai_consonants}, # Vowels and Signs @@ -72,10 +75,10 @@ **{char: char for char in thai_digits}, # Symbol "\u0e3f": "฿", -} +}) -def isthaichar(ch: str) -> bool: +def is_thai_char(ch: str) -> bool: """Check if a character is a Thai character. :param ch: input character @@ -86,12 +89,12 @@ def isthaichar(ch: str) -> bool: :Example: :: - from pythainlp.util import isthaichar + from pythainlp.util import is_thai_char - isthaichar("ก") # THAI CHARACTER KO KAI + is_thai_char("ก") # THAI CHARACTER KO KAI # output: True - isthaichar("๕") # THAI DIGIT FIVE + is_thai_char("๕") # THAI DIGIT FIVE # output: True """ ch_val = ord(ch) @@ -100,7 +103,27 @@ def isthaichar(ch: str) -> bool: return False -def isthai(text: str, ignore_chars: str = ".") -> bool: +def isthaichar(ch: str) -> bool: + """Check if a character is a Thai character. + + .. deprecated:: 5.3.2 + Use :func:`is_thai_char` instead. + + :param ch: input character + :type ch: str + :return: True if ch is a Thai character, otherwise False. + :rtype: bool + """ + warn_deprecation( + "pythainlp.util.isthaichar", + "pythainlp.util.is_thai_char", + "5.3.2", + "6.0", + ) + return is_thai_char(ch) + + +def is_thai(text: str, ignore_chars: str = ".") -> bool: """Check if every character in a string is a Thai character. :param text: input text @@ -114,18 +137,18 @@ def isthai(text: str, ignore_chars: str = ".") -> bool: :Example: :: - from pythainlp.util import isthai + from pythainlp.util import is_thai - isthai("กาลเวลา") + is_thai("กาลเวลา") # output: True - isthai("กาลเวลา.") + is_thai("กาลเวลา.") # output: True - isthai("กาล-เวลา") + is_thai("กาล-เวลา") # output: False - isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,") + is_thai("กาล-เวลา +66", ignore_chars="01234567890+-.,") # output: True """ @@ -133,13 +156,36 @@ def isthai(text: str, ignore_chars: str = ".") -> bool: ignore_chars = "" for ch in text: - if ch not in ignore_chars and not isthaichar(ch): + if ch not in ignore_chars and not is_thai_char(ch): return False return True -def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: - """Find proportion of Thai characters in a given text +def isthai(text: str, ignore_chars: str = ".") -> bool: + """Check if every character in a string is a Thai character. + + .. deprecated:: 5.3.2 + Use :func:`is_thai` instead. + + :param text: input text + :type text: str + :param ignore_chars: characters to be ignored, defaults to "." + :type ignore_chars: str, optional + :return: True if every character in the input string is Thai, + otherwise False. + :rtype: bool + """ + warn_deprecation( + "pythainlp.util.isthai", + "pythainlp.util.is_thai", + "5.3.2", + "6.0", + ) + return is_thai(text, ignore_chars) + + +def count_thai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: + """Find proportion of Thai characters in a given text. :param text: input text :type text: str @@ -152,18 +198,18 @@ def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: :Example: :: - from pythainlp.util import countthai + from pythainlp.util import count_thai - countthai("ไทยเอ็นแอลพี 3.0") + count_thai("ไทยเอ็นแอลพี 3.0") # output: 100.0 - countthai("PyThaiNLP 3.0") + count_thai("PyThaiNLP 3.0") # output: 0.0 - countthai("ใช้งาน PyThaiNLP 3.0") + count_thai("ใช้งาน PyThaiNLP 3.0") # output: 40.0 - countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="") + count_thai("ใช้งาน PyThaiNLP 3.0", ignore_chars="") # output: 30.0 """ if not text or not isinstance(text, str): @@ -178,7 +224,7 @@ def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: for ch in text: if ch in ignore_chars: num_ignore += 1 - elif isthaichar(ch): + elif is_thai_char(ch): num_thai += 1 num_count = len(text) - num_ignore @@ -189,6 +235,29 @@ def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: return (num_thai / num_count) * 100 +def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: + """Find proportion of Thai characters in a given text. + + .. deprecated:: 5.3.2 + Use :func:`count_thai` instead. + + :param text: input text + :type text: str + :param ignore_chars: characters to be ignored, defaults to whitespace,\\ + digits, and punctuation marks. + :type ignore_chars: str, optional + :return: proportion of Thai characters in the text (percentage) + :rtype: float + """ + warn_deprecation( + "pythainlp.util.countthai", + "pythainlp.util.count_thai", + "5.3.2", + "6.0", + ) + return count_thai(text, ignore_chars) + + def display_thai_char(ch: str) -> str: """Prefix an underscore (_) to a high-position vowel or a tone mark, to ease readability. @@ -348,8 +417,8 @@ def analyze_thai_text(text: str) -> dict: # Iterate over each character in the input string for char in text: # Check if the character is in our mapping - if char in THAI_CHAR_NAMES: - name = THAI_CHAR_NAMES[char] + if char in _THAI_CHAR_NAMES: + name = _THAI_CHAR_NAMES[char] results[name] += 1 else: # If the character is not a known Thai character, classify it as character diff --git a/tests/compact/testc_parse.py b/tests/compact/testc_parse.py index 52d01147d..5166079b1 100644 --- a/tests/compact/testc_parse.py +++ b/tests/compact/testc_parse.py @@ -3,8 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +import warnings -from pythainlp.tag import chunk_parse, pos_tag +from pythainlp.chunk import CRFChunkParser, chunk_parse +from pythainlp.tag import pos_tag class ChunkParseTestCaseC(unittest.TestCase): @@ -13,3 +15,41 @@ def test_chunk_parse(self): w_p = pos_tag(tokens, engine="perceptron", corpus="orchid") self.assertIsNotNone(chunk_parse(w_p)) + + def test_crf_chunk_parser(self): + tokens = ["ผม", "รัก", "คุณ"] + w_p = pos_tag(tokens, engine="perceptron", corpus="orchid") + with CRFChunkParser() as parser: + result = parser.parse(w_p) + self.assertIsNotNone(result) + self.assertEqual(len(result), len(tokens)) + + def test_deprecated_tag_chunk_parse(self): + """pythainlp.tag.chunk_parse still works but emits DeprecationWarning.""" + from pythainlp.tag import chunk_parse as old_chunk_parse + + tokens = ["ผม", "รัก", "คุณ"] + w_p = pos_tag(tokens, engine="perceptron", corpus="orchid") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = old_chunk_parse(w_p) + self.assertIsNotNone(result) + self.assertTrue( + any(issubclass(warning.category, DeprecationWarning) for warning in w) + ) + + def test_deprecated_crfchunk(self): + """pythainlp.tag.crfchunk.CRFchunk still works but emits DeprecationWarning.""" + from pythainlp.tag.crfchunk import CRFchunk + + tokens = ["ผม", "รัก", "คุณ"] + w_p = pos_tag(tokens, engine="perceptron", corpus="orchid") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + chunker = CRFchunk() + self.assertTrue( + any(issubclass(warning.category, DeprecationWarning) for warning in w) + ) + result = chunker.parse(w_p) + self.assertIsNotNone(result) + self.assertEqual(len(result), len(tokens)) diff --git a/tests/core/test_tools.py b/tests/core/test_tools.py index a87751d7d..9b949087b 100644 --- a/tests/core/test_tools.py +++ b/tests/core/test_tools.py @@ -130,10 +130,10 @@ def test_warn_deprecation(self): self.assertIn("old_func", str(w[0].message)) self.assertIn("deprecated", str(w[0].message)) - # Test with replacement function + # Test with replacement symbol with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - warn_deprecation("old_func", replacing_func="new_func") + warn_deprecation("old_func", replacing_symbol="new_func") self.assertEqual(len(w), 1) self.assertIn("old_func", str(w[0].message)) self.assertIn("new_func", str(w[0].message)) @@ -153,7 +153,7 @@ def test_warn_deprecation(self): warnings.simplefilter("always") warn_deprecation( "old_func", - replacing_func="new_func", + replacing_symbol="new_func", deprecated_version="1.0", removal_version="2.0", ) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index 11d4ff8e9..35f34b055 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -10,8 +10,7 @@ from collections import Counter from datetime import date, datetime, time, timedelta, timezone -from pythainlp.corpus import _CORPUS_PATH, thai_words -from pythainlp.corpus.common import _THAI_WORDS_FILENAME +from pythainlp.corpus import corpus_path, thai_words from pythainlp.util import ( Trie, analyze_thai_text, @@ -19,6 +18,7 @@ bahttext, collate, convert_years, + count_thai, count_thai_chars, countthai, dict_trie, @@ -29,6 +29,8 @@ expand_maiyamok, find_keyword, ipa_to_rtgs, + is_thai, + is_thai_char, isthai, isthaichar, longest_common_subsequence, @@ -547,7 +549,7 @@ def test_trie(self): self.assertIsNotNone(dict_trie({"ลอง", "สร้าง", "Trie", "ลน"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone( - dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)) + dict_trie(os.path.join(corpus_path(), "words_th.txt")) ) with self.assertRaises(TypeError): dict_trie("") @@ -716,6 +718,11 @@ def test_isthaichar(self): self.assertFalse(isthaichar("a")) self.assertFalse(isthaichar("0")) + def test_is_thai_char(self): + self.assertTrue(is_thai_char("ก")) + self.assertFalse(is_thai_char("a")) + self.assertFalse(is_thai_char("0")) + def test_isthai(self): self.assertTrue(isthai("ไทย")) self.assertTrue(isthai("ต.ค.")) @@ -724,6 +731,23 @@ def test_isthai(self): self.assertFalse(isthai("(ต.ค.)")) self.assertFalse(isthai("ต.ค.", ignore_chars=None)) + def test_is_thai(self): + self.assertTrue(is_thai("ไทย")) + self.assertTrue(is_thai("ต.ค.")) + self.assertTrue(is_thai("(ต.ค.)", ignore_chars=".()")) + self.assertFalse(is_thai("ไทย0")) + self.assertFalse(is_thai("(ต.ค.)")) + self.assertFalse(is_thai("ต.ค.", ignore_chars=None)) + + def test_count_thai(self): + self.assertEqual(count_thai(""), 0.0) + self.assertEqual(count_thai("123"), 0.0) + self.assertEqual(count_thai("1 2 3"), 0.0) + self.assertEqual(count_thai("ประเทศไทย"), 100.0) + self.assertEqual(count_thai("โรค COVID-19"), 37.5) + self.assertEqual(count_thai("(กกต.)", ".()"), 100.0) + self.assertEqual(count_thai("(กกต.)", None), 50.0) + def test_display_thai_char(self): self.assertEqual(display_thai_char("้"), "_้") self.assertEqual(display_thai_char("ป"), "ป")