PyThaiNLP · bact · Mar 19, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,7 +19,24 @@ and this project adheres to
 
 ## [5.3.2] - 2026-03-20
 
-This release focuses on security issues related to path traversal.
+This release focuses on security issues related to path traversal
+and renaming functions to conform with PEP 8. Old function names are
+still accessible but migration to new names are recommended.
+
+### Added
+
+- `pythainlp.chunk` module: canonical home for chunking/phrase-structure
+  parsing, following the NLTK `nltk.chunk` naming convention.
+
+### Deprecated
+
+The following names are deprecated and will be removed in 6.0 (#1339):
+
+- `pythainlp.util.isthaichar()`: use `pythainlp.util.is_thai_char()`.
+- `pythainlp.util.isthai()`: use `pythainlp.util.is_thai()`.
+- `pythainlp.util.countthai()`: use `pythainlp.util.count_thai()`.
+- `pythainlp.tag.crfchunk.CRFchunk`: use `pythainlp.chunk.CRFChunkParser`.
+- `pythainlp.tag.chunk_parse()`: use `pythainlp.chunk.chunk_parse()`.
 
 ### Security
 

diff --git a/pythainlp/chunk/__init__.py b/pythainlp/chunk/__init__.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""Thai phrase structure (chunking) module.
+
+This module provides chunk parsing for Thai text, following the
+NLTK :mod:`nltk.chunk` naming convention.
+
+:Example:
+
+    .. code-block:: python
+
+        from pythainlp.chunk import chunk_parse, CRFChunkParser
+        from pythainlp.tag import pos_tag
+
+        tokens = ["ผม", "รัก", "คุณ"]
+        tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")
+
+        # Using the convenience function
+        print(chunk_parse(tokens_pos))
+        # output: ['B-NP', 'B-VP', 'I-VP']
+
+        # Using the class directly
+        with CRFChunkParser() as parser:
+            print(parser.parse(tokens_pos))
+        # output: ['B-NP', 'B-VP', 'I-VP']
+"""
+from __future__ import annotations
+
+__all__: list[str] = [
+    "CRFChunkParser",
+    "chunk_parse",
+]
+
+from pythainlp.chunk.crfchunk import CRFChunkParser
+
+
+def chunk_parse(
+    sent: list[tuple[str, str]],
+    engine: str = "crf",
+    corpus: str = "orchidpp",
+) -> list[str]:
+    """Parse a Thai sentence into phrase-structure chunks (IOB format).
+
+    :param list[tuple[str, str]] sent: list of (word, POS-tag) pairs.
+    :param str engine: chunking engine; currently only ``"crf"`` is
+        supported.
+    :param str corpus: corpus name for the CRF model; currently only
+        ``"orchidpp"`` is supported.
+    :return: list of IOB chunk labels, one per token.
+    :rtype: list[str]
+
+    :Example:
+
+    .. code-block:: python
+
+        from pythainlp.chunk import chunk_parse
+        from pythainlp.tag import pos_tag
+
+        tokens = ["ผม", "รัก", "คุณ"]
+        tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")
+
+        print(chunk_parse(tokens_pos))
+        # output: ['B-NP', 'B-VP', 'I-VP']
+    """
+    _parser = CRFChunkParser(corpus=corpus)
+    return _parser.parse(sent)
diff --git a/pythainlp/chunk/crfchunk.py b/pythainlp/chunk/crfchunk.py
@@ -0,0 +1,164 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""CRF-based Thai phrase structure (chunk) parser."""
+from __future__ import annotations
+
+from importlib.resources import as_file, files
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+if TYPE_CHECKING:
+    import types
+    from contextlib import AbstractContextManager
+
+    from pycrfsuite import (
+        Tagger as CRFTagger,  # pyright: ignore[reportAttributeAccessIssue]  # pyrefly: ignore[missing-module-attribute]
+    )
+
+from pythainlp.corpus import thai_stopwords
+
+
+def _is_stopword(word: str) -> bool:
+    return word in thai_stopwords()
+
+
+def _doc2features(
+    tokens: list[tuple[str, str]], index: int
+) -> dict[str, Union[str, bool]]:
+    """Extract features for a single token in a POS-tagged sentence.
+
+    :param list[tuple[str, str]] tokens: POS-tagged sentence,
+        a list of (word, POS-tag) pairs.
+    :param int index: index of the token to extract features for.
+    :return: feature dictionary for the token.
+    :rtype: dict[str, Union[str, bool]]
+    """
+    word, pos = tokens[index]
+    f: dict[str, Union[str, bool]] = {
+        "word": word,
+        "word_is_stopword": _is_stopword(word),
+        "pos": pos,
+    }
+    if index > 1:
+        prevprevword, prevprevpos = tokens[index - 2]
+        f["prev-prev-word"] = prevprevword
+        f["prev-prevz-word_is_stopword"] = _is_stopword(prevprevword)
+        f["prev-prevz-pos"] = prevprevpos
+    if index > 0:
+        prevword, prevpos = tokens[index - 1]
+        f["prev-word"] = prevword
+        f["prev-word_is_stopword"] = _is_stopword(prevword)
+        f["prev-pos"] = prevpos
+    else:
+        f["BOS"] = True
+    if index < len(tokens) - 2:
+        nextnextword, nextnextpos = tokens[index + 2]
+        f["nextnext-word"] = nextnextword
+        f["nextnext-word_is_stopword"] = _is_stopword(nextnextword)
+        f["nextnext-pos"] = nextnextpos
+    if index < len(tokens) - 1:
+        nextword, nextpos = tokens[index + 1]
+        f["next-word"] = nextword
+        f["next-word_is_stopword"] = _is_stopword(nextword)
+        f["next-pos"] = nextpos
+    else:
+        f["EOS"] = True
+
+    return f
+
+
+def _extract_features(
+    doc: list[tuple[str, str]],
+) -> list[dict[str, Union[str, bool]]]:
+    return [_doc2features(doc, i) for i in range(len(doc))]
+
+
+class CRFChunkParser:
+    """CRF-based chunk parser for Thai text.
+
+    Parses a POS-tagged sentence into phrase-structure chunks
+    (IOB format), following the NLTK :class:`nltk.chunk.ChunkParserI`
+    convention.
+
+    This class supports the context manager protocol for deterministic
+    resource cleanup:
+
+    .. code-block:: python
+
+        from pythainlp.chunk import CRFChunkParser
+
+        with CRFChunkParser() as parser:
+            result = parser.parse(tokens_pos)
+
+    :param str corpus: corpus name for the CRF model
+        (default: ``"orchidpp"``).
+    """
+
+    corpus: str
+    _model_file_ctx: Optional[AbstractContextManager[Any]]
+    tagger: CRFTagger
+    xseq: list[dict[str, Union[str, bool]]]
+
+    def __init__(self, corpus: str = "orchidpp") -> None:
+        self.corpus = corpus
+        self._model_file_ctx = None
+        self.load_model(self.corpus)
+
+    def load_model(self, corpus: str) -> None:
+        """Load the CRF model for the given corpus.
+
+        :param str corpus: corpus name.
+        """
+        from pycrfsuite import (
+            Tagger as CRFTagger,  # noqa: PLC0415  # pyright: ignore[reportAttributeAccessIssue]  # pyrefly: ignore[missing-module-attribute]
+        )
+
+        self.tagger = CRFTagger()
+        if corpus == "orchidpp":
+            corpus_files = files("pythainlp.corpus")
+            model_file = corpus_files.joinpath("crfchunk_orchidpp.model")
+            self._model_file_ctx = as_file(model_file)
+            model_path = self._model_file_ctx.__enter__()
+            self.tagger.open(str(model_path))
+
+    def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
+        """Parse a POS-tagged sentence into IOB chunk labels.
+
+        :param list[tuple[str, str]] token_pos: list of (word, POS-tag)
+            pairs.
+        :return: list of IOB chunk labels, one per token.
+        :rtype: list[str]
+        """
+        self.xseq = _extract_features(token_pos)
+        return self.tagger.tag(self.xseq)  # type: ignore[no-any-return]
+
+    def __enter__(self) -> CRFChunkParser:
+        """Context manager entry."""
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[types.TracebackType],
+    ) -> None:
+        """Context manager exit — clean up resources."""
+        if self._model_file_ctx is not None:
+            try:
+                self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb)
+                self._model_file_ctx = None
+            except Exception:  # noqa: S110
+                pass
+
+    def __del__(self) -> None:
+        """Attempt resource cleanup on garbage collection.
+
+        .. note::
+            :meth:`__del__` is not guaranteed to be called.
+            Use the context manager protocol for reliable cleanup.
+        """
+        if self._model_file_ctx is not None:
+            try:
+                self._model_file_ctx.__exit__(None, None, None)
+            except Exception:  # noqa: S110
+                pass
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
@@ -17,7 +17,7 @@
 
 from pythainlp import thai_digits, thai_letters
 from pythainlp.corpus import phupha, thai_orst_words
-from pythainlp.util import isthaichar
+from pythainlp.util import is_thai_char
 
 
 def _no_filter(word: str) -> bool:
@@ -26,7 +26,7 @@ def _no_filter(word: str) -> bool:
 
 def _is_thai_and_not_num(word: str) -> bool:
     for ch in word:
-        if ch != "." and not isthaichar(ch):
+        if ch != "." and not is_thai_char(ch):
             return False
         if ch in thai_digits or ch in digits:
             return False

diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -5,6 +5,11 @@
 
 Tagging each token in a sentence with supplementary information,
 such as its part-of-speech (POS) tag, and named entity (NE) tag.
+
+.. note::
+    :func:`chunk_parse` has moved to :mod:`pythainlp.chunk`.
+    Importing it from :mod:`pythainlp.tag` still works but emits a
+    :class:`DeprecationWarning` and will be removed in 6.0.
 """
 
 __all__: list[str] = [

diff --git a/pythainlp/tag/chunk.py b/pythainlp/tag/chunk.py
@@ -1,33 +1,37 @@
 # SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
 # SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
+"""Deprecated. Use :func:`pythainlp.chunk.chunk_parse` instead.
+
+.. deprecated:: 5.3.2
+    :func:`chunk_parse` has moved to :mod:`pythainlp.chunk`.
+"""
 from __future__ import annotations
 
+from pythainlp.chunk import chunk_parse as _chunk_parse
+from pythainlp.tools import warn_deprecation
+
 
 def chunk_parse(
-    sent: list[tuple[str, str]], engine: str = "crf", corpus: str = "orchidpp"
+    sent: list[tuple[str, str]],
+    engine: str = "crf",
+    corpus: str = "orchidpp",
 ) -> list[str]:
-    """This function parses Thai sentence to phrase structure in IOB format.
+    """Parse a Thai sentence into phrase-structure chunks (IOB format).
 
-    :param list[tuple[str, str]] sent: list [(word, part-of-speech)]
-    :param str engine: chunk parse engine (now, it has crf only)
-    :param str corpus: chunk parse corpus (now, it has orchidpp only)
+    .. deprecated:: 5.3.2
+        Use :func:`pythainlp.chunk.chunk_parse` instead.
 
-    :return: a list of tuples (word, part-of-speech, chunking)
+    :param list[tuple[str, str]] sent: list of (word, POS-tag) pairs.
+    :param str engine: chunking engine (default: ``"crf"``).
+    :param str corpus: corpus name (default: ``"orchidpp"``).
+    :return: list of IOB chunk labels, one per token.
     :rtype: list[str]
-
-    :Example:
-    ::
-
-        from pythainlp.tag import chunk_parse, pos_tag
-
-        tokens = ["ผม", "รัก", "คุณ"]
-        tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")
-
-        print(chunk_parse(tokens_pos))
-        # output: ['B-NP', 'B-VP', 'I-VP']
     """
-    from .crfchunk import CRFchunk
-
-    _engine = CRFchunk()
-    return _engine.parse(sent)
+    warn_deprecation(
+        "pythainlp.tag.chunk_parse",
+        "pythainlp.chunk.chunk_parse",
+        "5.3.2",
+        "6.0",
+    )
+    return _chunk_parse(sent, engine=engine, corpus=corpus)