Skip to content
Merged
19 changes: 18 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,24 @@ and this project adheres to

## [5.3.2] - 2026-03-20

This release focuses on security issues related to path traversal.
This release focuses on security issues related to path traversal
and renaming functions to conform with PEP 8. Old function names are
still accessible but migration to new names are recommended.

### Added

- `pythainlp.chunk` module: canonical home for chunking/phrase-structure
parsing, following the NLTK `nltk.chunk` naming convention.

### Deprecated

The following names are deprecated and will be removed in 6.0 (#1339):

- `pythainlp.util.isthaichar()`: use `pythainlp.util.is_thai_char()`.
- `pythainlp.util.isthai()`: use `pythainlp.util.is_thai()`.
- `pythainlp.util.countthai()`: use `pythainlp.util.count_thai()`.
- `pythainlp.tag.crfchunk.CRFchunk`: use `pythainlp.chunk.CRFChunkParser`.
- `pythainlp.tag.chunk_parse()`: use `pythainlp.chunk.chunk_parse()`.

### Security

Expand Down
67 changes: 67 additions & 0 deletions pythainlp/chunk/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Thai phrase structure (chunking) module.

This module provides chunk parsing for Thai text, following the
NLTK :mod:`nltk.chunk` naming convention.

:Example:

.. code-block:: python

from pythainlp.chunk import chunk_parse, CRFChunkParser
from pythainlp.tag import pos_tag

tokens = ["ผม", "รัก", "คุณ"]
tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")

# Using the convenience function
print(chunk_parse(tokens_pos))
# output: ['B-NP', 'B-VP', 'I-VP']

# Using the class directly
with CRFChunkParser() as parser:
print(parser.parse(tokens_pos))
# output: ['B-NP', 'B-VP', 'I-VP']
"""
from __future__ import annotations

__all__: list[str] = [
"CRFChunkParser",
"chunk_parse",
]

from pythainlp.chunk.crfchunk import CRFChunkParser


def chunk_parse(
sent: list[tuple[str, str]],
engine: str = "crf",

Check warning on line 40 in pythainlp/chunk/__init__.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove the unused function parameter "engine".

See more on https://sonarcloud.io/project/issues?id=PyThaiNLP_pythainlp&issues=AZ0Fn2wzEwPRkYN4l0kK&open=AZ0Fn2wzEwPRkYN4l0kK&pullRequest=1339
corpus: str = "orchidpp",
) -> list[str]:
"""Parse a Thai sentence into phrase-structure chunks (IOB format).

:param list[tuple[str, str]] sent: list of (word, POS-tag) pairs.
:param str engine: chunking engine; currently only ``"crf"`` is
supported.
:param str corpus: corpus name for the CRF model; currently only
``"orchidpp"`` is supported.
:return: list of IOB chunk labels, one per token.
:rtype: list[str]

:Example:

.. code-block:: python

from pythainlp.chunk import chunk_parse
from pythainlp.tag import pos_tag

tokens = ["ผม", "รัก", "คุณ"]
tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")

print(chunk_parse(tokens_pos))
# output: ['B-NP', 'B-VP', 'I-VP']
"""
_parser = CRFChunkParser(corpus=corpus)
return _parser.parse(sent)
164 changes: 164 additions & 0 deletions pythainlp/chunk/crfchunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""CRF-based Thai phrase structure (chunk) parser."""
from __future__ import annotations

from importlib.resources import as_file, files
from typing import TYPE_CHECKING, Any, Optional, Union

if TYPE_CHECKING:
import types
from contextlib import AbstractContextManager

from pycrfsuite import (
Tagger as CRFTagger, # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute]
)

from pythainlp.corpus import thai_stopwords


def _is_stopword(word: str) -> bool:
return word in thai_stopwords()


def _doc2features(
tokens: list[tuple[str, str]], index: int
) -> dict[str, Union[str, bool]]:
"""Extract features for a single token in a POS-tagged sentence.

:param list[tuple[str, str]] tokens: POS-tagged sentence,
a list of (word, POS-tag) pairs.
:param int index: index of the token to extract features for.
:return: feature dictionary for the token.
:rtype: dict[str, Union[str, bool]]
"""
word, pos = tokens[index]
f: dict[str, Union[str, bool]] = {
"word": word,
"word_is_stopword": _is_stopword(word),
"pos": pos,
}
if index > 1:
prevprevword, prevprevpos = tokens[index - 2]
f["prev-prev-word"] = prevprevword
f["prev-prevz-word_is_stopword"] = _is_stopword(prevprevword)
f["prev-prevz-pos"] = prevprevpos
if index > 0:
prevword, prevpos = tokens[index - 1]
f["prev-word"] = prevword
f["prev-word_is_stopword"] = _is_stopword(prevword)
f["prev-pos"] = prevpos
else:
f["BOS"] = True
if index < len(tokens) - 2:
nextnextword, nextnextpos = tokens[index + 2]
f["nextnext-word"] = nextnextword
f["nextnext-word_is_stopword"] = _is_stopword(nextnextword)
f["nextnext-pos"] = nextnextpos
if index < len(tokens) - 1:
nextword, nextpos = tokens[index + 1]
f["next-word"] = nextword
f["next-word_is_stopword"] = _is_stopword(nextword)
f["next-pos"] = nextpos
else:
f["EOS"] = True

return f


def _extract_features(
doc: list[tuple[str, str]],
) -> list[dict[str, Union[str, bool]]]:
return [_doc2features(doc, i) for i in range(len(doc))]


class CRFChunkParser:
"""CRF-based chunk parser for Thai text.

Parses a POS-tagged sentence into phrase-structure chunks
(IOB format), following the NLTK :class:`nltk.chunk.ChunkParserI`
convention.

This class supports the context manager protocol for deterministic
resource cleanup:

.. code-block:: python

from pythainlp.chunk import CRFChunkParser

with CRFChunkParser() as parser:
result = parser.parse(tokens_pos)

:param str corpus: corpus name for the CRF model
(default: ``"orchidpp"``).
"""

corpus: str
_model_file_ctx: Optional[AbstractContextManager[Any]]
tagger: CRFTagger
xseq: list[dict[str, Union[str, bool]]]

def __init__(self, corpus: str = "orchidpp") -> None:
self.corpus = corpus
self._model_file_ctx = None
self.load_model(self.corpus)

def load_model(self, corpus: str) -> None:
"""Load the CRF model for the given corpus.

:param str corpus: corpus name.
"""
from pycrfsuite import (
Tagger as CRFTagger, # noqa: PLC0415 # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute]
)

self.tagger = CRFTagger()
if corpus == "orchidpp":
corpus_files = files("pythainlp.corpus")
model_file = corpus_files.joinpath("crfchunk_orchidpp.model")
self._model_file_ctx = as_file(model_file)
model_path = self._model_file_ctx.__enter__()
self.tagger.open(str(model_path))

def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
"""Parse a POS-tagged sentence into IOB chunk labels.

:param list[tuple[str, str]] token_pos: list of (word, POS-tag)
pairs.
:return: list of IOB chunk labels, one per token.
:rtype: list[str]
"""
self.xseq = _extract_features(token_pos)
return self.tagger.tag(self.xseq) # type: ignore[no-any-return]

def __enter__(self) -> CRFChunkParser:
"""Context manager entry."""
return self

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[types.TracebackType],
) -> None:
"""Context manager exit — clean up resources."""
if self._model_file_ctx is not None:
try:
self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb)
self._model_file_ctx = None
except Exception: # noqa: S110
pass

def __del__(self) -> None:
"""Attempt resource cleanup on garbage collection.

.. note::
:meth:`__del__` is not guaranteed to be called.
Use the context manager protocol for reliable cleanup.
"""
if self._model_file_ctx is not None:
try:
self._model_file_ctx.__exit__(None, None, None)
except Exception: # noqa: S110
pass
4 changes: 2 additions & 2 deletions pythainlp/spell/pn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from pythainlp import thai_digits, thai_letters
from pythainlp.corpus import phupha, thai_orst_words
from pythainlp.util import isthaichar
from pythainlp.util import is_thai_char


def _no_filter(word: str) -> bool:
Expand All @@ -26,7 +26,7 @@ def _no_filter(word: str) -> bool:

def _is_thai_and_not_num(word: str) -> bool:
for ch in word:
if ch != "." and not isthaichar(ch):
if ch != "." and not is_thai_char(ch):
return False
if ch in thai_digits or ch in digits:
return False
Expand Down
5 changes: 5 additions & 0 deletions pythainlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

Tagging each token in a sentence with supplementary information,
such as its part-of-speech (POS) tag, and named entity (NE) tag.

.. note::
:func:`chunk_parse` has moved to :mod:`pythainlp.chunk`.
Importing it from :mod:`pythainlp.tag` still works but emits a
:class:`DeprecationWarning` and will be removed in 6.0.
"""

__all__: list[str] = [
Expand Down
46 changes: 25 additions & 21 deletions pythainlp/tag/chunk.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,37 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Deprecated. Use :func:`pythainlp.chunk.chunk_parse` instead.

.. deprecated:: 5.3.2
:func:`chunk_parse` has moved to :mod:`pythainlp.chunk`.
"""
from __future__ import annotations

from pythainlp.chunk import chunk_parse as _chunk_parse
from pythainlp.tools import warn_deprecation


def chunk_parse(
sent: list[tuple[str, str]], engine: str = "crf", corpus: str = "orchidpp"
sent: list[tuple[str, str]],
engine: str = "crf",
corpus: str = "orchidpp",
) -> list[str]:
"""This function parses Thai sentence to phrase structure in IOB format.
"""Parse a Thai sentence into phrase-structure chunks (IOB format).

:param list[tuple[str, str]] sent: list [(word, part-of-speech)]
:param str engine: chunk parse engine (now, it has crf only)
:param str corpus: chunk parse corpus (now, it has orchidpp only)
.. deprecated:: 5.3.2
Use :func:`pythainlp.chunk.chunk_parse` instead.

:return: a list of tuples (word, part-of-speech, chunking)
:param list[tuple[str, str]] sent: list of (word, POS-tag) pairs.
:param str engine: chunking engine (default: ``"crf"``).
:param str corpus: corpus name (default: ``"orchidpp"``).
:return: list of IOB chunk labels, one per token.
:rtype: list[str]

:Example:
::

from pythainlp.tag import chunk_parse, pos_tag

tokens = ["ผม", "รัก", "คุณ"]
tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")

print(chunk_parse(tokens_pos))
# output: ['B-NP', 'B-VP', 'I-VP']
"""
from .crfchunk import CRFchunk

_engine = CRFchunk()
return _engine.parse(sent)
warn_deprecation(
"pythainlp.tag.chunk_parse",
"pythainlp.chunk.chunk_parse",
"5.3.2",
"6.0",
)
return _chunk_parse(sent, engine=engine, corpus=corpus)
Loading
Loading