From 59613e44ddecf00318abcde3480488286dd2ead1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 08:37:51 +0000 Subject: [PATCH 1/2] Initial plan From bdb69efdb1b60580d8806bb2bebdd7e0f643bb91 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 08:41:20 +0000 Subject: [PATCH 2/2] Add type hints, docstrings, and improve code quality in core.py Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- spacy_pythainlp/ __init__.py | 0 spacy_pythainlp/__init__.py | 12 ++ spacy_pythainlp/core.py | 346 +++++++++++++++++++++++------------ 3 files changed, 237 insertions(+), 121 deletions(-) delete mode 100644 spacy_pythainlp/ __init__.py create mode 100644 spacy_pythainlp/__init__.py diff --git a/spacy_pythainlp/ __init__.py b/spacy_pythainlp/ __init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/spacy_pythainlp/__init__.py b/spacy_pythainlp/__init__.py new file mode 100644 index 0000000..1e837fe --- /dev/null +++ b/spacy_pythainlp/__init__.py @@ -0,0 +1,12 @@ +""" +spaCy-PyThaiNLP: Thai language support for spaCy using PyThaiNLP. + +This package provides a spaCy pipeline component that integrates PyThaiNLP's +Thai NLP capabilities, including tokenization, POS tagging, NER, sentence +segmentation, dependency parsing, and word vectors. +""" + +from spacy_pythainlp.core import PyThaiNLP + +__version__ = "0.1.0" +__all__ = ["PyThaiNLP"] diff --git a/spacy_pythainlp/core.py b/spacy_pythainlp/core.py index 6fcf225..1d8c790 100644 --- a/spacy_pythainlp/core.py +++ b/spacy_pythainlp/core.py @@ -1,5 +1,6 @@ -from pythainlp.tag import pos_tag +from typing import List, Optional +from pythainlp.tag import pos_tag from pythainlp.tokenize import ( word_tokenize, DEFAULT_SENT_TOKENIZE_ENGINE, @@ -13,10 +14,17 @@ DEFAULT_POS_ENGINE = "perceptron" DEFAULT_NER_ENGINE = "thainer" +# Constants for sentence splitting +SENTENCE_SPLIT_MARKER = "SPLIT" + +# Constants for NER tags +NER_TAG_BEGIN = "B-" +NER_TAG_OUTSIDE = "O" + @Language.factory( "pythainlp", - assigns=["token.pos","token.is_sent_start","doc.ents"], + assigns=["token.pos", "token.is_sent_start", "doc.ents"], default_config={ "pos_engine": DEFAULT_POS_ENGINE, "pos": True, @@ -36,30 +44,56 @@ ) class PyThaiNLP: """ - SpaCy - PyThaiNLP + SpaCy pipeline component for Thai language processing using PyThaiNLP. + + This component provides Thai-specific NLP capabilities including: + - Word tokenization + - Part-of-speech tagging + - Named entity recognition + - Sentence segmentation + - Dependency parsing + - Word vectors """ def __init__( self, - nlp, - name, - tokenize_engine, - pos_engine, - sent_engine, - ner_engine, - dependency_parsing_engine, - tokenize, - pos, - sent, - ner, - dependency_parsing, - word_vector, - dependency_parsing_model, - word_vector_model, - pos_corpus - ): - """ - Initialize + nlp: Language, + name: str, + tokenize_engine: str, + pos_engine: str, + sent_engine: str, + ner_engine: str, + dependency_parsing_engine: str, + tokenize: bool, + pos: bool, + sent: bool, + ner: bool, + dependency_parsing: bool, + word_vector: bool, + dependency_parsing_model: Optional[str], + word_vector_model: str, + pos_corpus: str + ) -> None: + """ + Initialize the PyThaiNLP pipeline component. + + Args: + nlp: The spaCy Language object + name: Name of the pipeline component + tokenize_engine: Engine for word tokenization + pos_engine: Engine for part-of-speech tagging + sent_engine: Engine for sentence segmentation + ner_engine: Engine for named entity recognition + dependency_parsing_engine: Engine for dependency parsing + tokenize: Enable word tokenization + pos: Enable part-of-speech tagging + sent: Enable sentence segmentation + ner: Enable named entity recognition + dependency_parsing: Enable dependency parsing + word_vector: Enable word vectors + dependency_parsing_model: Model for dependency parsing + word_vector_model: Model for word vectors + pos_corpus: Corpus for POS tagging """ self.nlp = nlp self.word_vector = word_vector @@ -82,7 +116,16 @@ def __init__( from pythainlp.tag import NER self.ner = NER(engine=self.ner_engine) - def __call__(self, doc:Doc): + def __call__(self, doc: Doc) -> Doc: + """ + Process a Doc object through the PyThaiNLP pipeline. + + Args: + doc: The spaCy Doc to process + + Returns: + The processed Doc with Thai NLP annotations + """ if self.dependency_parsing: doc = self._dep(doc) self.on_tokenize = False @@ -97,147 +140,208 @@ def __call__(self, doc:Doc): doc = self._ner(doc) return doc - def _tokenize(self, doc:Doc): + def _tokenize(self, doc: Doc) -> Doc: + """ + Tokenize text using PyThaiNLP tokenizer. + + Args: + doc: The spaCy Doc to tokenize + + Returns: + New Doc with tokenized words + """ words = list(word_tokenize(doc.text, engine=self.tokenize_engine)) - spaces = [False for i in words] + spaces = [False] * len(words) return Doc(self.nlp.vocab, words=words, spaces=spaces) - def _pos(self, doc:Doc): - _pos_tag = [] + def _pos(self, doc: Doc) -> Doc: + """ + Add part-of-speech tags to tokens. + + Args: + doc: The spaCy Doc to tag + + Returns: + Doc with POS tags added + """ + pos_tags = [] if doc.is_sentenced: - _list_txt = [[j.text for j in i] for i in list(doc.sents)] + list_txt = [[token.text for token in sent] for sent in doc.sents] else: - _list_txt = [[j.text for j in doc]] - for i in _list_txt: - _word = i - _tag_ = pos_tag(_word, engine=self.pos_engine, corpus=self.pos_corpus) - _pos_tag.extend([tag for _,tag in _tag_]) - for i,_ in enumerate(_pos_tag): - doc[i].pos_ = _pos_tag[i] + list_txt = [[token.text for token in doc]] + for words in list_txt: + tagged = pos_tag(words, engine=self.pos_engine, corpus=self.pos_corpus) + pos_tags.extend([tag for _, tag in tagged]) + for i in range(len(pos_tags)): + doc[i].pos_ = pos_tags[i] return doc - def _sent(self, doc:Doc): + def _sent(self, doc: Doc) -> Doc: + """ + Add sentence boundaries to the document. + + Args: + doc: The spaCy Doc to segment + + Returns: + Doc with sentence boundaries marked + """ from pythainlp.tokenize import sent_tokenize - _text = sent_tokenize(str(doc.text), engine=self.sent_engine) - _doc = word_tokenize('SPLIT'.join(_text), engine=self.tokenize_engine) + sentences = sent_tokenize(str(doc.text), engine=self.sent_engine) + tokenized = word_tokenize(SENTENCE_SPLIT_MARKER.join(sentences), engine=self.tokenize_engine) number_skip = 0 seen_break = False - _new_cut = [] - for i, word in enumerate(_doc): - if 'SPLIT' in word: - if word.startswith("SPLIT"): - _new_cut.append("SPLIT") - _new_cut.append(word.replace('SPLIT','')) - elif word.endswith("SPLIT"): - _new_cut.append(word.replace('SPLIT','')) - _new_cut.append("SPLIT") + new_tokens = [] + for word in tokenized: + if SENTENCE_SPLIT_MARKER in word: + if word.startswith(SENTENCE_SPLIT_MARKER): + new_tokens.append(SENTENCE_SPLIT_MARKER) + new_tokens.append(word.replace(SENTENCE_SPLIT_MARKER, '')) + elif word.endswith(SENTENCE_SPLIT_MARKER): + new_tokens.append(word.replace(SENTENCE_SPLIT_MARKER, '')) + new_tokens.append(SENTENCE_SPLIT_MARKER) else: - _new_cut.append(word) + new_tokens.append(word) else: - _new_cut.append(word) - for i, word in enumerate(_new_cut): - if i-number_skip == len(doc) -1: + new_tokens.append(word) + for i, word in enumerate(new_tokens): + if i - number_skip == len(doc) - 1: break elif i == 0: - doc[i-number_skip].is_sent_start = True + doc[i - number_skip].is_sent_start = True elif seen_break: - doc[i-number_skip].is_sent_start = True + doc[i - number_skip].is_sent_start = True seen_break = False - elif 'SPLIT' in word: + elif SENTENCE_SPLIT_MARKER in word: seen_break = True number_skip += 1 else: - doc[i-number_skip].is_sent_start = False + doc[i - number_skip].is_sent_start = False return doc - def _dep(self, doc:Doc): + def _dep(self, doc: Doc) -> Doc: + """ + Perform dependency parsing on the document. + + Args: + doc: The spaCy Doc to parse + + Returns: + New Doc with dependency annotations + + Raises: + ValueError: If dependency parsing output has fewer than 10 fields + """ from pythainlp.parse import dependency_parsing text = str(doc.text) words = [] spaces = [] - pos = [] - tags = [] - morphs = [] + pos_tags = [] deps = [] heads = [] - lemmas = [] - offset = 0 - _dep_temp = dependency_parsing(text, model=self.dependency_parsing_model, engine=self.dependency_parsing_engine, tag="list") - for i in _dep_temp: - # Handle variable number of fields returned by dependency_parsing - # CoNLL-U format requires at least 10 fields, but some engines may return more - if len(i) < 10: - raise ValueError(f"Expected at least 10 fields in dependency parsing output, got {len(i)}") - # Only unpack the first 10 fields we need (CoNLL-U format) - idx, word, _, postag, _, _, head, dep, _, space = i[:10] + + dep_output = dependency_parsing( + text, + model=self.dependency_parsing_model, + engine=self.dependency_parsing_engine, + tag="list" + ) + + for fields in dep_output: + if len(fields) < 10: + raise ValueError(f"Expected at least 10 fields in dependency parsing output, got {len(fields)}") + # Extract CoNLL-U format fields (only first 10) + idx, word, _, postag, _, _, head, dep, _, space = fields[:10] words.append(word) - pos.append(postag) + pos_tags.append(postag) heads.append(int(head)) deps.append(dep) - if space == '_': - spaces.append(True) - else: - spaces.append(False) - return Doc(self.nlp.vocab, words=words, spaces=spaces,pos=pos,deps=deps,heads=heads) + spaces.append(space == '_') + + return Doc(self.nlp.vocab, words=words, spaces=spaces, pos=pos_tags, deps=deps, heads=heads) + + def _ner(self, doc: Doc) -> Doc: + """ + Add named entity recognition tags to the document. - def _ner(self, doc:Doc): - _list_txt = [] + Args: + doc: The spaCy Doc to tag + + Returns: + Doc with named entities added + """ + # Extract text segments if doc.is_sentenced: - _list_txt = [i.text for i in list(doc.sents)] + text_segments = [sent.text for sent in doc.sents] else: - _list_txt = [j.text for j in doc] - _ner_ =[] - for i in _list_txt: - _ner_.extend(self.ner.tag(i, pos=False)) - _new_ner = [] - c=0 - _t="" - for i,(w, tag) in enumerate(_ner_): - len_w = len(w) - if i+1 == len(_ner_) and _t != "": - _new_ner[-1][1] = c+len_w - elif i+1 == len(_ner_) and tag.startswith("B-"): - _t = tag.replace("B-","") - _new_ner.append([c,c+len_w,_t]) - elif tag.startswith("B-") and _t=="": - _t = tag.replace("B-","") - _new_ner.append([c,None,_t]) - elif tag.startswith("B-") and _t!="": - _new_ner[-1][1] = c - _t = tag.replace("B-","") - _new_ner.append([c,None,_t]) - elif tag == "O" and _t!="": - _new_ner[-1][1] = c - _t="" - c+=len_w - _ents = [] - for start, end, label in _new_ner: + text_segments = [token.text for token in doc] + + # Get NER tags for all segments + ner_tags = [] + for segment in text_segments: + ner_tags.extend(self.ner.tag(segment, pos=False)) + + # Merge consecutive entity tokens into spans + entity_spans = [] + char_offset = 0 + current_entity_label = "" + + for i, (word, tag) in enumerate(ner_tags): + word_length = len(word) + is_last = (i + 1 == len(ner_tags)) + + if is_last and current_entity_label: + entity_spans[-1][1] = char_offset + word_length + elif is_last and tag.startswith(NER_TAG_BEGIN): + entity_label = tag.replace(NER_TAG_BEGIN, "") + entity_spans.append([char_offset, char_offset + word_length, entity_label]) + elif tag.startswith(NER_TAG_BEGIN) and not current_entity_label: + current_entity_label = tag.replace(NER_TAG_BEGIN, "") + entity_spans.append([char_offset, None, current_entity_label]) + elif tag.startswith(NER_TAG_BEGIN) and current_entity_label: + entity_spans[-1][1] = char_offset + current_entity_label = tag.replace(NER_TAG_BEGIN, "") + entity_spans.append([char_offset, None, current_entity_label]) + elif tag == NER_TAG_OUTSIDE and current_entity_label: + entity_spans[-1][1] = char_offset + current_entity_label = "" + + char_offset += word_length + + # Create entity spans + entities = [] + for start, end, label in entity_spans: span = doc.char_span(start, end, label=label, alignment_mode="contract") - if span is None: - pass - else: - _ents.append(span) - - doc.ents = _ents + if span is not None: + entities.append(span) + + doc.ents = entities return doc - def _vec(self): + def _vec(self) -> None: + """ + Load word vectors into the spaCy vocabulary. + """ from pythainlp.word_vector import WordVector - _wv = WordVector(model_name=self.word_vector_model) - self.nlp.vocab.reset_vectors(width=_wv.model["แมว"].shape[0]) - _temp = list(dict(_wv.model.key_to_index).keys()) - for i in _temp: - self.nlp.vocab[i].vector = _wv.model[i] + wv = WordVector(model_name=self.word_vector_model) + self.nlp.vocab.reset_vectors(width=wv.model["แมว"].shape[0]) + words = list(dict(wv.model.key_to_index).keys()) + for word in words: + self.nlp.vocab[word].vector = wv.model[word] - def to_bytes(self, **kwargs): + def to_bytes(self, **kwargs) -> bytes: + """Serialize the component to bytes.""" return b"" - def from_bytes(self, _bytes_data, **kwargs): + def from_bytes(self, _bytes_data: bytes, **kwargs) -> "PyThaiNLP": + """Deserialize the component from bytes.""" return self - def to_disk(self, _path, **kwargs): + def to_disk(self, _path: str, **kwargs) -> None: + """Serialize the component to disk.""" return None - def from_disk(self, _path, **kwargs): + def from_disk(self, _path: str, **kwargs) -> "PyThaiNLP": + """Deserialize the component from disk.""" return self \ No newline at end of file