From 59613e44ddecf00318abcde3480488286dd2ead1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 08:37:51 +0000
Subject: [PATCH 1/2] Initial plan


From bdb69efdb1b60580d8806bb2bebdd7e0f643bb91 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 08:41:20 +0000
Subject: [PATCH 2/2] Add type hints, docstrings, and improve code quality in
 core.py

Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
---
 spacy_pythainlp/ __init__.py |   0
 spacy_pythainlp/__init__.py  |  12 ++
 spacy_pythainlp/core.py      | 346 +++++++++++++++++++++++------------
 3 files changed, 237 insertions(+), 121 deletions(-)
 delete mode 100644 spacy_pythainlp/ __init__.py
 create mode 100644 spacy_pythainlp/__init__.py

diff --git a/spacy_pythainlp/ __init__.py b/spacy_pythainlp/ __init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/spacy_pythainlp/__init__.py b/spacy_pythainlp/__init__.py
new file mode 100644
index 0000000..1e837fe
--- /dev/null
+++ b/spacy_pythainlp/__init__.py
@@ -0,0 +1,12 @@
+"""
+spaCy-PyThaiNLP: Thai language support for spaCy using PyThaiNLP.
+
+This package provides a spaCy pipeline component that integrates PyThaiNLP's
+Thai NLP capabilities, including tokenization, POS tagging, NER, sentence
+segmentation, dependency parsing, and word vectors.
+"""
+
+from spacy_pythainlp.core import PyThaiNLP
+
+__version__ = "0.1.0"
+__all__ = ["PyThaiNLP"]
diff --git a/spacy_pythainlp/core.py b/spacy_pythainlp/core.py
index 6fcf225..1d8c790 100644
--- a/spacy_pythainlp/core.py
+++ b/spacy_pythainlp/core.py
@@ -1,5 +1,6 @@
-from pythainlp.tag import pos_tag
+from typing import List, Optional
 
+from pythainlp.tag import pos_tag
 from pythainlp.tokenize import (
     word_tokenize,
     DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -13,10 +14,17 @@
 DEFAULT_POS_ENGINE = "perceptron"
 DEFAULT_NER_ENGINE = "thainer"
 
+# Constants for sentence splitting
+SENTENCE_SPLIT_MARKER = "SPLIT"
+
+# Constants for NER tags
+NER_TAG_BEGIN = "B-"
+NER_TAG_OUTSIDE = "O"
+
 
 @Language.factory(
     "pythainlp",
-    assigns=["token.pos","token.is_sent_start","doc.ents"],
+    assigns=["token.pos", "token.is_sent_start", "doc.ents"],
     default_config={
         "pos_engine": DEFAULT_POS_ENGINE,
         "pos": True,
@@ -36,30 +44,56 @@
 )
 class PyThaiNLP:
     """
-    SpaCy - PyThaiNLP
+    SpaCy pipeline component for Thai language processing using PyThaiNLP.
+
+    This component provides Thai-specific NLP capabilities including:
+    - Word tokenization
+    - Part-of-speech tagging
+    - Named entity recognition
+    - Sentence segmentation
+    - Dependency parsing
+    - Word vectors
     """
 
     def __init__(
         self,
-        nlp,
-        name,
-        tokenize_engine,
-        pos_engine,
-        sent_engine,
-        ner_engine,
-        dependency_parsing_engine,
-        tokenize,
-        pos,
-        sent,
-        ner,
-        dependency_parsing,
-        word_vector,
-        dependency_parsing_model,
-        word_vector_model,
-        pos_corpus
-    ):
-        """
-        Initialize
+        nlp: Language,
+        name: str,
+        tokenize_engine: str,
+        pos_engine: str,
+        sent_engine: str,
+        ner_engine: str,
+        dependency_parsing_engine: str,
+        tokenize: bool,
+        pos: bool,
+        sent: bool,
+        ner: bool,
+        dependency_parsing: bool,
+        word_vector: bool,
+        dependency_parsing_model: Optional[str],
+        word_vector_model: str,
+        pos_corpus: str
+    ) -> None:
+        """
+        Initialize the PyThaiNLP pipeline component.
+
+        Args:
+            nlp: The spaCy Language object
+            name: Name of the pipeline component
+            tokenize_engine: Engine for word tokenization
+            pos_engine: Engine for part-of-speech tagging
+            sent_engine: Engine for sentence segmentation
+            ner_engine: Engine for named entity recognition
+            dependency_parsing_engine: Engine for dependency parsing
+            tokenize: Enable word tokenization
+            pos: Enable part-of-speech tagging
+            sent: Enable sentence segmentation
+            ner: Enable named entity recognition
+            dependency_parsing: Enable dependency parsing
+            word_vector: Enable word vectors
+            dependency_parsing_model: Model for dependency parsing
+            word_vector_model: Model for word vectors
+            pos_corpus: Corpus for POS tagging
         """
         self.nlp = nlp
         self.word_vector = word_vector
@@ -82,7 +116,16 @@ def __init__(
             from pythainlp.tag import NER
             self.ner = NER(engine=self.ner_engine)
 
-    def __call__(self, doc:Doc):
+    def __call__(self, doc: Doc) -> Doc:
+        """
+        Process a Doc object through the PyThaiNLP pipeline.
+
+        Args:
+            doc: The spaCy Doc to process
+
+        Returns:
+            The processed Doc with Thai NLP annotations
+        """
         if self.dependency_parsing:
             doc = self._dep(doc)
             self.on_tokenize = False
@@ -97,147 +140,208 @@ def __call__(self, doc:Doc):
             doc = self._ner(doc)
         return doc
     
-    def _tokenize(self, doc:Doc):
+    def _tokenize(self, doc: Doc) -> Doc:
+        """
+        Tokenize text using PyThaiNLP tokenizer.
+
+        Args:
+            doc: The spaCy Doc to tokenize
+
+        Returns:
+            New Doc with tokenized words
+        """
         words = list(word_tokenize(doc.text, engine=self.tokenize_engine))
-        spaces = [False for i in words]
+        spaces = [False] * len(words)
         return Doc(self.nlp.vocab, words=words, spaces=spaces)
 
-    def _pos(self, doc:Doc):
-        _pos_tag = []
+    def _pos(self, doc: Doc) -> Doc:
+        """
+        Add part-of-speech tags to tokens.
+
+        Args:
+            doc: The spaCy Doc to tag
+
+        Returns:
+            Doc with POS tags added
+        """
+        pos_tags = []
         if doc.is_sentenced:
-            _list_txt = [[j.text for j in i] for i in list(doc.sents)]
+            list_txt = [[token.text for token in sent] for sent in doc.sents]
         else:
-            _list_txt = [[j.text for j in doc]]
-        for i in _list_txt:
-            _word = i
-            _tag_ = pos_tag(_word, engine=self.pos_engine, corpus=self.pos_corpus)
-            _pos_tag.extend([tag for _,tag in _tag_])
-        for i,_ in enumerate(_pos_tag):
-            doc[i].pos_ = _pos_tag[i]
+            list_txt = [[token.text for token in doc]]
+        for words in list_txt:
+            tagged = pos_tag(words, engine=self.pos_engine, corpus=self.pos_corpus)
+            pos_tags.extend([tag for _, tag in tagged])
+        for i in range(len(pos_tags)):
+            doc[i].pos_ = pos_tags[i]
         return doc
 
-    def _sent(self, doc:Doc):
+    def _sent(self, doc: Doc) -> Doc:
+        """
+        Add sentence boundaries to the document.
+
+        Args:
+            doc: The spaCy Doc to segment
+
+        Returns:
+            Doc with sentence boundaries marked
+        """
         from pythainlp.tokenize import sent_tokenize
-        _text = sent_tokenize(str(doc.text), engine=self.sent_engine)
-        _doc = word_tokenize('SPLIT'.join(_text), engine=self.tokenize_engine)
+        sentences = sent_tokenize(str(doc.text), engine=self.sent_engine)
+        tokenized = word_tokenize(SENTENCE_SPLIT_MARKER.join(sentences), engine=self.tokenize_engine)
         number_skip = 0
         seen_break = False
-        _new_cut = []
-        for i, word in enumerate(_doc):
-            if 'SPLIT' in word:
-                if word.startswith("SPLIT"):
-                    _new_cut.append("SPLIT")
-                    _new_cut.append(word.replace('SPLIT',''))
-                elif word.endswith("SPLIT"):
-                    _new_cut.append(word.replace('SPLIT',''))
-                    _new_cut.append("SPLIT")
+        new_tokens = []
+        for word in tokenized:
+            if SENTENCE_SPLIT_MARKER in word:
+                if word.startswith(SENTENCE_SPLIT_MARKER):
+                    new_tokens.append(SENTENCE_SPLIT_MARKER)
+                    new_tokens.append(word.replace(SENTENCE_SPLIT_MARKER, ''))
+                elif word.endswith(SENTENCE_SPLIT_MARKER):
+                    new_tokens.append(word.replace(SENTENCE_SPLIT_MARKER, ''))
+                    new_tokens.append(SENTENCE_SPLIT_MARKER)
                 else:
-                    _new_cut.append(word)
+                    new_tokens.append(word)
             else:
-                _new_cut.append(word)
-        for i, word in enumerate(_new_cut):
-            if i-number_skip == len(doc) -1:
+                new_tokens.append(word)
+        for i, word in enumerate(new_tokens):
+            if i - number_skip == len(doc) - 1:
                 break
             elif i == 0:
-                doc[i-number_skip].is_sent_start = True
+                doc[i - number_skip].is_sent_start = True
             elif seen_break:
-                doc[i-number_skip].is_sent_start = True
+                doc[i - number_skip].is_sent_start = True
                 seen_break = False
-            elif 'SPLIT' in word:
+            elif SENTENCE_SPLIT_MARKER in word:
                 seen_break = True
                 number_skip += 1
             else:
-                doc[i-number_skip].is_sent_start = False
+                doc[i - number_skip].is_sent_start = False
         return doc
 
-    def _dep(self, doc:Doc):
+    def _dep(self, doc: Doc) -> Doc:
+        """
+        Perform dependency parsing on the document.
+
+        Args:
+            doc: The spaCy Doc to parse
+
+        Returns:
+            New Doc with dependency annotations
+
+        Raises:
+            ValueError: If dependency parsing output has fewer than 10 fields
+        """
         from pythainlp.parse import dependency_parsing
         text = str(doc.text)
         words = []
         spaces = []
-        pos = []
-        tags = []
-        morphs = []
+        pos_tags = []
         deps = []
         heads = []
-        lemmas = []
-        offset = 0
-        _dep_temp = dependency_parsing(text, model=self.dependency_parsing_model, engine=self.dependency_parsing_engine, tag="list")
-        for i in _dep_temp:
-            # Handle variable number of fields returned by dependency_parsing
-            # CoNLL-U format requires at least 10 fields, but some engines may return more
-            if len(i) < 10:
-                raise ValueError(f"Expected at least 10 fields in dependency parsing output, got {len(i)}")
-            # Only unpack the first 10 fields we need (CoNLL-U format)
-            idx, word, _, postag, _, _, head, dep, _, space = i[:10]
+        
+        dep_output = dependency_parsing(
+            text,
+            model=self.dependency_parsing_model,
+            engine=self.dependency_parsing_engine,
+            tag="list"
+        )
+        
+        for fields in dep_output:
+            if len(fields) < 10:
+                raise ValueError(f"Expected at least 10 fields in dependency parsing output, got {len(fields)}")
+            # Extract CoNLL-U format fields (only first 10)
+            idx, word, _, postag, _, _, head, dep, _, space = fields[:10]
             words.append(word)
-            pos.append(postag)
+            pos_tags.append(postag)
             heads.append(int(head))
             deps.append(dep)
-            if space == '_':
-                spaces.append(True)
-            else:
-                spaces.append(False)
-        return Doc(self.nlp.vocab, words=words, spaces=spaces,pos=pos,deps=deps,heads=heads)
+            spaces.append(space == '_')
+        
+        return Doc(self.nlp.vocab, words=words, spaces=spaces, pos=pos_tags, deps=deps, heads=heads)
+
 
+    def _ner(self, doc: Doc) -> Doc:
+        """
+        Add named entity recognition tags to the document.
 
-    def _ner(self, doc:Doc):
-        _list_txt = []
+        Args:
+            doc: The spaCy Doc to tag
+
+        Returns:
+            Doc with named entities added
+        """
+        # Extract text segments
         if doc.is_sentenced:
-            _list_txt = [i.text for i in list(doc.sents)]
+            text_segments = [sent.text for sent in doc.sents]
         else:
-            _list_txt = [j.text for j in doc]
-        _ner_ =[]
-        for i in _list_txt:
-            _ner_.extend(self.ner.tag(i, pos=False))
-        _new_ner = []
-        c=0
-        _t=""
-        for i,(w, tag) in enumerate(_ner_):
-            len_w = len(w)
-            if i+1 == len(_ner_) and _t != "":
-                _new_ner[-1][1] = c+len_w
-            elif i+1 == len(_ner_) and tag.startswith("B-"):
-                _t =  tag.replace("B-","")
-                _new_ner.append([c,c+len_w,_t])
-            elif tag.startswith("B-") and _t=="":
-                _t = tag.replace("B-","")
-                _new_ner.append([c,None,_t])
-            elif tag.startswith("B-") and _t!="":
-                _new_ner[-1][1] = c
-                _t =  tag.replace("B-","")
-                _new_ner.append([c,None,_t])
-            elif tag == "O" and _t!="":
-                _new_ner[-1][1] = c
-                _t=""
-            c+=len_w
-        _ents = []
-        for start, end, label in _new_ner:
+            text_segments = [token.text for token in doc]
+        
+        # Get NER tags for all segments
+        ner_tags = []
+        for segment in text_segments:
+            ner_tags.extend(self.ner.tag(segment, pos=False))
+        
+        # Merge consecutive entity tokens into spans
+        entity_spans = []
+        char_offset = 0
+        current_entity_label = ""
+        
+        for i, (word, tag) in enumerate(ner_tags):
+            word_length = len(word)
+            is_last = (i + 1 == len(ner_tags))
+            
+            if is_last and current_entity_label:
+                entity_spans[-1][1] = char_offset + word_length
+            elif is_last and tag.startswith(NER_TAG_BEGIN):
+                entity_label = tag.replace(NER_TAG_BEGIN, "")
+                entity_spans.append([char_offset, char_offset + word_length, entity_label])
+            elif tag.startswith(NER_TAG_BEGIN) and not current_entity_label:
+                current_entity_label = tag.replace(NER_TAG_BEGIN, "")
+                entity_spans.append([char_offset, None, current_entity_label])
+            elif tag.startswith(NER_TAG_BEGIN) and current_entity_label:
+                entity_spans[-1][1] = char_offset
+                current_entity_label = tag.replace(NER_TAG_BEGIN, "")
+                entity_spans.append([char_offset, None, current_entity_label])
+            elif tag == NER_TAG_OUTSIDE and current_entity_label:
+                entity_spans[-1][1] = char_offset
+                current_entity_label = ""
+            
+            char_offset += word_length
+        
+        # Create entity spans
+        entities = []
+        for start, end, label in entity_spans:
             span = doc.char_span(start, end, label=label, alignment_mode="contract")
-            if span is None:
-                pass
-            else:
-                _ents.append(span)
-
-        doc.ents = _ents
+            if span is not None:
+                entities.append(span)
+        
+        doc.ents = entities
         return doc
 
-    def _vec(self):
+    def _vec(self) -> None:
+        """
+        Load word vectors into the spaCy vocabulary.
+        """
         from pythainlp.word_vector import WordVector
-        _wv = WordVector(model_name=self.word_vector_model)
-        self.nlp.vocab.reset_vectors(width=_wv.model["แมว"].shape[0])
-        _temp = list(dict(_wv.model.key_to_index).keys())
-        for i in _temp:
-            self.nlp.vocab[i].vector = _wv.model[i]
+        wv = WordVector(model_name=self.word_vector_model)
+        self.nlp.vocab.reset_vectors(width=wv.model["แมว"].shape[0])
+        words = list(dict(wv.model.key_to_index).keys())
+        for word in words:
+            self.nlp.vocab[word].vector = wv.model[word]
 
-    def to_bytes(self, **kwargs):
+    def to_bytes(self, **kwargs) -> bytes:
+        """Serialize the component to bytes."""
         return b""
 
-    def from_bytes(self, _bytes_data, **kwargs):
+    def from_bytes(self, _bytes_data: bytes, **kwargs) -> "PyThaiNLP":
+        """Deserialize the component from bytes."""
         return self
 
-    def to_disk(self, _path, **kwargs):
+    def to_disk(self, _path: str, **kwargs) -> None:
+        """Serialize the component to disk."""
         return None
 
-    def from_disk(self, _path, **kwargs):
+    def from_disk(self, _path: str, **kwargs) -> "PyThaiNLP":
+        """Deserialize the component from disk."""
         return self
\ No newline at end of file