Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Run Tests

on:
push:
branches: [ main, master, dev, copilot/** ]
pull_request:
branches: [ main, master, dev ]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .
pip install pytest pytest-cov

- name: Run tests
run: |
pytest tests/ -v --cov=spacy_pythainlp --cov-report=term-missing

- name: Upload coverage reports
if: matrix.python-version == '3.11'
uses: codecov/codecov-action@v3
with:
fail_ci_if_error: false
7 changes: 6 additions & 1 deletion spacy_pythainlp/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,12 @@ def _dep(self, doc:Doc):
offset = 0
_dep_temp = dependency_parsing(text, model=self.dependency_parsing_model, engine=self.dependency_parsing_engine, tag="list")
for i in _dep_temp:
idx, word, _, postag, _, _, head, dep, _, space = i
# Handle variable number of fields returned by dependency_parsing
# CoNLL-U format requires at least 10 fields, but some engines may return more
if len(i) < 10:
raise ValueError(f"Expected at least 10 fields in dependency parsing output, got {len(i)}")
# Only unpack the first 10 fields we need (CoNLL-U format)
idx, word, _, postag, _, _, head, dep, _, space = i[:10]
words.append(word)
pos.append(postag)
heads.append(int(head))
Expand Down
Empty file added tests/__init__.py
Empty file.
245 changes: 245 additions & 0 deletions tests/test_dependency_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
"""
Tests for spacy-pythainlp dependency parsing functionality.

This test suite verifies the fix for handling variable-length CoNLL-U format
output from PyThaiNLP's dependency_parsing function.
"""

import unittest
from unittest.mock import Mock, patch
import spacy
from spacy.tokens import Doc


class TestDependencyParsing(unittest.TestCase):
"""Test cases for dependency parsing with variable-length field tuples."""

def setUp(self):
"""Set up test fixtures."""
self.nlp = spacy.blank('th')

def test_import_spacy_pythainlp(self):
"""Test that spacy_pythainlp can be imported."""
import spacy_pythainlp.core
self.assertIsNotNone(spacy_pythainlp.core)

def test_add_pythainlp_pipe(self):
"""Test that pythainlp pipeline can be added."""
import spacy_pythainlp.core

# Add pipeline with minimal configuration
self.nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": False,
"dependency_parsing": False,
"word_vector": False,
}
)
self.assertIn("pythainlp", self.nlp.pipe_names)

@patch('pythainlp.parse.dependency_parsing')
def test_dependency_parsing_with_10_fields(self, mock_dep_parsing):
"""Test dependency parsing with exactly 10 fields (standard CoNLL-U)."""
import spacy_pythainlp.core

# Mock the dependency_parsing function to return 10-field tuples
# Using head indices: token 0 points to token 1 (head=1), token 1 is root (head=0)
mock_dep_parsing.return_value = [
['1', 'ฉัน', 'ฉัน', 'PRON', 'PRON', '_', '1', 'nsubj', '_', '_'],
['2', 'ชอบ', 'ชอบ', 'VERB', 'VERB', '_', '0', 'root', '_', '_'],
]

nlp = spacy.blank('th')
nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": False,
"dependency_parsing": True,
"dependency_parsing_engine": "esupar",
"word_vector": False,
}
)

doc = nlp("ฉันชอบ")

# Verify the document was processed
self.assertIsInstance(doc, Doc)
self.assertEqual(len(doc), 2)
self.assertEqual(doc[0].text, 'ฉัน')
self.assertEqual(doc[1].text, 'ชอบ')

@patch('pythainlp.parse.dependency_parsing')
def test_dependency_parsing_with_11_fields(self, mock_dep_parsing):
"""Test dependency parsing with 11 fields (extra field beyond standard)."""
import spacy_pythainlp.core

# Mock the dependency_parsing function to return 11-field tuples
# This simulates the issue reported in the bug
mock_dep_parsing.return_value = [
['1', 'ฉัน', 'ฉัน', 'PRON', 'PRON', '_', '1', 'nsubj', '_', '_', 'SpaceAfter=No'],
['2', 'ชอบ', 'ชอบ', 'VERB', 'VERB', '_', '0', 'root', '_', '_', 'SpaceAfter=Yes'],
]

nlp = spacy.blank('th')
nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": False,
"dependency_parsing": True,
"dependency_parsing_engine": "esupar",
"word_vector": False,
}
)

# This should NOT raise ValueError anymore
doc = nlp("ฉันชอบ")

# Verify the document was processed correctly
self.assertIsInstance(doc, Doc)
self.assertEqual(len(doc), 2)
self.assertEqual(doc[0].text, 'ฉัน')
self.assertEqual(doc[1].text, 'ชอบ')

@patch('pythainlp.parse.dependency_parsing')
def test_dependency_parsing_with_12_fields(self, mock_dep_parsing):
"""Test dependency parsing with 12 fields (multiple extra fields)."""
import spacy_pythainlp.core

# Mock with 12 fields
mock_dep_parsing.return_value = [
['1', 'ฉัน', 'ฉัน', 'PRON', 'PRON', '_', '0', 'root', '_', '_', 'extra1', 'extra2'],
]

nlp = spacy.blank('th')
nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": False,
"dependency_parsing": True,
"dependency_parsing_engine": "esupar",
"word_vector": False,
}
)

# Should handle extra fields gracefully
doc = nlp("ฉัน")
self.assertIsInstance(doc, Doc)
self.assertEqual(len(doc), 1)

@patch('pythainlp.parse.dependency_parsing')
def test_dependency_parsing_with_insufficient_fields(self, mock_dep_parsing):
"""Test that dependency parsing raises error with fewer than 10 fields."""
import spacy_pythainlp.core

# Mock with only 9 fields (insufficient)
mock_dep_parsing.return_value = [
['1', 'ฉัน', 'ฉัน', 'PRON', 'PRON', '_', '2', 'nsubj', '_'],
]

nlp = spacy.blank('th')
nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": False,
"dependency_parsing": True,
"dependency_parsing_engine": "esupar",
"word_vector": False,
}
)

# Should raise ValueError with clear message
with self.assertRaises(ValueError) as context:
doc = nlp("ฉัน")

self.assertIn("Expected at least 10 fields", str(context.exception))

@patch('pythainlp.parse.dependency_parsing')
def test_dependency_parsing_pos_and_dep_tags(self, mock_dep_parsing):
"""Test that POS tags and dependency relations are correctly extracted."""
import spacy_pythainlp.core

# Mock with complete CoNLL-U data
# Head indices: token 0 and 2 point to token 1, token 1 is root
mock_dep_parsing.return_value = [
['1', 'ฉัน', 'ฉัน', 'PRON', 'PRON', '_', '1', 'nsubj', '_', '_'],
['2', 'ชอบ', 'ชอบ', 'VERB', 'VERB', '_', '0', 'root', '_', '_'],
['3', 'แมว', 'แมว', 'NOUN', 'NOUN', '_', '1', 'obj', '_', '_'],
]

nlp = spacy.blank('th')
nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": False,
"dependency_parsing": True,
"dependency_parsing_engine": "esupar",
"word_vector": False,
}
)

doc = nlp("ฉันชอบแมว")

# Check POS tags
self.assertEqual(doc[0].pos_, 'PRON')
self.assertEqual(doc[1].pos_, 'VERB')
self.assertEqual(doc[2].pos_, 'NOUN')

# Check dependency relations
self.assertEqual(doc[0].dep_, 'nsubj')
self.assertEqual(doc[1].dep_, 'root')
self.assertEqual(doc[2].dep_, 'obj')


class TestBasicFunctionality(unittest.TestCase):
"""Test basic spacy-pythainlp functionality."""

def test_blank_model_creation(self):
"""Test that a blank Thai model can be created."""
nlp = spacy.blank('th')
self.assertIsNotNone(nlp)
self.assertEqual(nlp.lang, 'th')

def test_pipeline_with_tokenization(self):
"""Test pythainlp pipeline with tokenization enabled."""
import spacy_pythainlp.core

nlp = spacy.blank('th')
nlp.add_pipe(
"pythainlp",
config={
"pos": False,
"sent": False,
"ner": False,
"tokenize": True,
"tokenize_engine": "newmm",
"dependency_parsing": False,
"word_vector": False,
}
)

doc = nlp("ผมเป็นนักศึกษา")
self.assertIsInstance(doc, Doc)
self.assertGreater(len(doc), 0)


if __name__ == '__main__':
unittest.main()