Skip to content

Commit

Permalink
Merge pull request #916 from PyThaiNLP/add-tud
Browse files Browse the repository at this point in the history
Add TUD postag
  • Loading branch information
wannaphong committed Jun 10, 2024
2 parents 5971300 + ed3e61e commit c08d6eb
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 3 deletions.
8 changes: 5 additions & 3 deletions pythainlp/corpus/corpus_license.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ https://creativecommons.org/licenses/by/4.0/

| Filename | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------- |
| pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
| pos_orchid_perceptron.json | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
| pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram |
| pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
| pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
| pos_ud_perceptron-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
| pos_ud_unigram-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
| pos_tud_perceptron.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using perceptron |
| pos_tud_unigram.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using unigram |


## Thai Dictionary for ICU BreakIterator
Expand Down
1 change: 1 addition & 0 deletions pythainlp/corpus/pos_tud_perceptron.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pythainlp/corpus/pos_tud_unigram.json

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@

_BLACKBOARD_NAME = "blackboard_pt_tagger"

_TUD_FILENAME = "pos_tud_perceptron.json"
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None
_TUD_TAGGER = None


def _orchid_tagger():
Expand All @@ -44,6 +48,13 @@ def _blackboard_tagger():
return _LST20_TAGGER


def _tud_tagger():
global _TUD_TAGGER
if not _TUD_TAGGER:
_TUD_TAGGER = PerceptronTagger(path=_TUD_PATH)
return _TUD_TAGGER


def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
"""
:param list words: a list of tokenized words
Expand All @@ -67,6 +78,9 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = blackboard.pre_process(words)
word_tags = _blackboard_tagger().tag(words)
word_tags = blackboard.post_process(word_tags, to_ud)
elif corpus in ("tud"):
tagger = _tud_tagger()
word_tags = tagger.tag(words)
else: # by default, use "pud" for corpus
tagger = _pud_tagger()
word_tags = tagger.tag(words)
Expand Down
3 changes: 3 additions & 0 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def pos_tag(
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
* *tnc* - Thai National Corpus (support tltk engine only)
* *tud* - `Thai Universal Dependency Treebank (TUD)\
<https://github.com/nlp-chula/TUD>`_ \
:return: a list of tuples (word, POS tag)
:rtype: list[tuple[str, str]]
Expand Down Expand Up @@ -96,6 +98,7 @@ def pos_tag(
"orchid",
"orchid_ud",
"pud",
"tud",
]

if engine == "perceptron" and corpus in _support_corpus:
Expand Down
14 changes: 14 additions & 0 deletions pythainlp/tag/unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@

_BLACKBOARD_NAME = "blackboard_unigram_tagger"

_TUD_FILENAME = "pos_tud_unigram.json"
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None
_TUD_TAGGER = None


def _orchid_tagger():
Expand Down Expand Up @@ -49,6 +53,14 @@ def _blackboard_tagger():
return _BLACKBOARD_TAGGER


def _tud_tagger():
global _TUD_TAGGER
if not _TUD_TAGGER:
with open(_TUD_PATH, encoding="utf-8-sig") as fh:
_TUD_TAGGER = json.load(fh)
return _TUD_TAGGER


def _find_tag(
words: List[str], dictdata: dict, default_tag: str = ""
) -> List[Tuple[str, str]]:
Expand Down Expand Up @@ -82,6 +94,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = blackboard.pre_process(words)
word_tags = _find_tag(words, _blackboard_tagger())
word_tags = blackboard.post_process(word_tags, to_ud)
elif corpus in ("tud"):
word_tags = _find_tag(words, _tud_tagger())
else: # by default, use "pud" for corpus
word_tags = _find_tag(words, _pud_tagger())

Expand Down
9 changes: 9 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def test_pos_tag(self):
self.assertEqual(unigram.tag([], corpus="orchid"), [])
self.assertEqual(unigram.tag(None, corpus="blackboard"), [])
self.assertEqual(unigram.tag([], corpus="blackboard"), [])
self.assertEqual(unigram.tag(None, corpus="tud"), [])
self.assertEqual(unigram.tag([], corpus="tud"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="orchid")
)
Expand All @@ -68,6 +70,8 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="blackboard_ud")
)
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
self.assertEqual(
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
[("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
Expand All @@ -88,6 +92,8 @@ def test_pos_tag(self):
self.assertEqual(perceptron.tag([], corpus="pud"), [])
self.assertEqual(perceptron.tag(None, corpus="blackboard"), [])
self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
self.assertEqual(perceptron.tag(None, corpus="tud"), [])
self.assertEqual(perceptron.tag([], corpus="tud"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="orchid")
)
Expand All @@ -103,6 +109,9 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tud")
)
self.assertIsNotNone(pos_tag(tokens, engine="tltk"))

self.assertEqual(pos_tag_sents(None), [])
Expand Down

0 comments on commit c08d6eb

Please sign in to comment.