Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TUD postag #916

Merged
merged 1 commit into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pythainlp/corpus/corpus_license.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ https://creativecommons.org/licenses/by/4.0/

| Filename | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------- |
| pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
| pos_orchid_perceptron.json | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
| pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram |
| pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
| pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
| pos_ud_perceptron-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
| pos_ud_unigram-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
| pos_tud_perceptron.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using perceptron |
| pos_tud_unigram.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using unigram |


## Thai Dictionary for ICU BreakIterator
Expand Down
1 change: 1 addition & 0 deletions pythainlp/corpus/pos_tud_perceptron.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pythainlp/corpus/pos_tud_unigram.json

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@

_BLACKBOARD_NAME = "blackboard_pt_tagger"

_TUD_FILENAME = "pos_tud_perceptron.json"
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None
_TUD_TAGGER = None


def _orchid_tagger():
Expand All @@ -44,6 +48,13 @@ def _blackboard_tagger():
return _LST20_TAGGER


def _tud_tagger():
global _TUD_TAGGER
if not _TUD_TAGGER:
_TUD_TAGGER = PerceptronTagger(path=_TUD_PATH)
return _TUD_TAGGER


def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
"""
:param list words: a list of tokenized words
Expand All @@ -67,6 +78,9 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = blackboard.pre_process(words)
word_tags = _blackboard_tagger().tag(words)
word_tags = blackboard.post_process(word_tags, to_ud)
elif corpus in ("tud"):
tagger = _tud_tagger()
word_tags = tagger.tag(words)
else: # by default, use "pud" for corpus
tagger = _pud_tagger()
word_tags = tagger.tag(words)
Expand Down
3 changes: 3 additions & 0 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def pos_tag(
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
* *tnc* - Thai National Corpus (support tltk engine only)
* *tud* - `Thai Universal Dependency Treebank (TUD)\
<https://github.com/nlp-chula/TUD>`_ \
:return: a list of tuples (word, POS tag)
:rtype: list[tuple[str, str]]

Expand Down Expand Up @@ -96,6 +98,7 @@ def pos_tag(
"orchid",
"orchid_ud",
"pud",
"tud",
]

if engine == "perceptron" and corpus in _support_corpus:
Expand Down
14 changes: 14 additions & 0 deletions pythainlp/tag/unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@

_BLACKBOARD_NAME = "blackboard_unigram_tagger"

_TUD_FILENAME = "pos_tud_unigram.json"
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None
_TUD_TAGGER = None


def _orchid_tagger():
Expand Down Expand Up @@ -49,6 +53,14 @@ def _blackboard_tagger():
return _BLACKBOARD_TAGGER


def _tud_tagger():
global _TUD_TAGGER
if not _TUD_TAGGER:
with open(_TUD_PATH, encoding="utf-8-sig") as fh:
_TUD_TAGGER = json.load(fh)
return _TUD_TAGGER


def _find_tag(
words: List[str], dictdata: dict, default_tag: str = ""
) -> List[Tuple[str, str]]:
Expand Down Expand Up @@ -82,6 +94,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = blackboard.pre_process(words)
word_tags = _find_tag(words, _blackboard_tagger())
word_tags = blackboard.post_process(word_tags, to_ud)
elif corpus in ("tud"):
word_tags = _find_tag(words, _tud_tagger())
else: # by default, use "pud" for corpus
word_tags = _find_tag(words, _pud_tagger())

Expand Down
9 changes: 9 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def test_pos_tag(self):
self.assertEqual(unigram.tag([], corpus="orchid"), [])
self.assertEqual(unigram.tag(None, corpus="blackboard"), [])
self.assertEqual(unigram.tag([], corpus="blackboard"), [])
self.assertEqual(unigram.tag(None, corpus="tud"), [])
self.assertEqual(unigram.tag([], corpus="tud"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="orchid")
)
Expand All @@ -68,6 +70,8 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="blackboard_ud")
)
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
self.assertEqual(
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
[("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
Expand All @@ -88,6 +92,8 @@ def test_pos_tag(self):
self.assertEqual(perceptron.tag([], corpus="pud"), [])
self.assertEqual(perceptron.tag(None, corpus="blackboard"), [])
self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
self.assertEqual(perceptron.tag(None, corpus="tud"), [])
self.assertEqual(perceptron.tag([], corpus="tud"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="orchid")
)
Expand All @@ -103,6 +109,9 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tud")
)
self.assertIsNotNone(pos_tag(tokens, engine="tltk"))

self.assertEqual(pos_tag_sents(None), [])
Expand Down
Loading