-
Notifications
You must be signed in to change notification settings - Fork 0
/
dev_NodeJs_MyWordSegmenter.js
105 lines (90 loc) · 2.81 KB
/
dev_NodeJs_MyWordSegmenter.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/**
* Simple Word Segmenter
* https://github.com/vpnry/myanmar_word_segmenter
* File: dev_NodeJs_MyWordSegmenter.js
*/
const fs = require("fs");
const path = require("path");
const myWordListJsonFile = "MYWORDS.json";
class MyWordSegmenter {
/**
* The MY_SYLLABLE_REGEX and method syllable_segment are adapted from
* https://github.com/swanhtet1992/ReSegment/blob/master/Javascript/resegment.js
*/
constructor(wordListJsonFile = myWordListJsonFile) {
this.wordListJsonFile = wordListJsonFile;
this.puncSpecialChars = "…‘’!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + "။၊";
this.puncRegex = new RegExp(
`^[${this.puncSpecialChars}]+|[${this.puncSpecialChars}]+$`,
"g"
);
// The regular expression for syllable segmentation
this.MY_SYLLABLE_REGEX =
/(?:(?<!\u1039)([\u1000-\u102A\u103F\u104A-\u104F]|[\u1040-\u1049]+|[^\u1000-\u104F]+)(?![\u103E\u103B]?[\u1039\u103A\u1037]))/g;
// Load all the words from the file into a set
this.MYWORDS = this.loadWords();
}
// Method to load all the words from the file
loadWords() {
const mergedFilePath = path.join(__dirname, this.wordListJsonFile);
const mergedData = JSON.parse(fs.readFileSync(mergedFilePath, "utf8"));
const MYWORDS = new Set(mergedData);
console.log(`Words in ${this.wordListJsonFile}: ${MYWORDS.size}`);
return MYWORDS;
}
syllable_segment(text) {
/**
* @author Chan Mrate Ko Ko
*/
var outArray = text.replace(this.MY_SYLLABLE_REGEX, "𝕊$1").split("𝕊");
if (outArray.length > 0) {
outArray.shift();
}
return outArray;
}
word_segment(word) {
if (this.MYWORDS.has(word)) {
return [word];
}
return this.theSegmenter(word);
}
word_segment_sentence(sentence) {
let words = sentence.split(/\s+/);
let segmentedWords = [];
for (let word of words) {
segmentedWords.push(...this.word_segment(word));
}
return segmentedWords;
}
theSegmenter(text) {
text = text.trim();
let syllables = this.syllable_segment(text);
// filter punctuation, whitespace
syllables = syllables
.filter(
(element) => element.replace(this.puncRegex, "").trim().length > 0
)
.map((element) => element.trim());
// Split the word into its constituent syllables and look them up in the loaded words set
let wordList = [];
let i = 0;
while (i < syllables.length) {
for (let j = syllables.length; j > i; j--) {
let word = syllables.slice(i, j).join("");
if (this.MYWORDS.has(word)) {
wordList.push(word);
i = j;
break;
} else {
if (j === i + 1) {
let s = syllables[i];
wordList.push(s);
i++;
}
}
}
}
return wordList;
}
}
module.exports = MyWordSegmenter;