-
Notifications
You must be signed in to change notification settings - Fork 2
/
syllable.js
36 lines (28 loc) · 1.18 KB
/
syllable.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/// https://github.com/ye-kyaw-thu/sylbreak/blob/master/Javascript/resegment.js
const myConsonant = "\u1000-\u1021"; // "က-အ"
const enChar = "a-zA-Z0-9";
// "ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s"
const otherChar = "\u1023\u1024\u1025\u1026\u1027\u1029\u102a\u103f\u104c\u104d\u104f\u1040-\u1049\u104a\u104b!-/:-@\\[-`\\{-~\\s";
const ssSymbol = "\u1039";
const ngaThat = "\u1004\u103a";
const aThat = "\u103a";
// Regular expression pattern for Myanmar syllable breaking
// *** a consonant not after a subscript symbol AND a consonant is not
// followed by a-That character or a subscript symbol
const BREAK_PATTERN = new RegExp("((?!" + ssSymbol + ")[" + myConsonant + "](?![" + aThat + ssSymbol + "])" + "|[" + enChar + otherChar + "])", "mg");
function segment(text) {
var outArray = text.replace(BREAK_PATTERN, "𝕊$1").split('𝕊')
if (outArray.length > 0) {
outArray.shift();
//out.splice(0, 1);
}
return outArray;
}
function segmentWithSeparator(text, separator) {
if (separator === undefined) {
separator = "|";
}
var result = text.replace(BREAK_PATTERN, separator + "$1");
result = result.replace("\u{1039}","\u{103A}");
return result;
}