Skip to content

Commit

Permalink
Optimize cyr2lat combo transformations and add more unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
petarov committed Apr 28, 2024
1 parent ff5ea32 commit 3441aa4
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 24 deletions.
13 changes: 12 additions & 1 deletion tokens.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package translitbg

var (
// Възстановяването на оригиналната дума не е водещ принцип!
// Възстановяването на оригиналната дума не е водещ принцип
STREAMLINED = map[string]string{
// lower case
"а": "a",
Expand Down Expand Up @@ -77,6 +77,17 @@ var (
"ИЯ": "IA",
}

// uppercase cyrillic character to its uppercase latin combo equivalent
STREAMLINED_CYR2COMBO_UC = map[rune]string{
1046: "ZH", // Ж
1062: "TS", // Ц
1063: "CH", // Ч
1064: "SH", // Ш
1065: "SHT", // Щ
1070: "YU", // Ю
1071: "YA", // Я
}

// БЪЛГАРИЯ
BULGARIA_CYR_UP = []rune{1041, 1066, 1051, 1043, 1040, 1056, 1048, 1071}
// българия
Expand Down
26 changes: 6 additions & 20 deletions translitbg.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ import (
"fmt"
"io"
"regexp"
"strings"
)

type TranslitBG struct {
chars map[string]string
tokens map[string]string
combos map[rune]string
regex *regexp.Regexp
}

Expand All @@ -20,28 +20,12 @@ func isBGChar(r rune) bool {
return (r >= 1040 && r <= 1103) || r == 1117 || r == 1037
}

// isComboChar returns true, if the rune r is to be transformed into a
// combination of latin characters
func isComboChar(r rune) bool {
switch r {
case 1046, 1078, // Ж, ж
1062, 1094, // Ц, ц
1063, 1095, // Ч, ч
1064, 1096, // Ш, ш
1065, 1097, // Щ, щ
1070, 1102, // Ю, ю
1071, 1103: // Я, я
return true
}
return false
}

// isUpperBGChar returns true, if the rune r is an uppercase cyrillic rune
func isUpperBGChar(r rune) bool {
return (r >= 1040 && r <= 1071) || r == 1037
}

// tryDoBulgaria returns true for the case where input s is the text "България"
// tryDoBulgaria returns true for the case where input s is the text "България".
// In this case the "ъ" needs to be trasformed into an "u" as the law dictates
func tryDoBulgaria(input string) (bool, string) {
runes := []rune(input)
Expand Down Expand Up @@ -70,6 +54,7 @@ func New() *TranslitBG {
return &TranslitBG{
STREAMLINED,
STREAMLINED_TOKENS,
STREAMLINED_CYR2COMBO_UC,
regex,
}
}
Expand Down Expand Up @@ -128,8 +113,9 @@ func (tr *TranslitBG) Encode(input string) (string, error) {

token, ok := tr.chars[string(ch)]
if ok {
if isComboChar(ch) && isUpperBGChar(ch) && (ch2 == 0 || isUpperBGChar(ch2)) {
dest.WriteString(strings.ToUpper(token))
ucc, ok := tr.combos[ch]
if ok && (ch2 == 0 || isUpperBGChar(ch2)) {
dest.WriteString(ucc)
} else {
dest.WriteString(token)
}
Expand Down
11 changes: 8 additions & 3 deletions translitbg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ func TestEdgeCases(t *testing.T) {
{"ЖЕЗЪЛ", "ZHEZAL"},
{"жЕЗЪЛ", "zhEZAL"},
{"жезъл", "zhezal"},
{"ЩАСТИЕ", "SHTASTIE"},
{"щАСТИЕ", "shtASTIE"},
{"ЩаСТИЕ", "ShtaSTIE"},
{"ЦАРЕВЕЦ", "TSAREVETS"},
{"Царевец", "Tsarevets"},
{"ЦАРевец", "TSARevets"},
{"цАРЕВЕЦ", "tsAREVETS"},
{"чОвек", "chOvek"},
{"ЧОВек", "CHOVek"},
{"ШИВАЧ", "SHIVACH"},
{"шИВАч", "shIVAch"},
{"ЩАСТИЕ", "SHTASTIE"},
{"щАСТИЕ", "shtASTIE"},
{"ЩаСТИЕ", "ShtaSTIE"},
{"ЮНГА", "YUNGA"},
{"юНГА", "yuNGA"},
{"ЯБЪЛКИ", "YABALKI"},
Expand Down

0 comments on commit 3441aa4

Please sign in to comment.