Skip to content

Commit

Permalink
Another minor speed up by removing duplicate code from CheckWord and …
Browse files Browse the repository at this point in the history
…moving statis arrays to the class level
  • Loading branch information
Kirill Belousov committed Sep 16, 2023
1 parent 081c09f commit c84dd47
Showing 1 changed file with 60 additions and 62 deletions.
122 changes: 60 additions & 62 deletions addon/globalPlugins/textnormalizer/textnormalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,63 @@ def __init__(self):
self.OnlyEn = "DdFfGghIiJjLlNQqRrSstUVvWwYZz"
self.Rus = "АаВЕеКкМНОоРрСсТуХхЗОтиапьбт"
self.Eng = "AaBEeKkMHOoPpCcTyXx30mu@nb6m"
self.patterns = [
"[kк][аa][kк]",
"[tт][aа][kк]",
r"[a]([\s:,.?!_(){}=+-]+[а-яёА-ЯЁ])",
"[cс][kк][oо][pр][еe][еe]",
"[kк][yу][pр][сc]",
"[kк][yу][pр][сc][eе]",
"[s][kк][yу][pр][eе]",
"[HН][eе][tт]",
"тe",
"eг",
"дe",
"[cс][pр][oо][kк]",
"CCCP",
"CCP",
r"\b[HН][аa]\b",
r"\b[HН][eе]\b",
r"\b[HН][oо]\b",
r"\b[HН][уy]\b",
r"([а-яёА-ЯЁ])m",
r"m([а-яёА-ЯЁ])",
"∂",
"α",
"ū",
"meх",
r"\bom\b",
r"([а-яёА-ЯЁ])pu"
]
self.replaces = [
"как",
"так",
r"а\1",
"скорее",
"курс",
"курсе",
"skype",
"нет",
"те",
"ег",
"де",
"срок",
"СССР",
"ССР",
r"На",
r"Не",
r"Но",
r"Ну",
r"\1т",
r"т\1",
"д",
"а",
"й",
"тех",
"от",
r"\1ри"
]



def replace(self, old, new, string, case_insensitive = False):
Expand All @@ -64,8 +121,6 @@ def CheckWord(self, word, change_case = True):
# остальные символы из постов VK
for k, v in self.lettersstrng.items():
newword = self.replace(k, v, newword, True)
# убираем символ "мягкий перенос"
newword = newword.replace("\u200d", "").replace(chr(173), "").replace(chr(8205), "")
# один символ не имеет смысла
if len(newword.strip()) == 1:
return newword
Expand Down Expand Up @@ -124,8 +179,7 @@ def CheckText(self, text, change_case = True):
"""

# сразу убираем символ "мягкий перенос"
text = text.replace("\u200d", "").replace(chr(173), "").replace(chr(8205), "")
newText = text
newText = text.replace("\u200d", "").replace(chr(173), "").replace(chr(8205), "")
words = re.findall("[\\w\\@#]+", newText, re.IGNORECASE)
words2 = words.copy()
words2.reverse()
Expand All @@ -146,65 +200,9 @@ def CheckText(self, text, change_case = True):
newText = newText.replace(" C ", " С ")
for i in range(0, len(Rus)):
newText = self.replace(Eng[i], Rus[i], newText, False)
patterns = [
"[kк][аa][kк]",
"[tт][aа][kк]",
r"[a]([\s:,.?!_(){}=+-]+[а-яёА-ЯЁ])",
"[cс][kк][oо][pр][еe][еe]",
"[kк][yу][pр][сc]",
"[kк][yу][pр][сc][eе]",
"[s][kк][yу][pр][eе]",
"[HН][eе][tт]",
"тe",
"eг",
"дe",
"[cс][pр][oо][kк]",
"CCCP",
"CCP",
r"\b[HН][аa]\b",
r"\b[HН][eе]\b",
r"\b[HН][oо]\b",
r"\b[HН][уy]\b",
r"([а-яёА-ЯЁ])m",
r"m([а-яёА-ЯЁ])",
"∂",
"α",
"ū",
"meх",
r"\bom\b",
r"([а-яёА-ЯЁ])pu"
]
replaces = [
"как",
"так",
r"а\1",
"скорее",
"курс",
"курсе",
"skype",
"нет",
"те",
"ег",
"де",
"срок",
"СССР",
"ССР",
r"На",
r"Не",
r"Но",
r"Ну",
r"\1т",
r"т\1",
"д",
"а",
"й",
"тех",
"от",
r"\1ри"
]
for i in range(0, len(patterns)):
for i in range(0, len(self.patterns)):
if text != newText:
newText = re.sub(patterns[i], replaces[i], newText, flags=re.IGNORECASE)
newText = re.sub(self.patterns[i], self.replaces[i], newText, flags=re.IGNORECASE)
newText = re.sub(r"([a-z])у([a-z])", r"\1y\2", newText)
newText = re.sub(r"([a-z])у", r"\1y", newText)
newText = re.sub(r"у([a-z])", r"y\1", newText)
Expand Down

0 comments on commit c84dd47

Please sign in to comment.