platelminto · mhdzumair · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/PTN/extras.py b/PTN/extras.py
@@ -5,35 +5,51 @@
 delimiters = "[\.\s\-\+_\/(),]"
 
 langs = [
-    ("rus(?:sian)?", "Russian"),
-    ("(?:True)?fre?(?:nch)?", "French"),
+    ("rus(?:sian)?|russo", "Russian"),
+    ("(?:True)?fre?(?:nch)?|fr(?:ench|a|e|anc[eê]s)?", "French"),
     ("(?:nu)?ita(?:liano?)?", "Italian"),
     ("castellano|spa(?:nish)?|esp?", "Spanish"),
     ("swedish", "Swedish"),
     ("dk|dan(?:ish)?", "Danish"),
-    ("ger(?:man)?|deu(?:tsch)?", "German"),
+    ("ger(?:man)?|deu(?:tsch)?|alem[aã]o", "German"),
     ("nordic", "Nordic"),
     ("exyu", "ExYu"),
-    ("chs|chi(?:nese)?", "Chinese"),
+    ("chs|chi(?:nese)?|(?:mand[ae]rin|ch[sn])|chin[eê]s|zh-hans", "Chinese"),
     ("hin(?:di)?", "Hindi"),
     ("polish|poland|pl", "Polish"),
-    ("mandarin", "Mandarin"),
-    ("kor(?:ean)?", "Korean"),
+    ("kor(?:ean)?|coreano", "Korean"),
     ("ben(?:gali)?|bangla", "Bengali"),
     ("kan(?:nada)?", "Kannada"),
-    ("tam(?:il)?", "Tamil"),
+    ("t[aâ]m(?:il)?", "Tamil"),
     ("tel(?:ugu)?", "Telugu"),
     ("mar(?:athi)?", "Marathi"),
     ("mal(?:ayalam)?", "Malayalam"),
-    ("japanese|ja?p", "Japanese"),
+    ("guj(?:arati)?", "Gujarati"),
+    ("pun(?:jabi)?", "Punjabi"),
+    ("ori(?:ya)?", "Oriya"),
+    ("japanese|ja?p|jpn|japon[eê]s", "Japanese"),
     ("interslavic", "Interslavic"),
     ("ara(?:bic)?", "Arabic"),
     ("urdu", "Urdu"),
-    ("punjabi", "Punjabi"),
-    ("portuguese", "Portuguese"),
-    ("albanian?", "Albanian"),
-    ("egypt(?:ian)?", "Egyptian"),
-    ("en?(?:g(?:lish)?)?", "English"),  # Must be at end, matches just an 'e'
+    ("tur(?:kish)?|tr", "Turkish"),
+    ("tailand[eê]s|thai?", "Thai"),
+    ("tagalog", "Tagalog"),
+    ("ind(?:onesian)?", "Indonesian"),
+    ("vie(?:tnamese)?", "Vietnamese"),
+    ("heb(?:rew)?", "Hebrew"),
+    ("gre(?:ek)?", "Greek"),
+    ("cz(?:ech)?", "Czech"),
+    ("hun(?:garian)?", "Hungarian"),
+    ("ukr(?:ainian)?", "Ukrainian"),
+    ("fin(?:nish)?", "Finnish"),
+    ("nor(?:wegian)?", "Norwegian"),
+    ("sin(?:hala)?", "Sinhala"),
+    ("dutch|nl", "Dutch"),
+    ("p[ua]n(?:jabi)?", "Punjabi"),
+    ("por(?:tuguese)?|portugu[eèê]s[ea]?|p[rt]|port?", "Portuguese"),
+    ("alb(?:anian?)?|albanais", "Albanian"),
+    ("egypt(?:ian)?|egy", "Egyptian"),
+    ("en?(?:g(?:lish)?)?|ing(?:l[eéê]s)?", "English"),  # Must be at end, matches just an 'e'
 ]
 
 genres = [
@@ -88,7 +104,6 @@
     "extended": [r"(EXTENDED{d}(?!(?:CUT|EDITIONS?)))".format(d=delimiters)],
 }
 
-
 channels = [(1, 0), (2, 0), (5, 0), (5, 1), (6, 1), (7, 1)]
 
 
@@ -182,12 +197,12 @@ def link_patterns(pattern_options):
     return (
         "(?:"
         + "|".join(
-            [
-                pattern_option[0]
-                if isinstance(pattern_option, tuple)
-                else pattern_option
-                for pattern_option in pattern_options
-            ]
-        )
+        [
+            pattern_option[0]
+            if isinstance(pattern_option, tuple)
+            else pattern_option
+            for pattern_option in pattern_options
+        ]
+    )
         + ")"
     )
diff --git a/PTN/parse.py b/PTN/parse.py
@@ -36,21 +36,27 @@ def _part(self, name, match_slice, clean, overwrite=False):
             self.match_slices.append(match_slice)
 
     @staticmethod
-    def _clean_string(string):
-        clean = re.sub(r"^( -|\(|\[)", "", string)
-        if clean.find(" ") == -1 and clean.find(".") != -1:
+    def _clean_dots(string: str) -> str:
+        if string.find(" ") == -1 and string.find(".") != -1:
             # 4 dots likely means we want an ellipsis and a space
-            clean = re.sub(r"\.{4,}", "... ", clean)
+            string = re.sub(r"\.{4,}", "... ", string)
 
             # Replace any instances of less than 3 dots with a space
             # Lookarounds are used to prevent the 3-dots (ellipses) from being replaced
-            clean = re.sub(r"(?<!\.)\.\.(?!\.)", " ", clean)
-            clean = re.sub(r"(?<!\.)\.(?!\.\.)", " ", clean)
+            string = re.sub(r"(?<!\.)\.\.(?!\.)", " ", string)
+            string = re.sub(r"(?<!\.)\.(?!\.\.)", " ", string)
+        return string
+
+    def _clean_string(self, string):
+        clean = re.sub(r"^( -|\(|\[)", "", string)
+        clean = self._clean_dots(clean)
 
         clean = re.sub(r"_", " ", clean)
         clean = re.sub(r"([\[)_\]]|- )$", "", clean).strip()
         clean = clean.strip(" _-")
 
+        # Again, we need to clean up the dots & strip for non-english chars titles that get cleaned from above re.sub.
+        clean = self._clean_dots(clean).strip()
         return clean
 
     def parse(self, name, standardise, coherent_types):
@@ -358,7 +364,7 @@ def process_title(self):
                 relative_title_start = m.end()
                 raw = raw[relative_title_start:]
                 title_start = relative_title_start + title_start
-            clean = self._clean_string(raw)
+            clean = self._clean_string(self.clean_title(raw))
             # Re-add title_start to unrelative the index from raw to self.torrent_name
             self._part("title", (title_start, title_end), clean)
         else:
@@ -433,3 +439,14 @@ def clean_unmatched(self):
             ):
                 filtered.append(extra)
         return filtered
+
+    @staticmethod
+    def clean_title(raw_title):
+        cleaned_title = raw_title
+        cleaned_title = cleaned_title.replace(r"[[(]movie[)\]]", "")  # clear movie indication flag
+        cleaned_title = re.sub(patterns["RUSSIAN_CAST_REGEX"], " ", cleaned_title)  # clear russian cast information
+        cleaned_title = re.sub(patterns["RELEASE_GROUP_REGEX_START"], r"\1", cleaned_title)  # remove release group markings sections from the start
+        cleaned_title = re.sub(patterns["RELEASE_GROUP_REGEX_END"], r"\1", cleaned_title)  # remove unneeded markings section at the end if present
+        cleaned_title = re.sub(patterns["ALT_TITLES_REGEX"], "", cleaned_title)  # remove alt language titles
+        cleaned_title = re.sub(patterns["NOT_ONLY_NON_ENGLISH_REGEX"], "", cleaned_title)  # remove non english chars if they are not the only ones left
+        return cleaned_title
diff --git a/PTN/patterns.py b/PTN/patterns.py
@@ -411,3 +411,19 @@
     "remux": "boolean",
     "internationalCut": "boolean",
 }
+
+patterns["NON_ENGLISH_CHARS"] = "\u3040-\u30ff"  # Japanese characters
+patterns["NON_ENGLISH_CHARS"] += "\u3400-\u4dbf"  # Chinese characters
+patterns["NON_ENGLISH_CHARS"] += "\u4e00-\u9fff"  # Chinese characters
+patterns["NON_ENGLISH_CHARS"] += "\uf900-\ufaff"  # CJK Compatibility Ideographs
+patterns["NON_ENGLISH_CHARS"] += "\uff66-\uff9f"  # Halfwidth Katakana Japanese characters
+patterns["NON_ENGLISH_CHARS"] += "\u0400-\u04ff"  # Cyrillic characters (Russian)
+patterns["NON_ENGLISH_CHARS"] += "\u0600-\u06ff"  # Arabic characters
+
+patterns["RUSSIAN_CAST_REGEX"] = r"\([^)]*[\u0400-\u04ff][^)]*\)$|\/.*\((.*)\)$"
+patterns["ALT_TITLES_REGEX"] = f"[^/|(]*[{patterns['NON_ENGLISH_CHARS']}][^/|]*/|[/|][^/|(]*[{patterns['NON_ENGLISH_CHARS']}][^/|]*"
+patterns["NOT_ONLY_NON_ENGLISH_REGEX"] = rf"(?:[a-zA-Z][^{patterns['NON_ENGLISH_CHARS']}]+|^)[{patterns['NON_ENGLISH_CHARS']}].*[{patterns['NON_ENGLISH_CHARS']}]|[{patterns['NON_ENGLISH_CHARS']}].*[{patterns['NON_ENGLISH_CHARS']}](?=[^{patterns['NON_ENGLISH_CHARS']}]+[a-zA-Z])"
+patterns["NOT_ALLOWED_SYMBOLS_AT_START_AND_END"] = rf"^[^\w{patterns['NON_ENGLISH_CHARS']}#[【★]+|[ \-:/\\\[|{{(#$&^]+$"
+patterns["REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END"] = rf"^[^\w{patterns['NON_ENGLISH_CHARS']}#]+|]$"
+patterns["RELEASE_GROUP_REGEX_START"] = r"^[\[【★].*[\]】★][ .]?(.+)"
+patterns["RELEASE_GROUP_REGEX_END"] = r"(.+)[ .]?[\[【★].*[\]】★]$"
diff --git a/README.md b/README.md
@@ -218,7 +218,7 @@ $ python cli.py --coherent-types 'A freakishly cool movie or TV episode'
 Submit a PR on the `dev` branch. If you have changed the regex for a pattern, I can assume this is because you had a title that was being incorrectly processed, and your change fixes it. Please add the title to the test suite!
 
 To add new titles to the tests, you have 2 options (the first is easier):
-- Add the titles to `tests/test_generator`'s main method (in `add_titles()`), and run it. When asked for input, type 's', and it will automatically add what's needed to `files/input.json`, `files/output_raw.json`, and `files/output_standard.json`. The fields `encoder`, `excess`, `site`, and `episodeName` don't always have to be correct - if they're giving you issues, or seem wrong, feel free to manually remove them from the output test files.
+- Add the titles to `tests/generate_test_data.py`'s main method (in `add_titles()`), and run it. When asked for input, type 's', and it will automatically add what's needed to `files/input.json`, `files/output_raw.json`, and `files/output_standard.json`. The fields `encoder`, `excess`, `site`, and `episodeName` don't always have to be correct - if they're giving you issues, or seem wrong, feel free to manually remove them from the output test files.
 
 - Otherwise, you must add input torrent names to `tests/files/input.json` and full output json objects (with `standardise=False`) to `tests/files/output_raw.json`. Also add the standardised output to `tests/files/output_standard.json`, only including fields that are different from `output_raw.json`, along with `title`.
 

diff --git a/tests/files/input.json b/tests/files/input.json
@@ -405,5 +405,21 @@
   "www.1TamilBlasters.lat - Thuritham (2023) [Tamil - 2K QHD AVC UNTOUCHED - x264 - AAC - 3.4GB - ESub].mkv",
   "www.1TamilMV.world - Raja Vikramarka (2024) Tamil HQ HDRip - 400MB - x264 - AAC - ESub.mkv",
   "www.1TamilMV.world - Kotha Rangula Prapancham (2024) Telugu HQ PreDVD - 700MB - x264 - HQ Clean Aud.mkv",
-  "The.Lord.of.the.Rings.Extended.Edition.2001.1080p.BluRay.x264.DTS-WiKi"
+  "The.Lord.of.the.Rings.Extended.Edition.2001.1080p.BluRay.x264.DTS-WiKi",
+  "Deadpool 2016 1080p BluRay DTS Rus Ukr 3xEng HDCL",
+  "127.Heures.FRENCH.DVDRip.AC3.XViD-DVDFR",
+  "Men in Black International 2019 (ingl\u00eas portugu\u00eas)",
+  "Quarantine [2008] [DVDRiP.XviD-M14CH0] [Lektor PL] [Arx]",
+  "All.Love.E146.KOR.HDTV.XViD-DeBTV",
+  "Atonement.2017.KOREAN.ENSUBBED.1080p.WEBRip.x264-VXTT",
+  "Fauda.S01.HEBREW.1080p.NF.WEBRip.DD5.1.x264-TrollHD[rartv]",
+  "Chinese Zodiac (2012) 1080p BrRip x264 - YIFY",
+  "Thai Massage (2022) 720p PDVDRip x264 AAC.mkv",
+  "\u6740\u624b\u4e4b\u738b [\u6e2f\u7248\u539f\u76d8/\u56fd\u7ca4\u53cc\u8bed\u4e2d\u5b57].Hitman.1998.1080p.HKG.Blu-ray.AVC.TrueHD.7.1-TAG",
+  "[www.arabp2p.net]_-_\u062a\u0631\u0643\u064a \u0645\u062a\u0631\u062c\u0645 \u0648\u0645\u062f\u0628\u0644\u062c Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent",
+  "\u0413\u043e\u043b\u0443\u0431\u0430\u044f \u0432\u043e\u043b\u043d\u0430 / Blue Crush (2002) DVDRip",
+  "\u3010\u55b5\u840c\u5976\u8336\u5c4b\u3011\u260501\u6708\u65b0\u756a\u2605[Rebirth][01][720p][\u7b80\u4f53][\u62db\u52df\u7ffb\u8bd1]",
+  "08.\u041f\u043b\u0430\u043d\u0435\u0442\u0430.\u043e\u0431\u0435\u0437\u044c\u044f\u043d.\u0420\u0435\u0432\u043e\u043b\u044e\u0446\u0438\u044f.2014.BDRip-HEVC.1080p.mkv",
+  "\u0413\u0440\u0435\u0447\u0435\u0441\u043a\u0430\u044f \u0441\u043c\u043e\u043a\u043e\u0432\u043d\u0438\u0446\u0430 / The fruit is ripe / Griechische Feigen (Siggi G\u00f6tz) [1976, \u0413\u0435\u0440\u043c\u0430\u043d\u0438\u044f, \u042d\u0440\u043e\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u043a\u043e\u043c\u0435\u0434\u0438\u044f, DVDRip]",
+  "\u041a\u043d\u0438\u0433\u043e\u043d\u043e\u0448\u0438 / \u041a\u043di\u0433\u0430\u043d\u043e\u0448\u044b (1987) TVRip \u043e\u0442 AND03AND | BLR"
 ]
diff --git a/tests/files/output_raw.json b/tests/files/output_raw.json
@@ -3715,5 +3715,136 @@
     "resolution": "1080p",
     "title": "The Lord of the Rings",
     "year": 2001
+  },
+  {
+    "audio": "DTS",
+    "encoder": "3xEng",
+    "language": ["Rus","Ukr"],
+    "quality": "BluRay",
+    "resolution": "1080p",
+    "site": "HDCL",
+    "title": "Deadpool",
+    "year": 2016
+  },
+  {
+    "audio": "AC3",
+    "codec": "XViD",
+    "encoder": "DVDFR",
+    "language": "FRENCH",
+    "quality": "DVDRip",
+    "title": "127 Heures"
+  },
+  {
+    "language": ["inglês","português"],
+    "title": "Men in Black International",
+    "year": 2019
+  },
+  {
+    "codec": "XviD",
+    "language": "PL",
+    "quality": "DVDRiP",
+    "site": "Arx",
+    "title": "Quarantine",
+    "year": 2008
+  },
+  {
+    "codec": "XViD",
+    "encoder": "DeBTV",
+    "episode": 146,
+    "language": "KOR",
+    "quality": "HDTV",
+    "title": "All Love"
+  },
+  {
+    "codec": "x264",
+    "encoder": "VXTT",
+    "language": "KOREAN",
+    "quality": "WEBRip",
+    "resolution": "1080p",
+    "subtitles": "ENSUBBED",
+    "title": "Atonement",
+    "year": 2017
+  },
+  {
+    "audio": "DD5.1",
+    "codec": "x264",
+    "encoder": "TrollHD",
+    "language": "HEBREW",
+    "network": "NF",
+    "quality": "WEBRip",
+    "resolution": "1080p",
+    "season": 1,
+    "site": "rartv",
+    "title": "Fauda"
+  },
+  {
+    "codec": "x264",
+    "encoder": "YIFY",
+    "quality": "BrRip",
+    "resolution": "1080p",
+    "title": "Chinese Zodiac",
+    "year": 2012
+  },
+  {
+    "audio": "AAC",
+    "codec": "x264",
+    "encoder": "PDVDRip",
+    "filetype": "mkv",
+    "resolution": "720p",
+    "title": "Thai Massage",
+    "year": 2022
+  },
+  {
+    "audio": "TrueHD.7.1",
+    "codec": "AVC",
+    "encoder": "TAG",
+    "quality": "Blu-ray",
+    "resolution": "1080p",
+    "title": "] Hitman",
+    "year": 1998
+  },
+  {
+    "audio": "DDP5.1",
+    "codec": "H.264",
+    "encoder": "torrent",
+    "filetype": "MKV",
+    "network": "NF",
+    "quality": "WEB-DL",
+    "resolution": "1080p",
+    "site": "www.arabp2p.net",
+    "title": "Last Call for Istanbul",
+    "year": 2023
+  },
+  {
+    "quality": "DVDRip",
+    "title": "Blue Crush",
+    "year": 2002
+  },
+  {
+    "encoder": "]",
+    "resolution": "720p",
+    "site": "简体][招募翻译",
+    "title": "Rebirth"
+  },
+  {
+    "codec": "HEVC",
+    "filetype": "mkv",
+    "quality": "BDRip",
+    "resolution": "1080p",
+    "title": "08 Планета обезьян Революция",
+    "year": 2014
+  },
+  {
+    "encoder": "комедия",
+    "quality": "DVDRip",
+    "title": "The fruit is ripe / Griechische Feigen",
+    "year": 1976
+  },
+  {
+    "encoder": "|",
+    "quality": "TVRip",
+    "site": "BLR",
+    "title": "Кнiганошы",
+    "year": 1987
   }
 ]