Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for parsing non-english chars along with english title & More language patterns #66

Open
wants to merge 22 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
d3fea07
Improvements
platelminto Oct 27, 2023
93192ac
Merge branch 'dev'
platelminto Oct 27, 2023
0d1e05e
Update python-publish.yml
platelminto Oct 27, 2023
d326bb7
Delete .github/workflows directory
platelminto Oct 27, 2023
60ee833
Fix only-title torrent names
platelminto Oct 30, 2023
6fe2362
Remove complete series words
platelminto Dec 30, 2023
8ece810
Fix overlapping stuff for seasons
platelminto Dec 31, 2023
6615cba
Improve season range support when many are listed
platelminto Dec 31, 2023
ca89c7b
Improve French subtitle support
platelminto Dec 31, 2023
ee7a3b6
Improve site matching at beginning of title
platelminto Dec 31, 2023
16835f0
Bump version
platelminto Dec 31, 2023
20ae328
added standard resolution types
mhdzumair Jan 4, 2024
74de856
reorder the pattern from highest to lowest.
mhdzumair Jan 4, 2024
c3002b7
Added new test title
mhdzumair Jan 4, 2024
3f399fe
Merge branch 'master' of https://github.com/platelminto/parse-torrent…
mhdzumair Jan 25, 2024
d9a21d2
fix torrent name and site parsing
mhdzumair Jan 25, 2024
a90609f
Merge branch 'dev' of https://github.com/platelminto/parse-torrent-title
mhdzumair Feb 15, 2024
5f4c12b
Add site regex description
mhdzumair Feb 15, 2024
1e4137d
Add more language patterns
mhdzumair May 27, 2024
e6daab6
rename test data generator to not trigger unit test by default
mhdzumair May 27, 2024
c91e1c1
#64: Add support for parsing non-english chars along with english title
mhdzumair May 27, 2024
1c61386
Merge branch 'dev' into master
mhdzumair May 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions PTN/extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,51 @@
delimiters = "[\.\s\-\+_\/(),]"

langs = [
("rus(?:sian)?", "Russian"),
("(?:True)?fre?(?:nch)?", "French"),
("rus(?:sian)?|russo", "Russian"),
("(?:True)?fre?(?:nch)?|fr(?:ench|a|e|anc[eê]s)?", "French"),
("(?:nu)?ita(?:liano?)?", "Italian"),
("castellano|spa(?:nish)?|esp?", "Spanish"),
("swedish", "Swedish"),
("dk|dan(?:ish)?", "Danish"),
("ger(?:man)?|deu(?:tsch)?", "German"),
("ger(?:man)?|deu(?:tsch)?|alem[aã]o", "German"),
("nordic", "Nordic"),
("exyu", "ExYu"),
("chs|chi(?:nese)?", "Chinese"),
("chs|chi(?:nese)?|(?:mand[ae]rin|ch[sn])|chin[eê]s|zh-hans", "Chinese"),
("hin(?:di)?", "Hindi"),
("polish|poland|pl", "Polish"),
("mandarin", "Mandarin"),
("kor(?:ean)?", "Korean"),
("kor(?:ean)?|coreano", "Korean"),
("ben(?:gali)?|bangla", "Bengali"),
("kan(?:nada)?", "Kannada"),
("tam(?:il)?", "Tamil"),
("t[aâ]m(?:il)?", "Tamil"),
("tel(?:ugu)?", "Telugu"),
("mar(?:athi)?", "Marathi"),
("mal(?:ayalam)?", "Malayalam"),
("japanese|ja?p", "Japanese"),
("guj(?:arati)?", "Gujarati"),
("pun(?:jabi)?", "Punjabi"),
("ori(?:ya)?", "Oriya"),
("japanese|ja?p|jpn|japon[eê]s", "Japanese"),
("interslavic", "Interslavic"),
("ara(?:bic)?", "Arabic"),
("urdu", "Urdu"),
("punjabi", "Punjabi"),
("portuguese", "Portuguese"),
("albanian?", "Albanian"),
("egypt(?:ian)?", "Egyptian"),
("en?(?:g(?:lish)?)?", "English"), # Must be at end, matches just an 'e'
("tur(?:kish)?|tr", "Turkish"),
("tailand[eê]s|thai?", "Thai"),
("tagalog", "Tagalog"),
("ind(?:onesian)?", "Indonesian"),
("vie(?:tnamese)?", "Vietnamese"),
("heb(?:rew)?", "Hebrew"),
("gre(?:ek)?", "Greek"),
("cz(?:ech)?", "Czech"),
("hun(?:garian)?", "Hungarian"),
("ukr(?:ainian)?", "Ukrainian"),
("fin(?:nish)?", "Finnish"),
("nor(?:wegian)?", "Norwegian"),
("sin(?:hala)?", "Sinhala"),
("dutch|nl", "Dutch"),
("p[ua]n(?:jabi)?", "Punjabi"),
("por(?:tuguese)?|portugu[eèê]s[ea]?|p[rt]|port?", "Portuguese"),
("alb(?:anian?)?|albanais", "Albanian"),
("egypt(?:ian)?|egy", "Egyptian"),
("en?(?:g(?:lish)?)?|ing(?:l[eéê]s)?", "English"), # Must be at end, matches just an 'e'
]

genres = [
Expand Down Expand Up @@ -88,7 +104,6 @@
"extended": [r"(EXTENDED{d}(?!(?:CUT|EDITIONS?)))".format(d=delimiters)],
}


channels = [(1, 0), (2, 0), (5, 0), (5, 1), (6, 1), (7, 1)]


Expand Down Expand Up @@ -182,12 +197,12 @@ def link_patterns(pattern_options):
return (
"(?:"
+ "|".join(
[
pattern_option[0]
if isinstance(pattern_option, tuple)
else pattern_option
for pattern_option in pattern_options
]
)
[
pattern_option[0]
if isinstance(pattern_option, tuple)
else pattern_option
for pattern_option in pattern_options
]
)
+ ")"
)
31 changes: 24 additions & 7 deletions PTN/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,27 @@ def _part(self, name, match_slice, clean, overwrite=False):
self.match_slices.append(match_slice)

@staticmethod
def _clean_string(string):
clean = re.sub(r"^( -|\(|\[)", "", string)
if clean.find(" ") == -1 and clean.find(".") != -1:
def _clean_dots(string: str) -> str:
if string.find(" ") == -1 and string.find(".") != -1:
# 4 dots likely means we want an ellipsis and a space
clean = re.sub(r"\.{4,}", "... ", clean)
string = re.sub(r"\.{4,}", "... ", string)

# Replace any instances of less than 3 dots with a space
# Lookarounds are used to prevent the 3-dots (ellipses) from being replaced
clean = re.sub(r"(?<!\.)\.\.(?!\.)", " ", clean)
clean = re.sub(r"(?<!\.)\.(?!\.\.)", " ", clean)
string = re.sub(r"(?<!\.)\.\.(?!\.)", " ", string)
string = re.sub(r"(?<!\.)\.(?!\.\.)", " ", string)
return string

def _clean_string(self, string):
clean = re.sub(r"^( -|\(|\[)", "", string)
clean = self._clean_dots(clean)

clean = re.sub(r"_", " ", clean)
clean = re.sub(r"([\[)_\]]|- )$", "", clean).strip()
clean = clean.strip(" _-")

# Again, we need to clean up the dots & strip for non-english chars titles that get cleaned from above re.sub.
clean = self._clean_dots(clean).strip()
return clean

def parse(self, name, standardise, coherent_types):
Expand Down Expand Up @@ -358,7 +364,7 @@ def process_title(self):
relative_title_start = m.end()
raw = raw[relative_title_start:]
title_start = relative_title_start + title_start
clean = self._clean_string(raw)
clean = self._clean_string(self.clean_title(raw))
# Re-add title_start to unrelative the index from raw to self.torrent_name
self._part("title", (title_start, title_end), clean)
else:
Expand Down Expand Up @@ -433,3 +439,14 @@ def clean_unmatched(self):
):
filtered.append(extra)
return filtered

@staticmethod
def clean_title(raw_title):
cleaned_title = raw_title
cleaned_title = cleaned_title.replace(r"[[(]movie[)\]]", "") # clear movie indication flag
cleaned_title = re.sub(patterns["RUSSIAN_CAST_REGEX"], " ", cleaned_title) # clear russian cast information
cleaned_title = re.sub(patterns["RELEASE_GROUP_REGEX_START"], r"\1", cleaned_title) # remove release group markings sections from the start
cleaned_title = re.sub(patterns["RELEASE_GROUP_REGEX_END"], r"\1", cleaned_title) # remove unneeded markings section at the end if present
cleaned_title = re.sub(patterns["ALT_TITLES_REGEX"], "", cleaned_title) # remove alt language titles
cleaned_title = re.sub(patterns["NOT_ONLY_NON_ENGLISH_REGEX"], "", cleaned_title) # remove non english chars if they are not the only ones left
return cleaned_title
16 changes: 16 additions & 0 deletions PTN/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,19 @@
"remux": "boolean",
"internationalCut": "boolean",
}

patterns["NON_ENGLISH_CHARS"] = "\u3040-\u30ff" # Japanese characters
patterns["NON_ENGLISH_CHARS"] += "\u3400-\u4dbf" # Chinese characters
patterns["NON_ENGLISH_CHARS"] += "\u4e00-\u9fff" # Chinese characters
patterns["NON_ENGLISH_CHARS"] += "\uf900-\ufaff" # CJK Compatibility Ideographs
patterns["NON_ENGLISH_CHARS"] += "\uff66-\uff9f" # Halfwidth Katakana Japanese characters
patterns["NON_ENGLISH_CHARS"] += "\u0400-\u04ff" # Cyrillic characters (Russian)
patterns["NON_ENGLISH_CHARS"] += "\u0600-\u06ff" # Arabic characters

patterns["RUSSIAN_CAST_REGEX"] = r"\([^)]*[\u0400-\u04ff][^)]*\)$|\/.*\((.*)\)$"
patterns["ALT_TITLES_REGEX"] = f"[^/|(]*[{patterns['NON_ENGLISH_CHARS']}][^/|]*/|[/|][^/|(]*[{patterns['NON_ENGLISH_CHARS']}][^/|]*"
patterns["NOT_ONLY_NON_ENGLISH_REGEX"] = rf"(?:[a-zA-Z][^{patterns['NON_ENGLISH_CHARS']}]+|^)[{patterns['NON_ENGLISH_CHARS']}].*[{patterns['NON_ENGLISH_CHARS']}]|[{patterns['NON_ENGLISH_CHARS']}].*[{patterns['NON_ENGLISH_CHARS']}](?=[^{patterns['NON_ENGLISH_CHARS']}]+[a-zA-Z])"
patterns["NOT_ALLOWED_SYMBOLS_AT_START_AND_END"] = rf"^[^\w{patterns['NON_ENGLISH_CHARS']}#[【★]+|[ \-:/\\\[|{{(#$&^]+$"
patterns["REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END"] = rf"^[^\w{patterns['NON_ENGLISH_CHARS']}#]+|]$"
patterns["RELEASE_GROUP_REGEX_START"] = r"^[\[【★].*[\]】★][ .]?(.+)"
patterns["RELEASE_GROUP_REGEX_END"] = r"(.+)[ .]?[\[【★].*[\]】★]$"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ $ python cli.py --coherent-types 'A freakishly cool movie or TV episode'
Submit a PR on the `dev` branch. If you have changed the regex for a pattern, I can assume this is because you had a title that was being incorrectly processed, and your change fixes it. Please add the title to the test suite!

To add new titles to the tests, you have 2 options (the first is easier):
- Add the titles to `tests/test_generator`'s main method (in `add_titles()`), and run it. When asked for input, type 's', and it will automatically add what's needed to `files/input.json`, `files/output_raw.json`, and `files/output_standard.json`. The fields `encoder`, `excess`, `site`, and `episodeName` don't always have to be correct - if they're giving you issues, or seem wrong, feel free to manually remove them from the output test files.
- Add the titles to `tests/generate_test_data.py`'s main method (in `add_titles()`), and run it. When asked for input, type 's', and it will automatically add what's needed to `files/input.json`, `files/output_raw.json`, and `files/output_standard.json`. The fields `encoder`, `excess`, `site`, and `episodeName` don't always have to be correct - if they're giving you issues, or seem wrong, feel free to manually remove them from the output test files.

- Otherwise, you must add input torrent names to `tests/files/input.json` and full output json objects (with `standardise=False`) to `tests/files/output_raw.json`. Also add the standardised output to `tests/files/output_standard.json`, only including fields that are different from `output_raw.json`, along with `title`.

Expand Down
18 changes: 17 additions & 1 deletion tests/files/input.json
Original file line number Diff line number Diff line change
Expand Up @@ -405,5 +405,21 @@
"www.1TamilBlasters.lat - Thuritham (2023) [Tamil - 2K QHD AVC UNTOUCHED - x264 - AAC - 3.4GB - ESub].mkv",
"www.1TamilMV.world - Raja Vikramarka (2024) Tamil HQ HDRip - 400MB - x264 - AAC - ESub.mkv",
"www.1TamilMV.world - Kotha Rangula Prapancham (2024) Telugu HQ PreDVD - 700MB - x264 - HQ Clean Aud.mkv",
"The.Lord.of.the.Rings.Extended.Edition.2001.1080p.BluRay.x264.DTS-WiKi"
"The.Lord.of.the.Rings.Extended.Edition.2001.1080p.BluRay.x264.DTS-WiKi",
"Deadpool 2016 1080p BluRay DTS Rus Ukr 3xEng HDCL",
"127.Heures.FRENCH.DVDRip.AC3.XViD-DVDFR",
"Men in Black International 2019 (ingl\u00eas portugu\u00eas)",
"Quarantine [2008] [DVDRiP.XviD-M14CH0] [Lektor PL] [Arx]",
"All.Love.E146.KOR.HDTV.XViD-DeBTV",
"Atonement.2017.KOREAN.ENSUBBED.1080p.WEBRip.x264-VXTT",
"Fauda.S01.HEBREW.1080p.NF.WEBRip.DD5.1.x264-TrollHD[rartv]",
"Chinese Zodiac (2012) 1080p BrRip x264 - YIFY",
"Thai Massage (2022) 720p PDVDRip x264 AAC.mkv",
"\u6740\u624b\u4e4b\u738b [\u6e2f\u7248\u539f\u76d8/\u56fd\u7ca4\u53cc\u8bed\u4e2d\u5b57].Hitman.1998.1080p.HKG.Blu-ray.AVC.TrueHD.7.1-TAG",
"[www.arabp2p.net]_-_\u062a\u0631\u0643\u064a \u0645\u062a\u0631\u062c\u0645 \u0648\u0645\u062f\u0628\u0644\u062c Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent",
"\u0413\u043e\u043b\u0443\u0431\u0430\u044f \u0432\u043e\u043b\u043d\u0430 / Blue Crush (2002) DVDRip",
"\u3010\u55b5\u840c\u5976\u8336\u5c4b\u3011\u260501\u6708\u65b0\u756a\u2605[Rebirth][01][720p][\u7b80\u4f53][\u62db\u52df\u7ffb\u8bd1]",
"08.\u041f\u043b\u0430\u043d\u0435\u0442\u0430.\u043e\u0431\u0435\u0437\u044c\u044f\u043d.\u0420\u0435\u0432\u043e\u043b\u044e\u0446\u0438\u044f.2014.BDRip-HEVC.1080p.mkv",
"\u0413\u0440\u0435\u0447\u0435\u0441\u043a\u0430\u044f \u0441\u043c\u043e\u043a\u043e\u0432\u043d\u0438\u0446\u0430 / The fruit is ripe / Griechische Feigen (Siggi G\u00f6tz) [1976, \u0413\u0435\u0440\u043c\u0430\u043d\u0438\u044f, \u042d\u0440\u043e\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u043a\u043e\u043c\u0435\u0434\u0438\u044f, DVDRip]",
"\u041a\u043d\u0438\u0433\u043e\u043d\u043e\u0448\u0438 / \u041a\u043di\u0433\u0430\u043d\u043e\u0448\u044b (1987) TVRip \u043e\u0442 AND03AND | BLR"
]
131 changes: 131 additions & 0 deletions tests/files/output_raw.json
Original file line number Diff line number Diff line change
Expand Up @@ -3715,5 +3715,136 @@
"resolution": "1080p",
"title": "The Lord of the Rings",
"year": 2001
},
{
"audio": "DTS",
"encoder": "3xEng",
"language": ["Rus","Ukr"],
"quality": "BluRay",
"resolution": "1080p",
"site": "HDCL",
"title": "Deadpool",
"year": 2016
},
{
"audio": "AC3",
"codec": "XViD",
"encoder": "DVDFR",
"language": "FRENCH",
"quality": "DVDRip",
"title": "127 Heures"
},
{
"language": ["inglês","português"],
"title": "Men in Black International",
"year": 2019
},
{
"codec": "XviD",
"language": "PL",
"quality": "DVDRiP",
"site": "Arx",
"title": "Quarantine",
"year": 2008
},
{
"codec": "XViD",
"encoder": "DeBTV",
"episode": 146,
"language": "KOR",
"quality": "HDTV",
"title": "All Love"
},
{
"codec": "x264",
"encoder": "VXTT",
"language": "KOREAN",
"quality": "WEBRip",
"resolution": "1080p",
"subtitles": "ENSUBBED",
"title": "Atonement",
"year": 2017
},
{
"audio": "DD5.1",
"codec": "x264",
"encoder": "TrollHD",
"language": "HEBREW",
"network": "NF",
"quality": "WEBRip",
"resolution": "1080p",
"season": 1,
"site": "rartv",
"title": "Fauda"
},
{
"codec": "x264",
"encoder": "YIFY",
"quality": "BrRip",
"resolution": "1080p",
"title": "Chinese Zodiac",
"year": 2012
},
{
"audio": "AAC",
"codec": "x264",
"encoder": "PDVDRip",
"filetype": "mkv",
"resolution": "720p",
"title": "Thai Massage",
"year": 2022
},
{
"audio": "TrueHD.7.1",
"codec": "AVC",
"encoder": "TAG",
"quality": "Blu-ray",
"resolution": "1080p",
"title": "] Hitman",
"year": 1998
},
{
"audio": "DDP5.1",
"codec": "H.264",
"encoder": "torrent",
"filetype": "MKV",
"network": "NF",
"quality": "WEB-DL",
"resolution": "1080p",
"site": "www.arabp2p.net",
"title": "Last Call for Istanbul",
"year": 2023
},
{
"quality": "DVDRip",
"title": "Blue Crush",
"year": 2002
},
{
"encoder": "]",
"resolution": "720p",
"site": "简体][招募翻译",
"title": "Rebirth"
},
{
"codec": "HEVC",
"filetype": "mkv",
"quality": "BDRip",
"resolution": "1080p",
"title": "08 Планета обезьян Революция",
"year": 2014
},
{
"encoder": "комедия",
"quality": "DVDRip",
"title": "The fruit is ripe / Griechische Feigen",
"year": 1976
},
{
"encoder": "|",
"quality": "TVRip",
"site": "BLR",
"title": "Кнiганошы",
"year": 1987
}
]
Loading