diff --git a/courlan/clean.py b/courlan/clean.py index 0548662..fc68527 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -36,6 +36,16 @@ TRAILING_AMP = re.compile(r"/\&$") TRAILING_PARTS = re.compile(r'(.*?)[<>"\'\s]') +# https://github.com/AdguardTeam/AdguardFilters/blob/master/TrackParamFilter/sections/general_url.txt +# https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json +# https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records +TRACKERS_RE = re.compile( + r"^(?:dc|fbc|gc|twc|yc|ysc)lid|" + r"^(?:click|gbra|msclk|igsh|partner|wbra)id|" + r"^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|" + r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)" +) + def clean_url(url: str, language: Optional[str] = None) -> Optional[str]: "Helper function: chained scrubbing and normalization" @@ -99,20 +109,20 @@ def scrub_url(url: str) -> str: def clean_query( - parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None + querystring: str, strict: bool = False, language: Optional[str] = None ) -> str: "Strip unwanted query elements" - if len(parsed_url.query) > 0: - qdict = parse_qs(parsed_url.query) + if querystring: + qdict = parse_qs(querystring) newqdict = {} for qelem in sorted(qdict): teststr = qelem.lower() # control param - if ( - strict - and teststr not in ALLOWED_PARAMS - and teststr not in CONTROL_PARAMS - ): + if strict: + if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS: + continue + # get rid of trackers + elif TRACKERS_RE.search(teststr): continue # control language if language is not None and teststr in CONTROL_PARAMS: @@ -127,7 +137,7 @@ def clean_query( # insert newqdict[qelem] = qdict[qelem] return urlencode(newqdict, doseq=True) - return parsed_url.query + return querystring def decode_punycode(string: str) -> str: @@ -151,9 +161,18 @@ def decode_punycode(string: str) -> str: def normalize_part(url_part: str) -> str: """Normalize URLs parts (specifically path and fragment) while accounting for certain characters.""" - if not "%" in url_part and not "!" in url_part: - url_part = quote(url_part) - return url_part + # "~" for compatibility with Python 3.6 + return quote(url_part, safe="/%!=:,-~") + + +def normalize_fragment(fragment: str, language: Optional[str] = None) -> str: + "Look for trackers in URL fragments using query analysis, normalize the output." + if "=" in fragment: + if "&" in fragment: + fragment = clean_query(fragment, False, language) + elif TRACKERS_RE.search(fragment): + fragment = "" + return normalize_part(fragment) def normalize_url( @@ -178,10 +197,10 @@ def normalize_url( # leading /../'s in the path are removed newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path))) # strip unwanted query elements - newquery = clean_query(parsed_url, strict, language) or "" + newquery = clean_query(parsed_url.query, strict, language) or "" if newquery and newpath == "": newpath = "/" # fragment - newfragment = "" if strict else normalize_part(parsed_url.fragment) + newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language) # rebuild return urlunsplit([scheme, netloc, newpath, newquery, newfragment]) diff --git a/courlan/urlutils.py b/courlan/urlutils.py index 13af9d0..e3a4181 100644 --- a/courlan/urlutils.py +++ b/courlan/urlutils.py @@ -5,6 +5,7 @@ import re from functools import lru_cache +from html import unescape from typing import Any, List, Optional, Set, Tuple, Union from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult @@ -70,7 +71,7 @@ def extract_domain( def _parse(url: Any) -> SplitResult: "Parse a string or use urllib.parse object directly." if isinstance(url, str): - parsed_url = urlsplit(url) + parsed_url = urlsplit(unescape(url)) elif isinstance(url, SplitResult): parsed_url = url else: diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e8a5a87..3c87dde 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -479,8 +479,8 @@ def test_normalization(): == "http://test.net/foo.html#bar" ) assert ( - normalize_url("http://test.net/foo.html#:~:text=night-,vision", strict=True) - == "http://test.net/foo.html" + normalize_url("http://test.net/foo.html#:~:text=night-,vision") + == "http://test.net/foo.html#:~:text=night-,vision" ) assert ( normalize_url("http://www.example.org:80/test.html") @@ -493,10 +493,12 @@ def test_normalization(): assert ( normalize_url("https://hanxiao.io//404.html") == "https://hanxiao.io/404.html" ) + # punycode assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de" assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de" assert normalize_url("http://xn--München.de") == "http://xn--münchen.de" + # account for particular characters assert ( normalize_url( @@ -509,24 +511,48 @@ def test_normalization(): == "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/" ) + # trackers + assert normalize_url("http://test.org/?s_cid=123&clickid=1") == "http://test.org/" + assert normalize_url("http://test.org/?aftr_source=0") == "http://test.org/" + assert normalize_url("http://test.org/?fb_ref=0") == "http://test.org/" + assert normalize_url("http://test.org/?this_affiliate=0") == "http://test.org/" + assert ( + normalize_url("http://test.org/?utm_source=rss&utm_medium=rss") + == "http://test.org/" + ) + assert ( + normalize_url("http://test.org/?utm_source=rss&utm_medium=rss") + == "http://test.org/" + ) + assert normalize_url("http://test.org/#partnerid=123") == "http://test.org/" + assert ( + normalize_url( + "http://test.org/#mtm_campaign=documentation&mtm_keyword=demo&catpage=3" + ) + == "http://test.org/#catpage=3" + ) + assert normalize_url("http://test.org/#page2") == "http://test.org/#page2" + def test_qelems(): assert ( normalize_url("http://test.net/foo.html?utm_source=twitter") - == "http://test.net/foo.html?utm_source=twitter" + == "http://test.net/foo.html" ) assert ( - normalize_url("http://test.net/foo.html?utm_source=twitter", strict=True) + normalize_url("http://test.net/foo.html?testid=1") + == "http://test.net/foo.html?testid=1" + ) + assert ( + normalize_url("http://test.net/foo.html?testid=1", strict=True) == "http://test.net/foo.html" ) assert ( - normalize_url("http://test.net/foo.html?utm_source=twitter&post=abc&page=2") - == "http://test.net/foo.html?page=2&post=abc&utm_source=twitter" + normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2") + == "http://test.net/foo.html?page=2&post=abc&testid=1" ) assert ( - normalize_url( - "http://test.net/foo.html?utm_source=twitter&post=abc&page=2", strict=True - ) + normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2", strict=True) == "http://test.net/foo.html?page=2&post=abc" ) assert (