strip common tracking parameters in queries and fragments (#65)

* strip common tracking parameters by default * more tests * also strip in fragments * 3.6 compatibility
adbar · Nov 27, 2023 · b61b1b3 · b61b1b3
1 parent 32d456a
commit b61b1b3
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 24 deletions.
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -36,6 +36,16 @@
 TRAILING_AMP = re.compile(r"/\&$")
 TRAILING_PARTS = re.compile(r'(.*?)[<>"\'\s]')
 
+# https://github.com/AdguardTeam/AdguardFilters/blob/master/TrackParamFilter/sections/general_url.txt
+# https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json
+# https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
+TRACKERS_RE = re.compile(
+    r"^(?:dc|fbc|gc|twc|yc|ysc)lid|"
+    r"^(?:click|gbra|msclk|igsh|partner|wbra)id|"
+    r"^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|"
+    r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)"
+)
+
 
 def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
     "Helper function: chained scrubbing and normalization"
@@ -99,20 +109,20 @@ def scrub_url(url: str) -> str:
 
 
 def clean_query(
-    parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
+    querystring: str, strict: bool = False, language: Optional[str] = None
 ) -> str:
     "Strip unwanted query elements"
-    if len(parsed_url.query) > 0:
-        qdict = parse_qs(parsed_url.query)
+    if querystring:
+        qdict = parse_qs(querystring)
         newqdict = {}
         for qelem in sorted(qdict):
             teststr = qelem.lower()
             # control param
-            if (
-                strict
-                and teststr not in ALLOWED_PARAMS
-                and teststr not in CONTROL_PARAMS
-            ):
+            if strict:
+                if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
+                    continue
+            # get rid of trackers
+            elif TRACKERS_RE.search(teststr):
                 continue
             # control language
             if language is not None and teststr in CONTROL_PARAMS:
@@ -127,7 +137,7 @@ def clean_query(
             # insert
             newqdict[qelem] = qdict[qelem]
         return urlencode(newqdict, doseq=True)
-    return parsed_url.query
+    return querystring
 
 
 def decode_punycode(string: str) -> str:
@@ -151,9 +161,18 @@ def decode_punycode(string: str) -> str:
 def normalize_part(url_part: str) -> str:
     """Normalize URLs parts (specifically path and fragment) while
     accounting for certain characters."""
-    if not "%" in url_part and not "!" in url_part:
-        url_part = quote(url_part)
-    return url_part
+    # "~" for compatibility with Python 3.6
+    return quote(url_part, safe="/%!=:,-~")
+
+
+def normalize_fragment(fragment: str, language: Optional[str] = None) -> str:
+    "Look for trackers in URL fragments using query analysis, normalize the output."
+    if "=" in fragment:
+        if "&" in fragment:
+            fragment = clean_query(fragment, False, language)
+        elif TRACKERS_RE.search(fragment):
+            fragment = ""
+    return normalize_part(fragment)
 
 
 def normalize_url(
@@ -178,10 +197,10 @@ def normalize_url(
     # leading /../'s in the path are removed
     newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
     # strip unwanted query elements
-    newquery = clean_query(parsed_url, strict, language) or ""
+    newquery = clean_query(parsed_url.query, strict, language) or ""
     if newquery and newpath == "":
         newpath = "/"
     # fragment
-    newfragment = "" if strict else normalize_part(parsed_url.fragment)
+    newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language)
     # rebuild
     return urlunsplit([scheme, netloc, newpath, newquery, newfragment])
diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -5,6 +5,7 @@
 import re
 
 from functools import lru_cache
+from html import unescape
 from typing import Any, List, Optional, Set, Tuple, Union
 from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
 
@@ -70,7 +71,7 @@ def extract_domain(
 def _parse(url: Any) -> SplitResult:
     "Parse a string or use urllib.parse object directly."
     if isinstance(url, str):
-        parsed_url = urlsplit(url)
+        parsed_url = urlsplit(unescape(url))
     elif isinstance(url, SplitResult):
         parsed_url = url
     else:

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -479,8 +479,8 @@ def test_normalization():
         == "http://test.net/foo.html#bar"
     )
     assert (
-        normalize_url("http://test.net/foo.html#:~:text=night-,vision", strict=True)
-        == "http://test.net/foo.html"
+        normalize_url("http://test.net/foo.html#:~:text=night-,vision")
+        == "http://test.net/foo.html#:~:text=night-,vision"
     )
     assert (
         normalize_url("http://www.example.org:80/test.html")
@@ -493,10 +493,12 @@ def test_normalization():
     assert (
         normalize_url("https://hanxiao.io//404.html") == "https://hanxiao.io/404.html"
     )
+
     # punycode
     assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de"
     assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de"
     assert normalize_url("http://xn--München.de") == "http://xn--münchen.de"
+
     # account for particular characters
     assert (
         normalize_url(
@@ -509,24 +511,48 @@ def test_normalization():
         == "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/"
     )
 
+    # trackers
+    assert normalize_url("http://test.org/?s_cid=123&clickid=1") == "http://test.org/"
+    assert normalize_url("http://test.org/?aftr_source=0") == "http://test.org/"
+    assert normalize_url("http://test.org/?fb_ref=0") == "http://test.org/"
+    assert normalize_url("http://test.org/?this_affiliate=0") == "http://test.org/"
+    assert (
+        normalize_url("http://test.org/?utm_source=rss&utm_medium=rss")
+        == "http://test.org/"
+    )
+    assert (
+        normalize_url("http://test.org/?utm_source=rss&#038;utm_medium=rss")
+        == "http://test.org/"
+    )
+    assert normalize_url("http://test.org/#partnerid=123") == "http://test.org/"
+    assert (
+        normalize_url(
+            "http://test.org/#mtm_campaign=documentation&mtm_keyword=demo&catpage=3"
+        )
+        == "http://test.org/#catpage=3"
+    )
+    assert normalize_url("http://test.org/#page2") == "http://test.org/#page2"
+
 
 def test_qelems():
     assert (
         normalize_url("http://test.net/foo.html?utm_source=twitter")
-        == "http://test.net/foo.html?utm_source=twitter"
+        == "http://test.net/foo.html"
     )
     assert (
-        normalize_url("http://test.net/foo.html?utm_source=twitter", strict=True)
+        normalize_url("http://test.net/foo.html?testid=1")
+        == "http://test.net/foo.html?testid=1"
+    )
+    assert (
+        normalize_url("http://test.net/foo.html?testid=1", strict=True)
         == "http://test.net/foo.html"
     )
     assert (
-        normalize_url("http://test.net/foo.html?utm_source=twitter&post=abc&page=2")
-        == "http://test.net/foo.html?page=2&post=abc&utm_source=twitter"
+        normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2")
+        == "http://test.net/foo.html?page=2&post=abc&testid=1"
     )
     assert (
-        normalize_url(
-            "http://test.net/foo.html?utm_source=twitter&post=abc&page=2", strict=True
-        )
+        normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2", strict=True)
         == "http://test.net/foo.html?page=2&post=abc"
     )
     assert (