Skip to content

Commit

Permalink
strip common tracking parameters in queries and fragments (#65)
Browse files Browse the repository at this point in the history
* strip common tracking parameters by default

* more tests

* also strip in fragments

* 3.6 compatibility
  • Loading branch information
adbar committed Nov 27, 2023
1 parent 32d456a commit b61b1b3
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 24 deletions.
47 changes: 33 additions & 14 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@
TRAILING_AMP = re.compile(r"/\&$")
TRAILING_PARTS = re.compile(r'(.*?)[<>"\'\s]')

# https://github.com/AdguardTeam/AdguardFilters/blob/master/TrackParamFilter/sections/general_url.txt
# https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json
# https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
TRACKERS_RE = re.compile(
r"^(?:dc|fbc|gc|twc|yc|ysc)lid|"
r"^(?:click|gbra|msclk|igsh|partner|wbra)id|"
r"^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|"
r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)"
)


def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
"Helper function: chained scrubbing and normalization"
Expand Down Expand Up @@ -99,20 +109,20 @@ def scrub_url(url: str) -> str:


def clean_query(
parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
querystring: str, strict: bool = False, language: Optional[str] = None
) -> str:
"Strip unwanted query elements"
if len(parsed_url.query) > 0:
qdict = parse_qs(parsed_url.query)
if querystring:
qdict = parse_qs(querystring)
newqdict = {}
for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if (
strict
and teststr not in ALLOWED_PARAMS
and teststr not in CONTROL_PARAMS
):
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
continue
# get rid of trackers
elif TRACKERS_RE.search(teststr):
continue
# control language
if language is not None and teststr in CONTROL_PARAMS:
Expand All @@ -127,7 +137,7 @@ def clean_query(
# insert
newqdict[qelem] = qdict[qelem]
return urlencode(newqdict, doseq=True)
return parsed_url.query
return querystring


def decode_punycode(string: str) -> str:
Expand All @@ -151,9 +161,18 @@ def decode_punycode(string: str) -> str:
def normalize_part(url_part: str) -> str:
"""Normalize URLs parts (specifically path and fragment) while
accounting for certain characters."""
if not "%" in url_part and not "!" in url_part:
url_part = quote(url_part)
return url_part
# "~" for compatibility with Python 3.6
return quote(url_part, safe="/%!=:,-~")


def normalize_fragment(fragment: str, language: Optional[str] = None) -> str:
"Look for trackers in URL fragments using query analysis, normalize the output."
if "=" in fragment:
if "&" in fragment:
fragment = clean_query(fragment, False, language)
elif TRACKERS_RE.search(fragment):
fragment = ""
return normalize_part(fragment)


def normalize_url(
Expand All @@ -178,10 +197,10 @@ def normalize_url(
# leading /../'s in the path are removed
newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
# strip unwanted query elements
newquery = clean_query(parsed_url, strict, language) or ""
newquery = clean_query(parsed_url.query, strict, language) or ""
if newquery and newpath == "":
newpath = "/"
# fragment
newfragment = "" if strict else normalize_part(parsed_url.fragment)
newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language)
# rebuild
return urlunsplit([scheme, netloc, newpath, newquery, newfragment])
3 changes: 2 additions & 1 deletion courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re

from functools import lru_cache
from html import unescape
from typing import Any, List, Optional, Set, Tuple, Union
from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult

Expand Down Expand Up @@ -70,7 +71,7 @@ def extract_domain(
def _parse(url: Any) -> SplitResult:
"Parse a string or use urllib.parse object directly."
if isinstance(url, str):
parsed_url = urlsplit(url)
parsed_url = urlsplit(unescape(url))
elif isinstance(url, SplitResult):
parsed_url = url
else:
Expand Down
44 changes: 35 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,8 @@ def test_normalization():
== "http://test.net/foo.html#bar"
)
assert (
normalize_url("http://test.net/foo.html#:~:text=night-,vision", strict=True)
== "http://test.net/foo.html"
normalize_url("http://test.net/foo.html#:~:text=night-,vision")
== "http://test.net/foo.html#:~:text=night-,vision"
)
assert (
normalize_url("http://www.example.org:80/test.html")
Expand All @@ -493,10 +493,12 @@ def test_normalization():
assert (
normalize_url("https://hanxiao.io//404.html") == "https://hanxiao.io/404.html"
)

# punycode
assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de"
assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de"
assert normalize_url("http://xn--München.de") == "http://xn--münchen.de"

# account for particular characters
assert (
normalize_url(
Expand All @@ -509,24 +511,48 @@ def test_normalization():
== "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/"
)

# trackers
assert normalize_url("http://test.org/?s_cid=123&clickid=1") == "http://test.org/"
assert normalize_url("http://test.org/?aftr_source=0") == "http://test.org/"
assert normalize_url("http://test.org/?fb_ref=0") == "http://test.org/"
assert normalize_url("http://test.org/?this_affiliate=0") == "http://test.org/"
assert (
normalize_url("http://test.org/?utm_source=rss&utm_medium=rss")
== "http://test.org/"
)
assert (
normalize_url("http://test.org/?utm_source=rss&#038;utm_medium=rss")
== "http://test.org/"
)
assert normalize_url("http://test.org/#partnerid=123") == "http://test.org/"
assert (
normalize_url(
"http://test.org/#mtm_campaign=documentation&mtm_keyword=demo&catpage=3"
)
== "http://test.org/#catpage=3"
)
assert normalize_url("http://test.org/#page2") == "http://test.org/#page2"


def test_qelems():
assert (
normalize_url("http://test.net/foo.html?utm_source=twitter")
== "http://test.net/foo.html?utm_source=twitter"
== "http://test.net/foo.html"
)
assert (
normalize_url("http://test.net/foo.html?utm_source=twitter", strict=True)
normalize_url("http://test.net/foo.html?testid=1")
== "http://test.net/foo.html?testid=1"
)
assert (
normalize_url("http://test.net/foo.html?testid=1", strict=True)
== "http://test.net/foo.html"
)
assert (
normalize_url("http://test.net/foo.html?utm_source=twitter&post=abc&page=2")
== "http://test.net/foo.html?page=2&post=abc&utm_source=twitter"
normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2")
== "http://test.net/foo.html?page=2&post=abc&testid=1"
)
assert (
normalize_url(
"http://test.net/foo.html?utm_source=twitter&post=abc&page=2", strict=True
)
normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2", strict=True)
== "http://test.net/foo.html?page=2&post=abc"
)
assert (
Expand Down

0 comments on commit b61b1b3

Please sign in to comment.