Skip to content

Commit

Permalink
hardening of domain filter (#64)
Browse files Browse the repository at this point in the history
* hardening of domain filter: IP addresses

* add IPv6 tests

* simplify code

* better domain/hostname filtering

* additional safeguard: numbers

* add test
  • Loading branch information
adbar committed Nov 24, 2023
1 parent b828bd0 commit 32d456a
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 11 deletions.
12 changes: 7 additions & 5 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,15 @@ def normalize_url(
parsed_url = _parse(parsed_url)
# lowercase + remove fragments + normalize punycode
scheme = parsed_url.scheme.lower()
netloc = parsed_url.netloc.lower()
# port
if parsed_url.port and parsed_url.port in (80, 443):
netloc = NETLOC_RE.sub("", parsed_url.netloc)
else:
netloc = parsed_url.netloc
try:
if parsed_url.port and parsed_url.port in (80, 443):
netloc = NETLOC_RE.sub("", netloc)
except ValueError:
pass # Port could not be cast to integer value
# lowercase + remove fragments + normalize punycode
netloc = decode_punycode(netloc.lower())
netloc = decode_punycode(netloc)
# path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
# leading /../'s in the path are removed
newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
Expand Down
53 changes: 51 additions & 2 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import logging
import re

from ipaddress import ip_address
from typing import Any, Optional, Tuple
from urllib.parse import urlsplit

Expand All @@ -21,8 +22,41 @@


# domain/host names
UNSUITABLE_DOMAIN = re.compile(r"[?=`$;,]|:$|\.(.|[0-9]+|[^.]{25,})$")
IP_SET = {
".",
":",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"a",
"b",
"c",
"d",
"e",
"f",
}

# https://github.com/python-validators/validators/blob/master/src/validators/domain.py
VALID_DOMAIN = re.compile(
# First character of the domain
r"^(?:[a-zA-Z0-9]"
# Sub domain + hostname
+ r"(?:[a-zA-Z0-9-_]{0,61}[A-Za-z0-9])?\.)"
# First 61 characters of the gTLD
+ r"+[A-Za-z0-9][A-Za-z0-9-_]{0,61}"
# Last character of the gTLD
+ r"[A-Za-z]$",
re.IGNORECASE,
)

UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.")

# content filters
SITE_STRUCTURE = re.compile(
Expand Down Expand Up @@ -117,8 +151,23 @@ def basic_filter(url: str) -> bool:

def domain_filter(domain: str) -> bool:
"Find invalid domain/host names"
# IPv4 or IPv6
if not set(domain).difference(IP_SET):
try:
ip_address(domain)
except ValueError:
return False
return True

# malformed domains
try:
if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
return False
except UnicodeError:
return False

if UNSUITABLE_DOMAIN.search(domain):
# unsuitable content
if UNSUITABLE_DOMAIN.match(domain):
return False

if FILE_TYPE.search(domain):
Expand Down
46 changes: 42 additions & 4 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
lang_filter,
)
from courlan.core import filter_links
from courlan.filters import extension_filter, path_filter, type_filter
from courlan.filters import domain_filter, extension_filter, path_filter, type_filter
from courlan.meta import clear_caches
from courlan.urlutils import _parse, get_tldinfo, is_known_link

Expand Down Expand Up @@ -653,12 +653,50 @@ def test_urlcheck():
# assert check_url('http://www.immobilienscout24.de/de/ueberuns/presseservice/pressestimmen/2_halbjahr_2000.jsp;jsessionid=287EC625A45BD5A243352DD8C86D25CC.worker2', language='de', strict=True) is not None

# domain name
assert check_url("http://`$smarty.server.server_name`") is None
assert check_url("http://$`)}if(a.tryconvertencoding)trycatch(e)const") is None
assert check_url("http://00x200.jpg,") is None
assert check_url("http://-100x100.webp") is None
assert check_url("http://0.gravata.html") is None
assert check_url("http://https:") is None
assert check_url("http://127.0.0.1") is not None
assert check_url("http://111.111.111.111") is not None
assert check_url("http://0127.0.0.1") is None
# assert check_url("http://::1") is not None
assert check_url("http://2001:0db8:85a3:0000:0000:8a2e:0370:7334") is not None
assert check_url("http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]") is None
assert check_url("http://1:2:3:4:5:6:7:8:9") is None


def test_domain_filter():
"Test filters related to domain and hostnames."
assert domain_filter("") is False
assert domain_filter("too-long" + "g" * 60 + ".org") is False
assert domain_filter("long" + "g" * 50 + ".org") is True
assert domain_filter("example.-com") is False
assert domain_filter("example.") is False
assert domain_filter("-example.com") is False
assert domain_filter("_example.com") is False
assert domain_filter("example.com:") is False
assert domain_filter("a......b.com") is False
assert domain_filter("*.example.com") is False
assert domain_filter("exa-mple.co.uk") is True
assert domain_filter("kräuter.de") is True
assert domain_filter("xn--h1aagokeh.xn--p1ai") is True
assert domain_filter("`$smarty.server.server_name`") is False
assert domain_filter("$`)}if(a.tryconvertencoding)trycatch(e)const") is False
assert domain_filter("00x200.jpg,") is False
assert domain_filter("-100x100.webp") is False
assert domain_filter("0.gravata.html") is False
assert domain_filter("https:") is False

assert domain_filter("127.0.0.1") is True
assert domain_filter("900.200.100.75") is False
assert domain_filter("111.111.111") is False
assert domain_filter("0127.0.0.1") is False

assert domain_filter("example.jpg") is False
assert domain_filter("example.html") is False
assert domain_filter("0.gravatar.com") is False
assert domain_filter("12345.org") is False
# assert domain_filter("test.invalidtld") is False


def test_urlcheck_redirects():
Expand Down

0 comments on commit 32d456a

Please sign in to comment.