From 32d456a19ff0d0aef7dcdf39bca61c42fc55aa19 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 24 Nov 2023 14:49:56 +0100 Subject: [PATCH] hardening of domain filter (#64) * hardening of domain filter: IP addresses * add IPv6 tests * simplify code * better domain/hostname filtering * additional safeguard: numbers * add test --- courlan/clean.py | 12 +++++----- courlan/filters.py | 53 +++++++++++++++++++++++++++++++++++++++++++-- tests/unit_tests.py | 46 +++++++++++++++++++++++++++++++++++---- 3 files changed, 100 insertions(+), 11 deletions(-) diff --git a/courlan/clean.py b/courlan/clean.py index cabf5bc..0548662 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -165,13 +165,15 @@ def normalize_url( parsed_url = _parse(parsed_url) # lowercase + remove fragments + normalize punycode scheme = parsed_url.scheme.lower() + netloc = parsed_url.netloc.lower() # port - if parsed_url.port and parsed_url.port in (80, 443): - netloc = NETLOC_RE.sub("", parsed_url.netloc) - else: - netloc = parsed_url.netloc + try: + if parsed_url.port and parsed_url.port in (80, 443): + netloc = NETLOC_RE.sub("", netloc) + except ValueError: + pass # Port could not be cast to integer value # lowercase + remove fragments + normalize punycode - netloc = decode_punycode(netloc.lower()) + netloc = decode_punycode(netloc) # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py # leading /../'s in the path are removed newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path))) diff --git a/courlan/filters.py b/courlan/filters.py index 6006f24..8411d74 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -9,6 +9,7 @@ import logging import re +from ipaddress import ip_address from typing import Any, Optional, Tuple from urllib.parse import urlsplit @@ -21,8 +22,41 @@ # domain/host names -UNSUITABLE_DOMAIN = re.compile(r"[?=`$;,]|:$|\.(.|[0-9]+|[^.]{25,})$") +IP_SET = { + ".", + ":", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "a", + "b", + "c", + "d", + "e", + "f", +} + +# https://github.com/python-validators/validators/blob/master/src/validators/domain.py +VALID_DOMAIN = re.compile( + # First character of the domain + r"^(?:[a-zA-Z0-9]" + # Sub domain + hostname + + r"(?:[a-zA-Z0-9-_]{0,61}[A-Za-z0-9])?\.)" + # First 61 characters of the gTLD + + r"+[A-Za-z0-9][A-Za-z0-9-_]{0,61}" + # Last character of the gTLD + + r"[A-Za-z]$", + re.IGNORECASE, +) +UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.") # content filters SITE_STRUCTURE = re.compile( @@ -117,8 +151,23 @@ def basic_filter(url: str) -> bool: def domain_filter(domain: str) -> bool: "Find invalid domain/host names" + # IPv4 or IPv6 + if not set(domain).difference(IP_SET): + try: + ip_address(domain) + except ValueError: + return False + return True + + # malformed domains + try: + if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")): + return False + except UnicodeError: + return False - if UNSUITABLE_DOMAIN.search(domain): + # unsuitable content + if UNSUITABLE_DOMAIN.match(domain): return False if FILE_TYPE.search(domain): diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 2140d38..e8a5a87 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -40,7 +40,7 @@ lang_filter, ) from courlan.core import filter_links -from courlan.filters import extension_filter, path_filter, type_filter +from courlan.filters import domain_filter, extension_filter, path_filter, type_filter from courlan.meta import clear_caches from courlan.urlutils import _parse, get_tldinfo, is_known_link @@ -653,12 +653,50 @@ def test_urlcheck(): # assert check_url('http://www.immobilienscout24.de/de/ueberuns/presseservice/pressestimmen/2_halbjahr_2000.jsp;jsessionid=287EC625A45BD5A243352DD8C86D25CC.worker2', language='de', strict=True) is not None # domain name - assert check_url("http://`$smarty.server.server_name`") is None - assert check_url("http://$`)}if(a.tryconvertencoding)trycatch(e)const") is None - assert check_url("http://00x200.jpg,") is None assert check_url("http://-100x100.webp") is None assert check_url("http://0.gravata.html") is None assert check_url("http://https:") is None + assert check_url("http://127.0.0.1") is not None + assert check_url("http://111.111.111.111") is not None + assert check_url("http://0127.0.0.1") is None + # assert check_url("http://::1") is not None + assert check_url("http://2001:0db8:85a3:0000:0000:8a2e:0370:7334") is not None + assert check_url("http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]") is None + assert check_url("http://1:2:3:4:5:6:7:8:9") is None + + +def test_domain_filter(): + "Test filters related to domain and hostnames." + assert domain_filter("") is False + assert domain_filter("too-long" + "g" * 60 + ".org") is False + assert domain_filter("long" + "g" * 50 + ".org") is True + assert domain_filter("example.-com") is False + assert domain_filter("example.") is False + assert domain_filter("-example.com") is False + assert domain_filter("_example.com") is False + assert domain_filter("example.com:") is False + assert domain_filter("a......b.com") is False + assert domain_filter("*.example.com") is False + assert domain_filter("exa-mple.co.uk") is True + assert domain_filter("kräuter.de") is True + assert domain_filter("xn--h1aagokeh.xn--p1ai") is True + assert domain_filter("`$smarty.server.server_name`") is False + assert domain_filter("$`)}if(a.tryconvertencoding)trycatch(e)const") is False + assert domain_filter("00x200.jpg,") is False + assert domain_filter("-100x100.webp") is False + assert domain_filter("0.gravata.html") is False + assert domain_filter("https:") is False + + assert domain_filter("127.0.0.1") is True + assert domain_filter("900.200.100.75") is False + assert domain_filter("111.111.111") is False + assert domain_filter("0127.0.0.1") is False + + assert domain_filter("example.jpg") is False + assert domain_filter("example.html") is False + assert domain_filter("0.gravatar.com") is False + assert domain_filter("12345.org") is False + # assert domain_filter("test.invalidtld") is False def test_urlcheck_redirects():