From 32d456a19ff0d0aef7dcdf39bca61c42fc55aa19 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Fri, 24 Nov 2023 14:49:56 +0100
Subject: [PATCH] hardening of domain filter (#64)

* hardening of domain filter: IP addresses

* add IPv6 tests

* simplify code

* better domain/hostname filtering

* additional safeguard: numbers

* add test
---
 courlan/clean.py    | 12 +++++-----
 courlan/filters.py  | 53 +++++++++++++++++++++++++++++++++++++++++++--
 tests/unit_tests.py | 46 +++++++++++++++++++++++++++++++++++----
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/courlan/clean.py b/courlan/clean.py
index cabf5bc..0548662 100644
--- a/courlan/clean.py
+++ b/courlan/clean.py
@@ -165,13 +165,15 @@ def normalize_url(
     parsed_url = _parse(parsed_url)
     # lowercase + remove fragments + normalize punycode
     scheme = parsed_url.scheme.lower()
+    netloc = parsed_url.netloc.lower()
     # port
-    if parsed_url.port and parsed_url.port in (80, 443):
-        netloc = NETLOC_RE.sub("", parsed_url.netloc)
-    else:
-        netloc = parsed_url.netloc
+    try:
+        if parsed_url.port and parsed_url.port in (80, 443):
+            netloc = NETLOC_RE.sub("", netloc)
+    except ValueError:
+        pass  # Port could not be cast to integer value
     # lowercase + remove fragments + normalize punycode
-    netloc = decode_punycode(netloc.lower())
+    netloc = decode_punycode(netloc)
     # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
     # leading /../'s in the path are removed
     newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
diff --git a/courlan/filters.py b/courlan/filters.py
index 6006f24..8411d74 100644
--- a/courlan/filters.py
+++ b/courlan/filters.py
@@ -9,6 +9,7 @@
 import logging
 import re
 
+from ipaddress import ip_address
 from typing import Any, Optional, Tuple
 from urllib.parse import urlsplit
 
@@ -21,8 +22,41 @@
 
 
 # domain/host names
-UNSUITABLE_DOMAIN = re.compile(r"[?=`$;,]|:$|\.(.|[0-9]+|[^.]{25,})$")
+IP_SET = {
+    ".",
+    ":",
+    "0",
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "6",
+    "7",
+    "8",
+    "9",
+    "a",
+    "b",
+    "c",
+    "d",
+    "e",
+    "f",
+}
+
+# https://github.com/python-validators/validators/blob/master/src/validators/domain.py
+VALID_DOMAIN = re.compile(
+    # First character of the domain
+    r"^(?:[a-zA-Z0-9]"
+    # Sub domain + hostname
+    + r"(?:[a-zA-Z0-9-_]{0,61}[A-Za-z0-9])?\.)"
+    # First 61 characters of the gTLD
+    + r"+[A-Za-z0-9][A-Za-z0-9-_]{0,61}"
+    # Last character of the gTLD
+    + r"[A-Za-z]$",
+    re.IGNORECASE,
+)
 
+UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.")
 
 # content filters
 SITE_STRUCTURE = re.compile(
@@ -117,8 +151,23 @@ def basic_filter(url: str) -> bool:
 
 def domain_filter(domain: str) -> bool:
     "Find invalid domain/host names"
+    # IPv4 or IPv6
+    if not set(domain).difference(IP_SET):
+        try:
+            ip_address(domain)
+        except ValueError:
+            return False
+        return True
+
+    # malformed domains
+    try:
+        if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
+            return False
+    except UnicodeError:
+        return False
 
-    if UNSUITABLE_DOMAIN.search(domain):
+    # unsuitable content
+    if UNSUITABLE_DOMAIN.match(domain):
         return False
 
     if FILE_TYPE.search(domain):
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 2140d38..e8a5a87 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -40,7 +40,7 @@
     lang_filter,
 )
 from courlan.core import filter_links
-from courlan.filters import extension_filter, path_filter, type_filter
+from courlan.filters import domain_filter, extension_filter, path_filter, type_filter
 from courlan.meta import clear_caches
 from courlan.urlutils import _parse, get_tldinfo, is_known_link
 
@@ -653,12 +653,50 @@ def test_urlcheck():
     # assert check_url('http://www.immobilienscout24.de/de/ueberuns/presseservice/pressestimmen/2_halbjahr_2000.jsp;jsessionid=287EC625A45BD5A243352DD8C86D25CC.worker2', language='de', strict=True) is not None
 
     # domain name
-    assert check_url("http://`$smarty.server.server_name`") is None
-    assert check_url("http://$`)}if(a.tryconvertencoding)trycatch(e)const") is None
-    assert check_url("http://00x200.jpg,") is None
     assert check_url("http://-100x100.webp") is None
     assert check_url("http://0.gravata.html") is None
     assert check_url("http://https:") is None
+    assert check_url("http://127.0.0.1") is not None
+    assert check_url("http://111.111.111.111") is not None
+    assert check_url("http://0127.0.0.1") is None
+    # assert check_url("http://::1") is not None
+    assert check_url("http://2001:0db8:85a3:0000:0000:8a2e:0370:7334") is not None
+    assert check_url("http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]") is None
+    assert check_url("http://1:2:3:4:5:6:7:8:9") is None
+
+
+def test_domain_filter():
+    "Test filters related to domain and hostnames."
+    assert domain_filter("") is False
+    assert domain_filter("too-long" + "g" * 60 + ".org") is False
+    assert domain_filter("long" + "g" * 50 + ".org") is True
+    assert domain_filter("example.-com") is False
+    assert domain_filter("example.") is False
+    assert domain_filter("-example.com") is False
+    assert domain_filter("_example.com") is False
+    assert domain_filter("example.com:") is False
+    assert domain_filter("a......b.com") is False
+    assert domain_filter("*.example.com") is False
+    assert domain_filter("exa-mple.co.uk") is True
+    assert domain_filter("kräuter.de") is True
+    assert domain_filter("xn--h1aagokeh.xn--p1ai") is True
+    assert domain_filter("`$smarty.server.server_name`") is False
+    assert domain_filter("$`)}if(a.tryconvertencoding)trycatch(e)const") is False
+    assert domain_filter("00x200.jpg,") is False
+    assert domain_filter("-100x100.webp") is False
+    assert domain_filter("0.gravata.html") is False
+    assert domain_filter("https:") is False
+
+    assert domain_filter("127.0.0.1") is True
+    assert domain_filter("900.200.100.75") is False
+    assert domain_filter("111.111.111") is False
+    assert domain_filter("0127.0.0.1") is False
+
+    assert domain_filter("example.jpg") is False
+    assert domain_filter("example.html") is False
+    assert domain_filter("0.gravatar.com") is False
+    assert domain_filter("12345.org") is False
+    # assert domain_filter("test.invalidtld") is False
 
 
 def test_urlcheck_redirects():