From 9530f33a6309bd0326039abf9d1c86d929f89968 Mon Sep 17 00:00:00 2001 From: naz Date: Thu, 13 Jun 2024 16:52:19 +0900 Subject: [PATCH] validate netloc with port number --- courlan/filters.py | 11 +++++++---- tests/unit_tests.py | 4 ++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/courlan/filters.py b/courlan/filters.py index 51e052c..8158481 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -41,7 +41,7 @@ } # https://github.com/python-validators/validators/blob/master/src/validators/domain.py -VALID_DOMAIN = re.compile( +VALID_DOMAIN_PORT = re.compile( # First character of the domain r"^(?:[a-zA-Z0-9]" # Sub domain + hostname @@ -49,7 +49,10 @@ # First 61 characters of the gTLD + r"+[A-Za-z0-9][A-Za-z0-9-_]{0,61}" # Last character of the gTLD - + r"[A-Za-z]$", + + r"[A-Za-z]" + # Port number + + r"(\:(6553[0-5]|655[0-2][0-9]|65[0-4][0-9]{2}|" + + r"6[0-4][0-9]{3}|[1-5][0-9]{4}|[1-9][0-9]{0,3}))?$", re.IGNORECASE, ) @@ -151,9 +154,9 @@ def domain_filter(domain: str) -> bool: return True # malformed domains - if not VALID_DOMAIN.match(domain): + if not VALID_DOMAIN_PORT.match(domain): try: - if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")): + if not VALID_DOMAIN_PORT.match(domain.encode("idna").decode("utf-8")): return False except UnicodeError: return False diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 307fa43..4bc2557 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -706,6 +706,10 @@ def test_urlcheck(): assert check_url("http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]") is None assert check_url("http://1:2:3:4:5:6:7:8:9") is None + # port + assert check_url("http://example.com:80") is not None + assert check_url("http://example.com:80:80") is None + def test_domain_filter(): "Test filters related to domain and hostnames."