From 138f63ca33d426e891e99b9273b9b295baacd84b Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 30 Apr 2024 14:55:04 +0200 Subject: [PATCH 1/2] network: use urllib only --- courlan/network.py | 59 ++++++++++++--------------------------------- setup.py | 3 +-- tests/unit_tests.py | 8 +++--- 3 files changed, 21 insertions(+), 49 deletions(-) diff --git a/courlan/network.py b/courlan/network.py index b637ff5..5338545 100644 --- a/courlan/network.py +++ b/courlan/network.py @@ -3,47 +3,22 @@ """ import logging +import ssl +import urllib.request -import urllib3 +from typing import Optional +from urllib.error import HTTPError +import certifi -LOGGER = logging.getLogger(__name__) -urllib3.disable_warnings() +CERTIFI_CONTEXT = ssl.create_default_context(cafile=certifi.where()) -RETRY_STRATEGY = urllib3.util.Retry( - total=2, - redirect=2, - raise_on_redirect=False, - status_forcelist=[ - 429, - 499, - 500, - 502, - 503, - 504, - 509, - 520, - 521, - 522, - 523, - 524, - 525, - 526, - 527, - 530, - 598, - ], # unofficial: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#Unofficial_codes - backoff_factor=1, -) -HTTP_POOL = urllib3.PoolManager( - cert_reqs="CERT_NONE", num_pools=100, retries=RETRY_STRATEGY, timeout=10 -) +LOGGER = logging.getLogger(__name__) ACCEPTABLE_CODES = {200, 300, 301, 302, 303, 304, 305, 306, 307, 308} -# Test redirects def redirection_test(url: str) -> str: """Test final URL to handle redirects Args: @@ -55,17 +30,15 @@ def redirection_test(url: str) -> str: Raises: Nothing. """ - # headers.update({ - # "User-Agent" : str(sample(settings.USER_AGENTS, 1)), # select a random user agent - # }) try: - rhead = HTTP_POOL.request("HEAD", url) # type:ignore[no-untyped-call] + req = urllib.request.Request(url, method="HEAD") + rhead = urllib.request.urlopen(req, context=CERTIFI_CONTEXT) + except HTTPError as error: + if error.status in ACCEPTABLE_CODES: + return error.url except Exception as err: - LOGGER.exception("unknown error: %s %s", url, err) + LOGGER.warning("unknown error: %s %s", url, err) else: - # response - if rhead.status in ACCEPTABLE_CODES: - LOGGER.debug("result found: %s %s", rhead.geturl(), rhead.status) - return rhead.geturl() # type: ignore - # else: - raise ValueError(f"cannot reach URL: ${url}") + return rhead.url + + raise ValueError(f"cannot reach URL: {url}") diff --git a/setup.py b/setup.py index d7ff402..811099e 100644 --- a/setup.py +++ b/setup.py @@ -108,10 +108,9 @@ def get_long_description(): python_requires=">=3.6", install_requires=[ "babel >= 2.11.0", + "certifi", "tld == 0.12.6; python_version < '3.7'", "tld >= 0.13; python_version >= '3.7'", - "urllib3 >= 1.26, < 2; python_version < '3.7'", - "urllib3 >= 1.26, < 3; python_version >= '3.7'", ], # extras_require=extras, entry_points={ diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 307fa43..f5cce1a 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -743,11 +743,11 @@ def test_domain_filter(): def test_urlcheck_redirects(): "Test redirection checks." - assert check_url("https://www.httpbun.com/status/200", with_redirects=True) == ( - "https://httpbun.com", - "httpbun.com", + assert check_url("https://www.httpbin.org/status/301", with_redirects=True) == ( + "https://www.httpbin.org/get", + "httpbin.org", ) - assert check_url("https://www.httpbin.org/status/404", with_redirects=True) is None + assert check_url("https://httpbun.com/status/404", with_redirects=True) is None assert check_url("https://www.ht.or", with_redirects=True) is None From 689a517561c4b66c7d4f4d01a8e5fe627db2a1a6 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 31 May 2024 14:14:36 +0200 Subject: [PATCH 2/2] simplify code --- courlan/network.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/courlan/network.py b/courlan/network.py index 5338545..a57e65f 100644 --- a/courlan/network.py +++ b/courlan/network.py @@ -4,10 +4,8 @@ import logging import ssl -import urllib.request -from typing import Optional -from urllib.error import HTTPError +from urllib import request import certifi @@ -31,14 +29,12 @@ def redirection_test(url: str) -> str: Nothing. """ try: - req = urllib.request.Request(url, method="HEAD") - rhead = urllib.request.urlopen(req, context=CERTIFI_CONTEXT) - except HTTPError as error: - if error.status in ACCEPTABLE_CODES: - return error.url + req = request.Request(url, method="HEAD") + with request.urlopen(req, context=CERTIFI_CONTEXT) as f: + pass + if f.status in ACCEPTABLE_CODES: + return f.url except Exception as err: LOGGER.warning("unknown error: %s %s", url, err) - else: - return rhead.url raise ValueError(f"cannot reach URL: {url}")