diff --git a/courlan/meta.py b/courlan/meta.py index b82c502..9254b7a 100644 --- a/courlan/meta.py +++ b/courlan/meta.py @@ -5,12 +5,10 @@ from urllib.parse import clear_cache as urllib_clear_cache # type: ignore[attr-defined] from .filters import langcodes_score -from .urlutils import get_tldinfo def clear_caches() -> None: """Reset all known LRU caches used to speed up processing. This may release some memory.""" urllib_clear_cache() - get_tldinfo.cache_clear() langcodes_score.cache_clear() diff --git a/courlan/urlutils.py b/courlan/urlutils.py index 1865fae..5c69ce3 100644 --- a/courlan/urlutils.py +++ b/courlan/urlutils.py @@ -4,7 +4,6 @@ import re -from functools import lru_cache from html import unescape from typing import Any, List, Optional, Set, Tuple, Union from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult @@ -20,14 +19,11 @@ r"[0-9a-f:]{16,})" # IPv6 r"(?:/|$)" # slash or end of string ) -NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)") -STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=\D):\d+") +STRIP_PORT_REGEX = re.compile(r"(?<=\D):\d+") CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.") -INNER_SLASH_REGEX = re.compile(r"(.+/)+") FEED_WHITELIST_REGEX = re.compile(r"(?:feed(?:burner|proxy))", re.I) -@lru_cache(maxsize=1024) def get_tldinfo( url: str, fast: bool = False ) -> Union[Tuple[None, None], Tuple[str, str]]: @@ -38,10 +34,10 @@ def get_tldinfo( # try with regexes domain_match = DOMAIN_REGEX.match(url) if domain_match: - full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1]) - clean_match = NO_EXTENSION_REGEX.match(full_domain) + full_domain = STRIP_PORT_REGEX.sub("", domain_match[1].split("@")[-1]) + clean_match = full_domain.split(".")[0] if clean_match: - return clean_match[0], full_domain + return clean_match, full_domain # fallback tldinfo = get_tld(url, as_object=True, fail_silently=True) if tldinfo is None: diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 0b06af6..6529557 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -45,7 +45,7 @@ type_filter, ) from courlan.meta import clear_caches -from courlan.urlutils import _parse, get_tldinfo, is_known_link +from courlan.urlutils import _parse, is_known_link logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) @@ -1184,16 +1184,18 @@ def test_examples(): def test_meta(): "Test package meta functions." - url = "https://example.net/123/abc" - _ = get_tldinfo(url) - _ = _parse(url) - assert get_tldinfo.cache_info().currsize > 0 + _ = langcodes_score("en", "en_HK", 0) + _ = _parse("https://example.net/123/abc") + + assert langcodes_score.cache_info().currsize > 0 try: urlsplit_lrucache = True assert urlsplit.cache_info().currsize > 0 except AttributeError: # newer Python versions only urlsplit_lrucache = False + clear_caches() - assert get_tldinfo.cache_info().currsize == 0 + + assert langcodes_score.cache_info().currsize == 0 if urlsplit_lrucache: assert urlsplit.cache_info().currsize == 0