utils: faster tldinfo (#93)

* utils: faster tldinfo * remove lru decorator * simplify
adbar · Apr 25, 2024 · af2b3a4 · af2b3a4
1 parent f2f976c
commit af2b3a4
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 16 deletions.
diff --git a/courlan/meta.py b/courlan/meta.py
@@ -5,12 +5,10 @@
 from urllib.parse import clear_cache as urllib_clear_cache  # type: ignore[attr-defined]
 
 from .filters import langcodes_score
-from .urlutils import get_tldinfo
 
 
 def clear_caches() -> None:
     """Reset all known LRU caches used to speed up processing.
     This may release some memory."""
     urllib_clear_cache()
-    get_tldinfo.cache_clear()
     langcodes_score.cache_clear()
diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -4,7 +4,6 @@
 
 import re
 
-from functools import lru_cache
 from html import unescape
 from typing import Any, List, Optional, Set, Tuple, Union
 from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
@@ -20,14 +19,11 @@
     r"[0-9a-f:]{16,})"  # IPv6
     r"(?:/|$)"  # slash or end of string
 )
-NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)")
-STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=\D):\d+")
+STRIP_PORT_REGEX = re.compile(r"(?<=\D):\d+")
 CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.")
-INNER_SLASH_REGEX = re.compile(r"(.+/)+")
 FEED_WHITELIST_REGEX = re.compile(r"(?:feed(?:burner|proxy))", re.I)
 
 
-@lru_cache(maxsize=1024)
 def get_tldinfo(
     url: str, fast: bool = False
 ) -> Union[Tuple[None, None], Tuple[str, str]]:
@@ -38,10 +34,10 @@ def get_tldinfo(
         # try with regexes
         domain_match = DOMAIN_REGEX.match(url)
         if domain_match:
-            full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1])
-            clean_match = NO_EXTENSION_REGEX.match(full_domain)
+            full_domain = STRIP_PORT_REGEX.sub("", domain_match[1].split("@")[-1])
+            clean_match = full_domain.split(".")[0]
             if clean_match:
-                return clean_match[0], full_domain
+                return clean_match, full_domain
     # fallback
     tldinfo = get_tld(url, as_object=True, fail_silently=True)
     if tldinfo is None:

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -45,7 +45,7 @@
     type_filter,
 )
 from courlan.meta import clear_caches
-from courlan.urlutils import _parse, get_tldinfo, is_known_link
+from courlan.urlutils import _parse, is_known_link
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -1184,16 +1184,18 @@ def test_examples():
 
 def test_meta():
     "Test package meta functions."
-    url = "https://example.net/123/abc"
-    _ = get_tldinfo(url)
-    _ = _parse(url)
-    assert get_tldinfo.cache_info().currsize > 0
+    _ = langcodes_score("en", "en_HK", 0)
+    _ = _parse("https://example.net/123/abc")
+
+    assert langcodes_score.cache_info().currsize > 0
     try:
         urlsplit_lrucache = True
         assert urlsplit.cache_info().currsize > 0
     except AttributeError:  # newer Python versions only
         urlsplit_lrucache = False
+
     clear_caches()
-    assert get_tldinfo.cache_info().currsize == 0
+
+    assert langcodes_score.cache_info().currsize == 0
     if urlsplit_lrucache:
         assert urlsplit.cache_info().currsize == 0