Skip to content

Commit

Permalink
utils: faster tldinfo (#93)
Browse files Browse the repository at this point in the history
* utils: faster tldinfo

* remove lru decorator

* simplify
  • Loading branch information
adbar committed Apr 25, 2024
1 parent f2f976c commit af2b3a4
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 16 deletions.
2 changes: 0 additions & 2 deletions courlan/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
from urllib.parse import clear_cache as urllib_clear_cache # type: ignore[attr-defined]

from .filters import langcodes_score
from .urlutils import get_tldinfo


def clear_caches() -> None:
"""Reset all known LRU caches used to speed up processing.
This may release some memory."""
urllib_clear_cache()
get_tldinfo.cache_clear()
langcodes_score.cache_clear()
12 changes: 4 additions & 8 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import re

from functools import lru_cache
from html import unescape
from typing import Any, List, Optional, Set, Tuple, Union
from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
Expand All @@ -20,14 +19,11 @@
r"[0-9a-f:]{16,})" # IPv6
r"(?:/|$)" # slash or end of string
)
NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)")
STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=\D):\d+")
STRIP_PORT_REGEX = re.compile(r"(?<=\D):\d+")
CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.")
INNER_SLASH_REGEX = re.compile(r"(.+/)+")
FEED_WHITELIST_REGEX = re.compile(r"(?:feed(?:burner|proxy))", re.I)


@lru_cache(maxsize=1024)
def get_tldinfo(
url: str, fast: bool = False
) -> Union[Tuple[None, None], Tuple[str, str]]:
Expand All @@ -38,10 +34,10 @@ def get_tldinfo(
# try with regexes
domain_match = DOMAIN_REGEX.match(url)
if domain_match:
full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1])
clean_match = NO_EXTENSION_REGEX.match(full_domain)
full_domain = STRIP_PORT_REGEX.sub("", domain_match[1].split("@")[-1])
clean_match = full_domain.split(".")[0]
if clean_match:
return clean_match[0], full_domain
return clean_match, full_domain
# fallback
tldinfo = get_tld(url, as_object=True, fail_silently=True)
if tldinfo is None:
Expand Down
14 changes: 8 additions & 6 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
type_filter,
)
from courlan.meta import clear_caches
from courlan.urlutils import _parse, get_tldinfo, is_known_link
from courlan.urlutils import _parse, is_known_link


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand Down Expand Up @@ -1184,16 +1184,18 @@ def test_examples():

def test_meta():
"Test package meta functions."
url = "https://example.net/123/abc"
_ = get_tldinfo(url)
_ = _parse(url)
assert get_tldinfo.cache_info().currsize > 0
_ = langcodes_score("en", "en_HK", 0)
_ = _parse("https://example.net/123/abc")

assert langcodes_score.cache_info().currsize > 0
try:
urlsplit_lrucache = True
assert urlsplit.cache_info().currsize > 0
except AttributeError: # newer Python versions only
urlsplit_lrucache = False

clear_caches()
assert get_tldinfo.cache_info().currsize == 0

assert langcodes_score.cache_info().currsize == 0
if urlsplit_lrucache:
assert urlsplit.cache_info().currsize == 0

0 comments on commit af2b3a4

Please sign in to comment.