Merge branch 'master' into simplify_filters

adbar · Apr 26, 2024 · e61526a · e61526a
2 parents e7b308a + be3071b
commit e61526a
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 64 deletions.
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -9,7 +9,7 @@
 from urllib.parse import parse_qs, quote, urlencode, urlunsplit, SplitResult
 
 from .filters import is_valid_url
-from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN
+from .settings import ALLOWED_PARAMS, LANG_PARAMS, TARGET_LANGS
 from .urlutils import _parse
 
 
@@ -40,7 +40,8 @@
     r"^(?:dc|fbc|gc|twc|yc|ysc)lid|"
     r"^(?:click|gbra|msclk|igsh|partner|wbra)id|"
     r"^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|"
-    r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)"
+    r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|"
+    r"kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)"
 )
 
 
@@ -54,25 +55,25 @@ def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
 
 def scrub_url(url: str) -> str:
     "Strip unnecessary parts and make sure only one URL is considered"
-    # trim
-    # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
-    # remove leading and trailing white space and unescaped control chars
-    url = url.strip(
+    # remove leading/trailing space and unescaped control chars
+    # strip space in input string
+    url = "".join(url.split()).strip(
         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f \r\n"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
     )
-    # strip space in input string
-    url = "".join(url.split())
-    # <![CDATA[http://www.urbanlife.de/item/260-bmw-i8-hybrid-revolution-unter-den-sportwagen.html]]>
+
+    # <![CDATA[http://...]]>
     if url.startswith("<![CDATA["):
-        url = url.replace("<![CDATA[", "")  # url = re.sub(r'^<!\[CDATA\[', '', url)
-        url = url.replace("]]>", "")  # url = re.sub(r'\]\]>$', '', url)
+        url = url.replace("<![CDATA[", "").replace("]]>", "")
+
     # markup rests
     url = REMAINING_MARKUP.sub("", url)
+
     # & and &amp;
     if "&amp;" in url:
         url = url.replace("&amp;", "&")
     url = TRAILING_AMP.sub("", url)
+
     # if '"' in link:
     #    link = link.split('"')[0]
     # double/faulty URLs
@@ -88,53 +89,52 @@ def scrub_url(url: str) -> str:
             if match and is_valid_url(match[1]):
                 url = match[1]
                 LOGGER.debug("taking url: %s", url)
+
     # too long and garbled URLs e.g. due to quotes URLs
-    # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
-    # if len(url) > 500:  # arbitrary choice
     match = TRAILING_PARTS.match(url)
     if match:
         url = match[1]
-    if len(url) > 500:
+    if len(url) > 500:  # arbitrary choice
         LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url))
 
     # trailing slashes in URLs without path or in embedded URLs
     if url.count("/") == 3 or url.count("://") > 1:
         url = url.rstrip("/")
-    # lower
-    # url = url.lower()
+
     return url
 
 
 def clean_query(
     querystring: str, strict: bool = False, language: Optional[str] = None
 ) -> str:
     "Strip unwanted query elements"
-    if querystring:
-        qdict = parse_qs(querystring)
-        newqdict = {}
-        for qelem in sorted(qdict):
-            teststr = qelem.lower()
-            # control param
-            if strict:
-                if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
-                    continue
-            # get rid of trackers
-            elif TRACKERS_RE.search(teststr):
+    if not querystring:
+        return ""
+
+    qdict = parse_qs(querystring)
+    newqdict = {}
+
+    for qelem in sorted(qdict):
+        teststr = qelem.lower()
+        # control param
+        if strict:
+            if teststr not in ALLOWED_PARAMS and teststr not in LANG_PARAMS:
                 continue
-            # control language
-            if language is not None and teststr in CONTROL_PARAMS:
-                found_lang = str(qdict[qelem][0])
-                if (
-                    (language == "de" and found_lang not in TARGET_LANG_DE)
-                    or (language == "en" and found_lang not in TARGET_LANG_EN)
-                    or found_lang != language
-                ):
-                    LOGGER.info("bad lang: %s %s %s", language, qelem, found_lang)
-                    raise ValueError
-            # insert
-            newqdict[qelem] = qdict[qelem]
-        return urlencode(newqdict, doseq=True)
-    return querystring
+        # get rid of trackers
+        elif TRACKERS_RE.search(teststr):
+            continue
+        # control language
+        if (
+            language in TARGET_LANGS
+            and teststr in LANG_PARAMS
+            and str(qdict[qelem][0]) not in TARGET_LANGS[language]
+        ):
+            LOGGER.debug("bad lang: %s %s", language, qelem)
+            raise ValueError
+        # insert
+        newqdict[qelem] = qdict[qelem]
+
+    return urlencode(newqdict, doseq=True)
 
 
 def decode_punycode(string: str) -> str:
@@ -208,4 +208,4 @@ def normalize_url(
     # fragment
     newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language)
     # rebuild
-    return urlunsplit([scheme, netloc, newpath, newquery, newfragment])
+    return urlunsplit((scheme, netloc, newpath, newquery, newfragment))
diff --git a/courlan/meta.py b/courlan/meta.py
@@ -5,12 +5,10 @@
 from urllib.parse import clear_cache as urllib_clear_cache  # type: ignore[attr-defined]
 
 from .filters import langcodes_score
-from .urlutils import get_tldinfo
 
 
 def clear_caches() -> None:
     """Reset all known LRU caches used to speed up processing.
     This may release some memory."""
     urllib_clear_cache()
-    get_tldinfo.cache_clear()
     langcodes_score.cache_clear()
diff --git a/courlan/settings.py b/courlan/settings.py
@@ -2,7 +2,6 @@
 General settings for package execution.
 """
 
-# https://www.alexa.com/topsites
 # https://www.alexa.com/topsites/countries/DE
 # https://www.alexa.com/topsites/countries/US
 BLACKLIST = {
@@ -101,7 +100,10 @@
     "postid",
     "product_id",
 }
-CONTROL_PARAMS = {"lang", "language"}
-TARGET_LANG_DE = {"de", "deutsch", "ger", "german"}
-TARGET_LANG_EN = {"en", "english", "eng"}  # 'en_US', ''
-# accepted_lang = ('en')
+
+LANG_PARAMS = {"lang", "language"}
+
+TARGET_LANGS = {
+    "de": {"de", "deutsch", "ger", "german"},
+    "en": {"en", "english", "eng"},  # 'en_US'
+}
diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -4,7 +4,6 @@
 
 import re
 
-from functools import lru_cache
 from html import unescape
 from typing import Any, List, Optional, Set, Tuple, Union
 from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
@@ -20,14 +19,11 @@
     r"[0-9a-f:]{16,})"  # IPv6
     r"(?:/|$)"  # slash or end of string
 )
-NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)")
-STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=\D):\d+")
+STRIP_PORT_REGEX = re.compile(r"(?<=\D):\d+")
 CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.")
-INNER_SLASH_REGEX = re.compile(r"(.+/)+")
 FEED_WHITELIST_REGEX = re.compile(r"(?:feed(?:burner|proxy))", re.I)
 
 
-@lru_cache(maxsize=1024)
 def get_tldinfo(
     url: str, fast: bool = False
 ) -> Union[Tuple[None, None], Tuple[str, str]]:
@@ -38,10 +34,10 @@ def get_tldinfo(
         # try with regexes
         domain_match = DOMAIN_REGEX.match(url)
         if domain_match:
-            full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1])
-            clean_match = NO_EXTENSION_REGEX.match(full_domain)
+            full_domain = STRIP_PORT_REGEX.sub("", domain_match[1].split("@")[-1])
+            clean_match = full_domain.split(".")[0]
             if clean_match:
-                return clean_match[0], full_domain
+                return clean_match, full_domain
     # fallback
     tldinfo = get_tld(url, as_object=True, fail_silently=True)
     if tldinfo is None:

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -45,7 +45,7 @@
     type_filter,
 )
 from courlan.meta import clear_caches
-from courlan.urlutils import _parse, get_tldinfo, is_known_link
+from courlan.urlutils import _parse, is_known_link
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -1184,16 +1184,18 @@ def test_examples():
 
 def test_meta():
     "Test package meta functions."
-    url = "https://example.net/123/abc"
-    _ = get_tldinfo(url)
-    _ = _parse(url)
-    assert get_tldinfo.cache_info().currsize > 0
+    _ = langcodes_score("en", "en_HK", 0)
+    _ = _parse("https://example.net/123/abc")
+
+    assert langcodes_score.cache_info().currsize > 0
     try:
         urlsplit_lrucache = True
         assert urlsplit.cache_info().currsize > 0
     except AttributeError:  # newer Python versions only
         urlsplit_lrucache = False
+
     clear_caches()
-    assert get_tldinfo.cache_info().currsize == 0
+
+    assert langcodes_score.cache_info().currsize == 0
     if urlsplit_lrucache:
         assert urlsplit.cache_info().currsize == 0