simplify cleaning functions

adbar · Apr 25, 2024 · 6871d96 · 6871d96
1 parent f2f976c
commit 6871d96
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 37 deletions.
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -9,7 +9,7 @@
 from urllib.parse import parse_qs, quote, urlencode, urlunsplit, SplitResult
 
 from .filters import is_valid_url
-from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN
+from .settings import ALLOWED_PARAMS, LANG_PARAMS, TARGET_LANGS
 from .urlutils import _parse
 
 
@@ -57,22 +57,23 @@ def scrub_url(url: str) -> str:
     # trim
     # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
     # remove leading and trailing white space and unescaped control chars
-    url = url.strip(
+    # strip space in input string
+    url = "".join(url.split()).strip(
         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
         "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f \r\n"
     )
-    # strip space in input string
-    url = "".join(url.split())
+    # url = "".join([c for c in "".join(url.split()) if c.isprintable()])
+
     # <![CDATA[http://www.urbanlife.de/item/260-bmw-i8-hybrid-revolution-unter-den-sportwagen.html]]>
     if url.startswith("<![CDATA["):
-        url = url.replace("<![CDATA[", "")  # url = re.sub(r'^<!\[CDATA\[', '', url)
-        url = url.replace("]]>", "")  # url = re.sub(r'\]\]>$', '', url)
+        url = url.replace("<![CDATA[", "").replace("]]>", "")
+
     # markup rests
     url = REMAINING_MARKUP.sub("", url)
+
     # & and &amp;
-    if "&amp;" in url:
-        url = url.replace("&amp;", "&")
-    url = TRAILING_AMP.sub("", url)
+    url = TRAILING_AMP.sub("", url.replace("&amp;", "&"))
+
     # if '"' in link:
     #    link = link.split('"')[0]
     # double/faulty URLs
@@ -88,6 +89,7 @@ def scrub_url(url: str) -> str:
             if match and is_valid_url(match[1]):
                 url = match[1]
                 LOGGER.debug("taking url: %s", url)
+
     # too long and garbled URLs e.g. due to quotes URLs
     # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
     # if len(url) > 500:  # arbitrary choice
@@ -109,32 +111,33 @@ def clean_query(
     querystring: str, strict: bool = False, language: Optional[str] = None
 ) -> str:
     "Strip unwanted query elements"
-    if querystring:
-        qdict = parse_qs(querystring)
-        newqdict = {}
-        for qelem in sorted(qdict):
-            teststr = qelem.lower()
-            # control param
-            if strict:
-                if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
-                    continue
-            # get rid of trackers
-            elif TRACKERS_RE.search(teststr):
+    qdict = parse_qs(querystring)
+    if not qdict:
+        return ""
+
+    newqdict = {}
+
+    for qelem in sorted(qdict):
+        teststr = qelem.lower()
+        # control param
+        if strict:
+            if teststr not in ALLOWED_PARAMS and teststr not in LANG_PARAMS:
                 continue
-            # control language
-            if language is not None and teststr in CONTROL_PARAMS:
-                found_lang = str(qdict[qelem][0])
-                if (
-                    (language == "de" and found_lang not in TARGET_LANG_DE)
-                    or (language == "en" and found_lang not in TARGET_LANG_EN)
-                    or found_lang != language
-                ):
-                    LOGGER.info("bad lang: %s %s %s", language, qelem, found_lang)
-                    raise ValueError
-            # insert
-            newqdict[qelem] = qdict[qelem]
-        return urlencode(newqdict, doseq=True)
-    return querystring
+        # get rid of trackers
+        elif TRACKERS_RE.search(teststr):
+            continue
+        # control language
+        if teststr in LANG_PARAMS:
+            if (
+                language in TARGET_LANGS
+                and str(qdict[qelem][0]) not in TARGET_LANGS[language]
+            ):
+                LOGGER.debug("bad lang: %s %s", language, qelem)
+                raise ValueError
+        # insert
+        newqdict[qelem] = qdict[qelem]
+
+    return urlencode(newqdict, doseq=True)
 
 
 def decode_punycode(string: str) -> str:

diff --git a/courlan/settings.py b/courlan/settings.py
@@ -101,7 +101,11 @@
     "postid",
     "product_id",
 }
-CONTROL_PARAMS = {"lang", "language"}
-TARGET_LANG_DE = {"de", "deutsch", "ger", "german"}
-TARGET_LANG_EN = {"en", "english", "eng"}  # 'en_US', ''
+
+LANG_PARAMS = {"lang", "language"}
+
+TARGET_LANGS = {
+    "de": {"de", "deutsch", "ger", "german"},
+    "en": {"en", "english", "eng"},  # 'en_US'
+}
 # accepted_lang = ('en')