filters: simplify code (#94)

* filters: simplify code * lint * re-organize regexes
adbar · Apr 26, 2024 · 7e84223 · 7e84223
1 parent be3071b
commit 7e84223
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 34 deletions.
diff --git a/courlan/filters.py b/courlan/filters.py
@@ -16,6 +16,8 @@
 LOGGER = logging.getLogger(__name__)
 
 
+PROTOCOLS = {"http", "https"}
+
 # domain/host names
 IP_SET = {
     ".",
@@ -51,27 +53,27 @@
     re.IGNORECASE,
 )
 
-UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.")
-
 # content filters
 SITE_STRUCTURE = re.compile(
     # wordpress
-    r"/(?:paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|archives|uploads|modules|attachment|wp-admin|wp-content|wp-includes|wp-json|wp-themes|oembed)/|"
+    r"/(?:wp-(?:admin|content|includes|json|themes)|"
+    r"paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|"
+    r"archives|uploads|modules|attachment|oembed)/|"
     # wordpress + short URL
     r"[/_-](?:tags?|schlagwort|[ck]ategor[a-z]{1,2}|[ck]at|auth?or|user)/[^/]+/?$|"
     # mixed/blogspot
-    r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$|"
-    # blogspot
-    r"_archive\.html$",
+    r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$",
     re.IGNORECASE,
 )
 FILE_TYPE = re.compile(
-    r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|"
+    r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|"
+    r"avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|"
     r"[/-](img|jpg|png)(\b|_)",
     re.IGNORECASE,
 )  # (?=[&?])
 ADULT_AND_VIDEOS = re.compile(
-    r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
+    r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|"
+    r"porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
     re.IGNORECASE,
 )
 
@@ -132,15 +134,6 @@
     ".xml",
 }
 
-# territories whitelist
-# see also: https://babel.pocoo.org/en/latest/api/languages.html
-# get_official_languages('ch')
-LANGUAGE_MAPPINGS = {
-    "de": {"at", "ch", "de", "li"},  # 'be', 'it'
-    "en": {"au", "ca", "en", "gb", "ie", "nz", "us"},
-    "fr": {"be", "ca", "ch", "fr", "tn"},  # , 'lu', ...
-}
-
 
 def basic_filter(url: str) -> bool:
     "Filter URLs based on basic formal characteristics."
@@ -158,14 +151,15 @@ def domain_filter(domain: str) -> bool:
         return True
 
     # malformed domains
-    try:
-        if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
+    if not VALID_DOMAIN.match(domain):
+        try:
+            if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
+                return False
+        except UnicodeError:
             return False
-    except UnicodeError:
-        return False
 
     # unsuitable content
-    if UNSUITABLE_DOMAIN.match(domain) or FILE_TYPE.search(domain):
+    if domain.split(".")[0].isdigit() or FILE_TYPE.search(domain):
         return False
 
     # extensions
@@ -220,12 +214,11 @@ def lang_filter(
                 score = langcodes_score(language, occurrence, score)
         # don't perform the test if there are too many candidates: > 2
     # second test: prepended language cues
-    if strict and language in LANGUAGE_MAPPINGS:
+    if strict:
         match = HOST_LANG_FILTER.match(url)
         if match:
             candidate = match[1].lower()
-            LOGGER.debug("candidate lang %s found in URL", candidate)
-            if candidate in LANGUAGE_MAPPINGS[language]:
+            if candidate == language:
                 score += 1
             else:
                 score -= 1
@@ -244,8 +237,8 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
     """Make sure the target URL is from a suitable type (HTML page with primarily text).
     Strict: Try to filter out other document types, spam, video and adult websites."""
     try:
-        # feeds
-        if url.endswith(("/feed", "/rss")):
+        # feeds + blogspot
+        if url.endswith(("/feed", "/rss", "_archive.html")):
             raise ValueError
         # website structure
         if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)):
@@ -265,19 +258,16 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
         parsed_url = urlsplit(url)
     except ValueError:
         return False, None
-    if not bool(parsed_url.scheme) or parsed_url.scheme not in (
-        "http",
-        "https",
-    ):
+
+    if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS:
         return False, None
-    # fmt: off
+
     if len(parsed_url.netloc) < 5 or (
         parsed_url.netloc.startswith("www.")  # type: ignore
         and len(parsed_url.netloc) < 8
     ):
         return False, None
-    # fmt: on
-    # default
+
     return True, parsed_url
 
 

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -249,6 +249,9 @@ def test_spam_filter():
 
 def test_type_filter():
     assert type_filter("http://www.example.org/feed") is False
+    # wp
+    assert type_filter("http://www.example.org/wp-admin/") is False
+    assert type_filter("http://www.example.org/wp-includes/this") is False
     # straight category
     assert type_filter("http://www.example.org/category/123") is False
     assert type_filter("http://www.example.org/product-category/123") is False