Skip to content

Commit

Permalink
filters: simplify code (#94)
Browse files Browse the repository at this point in the history
* filters: simplify code

* lint

* re-organize regexes
  • Loading branch information
adbar committed Apr 26, 2024
1 parent be3071b commit 7e84223
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 34 deletions.
58 changes: 24 additions & 34 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
LOGGER = logging.getLogger(__name__)


PROTOCOLS = {"http", "https"}

# domain/host names
IP_SET = {
".",
Expand Down Expand Up @@ -51,27 +53,27 @@
re.IGNORECASE,
)

UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.")

# content filters
SITE_STRUCTURE = re.compile(
# wordpress
r"/(?:paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|archives|uploads|modules|attachment|wp-admin|wp-content|wp-includes|wp-json|wp-themes|oembed)/|"
r"/(?:wp-(?:admin|content|includes|json|themes)|"
r"paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|"
r"archives|uploads|modules|attachment|oembed)/|"
# wordpress + short URL
r"[/_-](?:tags?|schlagwort|[ck]ategor[a-z]{1,2}|[ck]at|auth?or|user)/[^/]+/?$|"
# mixed/blogspot
r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$|"
# blogspot
r"_archive\.html$",
r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$",
re.IGNORECASE,
)
FILE_TYPE = re.compile(
r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|"
r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|"
r"avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|"
r"[/-](img|jpg|png)(\b|_)",
re.IGNORECASE,
) # (?=[&?])
ADULT_AND_VIDEOS = re.compile(
r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|"
r"porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
re.IGNORECASE,
)

Expand Down Expand Up @@ -132,15 +134,6 @@
".xml",
}

# territories whitelist
# see also: https://babel.pocoo.org/en/latest/api/languages.html
# get_official_languages('ch')
LANGUAGE_MAPPINGS = {
"de": {"at", "ch", "de", "li"}, # 'be', 'it'
"en": {"au", "ca", "en", "gb", "ie", "nz", "us"},
"fr": {"be", "ca", "ch", "fr", "tn"}, # , 'lu', ...
}


def basic_filter(url: str) -> bool:
"Filter URLs based on basic formal characteristics."
Expand All @@ -158,14 +151,15 @@ def domain_filter(domain: str) -> bool:
return True

# malformed domains
try:
if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
if not VALID_DOMAIN.match(domain):
try:
if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
return False
except UnicodeError:
return False
except UnicodeError:
return False

# unsuitable content
if UNSUITABLE_DOMAIN.match(domain) or FILE_TYPE.search(domain):
if domain.split(".")[0].isdigit() or FILE_TYPE.search(domain):
return False

# extensions
Expand Down Expand Up @@ -220,12 +214,11 @@ def lang_filter(
score = langcodes_score(language, occurrence, score)
# don't perform the test if there are too many candidates: > 2
# second test: prepended language cues
if strict and language in LANGUAGE_MAPPINGS:
if strict:
match = HOST_LANG_FILTER.match(url)
if match:
candidate = match[1].lower()
LOGGER.debug("candidate lang %s found in URL", candidate)
if candidate in LANGUAGE_MAPPINGS[language]:
if candidate == language:
score += 1
else:
score -= 1
Expand All @@ -244,8 +237,8 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
"""Make sure the target URL is from a suitable type (HTML page with primarily text).
Strict: Try to filter out other document types, spam, video and adult websites."""
try:
# feeds
if url.endswith(("/feed", "/rss")):
# feeds + blogspot
if url.endswith(("/feed", "/rss", "_archive.html")):
raise ValueError
# website structure
if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)):
Expand All @@ -265,19 +258,16 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
parsed_url = urlsplit(url)
except ValueError:
return False, None
if not bool(parsed_url.scheme) or parsed_url.scheme not in (
"http",
"https",
):

if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS:
return False, None
# fmt: off

if len(parsed_url.netloc) < 5 or (
parsed_url.netloc.startswith("www.") # type: ignore
and len(parsed_url.netloc) < 8
):
return False, None
# fmt: on
# default

return True, parsed_url


Expand Down
3 changes: 3 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ def test_spam_filter():

def test_type_filter():
assert type_filter("http://www.example.org/feed") is False
# wp
assert type_filter("http://www.example.org/wp-admin/") is False
assert type_filter("http://www.example.org/wp-includes/this") is False
# straight category
assert type_filter("http://www.example.org/category/123") is False
assert type_filter("http://www.example.org/product-category/123") is False
Expand Down

0 comments on commit 7e84223

Please sign in to comment.