Skip to content

Commit

Permalink
Merge branch 'master' into simplify_filters
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 26, 2024
2 parents e7b308a + be3071b commit e61526a
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 64 deletions.
86 changes: 43 additions & 43 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from urllib.parse import parse_qs, quote, urlencode, urlunsplit, SplitResult

from .filters import is_valid_url
from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN
from .settings import ALLOWED_PARAMS, LANG_PARAMS, TARGET_LANGS
from .urlutils import _parse


Expand Down Expand Up @@ -40,7 +40,8 @@
r"^(?:dc|fbc|gc|twc|yc|ysc)lid|"
r"^(?:click|gbra|msclk|igsh|partner|wbra)id|"
r"^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|"
r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)"
r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|"
r"kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)"
)


Expand All @@ -54,25 +55,25 @@ def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:

def scrub_url(url: str) -> str:
"Strip unnecessary parts and make sure only one URL is considered"
# trim
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# remove leading and trailing white space and unescaped control chars
url = url.strip(
# remove leading/trailing space and unescaped control chars
# strip space in input string
url = "".join(url.split()).strip(
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f \r\n"
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
)
# strip space in input string
url = "".join(url.split())
# <![CDATA[http://www.urbanlife.de/item/260-bmw-i8-hybrid-revolution-unter-den-sportwagen.html]]>

# <![CDATA[http://...]]>
if url.startswith("<![CDATA["):
url = url.replace("<![CDATA[", "") # url = re.sub(r'^<!\[CDATA\[', '', url)
url = url.replace("]]>", "") # url = re.sub(r'\]\]>$', '', url)
url = url.replace("<![CDATA[", "").replace("]]>", "")

# markup rests
url = REMAINING_MARKUP.sub("", url)

# & and &amp;
if "&amp;" in url:
url = url.replace("&amp;", "&")
url = TRAILING_AMP.sub("", url)

# if '"' in link:
# link = link.split('"')[0]
# double/faulty URLs
Expand All @@ -88,53 +89,52 @@ def scrub_url(url: str) -> str:
if match and is_valid_url(match[1]):
url = match[1]
LOGGER.debug("taking url: %s", url)

# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500: # arbitrary choice
match = TRAILING_PARTS.match(url)
if match:
url = match[1]
if len(url) > 500:
if len(url) > 500: # arbitrary choice
LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url))

# trailing slashes in URLs without path or in embedded URLs
if url.count("/") == 3 or url.count("://") > 1:
url = url.rstrip("/")
# lower
# url = url.lower()

return url


def clean_query(
querystring: str, strict: bool = False, language: Optional[str] = None
) -> str:
"Strip unwanted query elements"
if querystring:
qdict = parse_qs(querystring)
newqdict = {}
for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
continue
# get rid of trackers
elif TRACKERS_RE.search(teststr):
if not querystring:
return ""

qdict = parse_qs(querystring)
newqdict = {}

for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in LANG_PARAMS:
continue
# control language
if language is not None and teststr in CONTROL_PARAMS:
found_lang = str(qdict[qelem][0])
if (
(language == "de" and found_lang not in TARGET_LANG_DE)
or (language == "en" and found_lang not in TARGET_LANG_EN)
or found_lang != language
):
LOGGER.info("bad lang: %s %s %s", language, qelem, found_lang)
raise ValueError
# insert
newqdict[qelem] = qdict[qelem]
return urlencode(newqdict, doseq=True)
return querystring
# get rid of trackers
elif TRACKERS_RE.search(teststr):
continue
# control language
if (
language in TARGET_LANGS
and teststr in LANG_PARAMS
and str(qdict[qelem][0]) not in TARGET_LANGS[language]
):
LOGGER.debug("bad lang: %s %s", language, qelem)
raise ValueError
# insert
newqdict[qelem] = qdict[qelem]

return urlencode(newqdict, doseq=True)


def decode_punycode(string: str) -> str:
Expand Down Expand Up @@ -208,4 +208,4 @@ def normalize_url(
# fragment
newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language)
# rebuild
return urlunsplit([scheme, netloc, newpath, newquery, newfragment])
return urlunsplit((scheme, netloc, newpath, newquery, newfragment))
2 changes: 0 additions & 2 deletions courlan/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
from urllib.parse import clear_cache as urllib_clear_cache # type: ignore[attr-defined]

from .filters import langcodes_score
from .urlutils import get_tldinfo


def clear_caches() -> None:
"""Reset all known LRU caches used to speed up processing.
This may release some memory."""
urllib_clear_cache()
get_tldinfo.cache_clear()
langcodes_score.cache_clear()
12 changes: 7 additions & 5 deletions courlan/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
General settings for package execution.
"""

# https://www.alexa.com/topsites
# https://www.alexa.com/topsites/countries/DE
# https://www.alexa.com/topsites/countries/US
BLACKLIST = {
Expand Down Expand Up @@ -101,7 +100,10 @@
"postid",
"product_id",
}
CONTROL_PARAMS = {"lang", "language"}
TARGET_LANG_DE = {"de", "deutsch", "ger", "german"}
TARGET_LANG_EN = {"en", "english", "eng"} # 'en_US', ''
# accepted_lang = ('en')

LANG_PARAMS = {"lang", "language"}

TARGET_LANGS = {
"de": {"de", "deutsch", "ger", "german"},
"en": {"en", "english", "eng"}, # 'en_US'
}
12 changes: 4 additions & 8 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import re

from functools import lru_cache
from html import unescape
from typing import Any, List, Optional, Set, Tuple, Union
from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
Expand All @@ -20,14 +19,11 @@
r"[0-9a-f:]{16,})" # IPv6
r"(?:/|$)" # slash or end of string
)
NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)")
STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=\D):\d+")
STRIP_PORT_REGEX = re.compile(r"(?<=\D):\d+")
CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.")
INNER_SLASH_REGEX = re.compile(r"(.+/)+")
FEED_WHITELIST_REGEX = re.compile(r"(?:feed(?:burner|proxy))", re.I)


@lru_cache(maxsize=1024)
def get_tldinfo(
url: str, fast: bool = False
) -> Union[Tuple[None, None], Tuple[str, str]]:
Expand All @@ -38,10 +34,10 @@ def get_tldinfo(
# try with regexes
domain_match = DOMAIN_REGEX.match(url)
if domain_match:
full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1])
clean_match = NO_EXTENSION_REGEX.match(full_domain)
full_domain = STRIP_PORT_REGEX.sub("", domain_match[1].split("@")[-1])
clean_match = full_domain.split(".")[0]
if clean_match:
return clean_match[0], full_domain
return clean_match, full_domain
# fallback
tldinfo = get_tld(url, as_object=True, fail_silently=True)
if tldinfo is None:
Expand Down
14 changes: 8 additions & 6 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
type_filter,
)
from courlan.meta import clear_caches
from courlan.urlutils import _parse, get_tldinfo, is_known_link
from courlan.urlutils import _parse, is_known_link


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand Down Expand Up @@ -1184,16 +1184,18 @@ def test_examples():

def test_meta():
"Test package meta functions."
url = "https://example.net/123/abc"
_ = get_tldinfo(url)
_ = _parse(url)
assert get_tldinfo.cache_info().currsize > 0
_ = langcodes_score("en", "en_HK", 0)
_ = _parse("https://example.net/123/abc")

assert langcodes_score.cache_info().currsize > 0
try:
urlsplit_lrucache = True
assert urlsplit.cache_info().currsize > 0
except AttributeError: # newer Python versions only
urlsplit_lrucache = False

clear_caches()
assert get_tldinfo.cache_info().currsize == 0

assert langcodes_score.cache_info().currsize == 0
if urlsplit_lrucache:
assert urlsplit.cache_info().currsize == 0

0 comments on commit e61526a

Please sign in to comment.