Skip to content

Commit

Permalink
simplify cleaning functions
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 25, 2024
1 parent f2f976c commit 6871d96
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 37 deletions.
71 changes: 37 additions & 34 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from urllib.parse import parse_qs, quote, urlencode, urlunsplit, SplitResult

from .filters import is_valid_url
from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN
from .settings import ALLOWED_PARAMS, LANG_PARAMS, TARGET_LANGS
from .urlutils import _parse


Expand Down Expand Up @@ -57,22 +57,23 @@ def scrub_url(url: str) -> str:
# trim
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# remove leading and trailing white space and unescaped control chars
url = url.strip(
# strip space in input string
url = "".join(url.split()).strip(
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f \r\n"
)
# strip space in input string
url = "".join(url.split())
# url = "".join([c for c in "".join(url.split()) if c.isprintable()])

# <![CDATA[http://www.urbanlife.de/item/260-bmw-i8-hybrid-revolution-unter-den-sportwagen.html]]>
if url.startswith("<![CDATA["):
url = url.replace("<![CDATA[", "") # url = re.sub(r'^<!\[CDATA\[', '', url)
url = url.replace("]]>", "") # url = re.sub(r'\]\]>$', '', url)
url = url.replace("<![CDATA[", "").replace("]]>", "")

# markup rests
url = REMAINING_MARKUP.sub("", url)

# & and &amp;
if "&amp;" in url:
url = url.replace("&amp;", "&")
url = TRAILING_AMP.sub("", url)
url = TRAILING_AMP.sub("", url.replace("&amp;", "&"))

# if '"' in link:
# link = link.split('"')[0]
# double/faulty URLs
Expand All @@ -88,6 +89,7 @@ def scrub_url(url: str) -> str:
if match and is_valid_url(match[1]):
url = match[1]
LOGGER.debug("taking url: %s", url)

# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500: # arbitrary choice
Expand All @@ -109,32 +111,33 @@ def clean_query(
querystring: str, strict: bool = False, language: Optional[str] = None
) -> str:
"Strip unwanted query elements"
if querystring:
qdict = parse_qs(querystring)
newqdict = {}
for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
continue
# get rid of trackers
elif TRACKERS_RE.search(teststr):
qdict = parse_qs(querystring)
if not qdict:
return ""

newqdict = {}

for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in LANG_PARAMS:
continue
# control language
if language is not None and teststr in CONTROL_PARAMS:
found_lang = str(qdict[qelem][0])
if (
(language == "de" and found_lang not in TARGET_LANG_DE)
or (language == "en" and found_lang not in TARGET_LANG_EN)
or found_lang != language
):
LOGGER.info("bad lang: %s %s %s", language, qelem, found_lang)
raise ValueError
# insert
newqdict[qelem] = qdict[qelem]
return urlencode(newqdict, doseq=True)
return querystring
# get rid of trackers
elif TRACKERS_RE.search(teststr):
continue
# control language
if teststr in LANG_PARAMS:
if (
language in TARGET_LANGS
and str(qdict[qelem][0]) not in TARGET_LANGS[language]
):
LOGGER.debug("bad lang: %s %s", language, qelem)
raise ValueError
# insert
newqdict[qelem] = qdict[qelem]

return urlencode(newqdict, doseq=True)


def decode_punycode(string: str) -> str:
Expand Down
10 changes: 7 additions & 3 deletions courlan/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,11 @@
"postid",
"product_id",
}
CONTROL_PARAMS = {"lang", "language"}
TARGET_LANG_DE = {"de", "deutsch", "ger", "german"}
TARGET_LANG_EN = {"en", "english", "eng"} # 'en_US', ''

LANG_PARAMS = {"lang", "language"}

TARGET_LANGS = {
"de": {"de", "deutsch", "ger", "german"},
"en": {"en", "english", "eng"}, # 'en_US'
}
# accepted_lang = ('en')

0 comments on commit 6871d96

Please sign in to comment.