Skip to content

Commit

Permalink
logging: reviewed options
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jul 27, 2022
1 parent 596d872 commit 0618f0b
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 12 deletions.
15 changes: 8 additions & 7 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
from .filters import validate_url
from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN


LOGGER = logging.getLogger(__name__)

# parsing
PROTOCOLS = re.compile(r"https?://")
SELECTION = re.compile(
Expand Down Expand Up @@ -66,26 +69,24 @@ def scrub_url(url: str) -> str:
# double/faulty URLs
protocols = PROTOCOLS.findall(url)
if len(protocols) > 1 and "web.archive.org" not in url:
logging.debug("double url: %s %s", len(protocols), url)
LOGGER.debug("double url: %s %s", len(protocols), url)
match = SELECTION.match(url)
if match and validate_url(match[1])[0] is True:
url = match[1]
logging.debug("taking url: %s", url)
LOGGER.debug("taking url: %s", url)
else:
match = MIDDLE_URL.match(url)
if match and validate_url(match[1])[0] is True:
url = match[1]
logging.debug("taking url: %s", url)
LOGGER.debug("taking url: %s", url)
# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
if len(url) > 500: # arbitrary choice
match = TRAILING_PARTS.match(url)
if match:
url = match[1]
if len(url) > 500:
logging.debug(
"invalid-looking link %s of length %d", f"{url[:50]}...", len(url)
)
LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url))

# trailing slashes in URLs without path or in embedded URLs
if url.count("/") == 3 or url.count("://") > 1:
Expand Down Expand Up @@ -119,7 +120,7 @@ def clean_query(
or (language == "en" and found_lang not in TARGET_LANG_EN)
or found_lang != language
):
logging.debug("bad lang: %s %s %s", language, qelem, found_lang)
LOGGER.info("bad lang: %s %s %s", language, qelem, found_lang)
raise ValueError
# insert
newqdict[qelem] = qdict[qelem]
Expand Down
5 changes: 2 additions & 3 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# import locale
import logging
import re
import sys

# from functools import cmp_to_key
from random import sample
Expand Down Expand Up @@ -130,9 +129,9 @@ def sample_urls(
"""Sample a list of URLs by domain name, optionally using constraints on their number"""
# logging
if verbose is True:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
LOGGER.setLevel(logging.DEBUG)
else:
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
LOGGER.setLevel(logging.ERROR)
# deduplicate
input_urls = list(dict.fromkeys(input_urls))
# validate
Expand Down
6 changes: 4 additions & 2 deletions courlan/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import urllib3 # type: ignore


LOGGER = logging.getLogger(__name__)

RETRY_STRATEGY = urllib3.util.Retry(
total=5,
redirect=5,
Expand Down Expand Up @@ -38,11 +40,11 @@ def redirection_test(url: str) -> str:
try:
rhead = HTTP_POOL.request("HEAD", url)
except Exception as err:
logging.error("unknown: %s %s", url, err) # sys.exc_info()[0]
LOGGER.exception("unknown error: %s %s", url, err)
else:
# response
if rhead.status in ACCEPTABLE_CODES:
logging.debug("result found: %s %s", rhead.geturl(), rhead.status)
LOGGER.debug("result found: %s %s", rhead.geturl(), rhead.status)
return rhead.geturl() # type: ignore
# else:
raise ValueError("cannot reach URL: %s", url)

0 comments on commit 0618f0b

Please sign in to comment.