Skip to content

Commit

Permalink
Merge branch 'master' into prepare_v1.2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jun 4, 2024
2 parents 03aab9f + c9a41ba commit 0736d5d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 32 deletions.
13 changes: 5 additions & 8 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,11 @@ def scrub_url(url: str) -> str:
url = REMAINING_MARKUP.sub("", url)

# & and &
if "&" in url:
url = url.replace("&", "&")
url = TRAILING_AMP.sub("", url)
url = TRAILING_AMP.sub("", url.replace("&", "&"))

# if '"' in link:
# link = link.split('"')[0]

# double/faulty URLs
protocols = PROTOCOLS.findall(url)
if len(protocols) > 1 and "web.archive.org" not in url:
Expand Down Expand Up @@ -182,21 +181,19 @@ def normalize_url(
parsed_url = _parse(parsed_url)
# lowercase + remove fragments + normalize punycode
scheme = parsed_url.scheme.lower()
netloc = parsed_url.netloc.lower()
netloc = decode_punycode(parsed_url.netloc.lower())
# port
try:
if parsed_url.port and parsed_url.port in (80, 443):
if parsed_url.port in (80, 443):
netloc = NETLOC_RE.sub("", netloc)
except ValueError:
pass # Port could not be cast to integer value
# lowercase + remove fragments + normalize punycode
netloc = decode_punycode(netloc)
# path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
# leading /../'s in the path are removed
newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
# strip unwanted query elements
newquery = clean_query(parsed_url.query, strict, language) or ""
if newquery and newpath == "":
if newquery and not newpath:
newpath = "/"
elif (
not trailing_slash
Expand Down
23 changes: 9 additions & 14 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,7 @@ def lang_filter(
if strict:
match = HOST_LANG_FILTER.match(url)
if match:
candidate = match[1].lower()
if candidate == language:
score += 1
else:
score -= 1
score += 1 if match[1].lower() == language else -1
# determine test result
return score >= 0

Expand All @@ -236,17 +232,16 @@ def path_filter(urlpath: str, query: str) -> bool:
def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
"""Make sure the target URL is from a suitable type (HTML page with primarily text).
Strict: Try to filter out other document types, spam, video and adult websites."""
try:
if (
# feeds + blogspot
if url.endswith(("/feed", "/rss", "_archive.html")):
raise ValueError
url.endswith(("/feed", "/rss", "_archive.html"))
or
# website structure
if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)):
raise ValueError
(SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)))
or
# type (also hidden in parameters), videos, adult content
if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)):
raise ValueError
except ValueError:
(strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)))
):
return False
# default
return True
Expand All @@ -259,7 +254,7 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
except ValueError:
return False, None

if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS:
if not parsed_url.scheme or parsed_url.scheme not in PROTOCOLS:
return False, None

if len(parsed_url.netloc) < 5 or (
Expand Down
24 changes: 14 additions & 10 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,15 @@ def fix_relative_urls(baseurl: str, url: str) -> str:
"Prepend protocol and host information to relative links."
if url.startswith("{"):
return url

base_netloc = urlsplit(baseurl).netloc
split_url = urlsplit(url)

if split_url.netloc not in (base_netloc, ""):
if split_url.scheme:
return url
return urlunsplit(split_url._replace(scheme="http"))

return urljoin(baseurl, url)


Expand Down Expand Up @@ -150,20 +153,21 @@ def is_known_link(link: str, known_links: Set[str]) -> bool:
return True

# check link and variants with trailing slashes
test_links = [link.rstrip("/"), link.rstrip("/") + "/"]
if any(test_link in known_links for test_link in test_links):
slash_test = link.rstrip("/") if link[-1] == "/" else link + "/"
if slash_test in known_links:
return True

# check link and variants with modified protocol
if link.startswith("http"):
if link.startswith("https"):
testlink = link[:4] + link[5:]
else:
testlink = "".join([link[:4], "s", link[4:]])
if any(
test in known_links
for test in [testlink, testlink.rstrip("/"), testlink.rstrip("/") + "/"]
):
protocol_test = (
"http" + link[:5] if link.startswith("https") else "https" + link[4:]
)
slash_test = (
protocol_test.rstrip("/")
if protocol_test[-1] == "/"
else protocol_test + "/"
)
if protocol_test in known_links or slash_test in known_links:
return True

return False

0 comments on commit 0736d5d

Please sign in to comment.