Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 1 addition & 24 deletions compass/scripts/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@

import logging
from contextlib import AsyncExitStack
from urllib.parse import (
parse_qsl,
quote,
unquote,
urlencode,
urlparse,
urlunparse,
)

from elm.web.document import PDFDocument
from elm.web.search.run import (
Expand All @@ -32,6 +24,7 @@
JurisdictionWebsiteValidator,
)
from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer
from compass.web.url_utils import _sanitize_url
from compass.utilities.enums import LLMTasks
from compass.utilities.io import load_local_docs
from compass.pb import COMPASS_PB
Expand Down Expand Up @@ -804,22 +797,6 @@ async def _contains_relevant_text(
return found_text


def _sanitize_url(url):
"""Percent-encode spaces and unsafe characters in a URL path"""
parsed = urlparse(url)
safe_path = quote(unquote(parsed.path), safe="/")
query_params = parse_qsl(parsed.query, keep_blank_values=True)
safe_query = urlencode(query_params, doseq=True) # cspell: disable-line
return urlunparse((
parsed.scheme,
parsed.netloc,
safe_path,
parsed.params,
safe_query,
parsed.fragment,
))


def _sanitize_doc_sources(docs):
"""Rewrite source attrs on documents returned by ELMWebsiteCrawler

Expand Down
12 changes: 12 additions & 0 deletions compass/web/url_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Shared URL utilities for COMPASS web modules"""

from urllib.parse import quote, urlsplit, urlunsplit


def _sanitize_url(url):
"""Encode unsafe URL characters while preserving URL semantics"""
parsed = urlsplit(url)
path = quote(parsed.path, safe="/:@-._~!$&'()*+,;=")
query = quote(parsed.query, safe="=&;%:@-._~!$&'()*+,;/?:")
fragment = quote(parsed.fragment, safe="")
return urlunsplit((parsed.scheme, parsed.netloc, path, query, fragment))
37 changes: 2 additions & 35 deletions compass/web/website_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,7 @@
import operator
from collections import Counter
from contextlib import AsyncExitStack
from urllib.parse import (
urlparse,
urlunparse,
quote,
unquote,
parse_qsl,
urlencode,
urljoin,
)
from urllib.parse import urljoin

from crawl4ai.models import Link as c4AILink
from bs4 import BeautifulSoup
Expand All @@ -28,6 +20,7 @@
from elm.web.document import PDFDocument, HTMLDocument
from elm.web.file_loader import AsyncWebFileLoader
from elm.web.website_crawl import ELMLinkScorer, _SCORE_KEY # noqa: PLC2701
from compass.web.url_utils import _sanitize_url


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -495,32 +488,6 @@ def _debug_info_on_links(links):
logger.debug(" ...")


def _sanitize_url(url):
"""Fix common URL issues

- Encode spaces and unsafe characters in the path
- Encode query parameters safely
- Leave existing percent-encoding intact
"""
parsed = urlparse(url)

safe_path = quote(unquote(parsed.path), safe="/")

query_params = parse_qsl(parsed.query, keep_blank_values=True)
safe_query = urlencode(query_params, doseq=True) # cspell: disable-line

return urlunparse(
(
parsed.scheme,
parsed.netloc,
safe_path,
parsed.params,
safe_query,
parsed.fragment,
)
)


def _extract_links_from_html(text, base_url):
"""Parse HTML and extract all links"""
soup = BeautifulSoup(text, "html.parser")
Expand Down
1 change: 1 addition & 0 deletions tests/python/unit/web/test_web_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ def test_extract_links_from_html_sets_text_from_anchor():
<a href="/doc.pdf">Permit Standards</a>
"""
links = _extract_links_from_html(html, base_url="https://example.com")
assert len(links) == 1
link = next(iter(links))
assert link.title == "Permit Standards"
assert link.text == "Permit Standards"
Expand Down
Loading