diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 9199eed8..213ada92 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -2,14 +2,6 @@ import logging from contextlib import AsyncExitStack -from urllib.parse import ( - parse_qsl, - quote, - unquote, - urlencode, - urlparse, - urlunparse, -) from elm.web.document import PDFDocument from elm.web.search.run import ( @@ -32,6 +24,7 @@ JurisdictionWebsiteValidator, ) from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer +from compass.web.url_utils import _sanitize_url from compass.utilities.enums import LLMTasks from compass.utilities.io import load_local_docs from compass.pb import COMPASS_PB @@ -804,22 +797,6 @@ async def _contains_relevant_text( return found_text -def _sanitize_url(url): - """Percent-encode spaces and unsafe characters in a URL path""" - parsed = urlparse(url) - safe_path = quote(unquote(parsed.path), safe="/") - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - return urlunparse(( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - )) - - def _sanitize_doc_sources(docs): """Rewrite source attrs on documents returned by ELMWebsiteCrawler diff --git a/compass/web/url_utils.py b/compass/web/url_utils.py new file mode 100644 index 00000000..2eeb464d --- /dev/null +++ b/compass/web/url_utils.py @@ -0,0 +1,12 @@ +"""Shared URL utilities for COMPASS web modules""" + +from urllib.parse import quote, urlsplit, urlunsplit + + +def _sanitize_url(url): + """Encode unsafe URL characters while preserving URL semantics""" + parsed = urlsplit(url) + path = quote(parsed.path, safe="/:@-._~!$&'()*+,;=") + query = quote(parsed.query, safe="=&;%:@-._~!$&'()*+,;/?:") + fragment = quote(parsed.fragment, safe="") + return urlunsplit((parsed.scheme, parsed.netloc, path, query, fragment)) diff --git a/compass/web/website_crawl.py b/compass/web/website_crawl.py index a5fd2093..ccda23ff 100644 --- a/compass/web/website_crawl.py +++ b/compass/web/website_crawl.py @@ -9,15 +9,7 @@ import operator from collections import Counter from contextlib import AsyncExitStack -from urllib.parse import ( - urlparse, - urlunparse, - quote, - unquote, - parse_qsl, - urlencode, - urljoin, -) +from urllib.parse import urljoin from crawl4ai.models import Link as c4AILink from bs4 import BeautifulSoup @@ -28,6 +20,7 @@ from elm.web.document import PDFDocument, HTMLDocument from elm.web.file_loader import AsyncWebFileLoader from elm.web.website_crawl import ELMLinkScorer, _SCORE_KEY # noqa: PLC2701 +from compass.web.url_utils import _sanitize_url logger = logging.getLogger(__name__) @@ -495,32 +488,6 @@ def _debug_info_on_links(links): logger.debug(" ...") -def _sanitize_url(url): - """Fix common URL issues - - - Encode spaces and unsafe characters in the path - - Encode query parameters safely - - Leave existing percent-encoding intact - """ - parsed = urlparse(url) - - safe_path = quote(unquote(parsed.path), safe="/") - - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - - return urlunparse( - ( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - ) - ) - - def _extract_links_from_html(text, base_url): """Parse HTML and extract all links""" soup = BeautifulSoup(text, "html.parser") diff --git a/tests/python/unit/web/test_web_crawl.py b/tests/python/unit/web/test_web_crawl.py index 24ae0e61..30dd2a86 100644 --- a/tests/python/unit/web/test_web_crawl.py +++ b/tests/python/unit/web/test_web_crawl.py @@ -267,6 +267,7 @@ def test_extract_links_from_html_sets_text_from_anchor(): Permit Standards """ links = _extract_links_from_html(html, base_url="https://example.com") + assert len(links) == 1 link = next(iter(links)) assert link.title == "Permit Standards" assert link.text == "Permit Standards"