Skip to content

Commit 79972a4

Browse files
committed
Add url, urlPrefix, ts filters and more sort options to crawl pages endpoint
1 parent 8d0e4d2 commit 79972a4

File tree

2 files changed

+80
-2
lines changed

2 files changed

+80
-2
lines changed

backend/btrixcloud/pages.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,9 @@ async def list_pages(
501501
self,
502502
crawl_id: str,
503503
org: Optional[Organization] = None,
504+
url: Optional[str] = None,
505+
url_prefix: Optional[str] = None,
506+
ts: Optional[datetime] = None,
504507
qa_run_id: Optional[str] = None,
505508
qa_filter_by: Optional[str] = None,
506509
qa_gte: Optional[float] = None,
@@ -527,6 +530,17 @@ async def list_pages(
527530
if org:
528531
query["oid"] = org.id
529532

533+
if url_prefix:
534+
url_prefix = urllib.parse.unquote(url_prefix)
535+
regex_pattern = f"^{re.escape(url_prefix)}"
536+
query["url"] = {"$regex": regex_pattern, "$options": "i"}
537+
538+
elif url:
539+
query["url"] = urllib.parse.unquote(url)
540+
541+
if ts:
542+
query["ts"] = ts
543+
530544
if reviewed:
531545
query["$or"] = [
532546
{"approved": {"$ne": None}},
@@ -571,7 +585,16 @@ async def list_pages(
571585
# Sorting options to add:
572586
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
573587
# - Ensure notes sorting works okay with notes in list
574-
sort_fields = ("url", "title", "notes", "approved")
588+
sort_fields = (
589+
"url",
590+
"title",
591+
"notes",
592+
"approved",
593+
"ts",
594+
"status",
595+
"mime",
596+
"filename",
597+
)
575598
qa_sort_fields = ("screenshotMatch", "textMatch")
576599
if sort_by not in sort_fields and sort_by not in qa_sort_fields:
577600
raise HTTPException(status_code=400, detail="invalid_sort_by")
@@ -1004,6 +1027,9 @@ async def delete_page_notes(
10041027
async def get_crawl_pages_list(
10051028
crawl_id: str,
10061029
org: Organization = Depends(org_crawl_dep),
1030+
url: Optional[str] = None,
1031+
urlPrefix: Optional[str] = None,
1032+
ts: Optional[datetime] = None,
10071033
reviewed: Optional[bool] = None,
10081034
approved: Optional[str] = None,
10091035
hasNotes: Optional[bool] = None,
@@ -1020,6 +1046,9 @@ async def get_crawl_pages_list(
10201046
pages, total = await ops.list_pages(
10211047
crawl_id=crawl_id,
10221048
org=org,
1049+
url=url,
1050+
url_prefix=urlPrefix,
1051+
ts=ts,
10231052
reviewed=reviewed,
10241053
approved=formatted_approved,
10251054
has_notes=hasNotes,

backend/test/test_run_crawl.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,11 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
682682

683683
# Test GET page endpoint
684684
global page_id
685-
page_id = pages[0]["id"]
685+
test_page = pages[0]
686+
page_id = test_page["id"]
687+
test_page_url = test_page["url"]
688+
test_page_ts = test_page["ts"]
689+
686690
r = requests.get(
687691
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
688692
headers=crawler_auth_headers,
@@ -710,6 +714,51 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
710714
assert page.get("modified") is None
711715
assert page.get("approved") is None
712716

717+
# Test exact url filter
718+
r = requests.get(
719+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}",
720+
headers=crawler_auth_headers,
721+
)
722+
assert r.status_code == 200
723+
data = r.json()
724+
725+
assert data["total"] >= 1
726+
for matching_page in data["items"]:
727+
assert matching_page["url"] == test_page_url
728+
729+
# Test exact url and ts filters together
730+
r = requests.get(
731+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}",
732+
headers=crawler_auth_headers,
733+
)
734+
assert r.status_code == 200
735+
data = r.json()
736+
737+
assert data["total"] >= 1
738+
for matching_page in data["items"]:
739+
assert matching_page["url"] == test_page_url
740+
assert matching_page["ts"] == test_page_ts
741+
742+
# Test urlPrefix filter
743+
url_prefix = test_page_url[:8]
744+
r = requests.get(
745+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}",
746+
headers=crawler_auth_headers,
747+
)
748+
assert r.status_code == 200
749+
data = r.json()
750+
751+
assert data["total"] >= 1
752+
753+
found_matching_page = False
754+
for page in data["items"]:
755+
if page["id"] == page_id and page["url"] == test_page_url:
756+
found_matching_page = True
757+
758+
assert found_matching_page
759+
760+
761+
def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id):
713762
# Test reviewed filter (page has no notes or approved so should show up in false)
714763
r = requests.get(
715764
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",

0 commit comments

Comments
 (0)