Skip to content

Commit

Permalink
fix page quotas:
Browse files Browse the repository at this point in the history
- add ensure_page_limit_quotas() which sets the limit to the quota
- set the page limit when: creating new crawl, creating configmap, returning crawl workflow
- don't set the quota page limit on new or existing crawl workflows (remove setting it on new workflows) to allow updated quotas to take affect for next crawl
- fixes #2369
  • Loading branch information
ikreymer committed Feb 10, 2025
1 parent 83211b2 commit e1ba768
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 24 deletions.
28 changes: 21 additions & 7 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,6 @@ async def add_crawl_config(
crawlconfig.lastStartedBy = user.id
crawlconfig.lastStartedByName = user.name

# Ensure page limit is below org maxPagesPerCall if set
max_pages = org.quotas.maxPagesPerCrawl or 0
if max_pages > 0:
crawlconfig.config.limit = max_pages

# add CrawlConfig to DB here
result = await self.crawl_configs.insert_one(crawlconfig.to_dict())

Expand Down Expand Up @@ -286,13 +281,30 @@ async def add_crawl_config(
execMinutesQuotaReached=exec_mins_quota_reached,
)

def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
"""ensure page limit is set to smaller or existing limit or quota limit"""
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
if crawlconfig.config.limit and crawlconfig.config.limit > 0:
crawlconfig.config.limit = min(
org.quotas.maxPagesPerCrawl, crawlconfig.config.limit
)
else:
crawlconfig.config.limit = org.quotas.maxPagesPerCrawl

async def add_new_crawl(
self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool
self,
crawl_id: str,
crawlconfig: CrawlConfig,
user: User,
org: Organization,
manual: bool,
) -> None:
"""increments crawl count for this config and adds new crawl"""

started = dt_now()

self.ensure_quota_page_limit(crawlconfig, org)

inc = self.inc_crawl_count(crawlconfig.id)
add = self.crawl_ops.add_new_crawl(
crawl_id, crawlconfig, user.id, started, manual
Expand Down Expand Up @@ -695,6 +707,8 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):

crawlconfig.config.seeds = None

self.ensure_quota_page_limit(crawlconfig, org)

return crawlconfig

async def get_crawl_config_seed_count(self, cid: UUID, org: Organization):
Expand Down Expand Up @@ -892,7 +906,7 @@ async def run_now_internal(
storage_filename=storage_filename,
profile_filename=profile_filename or "",
)
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
return crawl_id

except Exception as exc:
Expand Down
26 changes: 10 additions & 16 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pprint import pprint
from typing import Optional, Any, Sequence
from datetime import datetime
from uuid import UUID

import json

Expand All @@ -29,7 +30,6 @@
CrawlFile,
CrawlCompleteIn,
StorageRef,
Organization,
)

from btrixcloud.utils import str_to_date, date_to_str, dt_now
Expand Down Expand Up @@ -145,11 +145,13 @@ async def sync_crawls(self, data: MCSyncData):
params["userid"] = spec.get("userid", "")

pods = data.children[POD]
org = await self.org_ops.get_org_by_id(UUID(oid))

crawl = CrawlSpec(
id=crawl_id,
cid=cid,
oid=oid,
org=org,
storage=StorageRef(spec["storageName"]),
crawler_channel=spec.get("crawlerChannel"),
proxy_id=spec.get("proxyId"),
Expand Down Expand Up @@ -204,8 +206,6 @@ async def sync_crawls(self, data: MCSyncData):
await self.k8s.delete_crawl_job(crawl.id)
return {"status": status.dict(exclude_none=True), "children": []}

org = None

# first, check storage quota, and fail immediately if quota reached
if status.state in (
"starting",
Expand All @@ -215,7 +215,6 @@ async def sync_crawls(self, data: MCSyncData):
# only check on very first run, before any pods/pvcs created
# for now, allow if crawl has already started (pods/pvcs created)
if not pods and not data.children[PVC]:
org = await self.org_ops.get_org_by_id(crawl.oid)
if self.org_ops.storage_quota_reached(org):
await self.mark_finished(
crawl, status, "skipped_storage_quota_reached"
Expand All @@ -229,7 +228,7 @@ async def sync_crawls(self, data: MCSyncData):
return self._empty_response(status)

if status.state in ("starting", "waiting_org_limit"):
if not await self.can_start_new(crawl, data, status, org):
if not await self.can_start_new(crawl, data, status):
return self._empty_response(status)

await self.set_state(
Expand Down Expand Up @@ -382,8 +381,9 @@ async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):

crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)

raw_config = crawlconfig.get_raw_config()
self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)

raw_config = crawlconfig.get_raw_config()
raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"]
)
Expand Down Expand Up @@ -637,14 +637,10 @@ async def can_start_new(
crawl: CrawlSpec,
data: MCSyncData,
status: CrawlStatus,
org: Optional[Organization] = None,
):
"""return true if crawl can start, otherwise set crawl to 'queued' state
until more crawls for org finish"""
if not org:
org = await self.org_ops.get_org_by_id(crawl.oid)

max_crawls = org.quotas.maxConcurrentCrawls or 0
max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0
if not max_crawls:
return True

Expand Down Expand Up @@ -1238,15 +1234,13 @@ def get_log_line(self, message, details):
}
return json.dumps(err)

async def add_file_to_crawl(self, cc_data, crawl, redis):
async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis):
"""Handle finished CrawlFile to db"""

filecomplete = CrawlCompleteIn(**cc_data)

org = await self.org_ops.get_org_by_id(crawl.oid)

filename = self.storage_ops.get_org_relative_path(
org, crawl.storage, filecomplete.filename
crawl.org, crawl.storage, filecomplete.filename
)

crawl_file = CrawlFile(
Expand Down Expand Up @@ -1299,7 +1293,7 @@ async def is_crawl_stopping(
return "size-limit"

# gracefully stop crawl if current running crawl sizes reach storage quota
org = await self.org_ops.get_org_by_id(crawl.oid)
org = crawl.org

if org.readOnly:
return "stopped_org_readonly"
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/operator/cronjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ async def make_new_crawljob(
crawl_id,
crawlconfig,
user,
org,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)
Expand Down
3 changes: 2 additions & 1 deletion backend/btrixcloud/operator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Optional, DefaultDict, Literal, Annotated, Any
from pydantic import BaseModel, Field
from kubernetes.utils import parse_quantity
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization


BTRIX_API = "btrix.cloud/v1"
Expand Down Expand Up @@ -70,6 +70,7 @@ class CrawlSpec(BaseModel):
id: str
cid: UUID
oid: UUID
org: Organization
scale: int = 1
storage: StorageRef
started: str
Expand Down

0 comments on commit e1ba768

Please sign in to comment.