From eee21174e127562a031e820d96082e582b39def9 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 11:15:49 -0700 Subject: [PATCH 01/20] docs(retention): design for scheduled task-retention cleanup workflow --- ...scheduled-task-retention-cleanup-design.md | 181 ++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md diff --git a/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md b/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md new file mode 100644 index 00000000..a50594ab --- /dev/null +++ b/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md @@ -0,0 +1,181 @@ +# Scheduled Task-Retention Cleanup Workflow — Design + +**Date:** 2026-06-03 +**Status:** Approved (pending spec review) +**Author:** Stas Moreinis + +## Background + +A data-retention requirement calls for isolating project data and avoiding +long-lived chat/task data in the shared setup: keep Agentex chat/task data only +while a conversation is "active", and auto-clean it after a configurable idle +window (default 7 days since last interaction). + +The export / clean / rehydrate building blocks already landed (PR #243): +`TaskRetentionUseCase` and `TaskRetentionService.clean_task(...)` are written so +that the same logic backs both the HTTP admin endpoints and a scheduled cleanup +caller. `clean_task` is **idempotent**, performs its own **authoritative idle +check** (`max(task.updated_at, latest_message.created_at) < now - idle_days`), +and **refuses** (raises `ClientError`) for three safety/policy cases: the task is +`RUNNING`, it is not idle long enough (when the threshold is enforced), or it has +unprocessed events past the `agent_task_tracker` cursors. If the task is already +cleaned (`cleaned_at IS NOT NULL`) it returns an empty result rather than raising. + +This document designs the missing piece: a **regularly scheduled sweep** that +discovers idle tasks and drives them through `clean_task`. + +### Scope (v1) + +- **In scope:** A Temporal Schedule + sweep workflow that finds idle tasks + belonging to an allowlisted set of agents and cleans them, gated by a feature + flag. Clean only. +- **Out of scope (explicitly):** + - Exporting task content to an external sink before cleanup. Per the retention + discussion, v1 cleanup does **not** export anywhere; persisted chat history + lives in the consuming product's approved store and the export/rehydrate APIs + remain available for manual testing and a later full-restore path. + - Rehydrate wiring. + - **Deploying the Temporal worker in the target (k8s) environment.** The worker + process and Schedule infrastructure exist in code and docker-compose, but the + deployed environment may not yet run a backend Temporal worker. That is an + infra prerequisite tracked separately; this design is the code change. + +## Goals + +1. Periodically clean idle tasks for an allowlisted set of agents, with zero + behavior change in any environment until explicitly enabled. +2. Reuse the existing, idempotent `clean_task` path verbatim — no duplicated + deletion logic. +3. Be resilient: one task's failure or refusal never aborts the sweep; the run + is safe to retry and safe to replay after a worker crash. +4. Bound resource usage (Temporal history, concurrent deletes) regardless of + backlog size. + +## Configuration + +All configuration is via environment variables (consistent with the existing +`ENABLE_HEALTH_CHECK_WORKFLOW` pattern). The master flag and cron are read at +**schedule-bootstrap** time; the allowlist and idle threshold are passed into the +scheduled workflow as input args so the schedule encodes the policy it runs with. + +| Env var | Meaning | Default | +|---|---|---| +| `RETENTION_CLEANUP_ENABLED` | Master on/off. When false, the schedule is not created and the sweep is a no-op. | `false` | +| `RETENTION_CLEANUP_AGENT_ALLOWLIST` | Comma-separated agent **names**. Only tasks owned by these agents are eligible. Empty ⇒ nothing eligible (fail-closed). | `""` | +| `RETENTION_CLEANUP_IDLE_DAYS` | Idle threshold in days. | `7` | +| `RETENTION_CLEANUP_CRON` | Cron expression for the schedule. | `0 4 * * *` (daily 04:00) | +| `RETENTION_CLEANUP_PAGE_SIZE` | Candidate page size per discovery activity call. | `200` | +| `RETENTION_CLEANUP_MAX_IN_FLIGHT` | Max concurrent per-task child workflows. | `20` | + +**Fail-closed:** an empty allowlist cleans nothing. The allowlist scopes the +blast radius to named agents only. + +## Architecture + +### Components + +| Component | File | Responsibility | +|---|---|---| +| `RetentionCleanupSweepWorkflow` | `agentex/src/temporal/workflows/retention_cleanup_workflow.py` | Paginate candidates → fan out child workflows in bounded batches → aggregate summary → `continue_as_new` across pages. | +| `RetentionCleanupTaskWorkflow` | same file | Per-task child workflow: invoke the clean activity, return a structured outcome. | +| `RetentionCleanupActivities` | `agentex/src/temporal/activities/retention_cleanup_activities.py` | `find_cleanup_candidates(...)` and `clean_task(...)` (the latter catches `ClientError` and maps it to a `skipped` outcome). | +| Discovery query | `agentex/src/domain/repositories/task_repository.py` (extend) | Keyset-paginated query for idle, uncleaned candidate task ids filtered by agent name. | +| Schedule bootstrap | `agentex/src/temporal/run_retention_cleanup_schedule.py` | On startup, when enabled, create/update the Temporal Schedule (mirrors `run_healthcheck_workflow.py`). | +| Worker registration | `agentex/src/temporal/run_worker.py` (edit) | Register both workflows + the activities on the `agentex-server` task queue. | + +### Data flow + +``` +Temporal Schedule (cron; created at bootstrap only when RETENTION_CLEANUP_ENABLED) + └─> RetentionCleanupSweepWorkflow(idle_days, allowlist, page_size, max_in_flight) + ├─ activity find_cleanup_candidates(cursor, limit, idle_days, allowlist) -> [task_id...] + ├─ for each batch (size ≤ max_in_flight) of task_ids: + │ start child RetentionCleanupTaskWorkflow(task_id, idle_days) + │ └─ activity clean_task(task_id, idle_days) # enforce_idle_threshold=True + │ -> outcome: cleaned{counts} | skipped{reason} + │ (raises only on transient/infra errors -> Temporal retries) + ├─ accumulate running totals: cleaned / skipped(by reason) / failed + └─ if another page exists: continue_as_new(next_cursor, running_totals) + else: emit structured summary log and complete +``` + +### Discovery query + +```sql +SELECT t.id +FROM tasks t +JOIN task_agents ta ON ta.task_id = t.id +JOIN agents a ON a.id = ta.agent_id +WHERE t.cleaned_at IS NULL + AND t.updated_at < (now() - make_interval(days => :idle_days)) + AND a.name = ANY(:allowlist) + AND t.id > :cursor -- keyset pagination +ORDER BY t.id +LIMIT :page_size; +``` + +Notes: + +- **No `status` filter.** `status` is the race-prone dimension — a task can flip + to `RUNNING` between this query and the clean call, so filtering it here gives + only a false sense of safety. The trustworthy RUNNING check is the + authoritative guard inside `clean_task` (evaluated at clean-time). Discovery is + therefore limited to stable, index-friendly columns (`cleaned_at`, + `updated_at`) plus the allowlist join; a rare RUNNING-but-stale task surfaces as + a candidate and is absorbed as `skipped{reason=running}` by the backstop. +- The `updated_at < cutoff` pre-filter is a **correct superset** of genuinely-idle + tasks: true idleness requires both `updated_at` **and** the latest Mongo message + to predate the cutoff, so the Postgres pre-filter can never exclude a truly-idle + task. It only over-includes (caught at clean-time), never under-includes. +- Keyset pagination by `id` (not OFFSET) keeps each page cheap and stable as rows + are cleaned mid-sweep. + +## Idleness & correctness + +- **Pre-filter (cheap, in discovery):** `cleaned_at IS NULL AND updated_at < cutoff`. +- **Authoritative (correctness-critical, in `clean_task`):** idle check including + the latest Mongo message timestamp, the RUNNING guard, and the unprocessed-events + guard. The sweep always runs with `enforce_idle_threshold=True` and **never** + forces. +- **Idempotency / replay safety:** `clean_task` no-ops on already-cleaned tasks and + is idempotent across all stores, so child-workflow retries and worker-crash + replays are safe. + +## Error handling + +- The `clean_task` activity **catches `ClientError`** (the three refusals) and + returns a structured `skipped{reason}` outcome; the child workflow completes + successfully. Pre-filtering keeps these rare; the catch handles the unavoidable + races (a message/event landing between discovery and clean). +- Genuine transient errors (Postgres/Mongo) **propagate**, so Temporal's default + RetryPolicy retries the activity. A child that still fails after retries is + counted as `failed` by the parent and **does not abort the sweep**. +- The parent emits a structured summary log (`cleaned`, `skipped` by reason, + `failed`) for Datadog faceting, consistent with the existing + `task_cleanup_completed` forensic log emitted by `clean_task`. + +## Scale & safety + +- `continue_as_new` per page bounds workflow history irrespective of backlog size. +- `max_in_flight` caps concurrent child workflows to avoid a thundering herd of + deletes against Mongo/Postgres. +- Feature flag ⇒ no behavior change anywhere until explicitly enabled. +- Allowlist (fail-closed) ⇒ blast radius limited to named agents. + +## Testing + +- **Unit:** discovery query filters and keyset paging; activity skip-mapping + (`ClientError` → `skipped{reason}`); parent summary aggregation; fail-closed on + empty allowlist. +- **Integration (testcontainers — Postgres/Mongo):** seed idle, active, + already-cleaned, and not-yet-idle tasks across allowlisted and non-allowlisted + agents; run the activity layer; assert only the right tasks are cleaned and that + counts/skips match. +- **Workflow (Temporal `WorkflowEnvironment`):** fan-out correctness, + `continue_as_new` paging across multiple pages, and that a failed child does not + abort the sweep. + +## Open prerequisites (not built here) + +- Backend Temporal worker must actually run in the target deployed environment for + the Schedule to execute. Tracked as an infra change separate from this code. From 7336eb573939420a695b3bd2f4865798525f28fa Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 11:25:17 -0700 Subject: [PATCH 02/20] docs(retention): implementation plan for scheduled cleanup workflow --- ...-06-03-scheduled-task-retention-cleanup.md | 1134 +++++++++++++++++ 1 file changed, 1134 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md diff --git a/docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md b/docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md new file mode 100644 index 00000000..40ac5374 --- /dev/null +++ b/docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md @@ -0,0 +1,1134 @@ +# Scheduled Task-Retention Cleanup — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a Temporal Schedule + sweep workflow to the agentex backend that periodically discovers idle tasks for an allowlisted set of agents and runs the existing idempotent `clean_task` path against them, gated by a feature flag. + +**Architecture:** A daily Temporal Schedule starts a `RetentionCleanupSweepWorkflow`. The sweep calls a `find_cleanup_candidates` activity (cheap, index-friendly Postgres pre-filter: `cleaned_at IS NULL AND updated_at < cutoff`, joined to an agent-name allowlist, keyset-paginated), then fans out one `RetentionCleanupTaskWorkflow` child per task. Each child calls a `clean_task` activity that delegates to the already-merged `TaskRetentionUseCase.clean_task`; the activity catches `ClientError` (the three safety/policy refusals) and maps it to a `skipped` outcome, so only genuine transient errors retry. The parent aggregates `cleaned`/`skipped`/`failed` counts and `continue_as_new`s per page to bound history. + +**Tech Stack:** Python 3.12, Temporal (`temporalio`), SQLAlchemy async, FastAPI DI patterns, pytest (`pytest-asyncio`), testcontainers for integration. + +**Spec:** `docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md` + +**Conventions to follow:** +- Run a single test: `make test FILE=tests/unit/path/test_foo.py NAME=test_name` (from `agentex/`). +- Lint before commit: `uv run ruff check src/ --fix && uv run ruff format src/` (from `agentex/`). +- Activity/workflow boundary carries **only JSON-native types** (`str`, `int`, `bool`, `list`, `dict`). The backend's Temporal data converter only adds datetime support (`client_factory.py:DateTimePayloadConverter`); it does NOT serialize Pydantic models. Cross the activity boundary with dicts; build Pydantic models (if any) inside domain code only. +- Commit messages: no Claude attribution (repo is public — see `CLAUDE.md`). + +--- + +## File Structure + +| File | Create/Modify | Responsibility | +|---|---|---| +| `agentex/src/config/environment_variables.py` | Modify | Add the 6 `RETENTION_CLEANUP_*` config fields + parsing. | +| `agentex/src/domain/repositories/task_repository.py` | Modify | Add `list_cleanup_candidate_ids(...)` keyset-paginated discovery query. | +| `agentex/src/temporal/task_retention_factory.py` | Create | `build_task_retention_use_case(...)` — wires `TaskRetentionUseCase` outside FastAPI DI. | +| `agentex/src/temporal/activities/retention_cleanup_activities.py` | Create | `RetentionCleanupActivities` with `find_cleanup_candidates` + `clean_task` activities. | +| `agentex/src/temporal/workflows/retention_cleanup_workflow.py` | Create | `RetentionCleanupSweepWorkflow` (fan-out + paging) + `RetentionCleanupTaskWorkflow` (per-task child). | +| `agentex/src/temporal/run_worker.py` | Modify | Register the new workflows + activities on the `agentex-server` queue. | +| `agentex/src/temporal/run_retention_cleanup_schedule.py` | Create | Startup script: create/update the Temporal Schedule when enabled. | +| `agentex/docker-compose.yml` | Modify | Add the schedule-bootstrap step + env vars for local dev. | +| `agentex/tests/unit/temporal/test_retention_cleanup_activities.py` | Create | Unit tests for the activities (mocked use case / repo). | +| `agentex/tests/unit/temporal/test_retention_cleanup_workflow.py` | Create | Workflow tests via `WorkflowEnvironment` with mocked activities. | +| `agentex/tests/unit/config/test_retention_cleanup_env.py` | Create | Env-var parsing test. | +| `agentex/tests/integration/test_retention_cleanup_discovery.py` | Create | Integration test for `list_cleanup_candidate_ids` against real Postgres. | + +--- + +## Task 1: Add retention-cleanup configuration + +**Files:** +- Modify: `agentex/src/config/environment_variables.py` +- Test: `agentex/tests/unit/config/test_retention_cleanup_env.py` + +- [ ] **Step 1: Write the failing test** + +Create `agentex/tests/unit/config/test_retention_cleanup_env.py`: + +```python +import pytest + +from src.config.environment_variables import EnvironmentVariables + + +@pytest.mark.unit +def test_retention_cleanup_env_parses_enabled_and_allowlist(monkeypatch): + monkeypatch.setenv("RETENTION_CLEANUP_ENABLED", "true") + monkeypatch.setenv("RETENTION_CLEANUP_AGENT_ALLOWLIST", "agent-a, agent-b ,agent-c") + monkeypatch.setenv("RETENTION_CLEANUP_IDLE_DAYS", "14") + monkeypatch.setenv("RETENTION_CLEANUP_CRON", "0 3 * * *") + monkeypatch.setenv("RETENTION_CLEANUP_PAGE_SIZE", "50") + monkeypatch.setenv("RETENTION_CLEANUP_MAX_IN_FLIGHT", "5") + + env = EnvironmentVariables.refresh(force_refresh=True) + + assert env.RETENTION_CLEANUP_ENABLED is True + # Allowlist is parsed into a trimmed, non-empty list of names. + assert env.RETENTION_CLEANUP_AGENT_ALLOWLIST == ["agent-a", "agent-b", "agent-c"] + assert env.RETENTION_CLEANUP_IDLE_DAYS == 14 + assert env.RETENTION_CLEANUP_CRON == "0 3 * * *" + assert env.RETENTION_CLEANUP_PAGE_SIZE == 50 + assert env.RETENTION_CLEANUP_MAX_IN_FLIGHT == 5 + + +@pytest.mark.unit +def test_retention_cleanup_env_defaults(monkeypatch): + for key in ( + "RETENTION_CLEANUP_ENABLED", + "RETENTION_CLEANUP_AGENT_ALLOWLIST", + "RETENTION_CLEANUP_IDLE_DAYS", + "RETENTION_CLEANUP_CRON", + "RETENTION_CLEANUP_PAGE_SIZE", + "RETENTION_CLEANUP_MAX_IN_FLIGHT", + ): + monkeypatch.delenv(key, raising=False) + + env = EnvironmentVariables.refresh(force_refresh=True) + + assert env.RETENTION_CLEANUP_ENABLED is False + assert env.RETENTION_CLEANUP_AGENT_ALLOWLIST == [] # fail-closed + assert env.RETENTION_CLEANUP_IDLE_DAYS == 7 + assert env.RETENTION_CLEANUP_CRON == "0 4 * * *" + assert env.RETENTION_CLEANUP_PAGE_SIZE == 200 + assert env.RETENTION_CLEANUP_MAX_IN_FLIGHT == 20 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `make test FILE=tests/unit/config/test_retention_cleanup_env.py` +Expected: FAIL — `AttributeError` / unexpected attribute on `EnvironmentVariables`. + +- [ ] **Step 3: Implement the config** + +In `agentex/src/config/environment_variables.py`, add to the `EnvVarKeys` class (near `ENABLE_HEALTH_CHECK_WORKFLOW`, line ~59): + +```python + RETENTION_CLEANUP_ENABLED = "RETENTION_CLEANUP_ENABLED" + RETENTION_CLEANUP_AGENT_ALLOWLIST = "RETENTION_CLEANUP_AGENT_ALLOWLIST" + RETENTION_CLEANUP_IDLE_DAYS = "RETENTION_CLEANUP_IDLE_DAYS" + RETENTION_CLEANUP_CRON = "RETENTION_CLEANUP_CRON" + RETENTION_CLEANUP_PAGE_SIZE = "RETENTION_CLEANUP_PAGE_SIZE" + RETENTION_CLEANUP_MAX_IN_FLIGHT = "RETENTION_CLEANUP_MAX_IN_FLIGHT" +``` + +Add the fields to the `EnvironmentVariables` model (near `ENABLE_HEALTH_CHECK_WORKFLOW`, line ~115): + +```python + RETENTION_CLEANUP_ENABLED: bool = False + RETENTION_CLEANUP_AGENT_ALLOWLIST: list[str] = [] + RETENTION_CLEANUP_IDLE_DAYS: int = 7 + RETENTION_CLEANUP_CRON: str = "0 4 * * *" + RETENTION_CLEANUP_PAGE_SIZE: int = 200 + RETENTION_CLEANUP_MAX_IN_FLIGHT: int = 20 +``` + +Add the parsing inside `refresh()` where the `EnvironmentVariables(...)` instance is built (alongside `ENABLE_HEALTH_CHECK_WORKFLOW=...`, line ~199): + +```python + RETENTION_CLEANUP_ENABLED=( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_ENABLED, "false") == "true" + ), + RETENTION_CLEANUP_AGENT_ALLOWLIST=[ + name.strip() + for name in os.environ.get( + EnvVarKeys.RETENTION_CLEANUP_AGENT_ALLOWLIST, "" + ).split(",") + if name.strip() + ], + RETENTION_CLEANUP_IDLE_DAYS=int( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_IDLE_DAYS, "7") + ), + RETENTION_CLEANUP_CRON=os.environ.get( + EnvVarKeys.RETENTION_CLEANUP_CRON, "0 4 * * *" + ), + RETENTION_CLEANUP_PAGE_SIZE=int( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_PAGE_SIZE, "200") + ), + RETENTION_CLEANUP_MAX_IN_FLIGHT=int( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_MAX_IN_FLIGHT, "20") + ), +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `make test FILE=tests/unit/config/test_retention_cleanup_env.py` +Expected: PASS (both tests). + +- [ ] **Step 5: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/config/environment_variables.py tests/unit/config/test_retention_cleanup_env.py +git commit -m "feat(retention): add scheduled-cleanup configuration env vars" +``` + +--- + +## Task 2: Discovery query — `list_cleanup_candidate_ids` + +**Files:** +- Modify: `agentex/src/domain/repositories/task_repository.py` +- Test: `agentex/tests/integration/test_retention_cleanup_discovery.py` + +- [ ] **Step 1: Write the failing integration test** + +Create `agentex/tests/integration/test_retention_cleanup_discovery.py`. It seeds rows directly via SQLAlchemy core (so we can control `updated_at` / `cleaned_at`), then asserts the query's filtering and keyset paging. + +```python +from datetime import UTC, datetime, timedelta + +import pytest +from sqlalchemy import insert + +from src.adapters.orm import AgentORM, TaskAgentORM, TaskORM +from src.domain.entities.tasks import TaskStatus + + +async def _seed_agent(session, agent_id: str, name: str) -> None: + await session.execute( + insert(AgentORM).values( + id=agent_id, + name=name, + description="seed", + acp_url=f"http://{agent_id}:8000", + acp_type="sync", + ) + ) + + +async def _seed_task( + session, + *, + task_id: str, + agent_id: str, + updated_at: datetime, + cleaned_at: datetime | None, + status: TaskStatus = TaskStatus.COMPLETED, +) -> None: + await session.execute( + insert(TaskORM).values( + id=task_id, + name=task_id, + status=status, + updated_at=updated_at, + cleaned_at=cleaned_at, + ) + ) + await session.execute( + insert(TaskAgentORM).values(task_id=task_id, agent_id=agent_id) + ) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_discovery_filters_and_keyset_paging(isolated_repositories): + repo = isolated_repositories["task_repository"] + now = datetime.now(UTC) + old = now - timedelta(days=30) + + async with isolated_repositories["postgres_rw_session_factory"]() as session: + await _seed_agent(session, "agent-allowed", "allowed-agent") + await _seed_agent(session, "agent-other", "other-agent") + # idle + allowlisted + not cleaned -> eligible + await _seed_task(session, task_id="t-aaa", agent_id="agent-allowed", updated_at=old, cleaned_at=None) + await _seed_task(session, task_id="t-bbb", agent_id="agent-allowed", updated_at=old, cleaned_at=None) + # recently active (updated_at recent) -> excluded by pre-filter + await _seed_task(session, task_id="t-fresh", agent_id="agent-allowed", updated_at=now, cleaned_at=None) + # already cleaned -> excluded + await _seed_task(session, task_id="t-clean", agent_id="agent-allowed", updated_at=old, cleaned_at=old) + # idle but NOT on allowlist -> excluded + await _seed_task(session, task_id="t-other", agent_id="agent-other", updated_at=old, cleaned_at=None) + await session.commit() + + # Full page: only the two eligible ids, ordered by id ascending. + ids = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=["allowed-agent"], after_id=None, limit=100 + ) + assert ids == ["t-aaa", "t-bbb"] + + # Keyset paging: limit=1 then resume after the first id. + page1 = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=["allowed-agent"], after_id=None, limit=1 + ) + assert page1 == ["t-aaa"] + page2 = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=["allowed-agent"], after_id="t-aaa", limit=1 + ) + assert page2 == ["t-bbb"] + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_discovery_empty_allowlist_returns_nothing(isolated_repositories): + repo = isolated_repositories["task_repository"] + ids = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=[], after_id=None, limit=100 + ) + assert ids == [] +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `make test FILE=tests/integration/test_retention_cleanup_discovery.py` +Expected: FAIL — `AttributeError: 'TaskRepository' object has no attribute 'list_cleanup_candidate_ids'`. + +- [ ] **Step 3: Implement the query method** + +In `agentex/src/domain/repositories/task_repository.py`, add the import at the top (the module already imports `select`, `update` from sqlalchemy and `Sequence` from collections.abc): + +```python +from datetime import UTC, datetime, timedelta +``` + +Add this method to `TaskRepository` (e.g. right after `list_with_join`): + +```python + async def list_cleanup_candidate_ids( + self, + *, + idle_days: int, + agent_names: Sequence[str], + after_id: str | None, + limit: int, + ) -> list[str]: + """ + Return ids of tasks eligible for scheduled retention cleanup. + + Cheap, index-friendly PRE-FILTER only — the authoritative idle / status / + unprocessed-events checks live in TaskRetentionService.clean_task. This + deliberately omits a status filter: status is race-prone (a task can flip + to RUNNING between this query and the clean call), so the trustworthy + RUNNING guard is enforced at clean-time. `updated_at < cutoff` is a correct + superset of truly-idle tasks (true idleness also requires the latest Mongo + message to predate the cutoff), so we never under-include. + + Keyset-paginated by id ascending; pass the last returned id as `after_id` + to fetch the next page. Fail-closed: empty `agent_names` returns []. + """ + if not agent_names: + return [] + + cutoff = datetime.now(UTC) - timedelta(days=idle_days) + query = ( + select(TaskORM.id) + .join(TaskAgentORM, TaskORM.id == TaskAgentORM.task_id) + .join(AgentORM, TaskAgentORM.agent_id == AgentORM.id) + .where( + TaskORM.cleaned_at.is_(None), + TaskORM.updated_at < cutoff, + AgentORM.name.in_(list(agent_names)), + ) + .order_by(TaskORM.id.asc()) + .limit(limit) + .distinct() + ) + if after_id is not None: + query = query.where(TaskORM.id > after_id) + + async with self.start_async_db_session(False) as session: + result = await session.execute(query) + return [row[0] for row in result.all()] +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `make test FILE=tests/integration/test_retention_cleanup_discovery.py` +Expected: PASS (both tests). + +> If the `AgentORM` / `TaskORM` inserts fail on a NOT NULL column, add the missing +> column to the corresponding `_seed_*` helper using the column's value from +> `src/adapters/orm.py` — do not change the query. + +- [ ] **Step 5: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/domain/repositories/task_repository.py tests/integration/test_retention_cleanup_discovery.py +git commit -m "feat(retention): add keyset-paginated cleanup-candidate discovery query" +``` + +--- + +## Task 3: Use-case factory for worker context + +**Files:** +- Create: `agentex/src/temporal/task_retention_factory.py` + +No dedicated test (it's pure wiring exercised by Task 4's tests and at runtime). Verified against real constructor signatures: +`TaskRepository`/`EventRepository`/`AgentTaskTrackerRepository(rw_maker, ro_maker)`, +`TaskMessageRepository(db)`, `TaskStateRepository(db)`, `TaskMessageService(message_repository=...)`, +`TemporalAdapter(temporal_client=...)`, `TaskRetentionService(...)`, `TaskRetentionUseCase(retention_service=...)`. + +- [ ] **Step 1: Create the factory** + +Create `agentex/src/temporal/task_retention_factory.py`: + +```python +""" +Construct a TaskRetentionUseCase outside FastAPI's Depends DI, for use inside +Temporal worker processes. Mirrors the manual-wiring pattern in +run_healthcheck_workflow.py (repositories built from session makers). +""" + +from src.adapters.temporal.adapter_temporal import TemporalAdapter +from src.config.dependencies import ( + GlobalDependencies, + database_async_read_only_session_maker, + database_async_read_write_engine, + database_async_read_write_session_maker, + httpx_client, +) +from src.domain.repositories.agent_task_tracker_repository import ( + AgentTaskTrackerRepository, +) +from src.domain.repositories.event_repository import EventRepository +from src.domain.repositories.task_message_repository import TaskMessageRepository +from src.domain.repositories.task_repository import TaskRepository +from src.domain.repositories.task_state_repository import TaskStateRepository +from src.domain.services.task_message_service import TaskMessageService +from src.domain.services.task_retention_service import TaskRetentionService +from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase + + +def build_task_retention_use_case( + global_dependencies: GlobalDependencies, +) -> TaskRetentionUseCase: + """Wire a TaskRetentionUseCase from an already-loaded GlobalDependencies.""" + engine = database_async_read_write_engine() + rw_session_maker = database_async_read_write_session_maker(engine) + ro_session_maker = database_async_read_only_session_maker(engine) + + task_repository = TaskRepository(rw_session_maker, ro_session_maker) + event_repository = EventRepository(rw_session_maker, ro_session_maker) + agent_task_tracker_repository = AgentTaskTrackerRepository( + rw_session_maker, ro_session_maker + ) + + task_message_repository = TaskMessageRepository(global_dependencies.mongodb_database) + task_state_repository = TaskStateRepository(global_dependencies.mongodb_database) + task_message_service = TaskMessageService(message_repository=task_message_repository) + + temporal_adapter = TemporalAdapter( + temporal_client=global_dependencies.temporal_client + ) + + retention_service = TaskRetentionService( + task_repository=task_repository, + task_message_service=task_message_service, + task_message_repository=task_message_repository, + task_state_repository=task_state_repository, + event_repository=event_repository, + agent_task_tracker_repository=agent_task_tracker_repository, + temporal_adapter=temporal_adapter, + httpx_client=httpx_client(), + ) + return TaskRetentionUseCase(retention_service=retention_service) +``` + +- [ ] **Step 2: Verify it imports** + +Run: `cd agentex && uv run python -c "from src.temporal.task_retention_factory import build_task_retention_use_case; print('ok')"` +Expected: prints `ok` (no ImportError). + +- [ ] **Step 3: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/temporal/task_retention_factory.py +git commit -m "feat(retention): add worker-context factory for TaskRetentionUseCase" +``` + +--- + +## Task 4: Cleanup activities + +**Files:** +- Create: `agentex/src/temporal/activities/retention_cleanup_activities.py` +- Test: `agentex/tests/unit/temporal/test_retention_cleanup_activities.py` + +The activities cross the Temporal boundary with JSON-native types only. `clean_task` +returns a dict: `{"task_id", "status": "cleaned"|"skipped", "reason", "messages_deleted", "task_states_deleted", "events_deleted"}`. + +- [ ] **Step 1: Write the failing tests** + +Create `agentex/tests/unit/temporal/test_retention_cleanup_activities.py`: + +```python +from datetime import UTC, datetime +from unittest.mock import AsyncMock + +import pytest + +from src.domain.entities.task_retention import TaskCleanupResultEntity +from src.domain.exceptions import ClientError +from src.temporal.activities.retention_cleanup_activities import ( + RetentionCleanupActivities, +) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_find_cleanup_candidates_delegates_to_repo(): + repo = AsyncMock() + repo.list_cleanup_candidate_ids.return_value = ["t1", "t2"] + activities = RetentionCleanupActivities(task_repository=repo, use_case=AsyncMock()) + + result = await activities.find_cleanup_candidates( + after_id=None, limit=200, idle_days=7, agent_names=["a"] + ) + + assert result == ["t1", "t2"] + repo.list_cleanup_candidate_ids.assert_awaited_once_with( + idle_days=7, agent_names=["a"], after_id=None, limit=200 + ) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_clean_task_cleaned_outcome(): + use_case = AsyncMock() + use_case.clean_task.return_value = TaskCleanupResultEntity( + task_id="t1", + cleaned_at=datetime.now(UTC), + messages_deleted=3, + task_states_deleted=1, + events_deleted=2, + ) + activities = RetentionCleanupActivities(task_repository=AsyncMock(), use_case=use_case) + + outcome = await activities.clean_task(task_id="t1", idle_days=7) + + assert outcome["status"] == "cleaned" + assert outcome["task_id"] == "t1" + assert outcome["messages_deleted"] == 3 + use_case.clean_task.assert_awaited_once_with(task_id="t1", force=False, idle_days=7) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_clean_task_clienterror_maps_to_skipped(): + use_case = AsyncMock() + use_case.clean_task.side_effect = ClientError("Cannot clean task t1: status is RUNNING (active)") + activities = RetentionCleanupActivities(task_repository=AsyncMock(), use_case=use_case) + + outcome = await activities.clean_task(task_id="t1", idle_days=7) + + assert outcome["status"] == "skipped" + assert "RUNNING" in outcome["reason"] + assert outcome["task_id"] == "t1" + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_clean_task_unexpected_error_propagates(): + use_case = AsyncMock() + use_case.clean_task.side_effect = RuntimeError("mongo timeout") + activities = RetentionCleanupActivities(task_repository=AsyncMock(), use_case=use_case) + + with pytest.raises(RuntimeError): + await activities.clean_task(task_id="t1", idle_days=7) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_activities.py` +Expected: FAIL — module `retention_cleanup_activities` does not exist. + +- [ ] **Step 3: Implement the activities** + +Create `agentex/src/temporal/activities/retention_cleanup_activities.py`: + +```python +""" +Temporal activities for the scheduled task-retention cleanup sweep. + +Two activities: +- find_cleanup_candidates: cheap pre-filtered, keyset-paginated discovery. +- clean_task: delegates to TaskRetentionUseCase.clean_task; catches ClientError + (the three policy/safety refusals) and maps it to a 'skipped' outcome so the + caller's child workflow completes cleanly. Genuine transient errors propagate + so Temporal retries them. + +Boundary types are JSON-native (the backend data converter does not serialize +Pydantic models). +""" + +from src.domain.exceptions import ClientError +from src.domain.repositories.task_repository import TaskRepository +from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase +from src.utils.logging import make_logger +from temporalio import activity + +logger = make_logger(__name__) + +FIND_CLEANUP_CANDIDATES_ACTIVITY = "find_cleanup_candidates_activity" +CLEAN_TASK_ACTIVITY = "clean_task_activity" + + +class RetentionCleanupActivities: + def __init__( + self, + task_repository: TaskRepository, + use_case: TaskRetentionUseCase, + ): + self.task_repository = task_repository + self.use_case = use_case + + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) + async def find_cleanup_candidates( + self, + after_id: str | None, + limit: int, + idle_days: int, + agent_names: list[str], + ) -> list[str]: + return await self.task_repository.list_cleanup_candidate_ids( + idle_days=idle_days, + agent_names=agent_names, + after_id=after_id, + limit=limit, + ) + + @activity.defn(name=CLEAN_TASK_ACTIVITY) + async def clean_task(self, task_id: str, idle_days: int) -> dict: + try: + result = await self.use_case.clean_task( + task_id=task_id, force=False, idle_days=idle_days + ) + return { + "task_id": result.task_id, + "status": "cleaned", + "reason": None, + "messages_deleted": result.messages_deleted, + "task_states_deleted": result.task_states_deleted, + "events_deleted": result.events_deleted, + } + except ClientError as e: + # Expected policy/safety refusal (RUNNING / not idle / unprocessed + # events). Backstop for the rare race the pre-filter can't catch. + logger.info( + "task_cleanup_skipped", + extra={"task_id": task_id, "reason": str(e)}, + ) + return { + "task_id": task_id, + "status": "skipped", + "reason": str(e), + "messages_deleted": 0, + "task_states_deleted": 0, + "events_deleted": 0, + } +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_activities.py` +Expected: PASS (all four). + +- [ ] **Step 5: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/temporal/activities/retention_cleanup_activities.py tests/unit/temporal/test_retention_cleanup_activities.py +git commit -m "feat(retention): add cleanup discovery + clean activities" +``` + +--- + +## Task 5: Sweep + per-task child workflows + +**Files:** +- Create: `agentex/src/temporal/workflows/retention_cleanup_workflow.py` +- Test: `agentex/tests/unit/temporal/test_retention_cleanup_workflow.py` + +- [ ] **Step 1: Write the failing workflow test** + +Create `agentex/tests/unit/temporal/test_retention_cleanup_workflow.py`. It runs the real workflows in a time-skipping `WorkflowEnvironment` with **mocked activities** so no DB is needed. + +```python +import uuid + +import pytest +from temporalio import activity +from temporalio.testing import WorkflowEnvironment +from temporalio.worker import UnsandboxedWorkflowRunner, Worker + +from src.temporal.activities.retention_cleanup_activities import ( + CLEAN_TASK_ACTIVITY, + FIND_CLEANUP_CANDIDATES_ACTIVITY, +) +from src.temporal.workflows.retention_cleanup_workflow import ( + RetentionCleanupSweepWorkflow, + RetentionCleanupTaskWorkflow, +) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_sweep_cleans_all_pages_and_aggregates(): + # Two pages of candidates then empty; one task is skipped, one fails once then is counted failed. + pages = {None: ["t1", "t2"], "t2": ["t3"], "t3": []} + + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) + async def fake_find(after_id, limit, idle_days, agent_names) -> list[str]: + return pages[after_id] + + @activity.defn(name=CLEAN_TASK_ACTIVITY) + async def fake_clean(task_id: str, idle_days: int) -> dict: + if task_id == "t2": + return {"task_id": task_id, "status": "skipped", "reason": "RUNNING", + "messages_deleted": 0, "task_states_deleted": 0, "events_deleted": 0} + if task_id == "t3": + raise RuntimeError("permanent failure") + return {"task_id": task_id, "status": "cleaned", "reason": None, + "messages_deleted": 1, "task_states_deleted": 0, "events_deleted": 0} + + async with await WorkflowEnvironment.start_time_skipping() as env: + async with Worker( + env.client, + task_queue="test-retention", + workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + activities=[fake_find, fake_clean], + workflow_runner=UnsandboxedWorkflowRunner(), + ): + summary = await env.client.execute_workflow( + RetentionCleanupSweepWorkflow.run, + { + "idle_days": 7, + "agent_names": ["a"], + "page_size": 2, + "max_in_flight": 2, + }, + id=f"sweep-{uuid.uuid4()}", + task_queue="test-retention", + ) + + assert summary["cleaned"] == 1 # t1 + assert summary["skipped"] == 1 # t2 + assert summary["failed"] == 1 # t3 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_workflow.py` +Expected: FAIL — module `retention_cleanup_workflow` does not exist. + +- [ ] **Step 3: Implement the workflows** + +Create `agentex/src/temporal/workflows/retention_cleanup_workflow.py`: + +```python +""" +Scheduled task-retention cleanup workflows. + +RetentionCleanupSweepWorkflow: started by a Temporal Schedule. Pulls one page of +candidate task ids, fans out one child workflow per task (bounded by +max_in_flight), aggregates cleaned/skipped/failed counts, then continue_as_new's +to the next page so workflow history stays bounded regardless of backlog size. + +RetentionCleanupTaskWorkflow: per-task child. Invokes the clean activity, which +already maps the policy/safety ClientError refusals to a 'skipped' outcome; only +genuine transient errors surface as activity failures (and are retried). +""" + +import asyncio +from datetime import timedelta + +from src.temporal.activities.retention_cleanup_activities import ( + CLEAN_TASK_ACTIVITY, + FIND_CLEANUP_CANDIDATES_ACTIVITY, +) +from src.utils.logging import make_logger +from temporalio import workflow +from temporalio.common import RetryPolicy + +logger = make_logger(__name__) + + +def _chunked(items: list[str], size: int) -> list[list[str]]: + return [items[i : i + size] for i in range(0, len(items), size)] + + +@workflow.defn +class RetentionCleanupTaskWorkflow: + @workflow.run + async def run(self, args: dict) -> dict: + return await workflow.execute_activity( + CLEAN_TASK_ACTIVITY, + args=[args["task_id"], args["idle_days"]], + start_to_close_timeout=timedelta(seconds=60), + retry_policy=RetryPolicy( + maximum_attempts=3, + initial_interval=timedelta(seconds=1), + backoff_coefficient=2.0, + ), + ) + + +@workflow.defn +class RetentionCleanupSweepWorkflow: + @workflow.run + async def run(self, args: dict) -> dict: + idle_days = args["idle_days"] + agent_names = args["agent_names"] + page_size = args.get("page_size", 200) + max_in_flight = args.get("max_in_flight", 20) + after_id = args.get("after_id") + totals = args.get("totals", {"cleaned": 0, "skipped": 0, "failed": 0}) + + task_ids = await workflow.execute_activity( + FIND_CLEANUP_CANDIDATES_ACTIVITY, + args=[after_id, page_size, idle_days, agent_names], + start_to_close_timeout=timedelta(seconds=30), + retry_policy=RetryPolicy( + maximum_attempts=3, + initial_interval=timedelta(seconds=1), + backoff_coefficient=2.0, + ), + ) + + if not task_ids: + logger.info("retention_cleanup_sweep_completed", extra=totals) + return totals + + for batch in _chunked(task_ids, max_in_flight): + results = await asyncio.gather( + *[ + workflow.execute_child_workflow( + RetentionCleanupTaskWorkflow.run, + {"task_id": task_id, "idle_days": idle_days}, + id=f"retention-cleanup-task-{task_id}", + retry_policy=RetryPolicy(maximum_attempts=1), + ) + for task_id in batch + ], + return_exceptions=True, + ) + for result in results: + if isinstance(result, BaseException): + totals["failed"] += 1 + else: + status = result.get("status", "failed") + totals[status] = totals.get(status, 0) + 1 + + # Bound history: hand the next page to a fresh run. + workflow.continue_as_new( + { + "idle_days": idle_days, + "agent_names": agent_names, + "page_size": page_size, + "max_in_flight": max_in_flight, + "after_id": task_ids[-1], + "totals": totals, + } + ) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_workflow.py` +Expected: PASS. (Note: `WorkflowEnvironment.start_time_skipping()` downloads a test server on first run; ensure network access. If `temporalio` test deps are missing, install with `uv sync`.) + +- [ ] **Step 5: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/temporal/workflows/retention_cleanup_workflow.py tests/unit/temporal/test_retention_cleanup_workflow.py +git commit -m "feat(retention): add sweep + per-task cleanup workflows" +``` + +--- + +## Task 6: Register workflows + activities on the worker + +**Files:** +- Modify: `agentex/src/temporal/run_worker.py` + +- [ ] **Step 1: Add a worker factory for retention cleanup** + +In `agentex/src/temporal/run_worker.py`, add imports near the existing imports: + +```python +from src.config.dependencies import GlobalDependencies +from src.temporal.activities.retention_cleanup_activities import ( + RetentionCleanupActivities, +) +from src.temporal.task_retention_factory import build_task_retention_use_case +from src.temporal.workflows.retention_cleanup_workflow import ( + RetentionCleanupSweepWorkflow, + RetentionCleanupTaskWorkflow, +) +from src.domain.repositories.task_repository import TaskRepository +``` + +Add this factory function (mirrors `create_health_check_worker`): + +```python +def create_retention_cleanup_worker( + global_dependencies: GlobalDependencies, +) -> asyncio.Task: + """Create a worker that serves the retention-cleanup workflows + activities.""" + task_queue = os.environ.get("AGENTEX_SERVER_TASK_QUEUE", AGENTEX_SERVER_TASK_QUEUE) + + engine = database_async_read_write_engine() + rw_session_maker = database_async_read_write_session_maker(engine) + ro_session_maker = database_async_read_only_session_maker(engine) + task_repository = TaskRepository(rw_session_maker, ro_session_maker) + use_case = build_task_retention_use_case(global_dependencies) + + retention_activities = RetentionCleanupActivities( + task_repository=task_repository, + use_case=use_case, + ) + + return asyncio.create_task( + run_worker( + task_queue=task_queue, + workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + activities=[ + retention_activities.find_cleanup_candidates, + retention_activities.clean_task, + ], + max_workers=50, + max_concurrent_activities=50, + ) + ) +``` + +> **Note on `run_worker`'s global `health_check_worker`:** the existing `run_worker` +> assigns the worker to a module global and shuts that single global down in +> `finally`. Running two `run_worker` tasks in one process would clobber that +> global. For v1 the retention worker runs as its **own process** (see Task 8 — +> a separate docker-compose service / k8s deployment invoking +> `create_retention_cleanup_worker` via a `main()`), so it does not share the +> health-check process. Add a `main()` to this module that loads +> `GlobalDependencies` and awaits `create_retention_cleanup_worker(...)`, guarded +> so it only runs when invoked as the retention entrypoint. + +Add a retention entrypoint `main` (separate from the health-check `main`): + +```python +async def run_retention_cleanup_worker_main() -> None: + global_dependencies = GlobalDependencies() + await global_dependencies.load() + worker_task = create_retention_cleanup_worker(global_dependencies) + await worker_task +``` + +- [ ] **Step 2: Verify imports/compile** + +Run: `cd agentex && uv run python -c "from src.temporal.run_worker import create_retention_cleanup_worker, run_retention_cleanup_worker_main; print('ok')"` +Expected: prints `ok`. + +- [ ] **Step 3: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/temporal/run_worker.py +git commit -m "feat(retention): register cleanup workflows + activities on worker" +``` + +--- + +## Task 7: Schedule bootstrap script + +**Files:** +- Create: `agentex/src/temporal/run_retention_cleanup_schedule.py` + +This mirrors `run_healthcheck_workflow.py`: a startup script that, when enabled, +creates (or no-ops if already present) the Temporal Schedule. + +- [ ] **Step 1: Create the bootstrap script** + +Create `agentex/src/temporal/run_retention_cleanup_schedule.py`: + +```python +""" +Create the Temporal Schedule that drives the scheduled task-retention cleanup. + +Runs at startup (mirrors run_healthcheck_workflow.py). No-op unless +RETENTION_CLEANUP_ENABLED is true and an agent allowlist is configured +(fail-closed). Idempotent: if the schedule already exists, it is left as-is. +""" + +import asyncio + +from src.adapters.temporal.adapter_temporal import TemporalAdapter +from src.adapters.temporal.client_factory import TemporalClientFactory +from src.adapters.temporal.exceptions import TemporalScheduleAlreadyExistsError +from src.config.dependencies import GlobalDependencies +from src.config.environment_variables import EnvironmentVariables +from src.temporal.run_worker import AGENTEX_SERVER_TASK_QUEUE +from src.temporal.workflows.retention_cleanup_workflow import ( + RetentionCleanupSweepWorkflow, +) +from src.utils.logging import make_logger + +logger = make_logger(__name__) + +SCHEDULE_ID = "retention-cleanup-sweep" +WORKFLOW_ID = "retention-cleanup-sweep" + + +async def main() -> None: + global_dependencies = GlobalDependencies() + await global_dependencies.load() + + env = EnvironmentVariables.refresh() + if not env or not env.RETENTION_CLEANUP_ENABLED: + logger.info("Retention cleanup is not enabled; skipping schedule creation") + return + if not env.RETENTION_CLEANUP_AGENT_ALLOWLIST: + logger.warning( + "Retention cleanup enabled but agent allowlist is empty (fail-closed); " + "skipping schedule creation" + ) + return + if not TemporalClientFactory.is_temporal_configured(env): + logger.error("Temporal is not configured; skipping schedule creation") + return + + task_queue = env.AGENTEX_SERVER_TASK_QUEUE or AGENTEX_SERVER_TASK_QUEUE + adapter = TemporalAdapter(temporal_client=global_dependencies.temporal_client) + + workflow_args = { + "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, + "agent_names": env.RETENTION_CLEANUP_AGENT_ALLOWLIST, + "page_size": env.RETENTION_CLEANUP_PAGE_SIZE, + "max_in_flight": env.RETENTION_CLEANUP_MAX_IN_FLIGHT, + } + + try: + await adapter.create_schedule( + schedule_id=SCHEDULE_ID, + workflow=RetentionCleanupSweepWorkflow.run, + workflow_id=WORKFLOW_ID, + args=[workflow_args], + task_queue=task_queue, + cron_expressions=[env.RETENTION_CLEANUP_CRON], + ) + logger.info( + "Created retention-cleanup schedule", + extra={"cron": env.RETENTION_CLEANUP_CRON, "args": workflow_args}, + ) + except TemporalScheduleAlreadyExistsError: + logger.info("Retention-cleanup schedule already exists; leaving as-is") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +- [ ] **Step 2: Verify imports/compile** + +Run: `cd agentex && uv run python -c "import src.temporal.run_retention_cleanup_schedule as m; print(m.SCHEDULE_ID)"` +Expected: prints `retention-cleanup-sweep`. + +- [ ] **Step 3: Lint + commit** + +```bash +cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ +git add src/temporal/run_retention_cleanup_schedule.py +git commit -m "feat(retention): add cleanup schedule bootstrap script" +``` + +--- + +## Task 8: Wire local dev (docker-compose) + docs + +**Files:** +- Modify: `agentex/docker-compose.yml` + +- [ ] **Step 1: Add the retention worker service + schedule bootstrap** + +In `agentex/docker-compose.yml`, add a worker service modeled on the existing +`agentex-temporal-worker` service, but running the retention entrypoint, and add a +schedule-bootstrap invocation. Use the same image/build, env, and `depends_on` as +`agentex-temporal-worker`. Set its command to run the retention worker: + +```yaml + agentex-retention-cleanup-worker: + # (copy build/image/env/depends_on/networks from agentex-temporal-worker) + command: >- + python -c "import asyncio; from src.temporal.run_worker import run_retention_cleanup_worker_main; asyncio.run(run_retention_cleanup_worker_main())" + environment: + # inherit the temporal-worker env, plus: + RETENTION_CLEANUP_ENABLED: "${RETENTION_CLEANUP_ENABLED:-false}" + RETENTION_CLEANUP_AGENT_ALLOWLIST: "${RETENTION_CLEANUP_AGENT_ALLOWLIST:-}" + RETENTION_CLEANUP_IDLE_DAYS: "${RETENTION_CLEANUP_IDLE_DAYS:-7}" + RETENTION_CLEANUP_CRON: "${RETENTION_CLEANUP_CRON:-0 4 * * *}" + RETENTION_CLEANUP_PAGE_SIZE: "${RETENTION_CLEANUP_PAGE_SIZE:-200}" + RETENTION_CLEANUP_MAX_IN_FLIGHT: "${RETENTION_CLEANUP_MAX_IN_FLIGHT:-20}" +``` + +Add the schedule bootstrap to the `agentex` API service startup command, right +after the health-check workflow bootstrap (the existing command runs +`python src/temporal/run_healthcheck_workflow.py`); append: + +``` +python src/temporal/run_retention_cleanup_schedule.py +``` + +so the schedule is (re)asserted on API startup, gated by `RETENTION_CLEANUP_ENABLED`. + +- [ ] **Step 2: Validate compose syntax** + +Run: `cd agentex && docker compose config >/dev/null && echo "compose ok"` +Expected: prints `compose ok` (no YAML/interpolation errors). + +- [ ] **Step 3: Commit** + +```bash +git add agentex/docker-compose.yml +git commit -m "feat(retention): wire cleanup worker + schedule bootstrap for local dev" +``` + +--- + +## Task 9: Full test sweep + final verification + +- [ ] **Step 1: Run the full new test surface** + +```bash +cd agentex +make test FILE=tests/unit/config/test_retention_cleanup_env.py +make test FILE=tests/unit/temporal/test_retention_cleanup_activities.py +make test FILE=tests/unit/temporal/test_retention_cleanup_workflow.py +make test FILE=tests/integration/test_retention_cleanup_discovery.py +``` +Expected: all PASS. + +- [ ] **Step 2: Lint the whole changed surface** + +```bash +cd agentex && uv run ruff check src/ && uv run ruff format --check src/ +``` +Expected: no errors. + +- [ ] **Step 3: Manual smoke (optional, local)** + +With `RETENTION_CLEANUP_ENABLED=true` and `RETENTION_CLEANUP_AGENT_ALLOWLIST=`, +`make dev`, then in the Temporal UI (http://localhost:8080) confirm a Schedule +`retention-cleanup-sweep` exists; trigger it manually and confirm a +`RetentionCleanupSweepWorkflow` run completes with a summary. + +- [ ] **Step 4: Final commit (if any lint/fixups remain)** + +```bash +cd agentex && git add -A && git commit -m "chore(retention): lint + test fixups for scheduled cleanup" || echo "nothing to commit" +``` + +--- + +## Notes / Out of Scope + +- **Worker must run in the deployed env.** This plan wires the worker for local + docker-compose. The deployed (k8s) environment must run an agentex-backend + Temporal worker on the `agentex-server` queue for the Schedule to execute — + tracked as a separate infra change (see spec "Open prerequisites"). +- **No export sink** in v1 (clean only), per the spec. +- **Audit trail** is the existing `task_cleanup_completed` structured log (emitted + inside `clean_task`) plus the new `task_cleanup_skipped` and + `retention_cleanup_sweep_completed` logs — faceted in Datadog. No new table. From b2e86b7369b7c83cf77dc3b13f14ed1c706b402a Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 11:37:47 -0700 Subject: [PATCH 03/20] feat(retention): add scheduled-cleanup configuration env vars Adds RETENTION_CLEANUP_ENABLED, RETENTION_CLEANUP_AGENT_ALLOWLIST, RETENTION_CLEANUP_IDLE_DAYS, RETENTION_CLEANUP_CRON, RETENTION_CLEANUP_PAGE_SIZE, and RETENTION_CLEANUP_MAX_IN_FLIGHT to EnvVarKeys, EnvironmentVariables, and the refresh() parser, following the existing ENABLE_HEALTH_CHECK_WORKFLOW pattern. Includes unit tests covering both explicit values and defaults. --- agentex/src/config/environment_variables.py | 34 +++++++++++++++ .../unit/config/test_retention_cleanup_env.py | 43 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 agentex/tests/unit/config/test_retention_cleanup_env.py diff --git a/agentex/src/config/environment_variables.py b/agentex/src/config/environment_variables.py index 2d41740a..7d1f2a57 100644 --- a/agentex/src/config/environment_variables.py +++ b/agentex/src/config/environment_variables.py @@ -58,6 +58,12 @@ class EnvVarKeys(str, Enum): AGENTEX_SERVER_TASK_QUEUE = "AGENTEX_SERVER_TASK_QUEUE" ENABLE_HEALTH_CHECK_WORKFLOW = "ENABLE_HEALTH_CHECK_WORKFLOW" WEBHOOK_REQUEST_TIMEOUT = "WEBHOOK_REQUEST_TIMEOUT" + RETENTION_CLEANUP_ENABLED = "RETENTION_CLEANUP_ENABLED" + RETENTION_CLEANUP_AGENT_ALLOWLIST = "RETENTION_CLEANUP_AGENT_ALLOWLIST" + RETENTION_CLEANUP_IDLE_DAYS = "RETENTION_CLEANUP_IDLE_DAYS" + RETENTION_CLEANUP_CRON = "RETENTION_CLEANUP_CRON" + RETENTION_CLEANUP_PAGE_SIZE = "RETENTION_CLEANUP_PAGE_SIZE" + RETENTION_CLEANUP_MAX_IN_FLIGHT = "RETENTION_CLEANUP_MAX_IN_FLIGHT" class Environment(str, Enum): @@ -114,6 +120,12 @@ class EnvironmentVariables(BaseModel): AGENTEX_SERVER_TASK_QUEUE: str | None = None ENABLE_HEALTH_CHECK_WORKFLOW: bool = False WEBHOOK_REQUEST_TIMEOUT: float = 15.0 # Webhook request timeout in seconds + RETENTION_CLEANUP_ENABLED: bool = False + RETENTION_CLEANUP_AGENT_ALLOWLIST: list[str] = [] + RETENTION_CLEANUP_IDLE_DAYS: int = 7 + RETENTION_CLEANUP_CRON: str = "0 4 * * *" + RETENTION_CLEANUP_PAGE_SIZE: int = 200 + RETENTION_CLEANUP_MAX_IN_FLIGHT: int = 20 @classmethod def refresh(cls, force_refresh: bool = False) -> EnvironmentVariables | None: @@ -203,6 +215,28 @@ def refresh(cls, force_refresh: bool = False) -> EnvironmentVariables | None: WEBHOOK_REQUEST_TIMEOUT=float( os.environ.get(EnvVarKeys.WEBHOOK_REQUEST_TIMEOUT, "15.0") ), + RETENTION_CLEANUP_ENABLED=( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_ENABLED, "false") == "true" + ), + RETENTION_CLEANUP_AGENT_ALLOWLIST=[ + name.strip() + for name in os.environ.get( + EnvVarKeys.RETENTION_CLEANUP_AGENT_ALLOWLIST, "" + ).split(",") + if name.strip() + ], + RETENTION_CLEANUP_IDLE_DAYS=int( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_IDLE_DAYS, "7") + ), + RETENTION_CLEANUP_CRON=os.environ.get( + EnvVarKeys.RETENTION_CLEANUP_CRON, "0 4 * * *" + ), + RETENTION_CLEANUP_PAGE_SIZE=int( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_PAGE_SIZE, "200") + ), + RETENTION_CLEANUP_MAX_IN_FLIGHT=int( + os.environ.get(EnvVarKeys.RETENTION_CLEANUP_MAX_IN_FLIGHT, "20") + ), ) refreshed_environment_variables = environment_variables return refreshed_environment_variables diff --git a/agentex/tests/unit/config/test_retention_cleanup_env.py b/agentex/tests/unit/config/test_retention_cleanup_env.py new file mode 100644 index 00000000..15c2791d --- /dev/null +++ b/agentex/tests/unit/config/test_retention_cleanup_env.py @@ -0,0 +1,43 @@ +import pytest +from src.config.environment_variables import EnvironmentVariables + + +@pytest.mark.unit +def test_retention_cleanup_env_parses_enabled_and_allowlist(monkeypatch): + monkeypatch.setenv("RETENTION_CLEANUP_ENABLED", "true") + monkeypatch.setenv("RETENTION_CLEANUP_AGENT_ALLOWLIST", "agent-a, agent-b ,agent-c") + monkeypatch.setenv("RETENTION_CLEANUP_IDLE_DAYS", "14") + monkeypatch.setenv("RETENTION_CLEANUP_CRON", "0 3 * * *") + monkeypatch.setenv("RETENTION_CLEANUP_PAGE_SIZE", "50") + monkeypatch.setenv("RETENTION_CLEANUP_MAX_IN_FLIGHT", "5") + + env = EnvironmentVariables.refresh(force_refresh=True) + + assert env.RETENTION_CLEANUP_ENABLED is True + assert env.RETENTION_CLEANUP_AGENT_ALLOWLIST == ["agent-a", "agent-b", "agent-c"] + assert env.RETENTION_CLEANUP_IDLE_DAYS == 14 + assert env.RETENTION_CLEANUP_CRON == "0 3 * * *" + assert env.RETENTION_CLEANUP_PAGE_SIZE == 50 + assert env.RETENTION_CLEANUP_MAX_IN_FLIGHT == 5 + + +@pytest.mark.unit +def test_retention_cleanup_env_defaults(monkeypatch): + for key in ( + "RETENTION_CLEANUP_ENABLED", + "RETENTION_CLEANUP_AGENT_ALLOWLIST", + "RETENTION_CLEANUP_IDLE_DAYS", + "RETENTION_CLEANUP_CRON", + "RETENTION_CLEANUP_PAGE_SIZE", + "RETENTION_CLEANUP_MAX_IN_FLIGHT", + ): + monkeypatch.delenv(key, raising=False) + + env = EnvironmentVariables.refresh(force_refresh=True) + + assert env.RETENTION_CLEANUP_ENABLED is False + assert env.RETENTION_CLEANUP_AGENT_ALLOWLIST == [] # fail-closed + assert env.RETENTION_CLEANUP_IDLE_DAYS == 7 + assert env.RETENTION_CLEANUP_CRON == "0 4 * * *" + assert env.RETENTION_CLEANUP_PAGE_SIZE == 200 + assert env.RETENTION_CLEANUP_MAX_IN_FLIGHT == 20 From 38c46b3d84fe2312792d760b3bcda6c64bc1fbf9 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 11:47:10 -0700 Subject: [PATCH 04/20] feat(retention): add keyset-paginated cleanup-candidate discovery query --- .../domain/repositories/task_repository.py | 47 +++++++ .../test_retention_cleanup_discovery.py | 115 ++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 agentex/tests/integration/test_retention_cleanup_discovery.py diff --git a/agentex/src/domain/repositories/task_repository.py b/agentex/src/domain/repositories/task_repository.py index fd67ea97..0f41dc89 100644 --- a/agentex/src/domain/repositories/task_repository.py +++ b/agentex/src/domain/repositories/task_repository.py @@ -1,4 +1,5 @@ from collections.abc import Sequence +from datetime import UTC, datetime, timedelta from typing import Annotated, Literal from fastapi import Depends @@ -94,6 +95,52 @@ async def list_with_join( relationships=relationships, ) + async def list_cleanup_candidate_ids( + self, + *, + idle_days: int, + agent_names: Sequence[str], + after_id: str | None, + limit: int, + ) -> list[str]: + """ + Return ids of tasks eligible for scheduled retention cleanup. + + Cheap, index-friendly PRE-FILTER only — the authoritative idle / status / + unprocessed-events checks live in TaskRetentionService.clean_task. This + deliberately omits a status filter: status is race-prone (a task can flip + to RUNNING between this query and the clean call), so the trustworthy + RUNNING guard is enforced at clean-time. `updated_at < cutoff` is a correct + superset of truly-idle tasks (true idleness also requires the latest Mongo + message to predate the cutoff), so we never under-include. + + Keyset-paginated by id ascending; pass the last returned id as `after_id` + to fetch the next page. Fail-closed: empty `agent_names` returns []. + """ + if not agent_names: + return [] + + cutoff = datetime.now(UTC) - timedelta(days=idle_days) + query = ( + select(TaskORM.id) + .join(TaskAgentORM, TaskORM.id == TaskAgentORM.task_id) + .join(AgentORM, TaskAgentORM.agent_id == AgentORM.id) + .where( + TaskORM.cleaned_at.is_(None), + TaskORM.updated_at < cutoff, + AgentORM.name.in_(list(agent_names)), + ) + .order_by(TaskORM.id.asc()) + .limit(limit) + .distinct() + ) + if after_id is not None: + query = query.where(TaskORM.id > after_id) + + async with self.start_async_db_session(False) as session: + result = await session.execute(query) + return [row[0] for row in result.all()] + async def create(self, agent_id: str, task: TaskEntity) -> TaskEntity: """Create task and establish agent relationships""" diff --git a/agentex/tests/integration/test_retention_cleanup_discovery.py b/agentex/tests/integration/test_retention_cleanup_discovery.py new file mode 100644 index 00000000..bccab933 --- /dev/null +++ b/agentex/tests/integration/test_retention_cleanup_discovery.py @@ -0,0 +1,115 @@ +from datetime import UTC, datetime, timedelta + +import pytest +from sqlalchemy import insert +from src.adapters.orm import AgentORM, TaskAgentORM, TaskORM +from src.domain.entities.agents import AgentStatus +from src.domain.entities.tasks import TaskStatus + + +async def _seed_agent(session, agent_id: str, name: str) -> None: + await session.execute( + insert(AgentORM).values( + id=agent_id, + name=name, + description="seed", + acp_url=f"http://{agent_id}:8000", + acp_type="sync", + status=AgentStatus.READY, + ) + ) + + +async def _seed_task( + session, + *, + task_id: str, + agent_id: str, + updated_at: datetime, + cleaned_at: datetime | None, + status: TaskStatus = TaskStatus.COMPLETED, +) -> None: + await session.execute( + insert(TaskORM).values( + id=task_id, + name=task_id, + status=status, + updated_at=updated_at, + cleaned_at=cleaned_at, + ) + ) + await session.execute( + insert(TaskAgentORM).values(task_id=task_id, agent_id=agent_id) + ) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_discovery_filters_and_keyset_paging(isolated_repositories): + repo = isolated_repositories["task_repository"] + now = datetime.now(UTC) + old = now - timedelta(days=30) + + async with isolated_repositories["postgres_rw_session_factory"]() as session: + await _seed_agent(session, "agent-allowed", "allowed-agent") + await _seed_agent(session, "agent-other", "other-agent") + await _seed_task( + session, + task_id="t-aaa", + agent_id="agent-allowed", + updated_at=old, + cleaned_at=None, + ) + await _seed_task( + session, + task_id="t-bbb", + agent_id="agent-allowed", + updated_at=old, + cleaned_at=None, + ) + await _seed_task( + session, + task_id="t-fresh", + agent_id="agent-allowed", + updated_at=now, + cleaned_at=None, + ) + await _seed_task( + session, + task_id="t-clean", + agent_id="agent-allowed", + updated_at=old, + cleaned_at=old, + ) + await _seed_task( + session, + task_id="t-other", + agent_id="agent-other", + updated_at=old, + cleaned_at=None, + ) + await session.commit() + + ids = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=["allowed-agent"], after_id=None, limit=100 + ) + assert ids == ["t-aaa", "t-bbb"] + + page1 = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=["allowed-agent"], after_id=None, limit=1 + ) + assert page1 == ["t-aaa"] + page2 = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=["allowed-agent"], after_id="t-aaa", limit=1 + ) + assert page2 == ["t-bbb"] + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_discovery_empty_allowlist_returns_nothing(isolated_repositories): + repo = isolated_repositories["task_repository"] + ids = await repo.list_cleanup_candidate_ids( + idle_days=7, agent_names=[], after_id=None, limit=100 + ) + assert ids == [] From 20f19c0cad207771252f3228f18a68bd93282e38 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 12:29:20 -0700 Subject: [PATCH 05/20] refactor(retention): use keyword arg for read-only session in discovery query --- agentex/src/domain/repositories/task_repository.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agentex/src/domain/repositories/task_repository.py b/agentex/src/domain/repositories/task_repository.py index 0f41dc89..03f53d49 100644 --- a/agentex/src/domain/repositories/task_repository.py +++ b/agentex/src/domain/repositories/task_repository.py @@ -128,7 +128,7 @@ async def list_cleanup_candidate_ids( .where( TaskORM.cleaned_at.is_(None), TaskORM.updated_at < cutoff, - AgentORM.name.in_(list(agent_names)), + AgentORM.name.in_(agent_names), ) .order_by(TaskORM.id.asc()) .limit(limit) @@ -137,7 +137,7 @@ async def list_cleanup_candidate_ids( if after_id is not None: query = query.where(TaskORM.id > after_id) - async with self.start_async_db_session(False) as session: + async with self.start_async_db_session(allow_writes=False) as session: result = await session.execute(query) return [row[0] for row in result.all()] From 9ef7e5fb022931c41cca3956ef7c77029ce0808d Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 12:30:56 -0700 Subject: [PATCH 06/20] feat(retention): add worker-context factory for TaskRetentionUseCase --- .../src/temporal/task_retention_factory.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 agentex/src/temporal/task_retention_factory.py diff --git a/agentex/src/temporal/task_retention_factory.py b/agentex/src/temporal/task_retention_factory.py new file mode 100644 index 00000000..0f9290f6 --- /dev/null +++ b/agentex/src/temporal/task_retention_factory.py @@ -0,0 +1,63 @@ +""" +Construct a TaskRetentionUseCase outside FastAPI's Depends DI, for use inside +Temporal worker processes. Mirrors the manual-wiring pattern in +run_healthcheck_workflow.py (repositories built from session makers). +""" + +from src.adapters.temporal.adapter_temporal import TemporalAdapter +from src.config.dependencies import ( + GlobalDependencies, + database_async_read_only_session_maker, + database_async_read_write_engine, + database_async_read_write_session_maker, + httpx_client, +) +from src.domain.repositories.agent_task_tracker_repository import ( + AgentTaskTrackerRepository, +) +from src.domain.repositories.event_repository import EventRepository +from src.domain.repositories.task_message_repository import TaskMessageRepository +from src.domain.repositories.task_repository import TaskRepository +from src.domain.repositories.task_state_repository import TaskStateRepository +from src.domain.services.task_message_service import TaskMessageService +from src.domain.services.task_retention_service import TaskRetentionService +from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase + + +def build_task_retention_use_case( + global_dependencies: GlobalDependencies, +) -> TaskRetentionUseCase: + """Wire a TaskRetentionUseCase from an already-loaded GlobalDependencies.""" + engine = database_async_read_write_engine() + rw_session_maker = database_async_read_write_session_maker(engine) + ro_session_maker = database_async_read_only_session_maker(engine) + + task_repository = TaskRepository(rw_session_maker, ro_session_maker) + event_repository = EventRepository(rw_session_maker, ro_session_maker) + agent_task_tracker_repository = AgentTaskTrackerRepository( + rw_session_maker, ro_session_maker + ) + + task_message_repository = TaskMessageRepository( + global_dependencies.mongodb_database + ) + task_state_repository = TaskStateRepository(global_dependencies.mongodb_database) + task_message_service = TaskMessageService( + message_repository=task_message_repository + ) + + temporal_adapter = TemporalAdapter( + temporal_client=global_dependencies.temporal_client + ) + + retention_service = TaskRetentionService( + task_repository=task_repository, + task_message_service=task_message_service, + task_message_repository=task_message_repository, + task_state_repository=task_state_repository, + event_repository=event_repository, + agent_task_tracker_repository=agent_task_tracker_repository, + temporal_adapter=temporal_adapter, + httpx_client=httpx_client(), + ) + return TaskRetentionUseCase(retention_service=retention_service) From 488106c3a53202401ba05715cb75a66c9d8970fe Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 12:46:57 -0700 Subject: [PATCH 07/20] feat(retention): add cleanup discovery + clean activities --- .../retention_cleanup_activities.py | 77 ++++++++++++++++++ agentex/tests/unit/temporal/__init__.py | 0 .../test_retention_cleanup_activities.py | 80 +++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 agentex/src/temporal/activities/retention_cleanup_activities.py create mode 100644 agentex/tests/unit/temporal/__init__.py create mode 100644 agentex/tests/unit/temporal/test_retention_cleanup_activities.py diff --git a/agentex/src/temporal/activities/retention_cleanup_activities.py b/agentex/src/temporal/activities/retention_cleanup_activities.py new file mode 100644 index 00000000..c8b35454 --- /dev/null +++ b/agentex/src/temporal/activities/retention_cleanup_activities.py @@ -0,0 +1,77 @@ +""" +Temporal activities for the scheduled task-retention cleanup sweep. + +Two activities: +- find_cleanup_candidates: cheap pre-filtered, keyset-paginated discovery. +- clean_task: delegates to TaskRetentionUseCase.clean_task; catches ClientError + (the three policy/safety refusals) and maps it to a 'skipped' outcome so the + caller's child workflow completes cleanly. Genuine transient errors propagate + so Temporal retries them. + +Boundary types are JSON-native (the backend data converter does not serialize +Pydantic models). +""" + +from src.domain.exceptions import ClientError +from src.domain.repositories.task_repository import TaskRepository +from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase +from src.utils.logging import make_logger +from temporalio import activity + +logger = make_logger(__name__) + +FIND_CLEANUP_CANDIDATES_ACTIVITY = "find_cleanup_candidates_activity" +CLEAN_TASK_ACTIVITY = "clean_task_activity" + + +class RetentionCleanupActivities: + def __init__( + self, + task_repository: TaskRepository, + use_case: TaskRetentionUseCase, + ): + self.task_repository = task_repository + self.use_case = use_case + + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) + async def find_cleanup_candidates( + self, + after_id: str | None, + limit: int, + idle_days: int, + agent_names: list[str], + ) -> list[str]: + return await self.task_repository.list_cleanup_candidate_ids( + idle_days=idle_days, + agent_names=agent_names, + after_id=after_id, + limit=limit, + ) + + @activity.defn(name=CLEAN_TASK_ACTIVITY) + async def clean_task(self, task_id: str, idle_days: int) -> dict: + try: + result = await self.use_case.clean_task( + task_id=task_id, force=False, idle_days=idle_days + ) + return { + "task_id": result.task_id, + "status": "cleaned", + "reason": None, + "messages_deleted": result.messages_deleted, + "task_states_deleted": result.task_states_deleted, + "events_deleted": result.events_deleted, + } + except ClientError as e: + logger.info( + "task_cleanup_skipped", + extra={"task_id": task_id, "reason": str(e)}, + ) + return { + "task_id": task_id, + "status": "skipped", + "reason": str(e), + "messages_deleted": 0, + "task_states_deleted": 0, + "events_deleted": 0, + } diff --git a/agentex/tests/unit/temporal/__init__.py b/agentex/tests/unit/temporal/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/agentex/tests/unit/temporal/test_retention_cleanup_activities.py b/agentex/tests/unit/temporal/test_retention_cleanup_activities.py new file mode 100644 index 00000000..92d1efdb --- /dev/null +++ b/agentex/tests/unit/temporal/test_retention_cleanup_activities.py @@ -0,0 +1,80 @@ +from datetime import UTC, datetime +from unittest.mock import AsyncMock + +import pytest +from src.domain.entities.task_retention import TaskCleanupResultEntity +from src.domain.exceptions import ClientError +from src.temporal.activities.retention_cleanup_activities import ( + RetentionCleanupActivities, +) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_find_cleanup_candidates_delegates_to_repo(): + repo = AsyncMock() + repo.list_cleanup_candidate_ids.return_value = ["t1", "t2"] + activities = RetentionCleanupActivities(task_repository=repo, use_case=AsyncMock()) + + result = await activities.find_cleanup_candidates( + after_id=None, limit=200, idle_days=7, agent_names=["a"] + ) + + assert result == ["t1", "t2"] + repo.list_cleanup_candidate_ids.assert_awaited_once_with( + idle_days=7, agent_names=["a"], after_id=None, limit=200 + ) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_clean_task_cleaned_outcome(): + use_case = AsyncMock() + use_case.clean_task.return_value = TaskCleanupResultEntity( + task_id="t1", + cleaned_at=datetime.now(UTC), + messages_deleted=3, + task_states_deleted=1, + events_deleted=2, + ) + activities = RetentionCleanupActivities( + task_repository=AsyncMock(), use_case=use_case + ) + + outcome = await activities.clean_task(task_id="t1", idle_days=7) + + assert outcome["status"] == "cleaned" + assert outcome["task_id"] == "t1" + assert outcome["messages_deleted"] == 3 + use_case.clean_task.assert_awaited_once_with(task_id="t1", force=False, idle_days=7) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_clean_task_clienterror_maps_to_skipped(): + use_case = AsyncMock() + use_case.clean_task.side_effect = ClientError( + "Cannot clean task t1: status is RUNNING (active)" + ) + activities = RetentionCleanupActivities( + task_repository=AsyncMock(), use_case=use_case + ) + + outcome = await activities.clean_task(task_id="t1", idle_days=7) + + assert outcome["status"] == "skipped" + assert "RUNNING" in outcome["reason"] + assert outcome["task_id"] == "t1" + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_clean_task_unexpected_error_propagates(): + use_case = AsyncMock() + use_case.clean_task.side_effect = RuntimeError("mongo timeout") + activities = RetentionCleanupActivities( + task_repository=AsyncMock(), use_case=use_case + ) + + with pytest.raises(RuntimeError): + await activities.clean_task(task_id="t1", idle_days=7) From 20cd7375b02b5f7aef164b8ce6b50b4ea75c5c5c Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 12:55:12 -0700 Subject: [PATCH 08/20] refactor(retention): typed clean outcome + discovery logging --- .../retention_cleanup_activities.py | 48 ++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/agentex/src/temporal/activities/retention_cleanup_activities.py b/agentex/src/temporal/activities/retention_cleanup_activities.py index c8b35454..efbd0ced 100644 --- a/agentex/src/temporal/activities/retention_cleanup_activities.py +++ b/agentex/src/temporal/activities/retention_cleanup_activities.py @@ -12,6 +12,8 @@ Pydantic models). """ +from typing import TypedDict + from src.domain.exceptions import ClientError from src.domain.repositories.task_repository import TaskRepository from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase @@ -24,6 +26,15 @@ CLEAN_TASK_ACTIVITY = "clean_task_activity" +class CleanTaskOutcome(TypedDict): + task_id: str + status: str # "cleaned" | "skipped" + reason: str | None + messages_deleted: int + task_states_deleted: int + events_deleted: int + + class RetentionCleanupActivities: def __init__( self, @@ -41,15 +52,48 @@ async def find_cleanup_candidates( idle_days: int, agent_names: list[str], ) -> list[str]: - return await self.task_repository.list_cleanup_candidate_ids( + """ + Return a page of task IDs that are eligible for content cleanup. + + Args: + after_id: Keyset cursor — return only IDs strictly after this value, + or None to start from the beginning. + limit: Maximum number of IDs to return. + idle_days: Minimum number of days a task must have been idle to qualify. + agent_names: Restrict candidates to tasks belonging to these agents. + + Returns: + list[str]: Up to *limit* task IDs ordered by ID, suitable for passing + back as *after_id* on the next page. + """ + logger.info( + "find_cleanup_candidates_started", + extra={"after_id": after_id, "limit": limit}, + ) + result = await self.task_repository.list_cleanup_candidate_ids( idle_days=idle_days, agent_names=agent_names, after_id=after_id, limit=limit, ) + logger.info("find_cleanup_candidates_completed", extra={"count": len(result)}) + return result @activity.defn(name=CLEAN_TASK_ACTIVITY) - async def clean_task(self, task_id: str, idle_days: int) -> dict: + async def clean_task(self, task_id: str, idle_days: int) -> CleanTaskOutcome: + """ + Delete the stored content (messages, states, events) for a single task. + + Args: + task_id: ID of the task to clean. + idle_days: Passed through to the use case for policy checks. + + Returns: + CleanTaskOutcome with ``status`` set to ``"cleaned"`` when content was + deleted, or ``"skipped"`` when the use case refused via ``ClientError`` + (e.g. task is still active, not yet idle long enough, or already + cleaned). Other exceptions propagate so Temporal can retry them. + """ try: result = await self.use_case.clean_task( task_id=task_id, force=False, idle_days=idle_days From 557e3d8c65d3696a4534ab78c2272ffaeaff1253 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 12:59:56 -0700 Subject: [PATCH 09/20] feat(retention): add sweep + per-task cleanup workflows --- .../workflows/retention_cleanup_workflow.py | 103 ++++++++++++++++++ .../test_retention_cleanup_workflow.py | 70 ++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 agentex/src/temporal/workflows/retention_cleanup_workflow.py create mode 100644 agentex/tests/unit/temporal/test_retention_cleanup_workflow.py diff --git a/agentex/src/temporal/workflows/retention_cleanup_workflow.py b/agentex/src/temporal/workflows/retention_cleanup_workflow.py new file mode 100644 index 00000000..8b7baa83 --- /dev/null +++ b/agentex/src/temporal/workflows/retention_cleanup_workflow.py @@ -0,0 +1,103 @@ +""" +Scheduled task-retention cleanup workflows. + +RetentionCleanupSweepWorkflow: started by a Temporal Schedule. Pulls one page of +candidate task ids, fans out one child workflow per task (bounded by +max_in_flight), aggregates cleaned/skipped/failed counts, then continue_as_new's +to the next page so workflow history stays bounded regardless of backlog size. + +RetentionCleanupTaskWorkflow: per-task child. Invokes the clean activity, which +already maps the policy/safety ClientError refusals to a 'skipped' outcome; only +genuine transient errors surface as activity failures (and are retried). +""" + +import asyncio +from datetime import timedelta + +from src.temporal.activities.retention_cleanup_activities import ( + CLEAN_TASK_ACTIVITY, + FIND_CLEANUP_CANDIDATES_ACTIVITY, +) +from src.utils.logging import make_logger +from temporalio import workflow +from temporalio.common import RetryPolicy + +logger = make_logger(__name__) + + +def _chunked(items: list[str], size: int) -> list[list[str]]: + return [items[i : i + size] for i in range(0, len(items), size)] + + +@workflow.defn +class RetentionCleanupTaskWorkflow: + @workflow.run + async def run(self, args: dict) -> dict: + return await workflow.execute_activity( + CLEAN_TASK_ACTIVITY, + args=[args["task_id"], args["idle_days"]], + start_to_close_timeout=timedelta(seconds=60), + retry_policy=RetryPolicy( + maximum_attempts=3, + initial_interval=timedelta(seconds=1), + backoff_coefficient=2.0, + ), + ) + + +@workflow.defn +class RetentionCleanupSweepWorkflow: + @workflow.run + async def run(self, args: dict) -> dict: + idle_days = args["idle_days"] + agent_names = args["agent_names"] + page_size = args.get("page_size", 200) + max_in_flight = args.get("max_in_flight", 20) + after_id = args.get("after_id") + totals = args.get("totals", {"cleaned": 0, "skipped": 0, "failed": 0}) + + task_ids = await workflow.execute_activity( + FIND_CLEANUP_CANDIDATES_ACTIVITY, + args=[after_id, page_size, idle_days, agent_names], + start_to_close_timeout=timedelta(seconds=30), + retry_policy=RetryPolicy( + maximum_attempts=3, + initial_interval=timedelta(seconds=1), + backoff_coefficient=2.0, + ), + ) + + if not task_ids: + logger.info("retention_cleanup_sweep_completed", extra=totals) + return totals + + for batch in _chunked(task_ids, max_in_flight): + results = await asyncio.gather( + *[ + workflow.execute_child_workflow( + RetentionCleanupTaskWorkflow.run, + {"task_id": task_id, "idle_days": idle_days}, + id=f"retention-cleanup-task-{task_id}", + retry_policy=RetryPolicy(maximum_attempts=1), + ) + for task_id in batch + ], + return_exceptions=True, + ) + for result in results: + if isinstance(result, BaseException): + totals["failed"] += 1 + else: + status = result.get("status", "failed") + totals[status] = totals.get(status, 0) + 1 + + workflow.continue_as_new( + arg={ + "idle_days": idle_days, + "agent_names": agent_names, + "page_size": page_size, + "max_in_flight": max_in_flight, + "after_id": task_ids[-1], + "totals": totals, + } + ) diff --git a/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py b/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py new file mode 100644 index 00000000..55496cf9 --- /dev/null +++ b/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py @@ -0,0 +1,70 @@ +import uuid + +import pytest +from src.temporal.activities.retention_cleanup_activities import ( + CLEAN_TASK_ACTIVITY, + FIND_CLEANUP_CANDIDATES_ACTIVITY, +) +from src.temporal.workflows.retention_cleanup_workflow import ( + RetentionCleanupSweepWorkflow, + RetentionCleanupTaskWorkflow, +) +from temporalio import activity +from temporalio.testing import WorkflowEnvironment +from temporalio.worker import UnsandboxedWorkflowRunner, Worker + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_sweep_cleans_all_pages_and_aggregates(): + pages = {None: ["t1", "t2"], "t2": ["t3"], "t3": []} + + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) + async def fake_find(after_id, limit, idle_days, agent_names) -> list[str]: + return pages[after_id] + + @activity.defn(name=CLEAN_TASK_ACTIVITY) + async def fake_clean(task_id: str, idle_days: int) -> dict: + if task_id == "t2": + return { + "task_id": task_id, + "status": "skipped", + "reason": "RUNNING", + "messages_deleted": 0, + "task_states_deleted": 0, + "events_deleted": 0, + } + if task_id == "t3": + raise RuntimeError("permanent failure") + return { + "task_id": task_id, + "status": "cleaned", + "reason": None, + "messages_deleted": 1, + "task_states_deleted": 0, + "events_deleted": 0, + } + + async with await WorkflowEnvironment.start_time_skipping() as env: + async with Worker( + env.client, + task_queue="test-retention", + workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + activities=[fake_find, fake_clean], + workflow_runner=UnsandboxedWorkflowRunner(), + ): + summary = await env.client.execute_workflow( + RetentionCleanupSweepWorkflow.run, + { + "idle_days": 7, + "agent_names": ["a"], + "page_size": 2, + "max_in_flight": 2, + }, + id=f"sweep-{uuid.uuid4()}", + task_queue="test-retention", + ) + + assert summary["cleaned"] == 1 # t1 + assert summary["skipped"] == 1 # t2 + assert summary["failed"] == 1 # t3 From 9c00819dae4fea589c6320c1a1c57eab46da1958 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 13:05:03 -0700 Subject: [PATCH 10/20] feat(retention): register cleanup workflows + activities on worker --- agentex/src/temporal/run_worker.py | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/agentex/src/temporal/run_worker.py b/agentex/src/temporal/run_worker.py index d3665094..00f3d5ff 100644 --- a/agentex/src/temporal/run_worker.py +++ b/agentex/src/temporal/run_worker.py @@ -14,6 +14,7 @@ from src.adapters.temporal.client_factory import TemporalClientFactory from src.config.dependencies import ( + GlobalDependencies, database_async_read_only_session_maker, database_async_read_write_engine, database_async_read_write_session_maker, @@ -22,8 +23,17 @@ ) from src.config.environment_variables import EnvironmentVariables from src.domain.repositories.agent_repository import AgentRepository +from src.domain.repositories.task_repository import TaskRepository from src.temporal.activities.healthcheck_activities import HealthCheckActivities +from src.temporal.activities.retention_cleanup_activities import ( + RetentionCleanupActivities, +) +from src.temporal.task_retention_factory import build_task_retention_use_case from src.temporal.workflows.healthcheck_workflow import HealthCheckWorkflow +from src.temporal.workflows.retention_cleanup_workflow import ( + RetentionCleanupSweepWorkflow, + RetentionCleanupTaskWorkflow, +) from src.utils.logging import make_logger logger = make_logger(__name__) @@ -157,6 +167,44 @@ def create_health_check_worker( ) +def create_retention_cleanup_worker( + global_dependencies: GlobalDependencies, +) -> asyncio.Task: + """Create a worker that serves the retention-cleanup workflows + activities.""" + task_queue = os.environ.get("AGENTEX_SERVER_TASK_QUEUE", AGENTEX_SERVER_TASK_QUEUE) + + engine = database_async_read_write_engine() + rw_session_maker = database_async_read_write_session_maker(engine) + ro_session_maker = database_async_read_only_session_maker(engine) + task_repository = TaskRepository(rw_session_maker, ro_session_maker) + use_case = build_task_retention_use_case(global_dependencies) + + retention_activities = RetentionCleanupActivities( + task_repository=task_repository, + use_case=use_case, + ) + + return asyncio.create_task( + run_worker( + task_queue=task_queue, + workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + activities=[ + retention_activities.find_cleanup_candidates, + retention_activities.clean_task, + ], + max_workers=50, + max_concurrent_activities=50, + ) + ) + + +async def run_retention_cleanup_worker_main() -> None: + global_dependencies = GlobalDependencies() + await global_dependencies.load() + worker_task = create_retention_cleanup_worker(global_dependencies) + await worker_task + + async def main() -> None: """ Main entry point for the Health Check worker. From fcf82c6a95dd8aeaca797072ce9ab4d85ff6ee8a Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 14:08:07 -0700 Subject: [PATCH 11/20] refactor(retention): reuse factory-built task repository in cleanup worker --- agentex/src/temporal/run_worker.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/agentex/src/temporal/run_worker.py b/agentex/src/temporal/run_worker.py index 00f3d5ff..d4fd8929 100644 --- a/agentex/src/temporal/run_worker.py +++ b/agentex/src/temporal/run_worker.py @@ -23,7 +23,6 @@ ) from src.config.environment_variables import EnvironmentVariables from src.domain.repositories.agent_repository import AgentRepository -from src.domain.repositories.task_repository import TaskRepository from src.temporal.activities.healthcheck_activities import HealthCheckActivities from src.temporal.activities.retention_cleanup_activities import ( RetentionCleanupActivities, @@ -173,11 +172,10 @@ def create_retention_cleanup_worker( """Create a worker that serves the retention-cleanup workflows + activities.""" task_queue = os.environ.get("AGENTEX_SERVER_TASK_QUEUE", AGENTEX_SERVER_TASK_QUEUE) - engine = database_async_read_write_engine() - rw_session_maker = database_async_read_write_session_maker(engine) - ro_session_maker = database_async_read_only_session_maker(engine) - task_repository = TaskRepository(rw_session_maker, ro_session_maker) use_case = build_task_retention_use_case(global_dependencies) + # Reuse the repository the factory already built (avoids a second TaskRepository + # / connection pool for the same database). + task_repository = use_case.retention_service.task_repository retention_activities = RetentionCleanupActivities( task_repository=task_repository, @@ -199,6 +197,7 @@ def create_retention_cleanup_worker( async def run_retention_cleanup_worker_main() -> None: + """Entrypoint: run the retention-cleanup worker as its own process.""" global_dependencies = GlobalDependencies() await global_dependencies.load() worker_task = create_retention_cleanup_worker(global_dependencies) From 616085ae0b57ad44de8102b20b4d7b1411085e69 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 14:09:36 -0700 Subject: [PATCH 12/20] feat(retention): add cleanup schedule bootstrap script --- .../run_retention_cleanup_schedule.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 agentex/src/temporal/run_retention_cleanup_schedule.py diff --git a/agentex/src/temporal/run_retention_cleanup_schedule.py b/agentex/src/temporal/run_retention_cleanup_schedule.py new file mode 100644 index 00000000..98186384 --- /dev/null +++ b/agentex/src/temporal/run_retention_cleanup_schedule.py @@ -0,0 +1,74 @@ +""" +Create the Temporal Schedule that drives the scheduled task-retention cleanup. + +Runs at startup (mirrors run_healthcheck_workflow.py). No-op unless +RETENTION_CLEANUP_ENABLED is true and an agent allowlist is configured +(fail-closed). Idempotent: if the schedule already exists, it is left as-is. +""" + +import asyncio + +from src.adapters.temporal.adapter_temporal import TemporalAdapter +from src.adapters.temporal.client_factory import TemporalClientFactory +from src.adapters.temporal.exceptions import TemporalScheduleAlreadyExistsError +from src.config.dependencies import GlobalDependencies +from src.config.environment_variables import EnvironmentVariables +from src.temporal.run_worker import AGENTEX_SERVER_TASK_QUEUE +from src.temporal.workflows.retention_cleanup_workflow import ( + RetentionCleanupSweepWorkflow, +) +from src.utils.logging import make_logger + +logger = make_logger(__name__) + +SCHEDULE_ID = "retention-cleanup-sweep" +WORKFLOW_ID = "retention-cleanup-sweep" + + +async def main() -> None: + global_dependencies = GlobalDependencies() + await global_dependencies.load() + + env = EnvironmentVariables.refresh() + if not env or not env.RETENTION_CLEANUP_ENABLED: + logger.info("Retention cleanup is not enabled; skipping schedule creation") + return + if not env.RETENTION_CLEANUP_AGENT_ALLOWLIST: + logger.warning( + "Retention cleanup enabled but agent allowlist is empty (fail-closed); " + "skipping schedule creation" + ) + return + if not TemporalClientFactory.is_temporal_configured(env): + logger.error("Temporal is not configured; skipping schedule creation") + return + + task_queue = env.AGENTEX_SERVER_TASK_QUEUE or AGENTEX_SERVER_TASK_QUEUE + adapter = TemporalAdapter(temporal_client=global_dependencies.temporal_client) + + workflow_args = { + "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, + "agent_names": env.RETENTION_CLEANUP_AGENT_ALLOWLIST, + "page_size": env.RETENTION_CLEANUP_PAGE_SIZE, + "max_in_flight": env.RETENTION_CLEANUP_MAX_IN_FLIGHT, + } + + try: + await adapter.create_schedule( + schedule_id=SCHEDULE_ID, + workflow=RetentionCleanupSweepWorkflow.run, + workflow_id=WORKFLOW_ID, + args=[workflow_args], + task_queue=task_queue, + cron_expressions=[env.RETENTION_CLEANUP_CRON], + ) + logger.info( + "Created retention-cleanup schedule", + extra={"cron": env.RETENTION_CLEANUP_CRON, "args": workflow_args}, + ) + except TemporalScheduleAlreadyExistsError: + logger.info("Retention-cleanup schedule already exists; leaving as-is") + + +if __name__ == "__main__": + asyncio.run(main()) From 11c04067aab8e9b22efde264f13d414851f01956 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 14:17:05 -0700 Subject: [PATCH 13/20] feat(retention): wire cleanup worker + schedule bootstrap for local dev --- agentex/docker-compose.yml | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/agentex/docker-compose.yml b/agentex/docker-compose.yml index b3a935b3..f82c51d4 100644 --- a/agentex/docker-compose.yml +++ b/agentex/docker-compose.yml @@ -170,6 +170,12 @@ services: - ALLOWED_ORIGINS=http://localhost:3000 - OTEL_EXPORTER_OTLP_ENDPOINT=http://agentex-otel-collector:4317 - OTEL_SERVICE_NAME=agentex-api + - RETENTION_CLEANUP_ENABLED=${RETENTION_CLEANUP_ENABLED:-false} + - RETENTION_CLEANUP_AGENT_ALLOWLIST=${RETENTION_CLEANUP_AGENT_ALLOWLIST:-} + - RETENTION_CLEANUP_IDLE_DAYS=${RETENTION_CLEANUP_IDLE_DAYS:-7} + - RETENTION_CLEANUP_CRON=${RETENTION_CLEANUP_CRON:-0 4 * * *} + - RETENTION_CLEANUP_PAGE_SIZE=${RETENTION_CLEANUP_PAGE_SIZE:-200} + - RETENTION_CLEANUP_MAX_IN_FLIGHT=${RETENTION_CLEANUP_MAX_IN_FLIGHT:-20} ports: - "5003:5003" volumes: @@ -200,6 +206,7 @@ services: alembic upgrade head && popd && python src/temporal/run_healthcheck_workflow.py && + python src/temporal/run_retention_cleanup_schedule.py && echo 'Starting API server...' && uvicorn src.api.app:app --host 0.0.0.0 --port 5003 --reload --reload-dir /app/src --reload-include '*.py' --workers 1 " @@ -252,6 +259,53 @@ services: user: root restart: unless-stopped + agentex-retention-cleanup-worker: + container_name: agentex-retention-cleanup-worker + build: + context: .. + dockerfile: agentex/Dockerfile + target: dev + args: + SOURCE_DIR: agentex + environment: + - ENVIRONMENT=development + - DATABASE_URL=postgresql://postgres:postgres@agentex-postgres:5432/agentex + - TEMPORAL_ADDRESS=agentex-temporal:7233 + - TEMPORAL_HOST=agentex-temporal + - REDIS_URL=redis://agentex-redis:6379 + - MONGODB_URI=mongodb://agentex-mongodb:27017 + - MONGODB_DATABASE_NAME=agentex + - AGENTEX_SERVER_TASK_QUEUE=agentex-server + - RETENTION_CLEANUP_ENABLED=${RETENTION_CLEANUP_ENABLED:-false} + - RETENTION_CLEANUP_AGENT_ALLOWLIST=${RETENTION_CLEANUP_AGENT_ALLOWLIST:-} + - RETENTION_CLEANUP_IDLE_DAYS=${RETENTION_CLEANUP_IDLE_DAYS:-7} + - RETENTION_CLEANUP_CRON=${RETENTION_CLEANUP_CRON:-0 4 * * *} + - RETENTION_CLEANUP_PAGE_SIZE=${RETENTION_CLEANUP_PAGE_SIZE:-200} + - RETENTION_CLEANUP_MAX_IN_FLIGHT=${RETENTION_CLEANUP_MAX_IN_FLIGHT:-20} + volumes: + - .:/app:cached + depends_on: + agentex-temporal: + condition: service_healthy + agentex-redis: + condition: service_healthy + agentex-postgres: + condition: service_healthy + agentex-mongodb: + condition: service_healthy + networks: + - agentex-network + command: | + bash -c " + echo 'Starting Retention Cleanup Worker...' && + export TEMPORAL_ADDRESS=agentex-temporal:7233 && + export TEMPORAL_HOST=agentex-temporal && + export MONGODB_URI=mongodb://agentex-mongodb:27017 && + python -c \"import asyncio; from src.temporal.run_worker import run_retention_cleanup_worker_main; asyncio.run(run_retention_cleanup_worker_main())\" + " + user: root + restart: unless-stopped + volumes: agentex-temporal-postgres-data: agentex-postgres-data: From 08155187dd873d4e04d894dbd840a8c779a6cb74 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 15:53:51 -0700 Subject: [PATCH 14/20] feat(retention): load cleanup policy at runtime instead of baking into schedule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a load_cleanup_config activity that reads RETENTION_CLEANUP_* env vars at sweep runtime so changing policy and restarting the worker takes effect on the next scheduled run without recreating the Temporal Schedule. The schedule now carries no policy args — only cron + workflow identity. The sweep loads config once on the first page and carries it across continue_as_new pages for consistency. The allowlist gate in the schedule bootstrap is removed; fail-closed is preserved at runtime (empty allowlist → discovery returns no candidates). --- .../retention_cleanup_activities.py | 24 +++++++++- .../run_retention_cleanup_schedule.py | 38 ++++++++------- agentex/src/temporal/run_worker.py | 1 + .../workflows/retention_cleanup_workflow.py | 20 +++++++- .../test_retention_cleanup_activities.py | 19 ++++++++ .../test_retention_cleanup_workflow.py | 47 +++++++++++++++++++ 6 files changed, 129 insertions(+), 20 deletions(-) diff --git a/agentex/src/temporal/activities/retention_cleanup_activities.py b/agentex/src/temporal/activities/retention_cleanup_activities.py index efbd0ced..4dc9d953 100644 --- a/agentex/src/temporal/activities/retention_cleanup_activities.py +++ b/agentex/src/temporal/activities/retention_cleanup_activities.py @@ -1,7 +1,9 @@ """ Temporal activities for the scheduled task-retention cleanup sweep. -Two activities: +Three activities: +- load_cleanup_config: reads RETENTION_CLEANUP_* env vars at run time so policy + changes take effect on the next scheduled run without recreating the schedule. - find_cleanup_candidates: cheap pre-filtered, keyset-paginated discovery. - clean_task: delegates to TaskRetentionUseCase.clean_task; catches ClientError (the three policy/safety refusals) and maps it to a 'skipped' outcome so the @@ -14,6 +16,7 @@ from typing import TypedDict +from src.config.environment_variables import EnvironmentVariables from src.domain.exceptions import ClientError from src.domain.repositories.task_repository import TaskRepository from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase @@ -22,6 +25,7 @@ logger = make_logger(__name__) +LOAD_CLEANUP_CONFIG_ACTIVITY = "load_cleanup_config_activity" FIND_CLEANUP_CANDIDATES_ACTIVITY = "find_cleanup_candidates_activity" CLEAN_TASK_ACTIVITY = "clean_task_activity" @@ -44,6 +48,24 @@ def __init__( self.task_repository = task_repository self.use_case = use_case + @activity.defn(name=LOAD_CLEANUP_CONFIG_ACTIVITY) + async def load_cleanup_config(self) -> dict: + """ + Read the current retention-cleanup policy from the environment. + + Policy (allowlist, idle threshold, paging) is intentionally NOT baked into + the Temporal Schedule. The sweep loads it here at run time so changing a + RETENTION_CLEANUP_* env var and restarting the worker takes effect on the + next scheduled run without recreating the schedule. + """ + env = EnvironmentVariables.refresh(force_refresh=True) + return { + "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, + "agent_names": env.RETENTION_CLEANUP_AGENT_ALLOWLIST, + "page_size": env.RETENTION_CLEANUP_PAGE_SIZE, + "max_in_flight": env.RETENTION_CLEANUP_MAX_IN_FLIGHT, + } + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) async def find_cleanup_candidates( self, diff --git a/agentex/src/temporal/run_retention_cleanup_schedule.py b/agentex/src/temporal/run_retention_cleanup_schedule.py index 98186384..6ea789df 100644 --- a/agentex/src/temporal/run_retention_cleanup_schedule.py +++ b/agentex/src/temporal/run_retention_cleanup_schedule.py @@ -2,8 +2,17 @@ Create the Temporal Schedule that drives the scheduled task-retention cleanup. Runs at startup (mirrors run_healthcheck_workflow.py). No-op unless -RETENTION_CLEANUP_ENABLED is true and an agent allowlist is configured -(fail-closed). Idempotent: if the schedule already exists, it is left as-is. +RETENTION_CLEANUP_ENABLED is true and Temporal is configured. +Idempotent: if the schedule already exists, it is left as-is. + +The schedule carries NO policy args (no allowlist, idle_days, page_size, or +max_in_flight). Those are read at sweep runtime via the load_cleanup_config +activity so changing a RETENTION_CLEANUP_* env var and restarting the worker +takes effect on the next scheduled run without recreating the schedule. +Only the cron expression and workflow identity are baked into the schedule. + +Fail-closed behaviour is preserved at runtime: if the allowlist is empty the +discovery activity returns no candidates and the sweep completes immediately. """ import asyncio @@ -33,38 +42,31 @@ async def main() -> None: if not env or not env.RETENTION_CLEANUP_ENABLED: logger.info("Retention cleanup is not enabled; skipping schedule creation") return - if not env.RETENTION_CLEANUP_AGENT_ALLOWLIST: - logger.warning( - "Retention cleanup enabled but agent allowlist is empty (fail-closed); " - "skipping schedule creation" - ) - return if not TemporalClientFactory.is_temporal_configured(env): logger.error("Temporal is not configured; skipping schedule creation") return + if not env.RETENTION_CLEANUP_AGENT_ALLOWLIST: + logger.info( + "Retention cleanup agent allowlist is empty; the sweep will discover " + "no candidates at runtime (fail-closed by policy, not by schedule)" + ) + task_queue = env.AGENTEX_SERVER_TASK_QUEUE or AGENTEX_SERVER_TASK_QUEUE adapter = TemporalAdapter(temporal_client=global_dependencies.temporal_client) - workflow_args = { - "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, - "agent_names": env.RETENTION_CLEANUP_AGENT_ALLOWLIST, - "page_size": env.RETENTION_CLEANUP_PAGE_SIZE, - "max_in_flight": env.RETENTION_CLEANUP_MAX_IN_FLIGHT, - } - try: await adapter.create_schedule( schedule_id=SCHEDULE_ID, workflow=RetentionCleanupSweepWorkflow.run, workflow_id=WORKFLOW_ID, - args=[workflow_args], + args=[], task_queue=task_queue, cron_expressions=[env.RETENTION_CLEANUP_CRON], ) logger.info( - "Created retention-cleanup schedule", - extra={"cron": env.RETENTION_CLEANUP_CRON, "args": workflow_args}, + "Created retention-cleanup schedule (policy read at runtime)", + extra={"cron": env.RETENTION_CLEANUP_CRON}, ) except TemporalScheduleAlreadyExistsError: logger.info("Retention-cleanup schedule already exists; leaving as-is") diff --git a/agentex/src/temporal/run_worker.py b/agentex/src/temporal/run_worker.py index d4fd8929..d6c2c766 100644 --- a/agentex/src/temporal/run_worker.py +++ b/agentex/src/temporal/run_worker.py @@ -187,6 +187,7 @@ def create_retention_cleanup_worker( task_queue=task_queue, workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], activities=[ + retention_activities.load_cleanup_config, retention_activities.find_cleanup_candidates, retention_activities.clean_task, ], diff --git a/agentex/src/temporal/workflows/retention_cleanup_workflow.py b/agentex/src/temporal/workflows/retention_cleanup_workflow.py index 8b7baa83..3650b1e3 100644 --- a/agentex/src/temporal/workflows/retention_cleanup_workflow.py +++ b/agentex/src/temporal/workflows/retention_cleanup_workflow.py @@ -17,6 +17,7 @@ from src.temporal.activities.retention_cleanup_activities import ( CLEAN_TASK_ACTIVITY, FIND_CLEANUP_CANDIDATES_ACTIVITY, + LOAD_CLEANUP_CONFIG_ACTIVITY, ) from src.utils.logging import make_logger from temporalio import workflow @@ -48,7 +49,24 @@ async def run(self, args: dict) -> dict: @workflow.defn class RetentionCleanupSweepWorkflow: @workflow.run - async def run(self, args: dict) -> dict: + async def run(self, args: dict | None = None) -> dict: + args = args or {} + + # First page of a sweep: load policy from env (via activity) and carry it + # across continue_as_new pages so a single sweep stays consistent even if + # env changes mid-run. Subsequent pages already have it in args. + if "idle_days" not in args: + config = await workflow.execute_activity( + LOAD_CLEANUP_CONFIG_ACTIVITY, + start_to_close_timeout=timedelta(seconds=15), + retry_policy=RetryPolicy( + maximum_attempts=3, + initial_interval=timedelta(seconds=1), + backoff_coefficient=2.0, + ), + ) + args = {**args, **config} + idle_days = args["idle_days"] agent_names = args["agent_names"] page_size = args.get("page_size", 200) diff --git a/agentex/tests/unit/temporal/test_retention_cleanup_activities.py b/agentex/tests/unit/temporal/test_retention_cleanup_activities.py index 92d1efdb..53e79cc9 100644 --- a/agentex/tests/unit/temporal/test_retention_cleanup_activities.py +++ b/agentex/tests/unit/temporal/test_retention_cleanup_activities.py @@ -78,3 +78,22 @@ async def test_clean_task_unexpected_error_propagates(): with pytest.raises(RuntimeError): await activities.clean_task(task_id="t1", idle_days=7) + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_load_cleanup_config_reads_env(monkeypatch): + monkeypatch.setenv("RETENTION_CLEANUP_AGENT_ALLOWLIST", "x,y") + monkeypatch.setenv("RETENTION_CLEANUP_IDLE_DAYS", "9") + monkeypatch.setenv("RETENTION_CLEANUP_PAGE_SIZE", "33") + monkeypatch.setenv("RETENTION_CLEANUP_MAX_IN_FLIGHT", "4") + activities = RetentionCleanupActivities( + task_repository=AsyncMock(), use_case=AsyncMock() + ) + config = await activities.load_cleanup_config() + assert config == { + "idle_days": 9, + "agent_names": ["x", "y"], + "page_size": 33, + "max_in_flight": 4, + } diff --git a/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py b/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py index 55496cf9..61fea670 100644 --- a/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py +++ b/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py @@ -4,6 +4,7 @@ from src.temporal.activities.retention_cleanup_activities import ( CLEAN_TASK_ACTIVITY, FIND_CLEANUP_CANDIDATES_ACTIVITY, + LOAD_CLEANUP_CONFIG_ACTIVITY, ) from src.temporal.workflows.retention_cleanup_workflow import ( RetentionCleanupSweepWorkflow, @@ -68,3 +69,49 @@ async def fake_clean(task_id: str, idle_days: int) -> dict: assert summary["cleaned"] == 1 # t1 assert summary["skipped"] == 1 # t2 assert summary["failed"] == 1 # t3 + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_sweep_loads_config_from_activity_when_no_args(): + pages = {None: ["t1"], "t1": []} + + @activity.defn(name=LOAD_CLEANUP_CONFIG_ACTIVITY) + async def fake_load() -> dict: + return { + "idle_days": 7, + "agent_names": ["a"], + "page_size": 2, + "max_in_flight": 2, + } + + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) + async def fake_find(after_id, limit, idle_days, agent_names) -> list[str]: + assert agent_names == ["a"] # policy from load activity flowed through + return pages[after_id] + + @activity.defn(name=CLEAN_TASK_ACTIVITY) + async def fake_clean(task_id: str, idle_days: int) -> dict: + return { + "task_id": task_id, + "status": "cleaned", + "reason": None, + "messages_deleted": 1, + "task_states_deleted": 0, + "events_deleted": 0, + } + + async with await WorkflowEnvironment.start_time_skipping() as env: + async with Worker( + env.client, + task_queue="test-retention-load", + workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + activities=[fake_load, fake_find, fake_clean], + workflow_runner=UnsandboxedWorkflowRunner(), + ): + summary = await env.client.execute_workflow( + RetentionCleanupSweepWorkflow.run, + id=f"sweep-{uuid.uuid4()}", + task_queue="test-retention-load", + ) + assert summary["cleaned"] == 1 From 652ad55cd0ed32e94c8829f4e303e623301379d2 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 15:56:46 -0700 Subject: [PATCH 15/20] docs(retention): clarify why load_cleanup_config lives on the activities class --- .../src/temporal/activities/retention_cleanup_activities.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agentex/src/temporal/activities/retention_cleanup_activities.py b/agentex/src/temporal/activities/retention_cleanup_activities.py index 4dc9d953..c3735029 100644 --- a/agentex/src/temporal/activities/retention_cleanup_activities.py +++ b/agentex/src/temporal/activities/retention_cleanup_activities.py @@ -58,6 +58,9 @@ async def load_cleanup_config(self) -> dict: RETENTION_CLEANUP_* env var and restarting the worker takes effect on the next scheduled run without recreating the schedule. """ + # Lives on this class (rather than as a free function) only so the worker + # can register it alongside the other activities; it intentionally uses + # none of the injected repositories/use case. env = EnvironmentVariables.refresh(force_refresh=True) return { "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, From 08329942e437aa8009e144be1ae53e1a4ce0c086 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 15:57:43 -0700 Subject: [PATCH 16/20] docs(retention): update design for runtime policy loading --- ...scheduled-task-retention-cleanup-design.md | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md b/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md index a50594ab..ae819d7f 100644 --- a/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md +++ b/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md @@ -54,9 +54,16 @@ discovers idle tasks and drives them through `clean_task`. ## Configuration All configuration is via environment variables (consistent with the existing -`ENABLE_HEALTH_CHECK_WORKFLOW` pattern). The master flag and cron are read at -**schedule-bootstrap** time; the allowlist and idle threshold are passed into the -scheduled workflow as input args so the schedule encodes the policy it runs with. +`ENABLE_HEALTH_CHECK_WORKFLOW` pattern). The master flag (`RETENTION_CLEANUP_ENABLED`) +and `RETENTION_CLEANUP_CRON` are read at **schedule-bootstrap** time — they are +inherent schedule properties (whether the schedule exists, and its cadence). The +**policy** (allowlist, idle threshold, paging) is deliberately **not** baked into +the schedule: it is read at **sweep run time** by a `load_cleanup_config` activity +from the worker's environment, then carried across the sweep's `continue_as_new` +pages. This means changing the allowlist / idle days / paging is a matter of +editing the env var and restarting the worker — the next scheduled run picks up +the new policy with **no schedule recreation**. (Changing the *cron* cadence still +requires updating/recreating the schedule, since cron is a schedule property.) | Env var | Meaning | Default | |---|---|---| @@ -78,16 +85,18 @@ blast radius to named agents only. |---|---|---| | `RetentionCleanupSweepWorkflow` | `agentex/src/temporal/workflows/retention_cleanup_workflow.py` | Paginate candidates → fan out child workflows in bounded batches → aggregate summary → `continue_as_new` across pages. | | `RetentionCleanupTaskWorkflow` | same file | Per-task child workflow: invoke the clean activity, return a structured outcome. | -| `RetentionCleanupActivities` | `agentex/src/temporal/activities/retention_cleanup_activities.py` | `find_cleanup_candidates(...)` and `clean_task(...)` (the latter catches `ClientError` and maps it to a `skipped` outcome). | +| `RetentionCleanupActivities` | `agentex/src/temporal/activities/retention_cleanup_activities.py` | `load_cleanup_config(...)` (reads policy from env), `find_cleanup_candidates(...)`, and `clean_task(...)` (the latter catches `ClientError` and maps it to a `skipped` outcome). | | Discovery query | `agentex/src/domain/repositories/task_repository.py` (extend) | Keyset-paginated query for idle, uncleaned candidate task ids filtered by agent name. | -| Schedule bootstrap | `agentex/src/temporal/run_retention_cleanup_schedule.py` | On startup, when enabled, create/update the Temporal Schedule (mirrors `run_healthcheck_workflow.py`). | +| Schedule bootstrap | `agentex/src/temporal/run_retention_cleanup_schedule.py` | On startup, when enabled + Temporal configured, create the Temporal Schedule with **no policy args** (cron + identity only; idempotent, leaves an existing schedule as-is). Mirrors `run_healthcheck_workflow.py`. | | Worker registration | `agentex/src/temporal/run_worker.py` (edit) | Register both workflows + the activities on the `agentex-server` task queue. | ### Data flow ``` -Temporal Schedule (cron; created at bootstrap only when RETENTION_CLEANUP_ENABLED) - └─> RetentionCleanupSweepWorkflow(idle_days, allowlist, page_size, max_in_flight) +Temporal Schedule (cron; created at bootstrap only when RETENTION_CLEANUP_ENABLED; no policy args) + └─> RetentionCleanupSweepWorkflow() # no policy baked in + ├─ (first page only) activity load_cleanup_config() -> {idle_days, allowlist, page_size, max_in_flight} + │ (read from worker env; carried across continue_as_new pages so a sweep stays consistent) ├─ activity find_cleanup_candidates(cursor, limit, idle_days, allowlist) -> [task_id...] ├─ for each batch (size ≤ max_in_flight) of task_ids: │ start child RetentionCleanupTaskWorkflow(task_id, idle_days) From c9ebddad17bcc7abc4a118bf23d0c810039aacde Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 17:06:17 -0700 Subject: [PATCH 17/20] fix(retention): honor cleanup flag at runtime --- .../retention_cleanup_activities.py | 1 + .../workflows/retention_cleanup_workflow.py | 4 + .../test_retention_cleanup_activities.py | 2 + .../test_retention_cleanup_workflow.py | 38 + ...-06-03-scheduled-task-retention-cleanup.md | 1134 ----------------- ...scheduled-task-retention-cleanup-design.md | 190 --- 6 files changed, 45 insertions(+), 1324 deletions(-) delete mode 100644 docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md delete mode 100644 docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md diff --git a/agentex/src/temporal/activities/retention_cleanup_activities.py b/agentex/src/temporal/activities/retention_cleanup_activities.py index c3735029..4ebdef03 100644 --- a/agentex/src/temporal/activities/retention_cleanup_activities.py +++ b/agentex/src/temporal/activities/retention_cleanup_activities.py @@ -63,6 +63,7 @@ async def load_cleanup_config(self) -> dict: # none of the injected repositories/use case. env = EnvironmentVariables.refresh(force_refresh=True) return { + "enabled": env.RETENTION_CLEANUP_ENABLED, "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, "agent_names": env.RETENTION_CLEANUP_AGENT_ALLOWLIST, "page_size": env.RETENTION_CLEANUP_PAGE_SIZE, diff --git a/agentex/src/temporal/workflows/retention_cleanup_workflow.py b/agentex/src/temporal/workflows/retention_cleanup_workflow.py index 3650b1e3..4ade1b7f 100644 --- a/agentex/src/temporal/workflows/retention_cleanup_workflow.py +++ b/agentex/src/temporal/workflows/retention_cleanup_workflow.py @@ -74,6 +74,10 @@ async def run(self, args: dict | None = None) -> dict: after_id = args.get("after_id") totals = args.get("totals", {"cleaned": 0, "skipped": 0, "failed": 0}) + if not args.get("enabled", True): + logger.info("retention_cleanup_sweep_disabled", extra=totals) + return totals + task_ids = await workflow.execute_activity( FIND_CLEANUP_CANDIDATES_ACTIVITY, args=[after_id, page_size, idle_days, agent_names], diff --git a/agentex/tests/unit/temporal/test_retention_cleanup_activities.py b/agentex/tests/unit/temporal/test_retention_cleanup_activities.py index 53e79cc9..34362e6f 100644 --- a/agentex/tests/unit/temporal/test_retention_cleanup_activities.py +++ b/agentex/tests/unit/temporal/test_retention_cleanup_activities.py @@ -83,6 +83,7 @@ async def test_clean_task_unexpected_error_propagates(): @pytest.mark.unit @pytest.mark.asyncio async def test_load_cleanup_config_reads_env(monkeypatch): + monkeypatch.setenv("RETENTION_CLEANUP_ENABLED", "true") monkeypatch.setenv("RETENTION_CLEANUP_AGENT_ALLOWLIST", "x,y") monkeypatch.setenv("RETENTION_CLEANUP_IDLE_DAYS", "9") monkeypatch.setenv("RETENTION_CLEANUP_PAGE_SIZE", "33") @@ -92,6 +93,7 @@ async def test_load_cleanup_config_reads_env(monkeypatch): ) config = await activities.load_cleanup_config() assert config == { + "enabled": True, "idle_days": 9, "agent_names": ["x", "y"], "page_size": 33, diff --git a/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py b/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py index 61fea670..7c89bea6 100644 --- a/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py +++ b/agentex/tests/unit/temporal/test_retention_cleanup_workflow.py @@ -115,3 +115,41 @@ async def fake_clean(task_id: str, idle_days: int) -> dict: task_queue="test-retention-load", ) assert summary["cleaned"] == 1 + + +@pytest.mark.unit +@pytest.mark.asyncio +async def test_sweep_noops_when_runtime_config_disabled(): + @activity.defn(name=LOAD_CLEANUP_CONFIG_ACTIVITY) + async def fake_load() -> dict: + return { + "enabled": False, + "idle_days": 7, + "agent_names": ["a"], + "page_size": 2, + "max_in_flight": 2, + } + + @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) + async def fake_find(after_id, limit, idle_days, agent_names) -> list[str]: + raise AssertionError("disabled cleanup should not discover candidates") + + @activity.defn(name=CLEAN_TASK_ACTIVITY) + async def fake_clean(task_id: str, idle_days: int) -> dict: + raise AssertionError("disabled cleanup should not clean tasks") + + async with await WorkflowEnvironment.start_time_skipping() as env: + async with Worker( + env.client, + task_queue="test-retention-disabled", + workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + activities=[fake_load, fake_find, fake_clean], + workflow_runner=UnsandboxedWorkflowRunner(), + ): + summary = await env.client.execute_workflow( + RetentionCleanupSweepWorkflow.run, + id=f"sweep-{uuid.uuid4()}", + task_queue="test-retention-disabled", + ) + + assert summary == {"cleaned": 0, "skipped": 0, "failed": 0} diff --git a/docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md b/docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md deleted file mode 100644 index 40ac5374..00000000 --- a/docs/superpowers/plans/2026-06-03-scheduled-task-retention-cleanup.md +++ /dev/null @@ -1,1134 +0,0 @@ -# Scheduled Task-Retention Cleanup — Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add a Temporal Schedule + sweep workflow to the agentex backend that periodically discovers idle tasks for an allowlisted set of agents and runs the existing idempotent `clean_task` path against them, gated by a feature flag. - -**Architecture:** A daily Temporal Schedule starts a `RetentionCleanupSweepWorkflow`. The sweep calls a `find_cleanup_candidates` activity (cheap, index-friendly Postgres pre-filter: `cleaned_at IS NULL AND updated_at < cutoff`, joined to an agent-name allowlist, keyset-paginated), then fans out one `RetentionCleanupTaskWorkflow` child per task. Each child calls a `clean_task` activity that delegates to the already-merged `TaskRetentionUseCase.clean_task`; the activity catches `ClientError` (the three safety/policy refusals) and maps it to a `skipped` outcome, so only genuine transient errors retry. The parent aggregates `cleaned`/`skipped`/`failed` counts and `continue_as_new`s per page to bound history. - -**Tech Stack:** Python 3.12, Temporal (`temporalio`), SQLAlchemy async, FastAPI DI patterns, pytest (`pytest-asyncio`), testcontainers for integration. - -**Spec:** `docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md` - -**Conventions to follow:** -- Run a single test: `make test FILE=tests/unit/path/test_foo.py NAME=test_name` (from `agentex/`). -- Lint before commit: `uv run ruff check src/ --fix && uv run ruff format src/` (from `agentex/`). -- Activity/workflow boundary carries **only JSON-native types** (`str`, `int`, `bool`, `list`, `dict`). The backend's Temporal data converter only adds datetime support (`client_factory.py:DateTimePayloadConverter`); it does NOT serialize Pydantic models. Cross the activity boundary with dicts; build Pydantic models (if any) inside domain code only. -- Commit messages: no Claude attribution (repo is public — see `CLAUDE.md`). - ---- - -## File Structure - -| File | Create/Modify | Responsibility | -|---|---|---| -| `agentex/src/config/environment_variables.py` | Modify | Add the 6 `RETENTION_CLEANUP_*` config fields + parsing. | -| `agentex/src/domain/repositories/task_repository.py` | Modify | Add `list_cleanup_candidate_ids(...)` keyset-paginated discovery query. | -| `agentex/src/temporal/task_retention_factory.py` | Create | `build_task_retention_use_case(...)` — wires `TaskRetentionUseCase` outside FastAPI DI. | -| `agentex/src/temporal/activities/retention_cleanup_activities.py` | Create | `RetentionCleanupActivities` with `find_cleanup_candidates` + `clean_task` activities. | -| `agentex/src/temporal/workflows/retention_cleanup_workflow.py` | Create | `RetentionCleanupSweepWorkflow` (fan-out + paging) + `RetentionCleanupTaskWorkflow` (per-task child). | -| `agentex/src/temporal/run_worker.py` | Modify | Register the new workflows + activities on the `agentex-server` queue. | -| `agentex/src/temporal/run_retention_cleanup_schedule.py` | Create | Startup script: create/update the Temporal Schedule when enabled. | -| `agentex/docker-compose.yml` | Modify | Add the schedule-bootstrap step + env vars for local dev. | -| `agentex/tests/unit/temporal/test_retention_cleanup_activities.py` | Create | Unit tests for the activities (mocked use case / repo). | -| `agentex/tests/unit/temporal/test_retention_cleanup_workflow.py` | Create | Workflow tests via `WorkflowEnvironment` with mocked activities. | -| `agentex/tests/unit/config/test_retention_cleanup_env.py` | Create | Env-var parsing test. | -| `agentex/tests/integration/test_retention_cleanup_discovery.py` | Create | Integration test for `list_cleanup_candidate_ids` against real Postgres. | - ---- - -## Task 1: Add retention-cleanup configuration - -**Files:** -- Modify: `agentex/src/config/environment_variables.py` -- Test: `agentex/tests/unit/config/test_retention_cleanup_env.py` - -- [ ] **Step 1: Write the failing test** - -Create `agentex/tests/unit/config/test_retention_cleanup_env.py`: - -```python -import pytest - -from src.config.environment_variables import EnvironmentVariables - - -@pytest.mark.unit -def test_retention_cleanup_env_parses_enabled_and_allowlist(monkeypatch): - monkeypatch.setenv("RETENTION_CLEANUP_ENABLED", "true") - monkeypatch.setenv("RETENTION_CLEANUP_AGENT_ALLOWLIST", "agent-a, agent-b ,agent-c") - monkeypatch.setenv("RETENTION_CLEANUP_IDLE_DAYS", "14") - monkeypatch.setenv("RETENTION_CLEANUP_CRON", "0 3 * * *") - monkeypatch.setenv("RETENTION_CLEANUP_PAGE_SIZE", "50") - monkeypatch.setenv("RETENTION_CLEANUP_MAX_IN_FLIGHT", "5") - - env = EnvironmentVariables.refresh(force_refresh=True) - - assert env.RETENTION_CLEANUP_ENABLED is True - # Allowlist is parsed into a trimmed, non-empty list of names. - assert env.RETENTION_CLEANUP_AGENT_ALLOWLIST == ["agent-a", "agent-b", "agent-c"] - assert env.RETENTION_CLEANUP_IDLE_DAYS == 14 - assert env.RETENTION_CLEANUP_CRON == "0 3 * * *" - assert env.RETENTION_CLEANUP_PAGE_SIZE == 50 - assert env.RETENTION_CLEANUP_MAX_IN_FLIGHT == 5 - - -@pytest.mark.unit -def test_retention_cleanup_env_defaults(monkeypatch): - for key in ( - "RETENTION_CLEANUP_ENABLED", - "RETENTION_CLEANUP_AGENT_ALLOWLIST", - "RETENTION_CLEANUP_IDLE_DAYS", - "RETENTION_CLEANUP_CRON", - "RETENTION_CLEANUP_PAGE_SIZE", - "RETENTION_CLEANUP_MAX_IN_FLIGHT", - ): - monkeypatch.delenv(key, raising=False) - - env = EnvironmentVariables.refresh(force_refresh=True) - - assert env.RETENTION_CLEANUP_ENABLED is False - assert env.RETENTION_CLEANUP_AGENT_ALLOWLIST == [] # fail-closed - assert env.RETENTION_CLEANUP_IDLE_DAYS == 7 - assert env.RETENTION_CLEANUP_CRON == "0 4 * * *" - assert env.RETENTION_CLEANUP_PAGE_SIZE == 200 - assert env.RETENTION_CLEANUP_MAX_IN_FLIGHT == 20 -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `make test FILE=tests/unit/config/test_retention_cleanup_env.py` -Expected: FAIL — `AttributeError` / unexpected attribute on `EnvironmentVariables`. - -- [ ] **Step 3: Implement the config** - -In `agentex/src/config/environment_variables.py`, add to the `EnvVarKeys` class (near `ENABLE_HEALTH_CHECK_WORKFLOW`, line ~59): - -```python - RETENTION_CLEANUP_ENABLED = "RETENTION_CLEANUP_ENABLED" - RETENTION_CLEANUP_AGENT_ALLOWLIST = "RETENTION_CLEANUP_AGENT_ALLOWLIST" - RETENTION_CLEANUP_IDLE_DAYS = "RETENTION_CLEANUP_IDLE_DAYS" - RETENTION_CLEANUP_CRON = "RETENTION_CLEANUP_CRON" - RETENTION_CLEANUP_PAGE_SIZE = "RETENTION_CLEANUP_PAGE_SIZE" - RETENTION_CLEANUP_MAX_IN_FLIGHT = "RETENTION_CLEANUP_MAX_IN_FLIGHT" -``` - -Add the fields to the `EnvironmentVariables` model (near `ENABLE_HEALTH_CHECK_WORKFLOW`, line ~115): - -```python - RETENTION_CLEANUP_ENABLED: bool = False - RETENTION_CLEANUP_AGENT_ALLOWLIST: list[str] = [] - RETENTION_CLEANUP_IDLE_DAYS: int = 7 - RETENTION_CLEANUP_CRON: str = "0 4 * * *" - RETENTION_CLEANUP_PAGE_SIZE: int = 200 - RETENTION_CLEANUP_MAX_IN_FLIGHT: int = 20 -``` - -Add the parsing inside `refresh()` where the `EnvironmentVariables(...)` instance is built (alongside `ENABLE_HEALTH_CHECK_WORKFLOW=...`, line ~199): - -```python - RETENTION_CLEANUP_ENABLED=( - os.environ.get(EnvVarKeys.RETENTION_CLEANUP_ENABLED, "false") == "true" - ), - RETENTION_CLEANUP_AGENT_ALLOWLIST=[ - name.strip() - for name in os.environ.get( - EnvVarKeys.RETENTION_CLEANUP_AGENT_ALLOWLIST, "" - ).split(",") - if name.strip() - ], - RETENTION_CLEANUP_IDLE_DAYS=int( - os.environ.get(EnvVarKeys.RETENTION_CLEANUP_IDLE_DAYS, "7") - ), - RETENTION_CLEANUP_CRON=os.environ.get( - EnvVarKeys.RETENTION_CLEANUP_CRON, "0 4 * * *" - ), - RETENTION_CLEANUP_PAGE_SIZE=int( - os.environ.get(EnvVarKeys.RETENTION_CLEANUP_PAGE_SIZE, "200") - ), - RETENTION_CLEANUP_MAX_IN_FLIGHT=int( - os.environ.get(EnvVarKeys.RETENTION_CLEANUP_MAX_IN_FLIGHT, "20") - ), -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `make test FILE=tests/unit/config/test_retention_cleanup_env.py` -Expected: PASS (both tests). - -- [ ] **Step 5: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/config/environment_variables.py tests/unit/config/test_retention_cleanup_env.py -git commit -m "feat(retention): add scheduled-cleanup configuration env vars" -``` - ---- - -## Task 2: Discovery query — `list_cleanup_candidate_ids` - -**Files:** -- Modify: `agentex/src/domain/repositories/task_repository.py` -- Test: `agentex/tests/integration/test_retention_cleanup_discovery.py` - -- [ ] **Step 1: Write the failing integration test** - -Create `agentex/tests/integration/test_retention_cleanup_discovery.py`. It seeds rows directly via SQLAlchemy core (so we can control `updated_at` / `cleaned_at`), then asserts the query's filtering and keyset paging. - -```python -from datetime import UTC, datetime, timedelta - -import pytest -from sqlalchemy import insert - -from src.adapters.orm import AgentORM, TaskAgentORM, TaskORM -from src.domain.entities.tasks import TaskStatus - - -async def _seed_agent(session, agent_id: str, name: str) -> None: - await session.execute( - insert(AgentORM).values( - id=agent_id, - name=name, - description="seed", - acp_url=f"http://{agent_id}:8000", - acp_type="sync", - ) - ) - - -async def _seed_task( - session, - *, - task_id: str, - agent_id: str, - updated_at: datetime, - cleaned_at: datetime | None, - status: TaskStatus = TaskStatus.COMPLETED, -) -> None: - await session.execute( - insert(TaskORM).values( - id=task_id, - name=task_id, - status=status, - updated_at=updated_at, - cleaned_at=cleaned_at, - ) - ) - await session.execute( - insert(TaskAgentORM).values(task_id=task_id, agent_id=agent_id) - ) - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_discovery_filters_and_keyset_paging(isolated_repositories): - repo = isolated_repositories["task_repository"] - now = datetime.now(UTC) - old = now - timedelta(days=30) - - async with isolated_repositories["postgres_rw_session_factory"]() as session: - await _seed_agent(session, "agent-allowed", "allowed-agent") - await _seed_agent(session, "agent-other", "other-agent") - # idle + allowlisted + not cleaned -> eligible - await _seed_task(session, task_id="t-aaa", agent_id="agent-allowed", updated_at=old, cleaned_at=None) - await _seed_task(session, task_id="t-bbb", agent_id="agent-allowed", updated_at=old, cleaned_at=None) - # recently active (updated_at recent) -> excluded by pre-filter - await _seed_task(session, task_id="t-fresh", agent_id="agent-allowed", updated_at=now, cleaned_at=None) - # already cleaned -> excluded - await _seed_task(session, task_id="t-clean", agent_id="agent-allowed", updated_at=old, cleaned_at=old) - # idle but NOT on allowlist -> excluded - await _seed_task(session, task_id="t-other", agent_id="agent-other", updated_at=old, cleaned_at=None) - await session.commit() - - # Full page: only the two eligible ids, ordered by id ascending. - ids = await repo.list_cleanup_candidate_ids( - idle_days=7, agent_names=["allowed-agent"], after_id=None, limit=100 - ) - assert ids == ["t-aaa", "t-bbb"] - - # Keyset paging: limit=1 then resume after the first id. - page1 = await repo.list_cleanup_candidate_ids( - idle_days=7, agent_names=["allowed-agent"], after_id=None, limit=1 - ) - assert page1 == ["t-aaa"] - page2 = await repo.list_cleanup_candidate_ids( - idle_days=7, agent_names=["allowed-agent"], after_id="t-aaa", limit=1 - ) - assert page2 == ["t-bbb"] - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_discovery_empty_allowlist_returns_nothing(isolated_repositories): - repo = isolated_repositories["task_repository"] - ids = await repo.list_cleanup_candidate_ids( - idle_days=7, agent_names=[], after_id=None, limit=100 - ) - assert ids == [] -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `make test FILE=tests/integration/test_retention_cleanup_discovery.py` -Expected: FAIL — `AttributeError: 'TaskRepository' object has no attribute 'list_cleanup_candidate_ids'`. - -- [ ] **Step 3: Implement the query method** - -In `agentex/src/domain/repositories/task_repository.py`, add the import at the top (the module already imports `select`, `update` from sqlalchemy and `Sequence` from collections.abc): - -```python -from datetime import UTC, datetime, timedelta -``` - -Add this method to `TaskRepository` (e.g. right after `list_with_join`): - -```python - async def list_cleanup_candidate_ids( - self, - *, - idle_days: int, - agent_names: Sequence[str], - after_id: str | None, - limit: int, - ) -> list[str]: - """ - Return ids of tasks eligible for scheduled retention cleanup. - - Cheap, index-friendly PRE-FILTER only — the authoritative idle / status / - unprocessed-events checks live in TaskRetentionService.clean_task. This - deliberately omits a status filter: status is race-prone (a task can flip - to RUNNING between this query and the clean call), so the trustworthy - RUNNING guard is enforced at clean-time. `updated_at < cutoff` is a correct - superset of truly-idle tasks (true idleness also requires the latest Mongo - message to predate the cutoff), so we never under-include. - - Keyset-paginated by id ascending; pass the last returned id as `after_id` - to fetch the next page. Fail-closed: empty `agent_names` returns []. - """ - if not agent_names: - return [] - - cutoff = datetime.now(UTC) - timedelta(days=idle_days) - query = ( - select(TaskORM.id) - .join(TaskAgentORM, TaskORM.id == TaskAgentORM.task_id) - .join(AgentORM, TaskAgentORM.agent_id == AgentORM.id) - .where( - TaskORM.cleaned_at.is_(None), - TaskORM.updated_at < cutoff, - AgentORM.name.in_(list(agent_names)), - ) - .order_by(TaskORM.id.asc()) - .limit(limit) - .distinct() - ) - if after_id is not None: - query = query.where(TaskORM.id > after_id) - - async with self.start_async_db_session(False) as session: - result = await session.execute(query) - return [row[0] for row in result.all()] -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `make test FILE=tests/integration/test_retention_cleanup_discovery.py` -Expected: PASS (both tests). - -> If the `AgentORM` / `TaskORM` inserts fail on a NOT NULL column, add the missing -> column to the corresponding `_seed_*` helper using the column's value from -> `src/adapters/orm.py` — do not change the query. - -- [ ] **Step 5: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/domain/repositories/task_repository.py tests/integration/test_retention_cleanup_discovery.py -git commit -m "feat(retention): add keyset-paginated cleanup-candidate discovery query" -``` - ---- - -## Task 3: Use-case factory for worker context - -**Files:** -- Create: `agentex/src/temporal/task_retention_factory.py` - -No dedicated test (it's pure wiring exercised by Task 4's tests and at runtime). Verified against real constructor signatures: -`TaskRepository`/`EventRepository`/`AgentTaskTrackerRepository(rw_maker, ro_maker)`, -`TaskMessageRepository(db)`, `TaskStateRepository(db)`, `TaskMessageService(message_repository=...)`, -`TemporalAdapter(temporal_client=...)`, `TaskRetentionService(...)`, `TaskRetentionUseCase(retention_service=...)`. - -- [ ] **Step 1: Create the factory** - -Create `agentex/src/temporal/task_retention_factory.py`: - -```python -""" -Construct a TaskRetentionUseCase outside FastAPI's Depends DI, for use inside -Temporal worker processes. Mirrors the manual-wiring pattern in -run_healthcheck_workflow.py (repositories built from session makers). -""" - -from src.adapters.temporal.adapter_temporal import TemporalAdapter -from src.config.dependencies import ( - GlobalDependencies, - database_async_read_only_session_maker, - database_async_read_write_engine, - database_async_read_write_session_maker, - httpx_client, -) -from src.domain.repositories.agent_task_tracker_repository import ( - AgentTaskTrackerRepository, -) -from src.domain.repositories.event_repository import EventRepository -from src.domain.repositories.task_message_repository import TaskMessageRepository -from src.domain.repositories.task_repository import TaskRepository -from src.domain.repositories.task_state_repository import TaskStateRepository -from src.domain.services.task_message_service import TaskMessageService -from src.domain.services.task_retention_service import TaskRetentionService -from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase - - -def build_task_retention_use_case( - global_dependencies: GlobalDependencies, -) -> TaskRetentionUseCase: - """Wire a TaskRetentionUseCase from an already-loaded GlobalDependencies.""" - engine = database_async_read_write_engine() - rw_session_maker = database_async_read_write_session_maker(engine) - ro_session_maker = database_async_read_only_session_maker(engine) - - task_repository = TaskRepository(rw_session_maker, ro_session_maker) - event_repository = EventRepository(rw_session_maker, ro_session_maker) - agent_task_tracker_repository = AgentTaskTrackerRepository( - rw_session_maker, ro_session_maker - ) - - task_message_repository = TaskMessageRepository(global_dependencies.mongodb_database) - task_state_repository = TaskStateRepository(global_dependencies.mongodb_database) - task_message_service = TaskMessageService(message_repository=task_message_repository) - - temporal_adapter = TemporalAdapter( - temporal_client=global_dependencies.temporal_client - ) - - retention_service = TaskRetentionService( - task_repository=task_repository, - task_message_service=task_message_service, - task_message_repository=task_message_repository, - task_state_repository=task_state_repository, - event_repository=event_repository, - agent_task_tracker_repository=agent_task_tracker_repository, - temporal_adapter=temporal_adapter, - httpx_client=httpx_client(), - ) - return TaskRetentionUseCase(retention_service=retention_service) -``` - -- [ ] **Step 2: Verify it imports** - -Run: `cd agentex && uv run python -c "from src.temporal.task_retention_factory import build_task_retention_use_case; print('ok')"` -Expected: prints `ok` (no ImportError). - -- [ ] **Step 3: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/temporal/task_retention_factory.py -git commit -m "feat(retention): add worker-context factory for TaskRetentionUseCase" -``` - ---- - -## Task 4: Cleanup activities - -**Files:** -- Create: `agentex/src/temporal/activities/retention_cleanup_activities.py` -- Test: `agentex/tests/unit/temporal/test_retention_cleanup_activities.py` - -The activities cross the Temporal boundary with JSON-native types only. `clean_task` -returns a dict: `{"task_id", "status": "cleaned"|"skipped", "reason", "messages_deleted", "task_states_deleted", "events_deleted"}`. - -- [ ] **Step 1: Write the failing tests** - -Create `agentex/tests/unit/temporal/test_retention_cleanup_activities.py`: - -```python -from datetime import UTC, datetime -from unittest.mock import AsyncMock - -import pytest - -from src.domain.entities.task_retention import TaskCleanupResultEntity -from src.domain.exceptions import ClientError -from src.temporal.activities.retention_cleanup_activities import ( - RetentionCleanupActivities, -) - - -@pytest.mark.unit -@pytest.mark.asyncio -async def test_find_cleanup_candidates_delegates_to_repo(): - repo = AsyncMock() - repo.list_cleanup_candidate_ids.return_value = ["t1", "t2"] - activities = RetentionCleanupActivities(task_repository=repo, use_case=AsyncMock()) - - result = await activities.find_cleanup_candidates( - after_id=None, limit=200, idle_days=7, agent_names=["a"] - ) - - assert result == ["t1", "t2"] - repo.list_cleanup_candidate_ids.assert_awaited_once_with( - idle_days=7, agent_names=["a"], after_id=None, limit=200 - ) - - -@pytest.mark.unit -@pytest.mark.asyncio -async def test_clean_task_cleaned_outcome(): - use_case = AsyncMock() - use_case.clean_task.return_value = TaskCleanupResultEntity( - task_id="t1", - cleaned_at=datetime.now(UTC), - messages_deleted=3, - task_states_deleted=1, - events_deleted=2, - ) - activities = RetentionCleanupActivities(task_repository=AsyncMock(), use_case=use_case) - - outcome = await activities.clean_task(task_id="t1", idle_days=7) - - assert outcome["status"] == "cleaned" - assert outcome["task_id"] == "t1" - assert outcome["messages_deleted"] == 3 - use_case.clean_task.assert_awaited_once_with(task_id="t1", force=False, idle_days=7) - - -@pytest.mark.unit -@pytest.mark.asyncio -async def test_clean_task_clienterror_maps_to_skipped(): - use_case = AsyncMock() - use_case.clean_task.side_effect = ClientError("Cannot clean task t1: status is RUNNING (active)") - activities = RetentionCleanupActivities(task_repository=AsyncMock(), use_case=use_case) - - outcome = await activities.clean_task(task_id="t1", idle_days=7) - - assert outcome["status"] == "skipped" - assert "RUNNING" in outcome["reason"] - assert outcome["task_id"] == "t1" - - -@pytest.mark.unit -@pytest.mark.asyncio -async def test_clean_task_unexpected_error_propagates(): - use_case = AsyncMock() - use_case.clean_task.side_effect = RuntimeError("mongo timeout") - activities = RetentionCleanupActivities(task_repository=AsyncMock(), use_case=use_case) - - with pytest.raises(RuntimeError): - await activities.clean_task(task_id="t1", idle_days=7) -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_activities.py` -Expected: FAIL — module `retention_cleanup_activities` does not exist. - -- [ ] **Step 3: Implement the activities** - -Create `agentex/src/temporal/activities/retention_cleanup_activities.py`: - -```python -""" -Temporal activities for the scheduled task-retention cleanup sweep. - -Two activities: -- find_cleanup_candidates: cheap pre-filtered, keyset-paginated discovery. -- clean_task: delegates to TaskRetentionUseCase.clean_task; catches ClientError - (the three policy/safety refusals) and maps it to a 'skipped' outcome so the - caller's child workflow completes cleanly. Genuine transient errors propagate - so Temporal retries them. - -Boundary types are JSON-native (the backend data converter does not serialize -Pydantic models). -""" - -from src.domain.exceptions import ClientError -from src.domain.repositories.task_repository import TaskRepository -from src.domain.use_cases.task_retention_use_case import TaskRetentionUseCase -from src.utils.logging import make_logger -from temporalio import activity - -logger = make_logger(__name__) - -FIND_CLEANUP_CANDIDATES_ACTIVITY = "find_cleanup_candidates_activity" -CLEAN_TASK_ACTIVITY = "clean_task_activity" - - -class RetentionCleanupActivities: - def __init__( - self, - task_repository: TaskRepository, - use_case: TaskRetentionUseCase, - ): - self.task_repository = task_repository - self.use_case = use_case - - @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) - async def find_cleanup_candidates( - self, - after_id: str | None, - limit: int, - idle_days: int, - agent_names: list[str], - ) -> list[str]: - return await self.task_repository.list_cleanup_candidate_ids( - idle_days=idle_days, - agent_names=agent_names, - after_id=after_id, - limit=limit, - ) - - @activity.defn(name=CLEAN_TASK_ACTIVITY) - async def clean_task(self, task_id: str, idle_days: int) -> dict: - try: - result = await self.use_case.clean_task( - task_id=task_id, force=False, idle_days=idle_days - ) - return { - "task_id": result.task_id, - "status": "cleaned", - "reason": None, - "messages_deleted": result.messages_deleted, - "task_states_deleted": result.task_states_deleted, - "events_deleted": result.events_deleted, - } - except ClientError as e: - # Expected policy/safety refusal (RUNNING / not idle / unprocessed - # events). Backstop for the rare race the pre-filter can't catch. - logger.info( - "task_cleanup_skipped", - extra={"task_id": task_id, "reason": str(e)}, - ) - return { - "task_id": task_id, - "status": "skipped", - "reason": str(e), - "messages_deleted": 0, - "task_states_deleted": 0, - "events_deleted": 0, - } -``` - -- [ ] **Step 4: Run tests to verify they pass** - -Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_activities.py` -Expected: PASS (all four). - -- [ ] **Step 5: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/temporal/activities/retention_cleanup_activities.py tests/unit/temporal/test_retention_cleanup_activities.py -git commit -m "feat(retention): add cleanup discovery + clean activities" -``` - ---- - -## Task 5: Sweep + per-task child workflows - -**Files:** -- Create: `agentex/src/temporal/workflows/retention_cleanup_workflow.py` -- Test: `agentex/tests/unit/temporal/test_retention_cleanup_workflow.py` - -- [ ] **Step 1: Write the failing workflow test** - -Create `agentex/tests/unit/temporal/test_retention_cleanup_workflow.py`. It runs the real workflows in a time-skipping `WorkflowEnvironment` with **mocked activities** so no DB is needed. - -```python -import uuid - -import pytest -from temporalio import activity -from temporalio.testing import WorkflowEnvironment -from temporalio.worker import UnsandboxedWorkflowRunner, Worker - -from src.temporal.activities.retention_cleanup_activities import ( - CLEAN_TASK_ACTIVITY, - FIND_CLEANUP_CANDIDATES_ACTIVITY, -) -from src.temporal.workflows.retention_cleanup_workflow import ( - RetentionCleanupSweepWorkflow, - RetentionCleanupTaskWorkflow, -) - - -@pytest.mark.unit -@pytest.mark.asyncio -async def test_sweep_cleans_all_pages_and_aggregates(): - # Two pages of candidates then empty; one task is skipped, one fails once then is counted failed. - pages = {None: ["t1", "t2"], "t2": ["t3"], "t3": []} - - @activity.defn(name=FIND_CLEANUP_CANDIDATES_ACTIVITY) - async def fake_find(after_id, limit, idle_days, agent_names) -> list[str]: - return pages[after_id] - - @activity.defn(name=CLEAN_TASK_ACTIVITY) - async def fake_clean(task_id: str, idle_days: int) -> dict: - if task_id == "t2": - return {"task_id": task_id, "status": "skipped", "reason": "RUNNING", - "messages_deleted": 0, "task_states_deleted": 0, "events_deleted": 0} - if task_id == "t3": - raise RuntimeError("permanent failure") - return {"task_id": task_id, "status": "cleaned", "reason": None, - "messages_deleted": 1, "task_states_deleted": 0, "events_deleted": 0} - - async with await WorkflowEnvironment.start_time_skipping() as env: - async with Worker( - env.client, - task_queue="test-retention", - workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], - activities=[fake_find, fake_clean], - workflow_runner=UnsandboxedWorkflowRunner(), - ): - summary = await env.client.execute_workflow( - RetentionCleanupSweepWorkflow.run, - { - "idle_days": 7, - "agent_names": ["a"], - "page_size": 2, - "max_in_flight": 2, - }, - id=f"sweep-{uuid.uuid4()}", - task_queue="test-retention", - ) - - assert summary["cleaned"] == 1 # t1 - assert summary["skipped"] == 1 # t2 - assert summary["failed"] == 1 # t3 -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_workflow.py` -Expected: FAIL — module `retention_cleanup_workflow` does not exist. - -- [ ] **Step 3: Implement the workflows** - -Create `agentex/src/temporal/workflows/retention_cleanup_workflow.py`: - -```python -""" -Scheduled task-retention cleanup workflows. - -RetentionCleanupSweepWorkflow: started by a Temporal Schedule. Pulls one page of -candidate task ids, fans out one child workflow per task (bounded by -max_in_flight), aggregates cleaned/skipped/failed counts, then continue_as_new's -to the next page so workflow history stays bounded regardless of backlog size. - -RetentionCleanupTaskWorkflow: per-task child. Invokes the clean activity, which -already maps the policy/safety ClientError refusals to a 'skipped' outcome; only -genuine transient errors surface as activity failures (and are retried). -""" - -import asyncio -from datetime import timedelta - -from src.temporal.activities.retention_cleanup_activities import ( - CLEAN_TASK_ACTIVITY, - FIND_CLEANUP_CANDIDATES_ACTIVITY, -) -from src.utils.logging import make_logger -from temporalio import workflow -from temporalio.common import RetryPolicy - -logger = make_logger(__name__) - - -def _chunked(items: list[str], size: int) -> list[list[str]]: - return [items[i : i + size] for i in range(0, len(items), size)] - - -@workflow.defn -class RetentionCleanupTaskWorkflow: - @workflow.run - async def run(self, args: dict) -> dict: - return await workflow.execute_activity( - CLEAN_TASK_ACTIVITY, - args=[args["task_id"], args["idle_days"]], - start_to_close_timeout=timedelta(seconds=60), - retry_policy=RetryPolicy( - maximum_attempts=3, - initial_interval=timedelta(seconds=1), - backoff_coefficient=2.0, - ), - ) - - -@workflow.defn -class RetentionCleanupSweepWorkflow: - @workflow.run - async def run(self, args: dict) -> dict: - idle_days = args["idle_days"] - agent_names = args["agent_names"] - page_size = args.get("page_size", 200) - max_in_flight = args.get("max_in_flight", 20) - after_id = args.get("after_id") - totals = args.get("totals", {"cleaned": 0, "skipped": 0, "failed": 0}) - - task_ids = await workflow.execute_activity( - FIND_CLEANUP_CANDIDATES_ACTIVITY, - args=[after_id, page_size, idle_days, agent_names], - start_to_close_timeout=timedelta(seconds=30), - retry_policy=RetryPolicy( - maximum_attempts=3, - initial_interval=timedelta(seconds=1), - backoff_coefficient=2.0, - ), - ) - - if not task_ids: - logger.info("retention_cleanup_sweep_completed", extra=totals) - return totals - - for batch in _chunked(task_ids, max_in_flight): - results = await asyncio.gather( - *[ - workflow.execute_child_workflow( - RetentionCleanupTaskWorkflow.run, - {"task_id": task_id, "idle_days": idle_days}, - id=f"retention-cleanup-task-{task_id}", - retry_policy=RetryPolicy(maximum_attempts=1), - ) - for task_id in batch - ], - return_exceptions=True, - ) - for result in results: - if isinstance(result, BaseException): - totals["failed"] += 1 - else: - status = result.get("status", "failed") - totals[status] = totals.get(status, 0) + 1 - - # Bound history: hand the next page to a fresh run. - workflow.continue_as_new( - { - "idle_days": idle_days, - "agent_names": agent_names, - "page_size": page_size, - "max_in_flight": max_in_flight, - "after_id": task_ids[-1], - "totals": totals, - } - ) -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `make test FILE=tests/unit/temporal/test_retention_cleanup_workflow.py` -Expected: PASS. (Note: `WorkflowEnvironment.start_time_skipping()` downloads a test server on first run; ensure network access. If `temporalio` test deps are missing, install with `uv sync`.) - -- [ ] **Step 5: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/temporal/workflows/retention_cleanup_workflow.py tests/unit/temporal/test_retention_cleanup_workflow.py -git commit -m "feat(retention): add sweep + per-task cleanup workflows" -``` - ---- - -## Task 6: Register workflows + activities on the worker - -**Files:** -- Modify: `agentex/src/temporal/run_worker.py` - -- [ ] **Step 1: Add a worker factory for retention cleanup** - -In `agentex/src/temporal/run_worker.py`, add imports near the existing imports: - -```python -from src.config.dependencies import GlobalDependencies -from src.temporal.activities.retention_cleanup_activities import ( - RetentionCleanupActivities, -) -from src.temporal.task_retention_factory import build_task_retention_use_case -from src.temporal.workflows.retention_cleanup_workflow import ( - RetentionCleanupSweepWorkflow, - RetentionCleanupTaskWorkflow, -) -from src.domain.repositories.task_repository import TaskRepository -``` - -Add this factory function (mirrors `create_health_check_worker`): - -```python -def create_retention_cleanup_worker( - global_dependencies: GlobalDependencies, -) -> asyncio.Task: - """Create a worker that serves the retention-cleanup workflows + activities.""" - task_queue = os.environ.get("AGENTEX_SERVER_TASK_QUEUE", AGENTEX_SERVER_TASK_QUEUE) - - engine = database_async_read_write_engine() - rw_session_maker = database_async_read_write_session_maker(engine) - ro_session_maker = database_async_read_only_session_maker(engine) - task_repository = TaskRepository(rw_session_maker, ro_session_maker) - use_case = build_task_retention_use_case(global_dependencies) - - retention_activities = RetentionCleanupActivities( - task_repository=task_repository, - use_case=use_case, - ) - - return asyncio.create_task( - run_worker( - task_queue=task_queue, - workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], - activities=[ - retention_activities.find_cleanup_candidates, - retention_activities.clean_task, - ], - max_workers=50, - max_concurrent_activities=50, - ) - ) -``` - -> **Note on `run_worker`'s global `health_check_worker`:** the existing `run_worker` -> assigns the worker to a module global and shuts that single global down in -> `finally`. Running two `run_worker` tasks in one process would clobber that -> global. For v1 the retention worker runs as its **own process** (see Task 8 — -> a separate docker-compose service / k8s deployment invoking -> `create_retention_cleanup_worker` via a `main()`), so it does not share the -> health-check process. Add a `main()` to this module that loads -> `GlobalDependencies` and awaits `create_retention_cleanup_worker(...)`, guarded -> so it only runs when invoked as the retention entrypoint. - -Add a retention entrypoint `main` (separate from the health-check `main`): - -```python -async def run_retention_cleanup_worker_main() -> None: - global_dependencies = GlobalDependencies() - await global_dependencies.load() - worker_task = create_retention_cleanup_worker(global_dependencies) - await worker_task -``` - -- [ ] **Step 2: Verify imports/compile** - -Run: `cd agentex && uv run python -c "from src.temporal.run_worker import create_retention_cleanup_worker, run_retention_cleanup_worker_main; print('ok')"` -Expected: prints `ok`. - -- [ ] **Step 3: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/temporal/run_worker.py -git commit -m "feat(retention): register cleanup workflows + activities on worker" -``` - ---- - -## Task 7: Schedule bootstrap script - -**Files:** -- Create: `agentex/src/temporal/run_retention_cleanup_schedule.py` - -This mirrors `run_healthcheck_workflow.py`: a startup script that, when enabled, -creates (or no-ops if already present) the Temporal Schedule. - -- [ ] **Step 1: Create the bootstrap script** - -Create `agentex/src/temporal/run_retention_cleanup_schedule.py`: - -```python -""" -Create the Temporal Schedule that drives the scheduled task-retention cleanup. - -Runs at startup (mirrors run_healthcheck_workflow.py). No-op unless -RETENTION_CLEANUP_ENABLED is true and an agent allowlist is configured -(fail-closed). Idempotent: if the schedule already exists, it is left as-is. -""" - -import asyncio - -from src.adapters.temporal.adapter_temporal import TemporalAdapter -from src.adapters.temporal.client_factory import TemporalClientFactory -from src.adapters.temporal.exceptions import TemporalScheduleAlreadyExistsError -from src.config.dependencies import GlobalDependencies -from src.config.environment_variables import EnvironmentVariables -from src.temporal.run_worker import AGENTEX_SERVER_TASK_QUEUE -from src.temporal.workflows.retention_cleanup_workflow import ( - RetentionCleanupSweepWorkflow, -) -from src.utils.logging import make_logger - -logger = make_logger(__name__) - -SCHEDULE_ID = "retention-cleanup-sweep" -WORKFLOW_ID = "retention-cleanup-sweep" - - -async def main() -> None: - global_dependencies = GlobalDependencies() - await global_dependencies.load() - - env = EnvironmentVariables.refresh() - if not env or not env.RETENTION_CLEANUP_ENABLED: - logger.info("Retention cleanup is not enabled; skipping schedule creation") - return - if not env.RETENTION_CLEANUP_AGENT_ALLOWLIST: - logger.warning( - "Retention cleanup enabled but agent allowlist is empty (fail-closed); " - "skipping schedule creation" - ) - return - if not TemporalClientFactory.is_temporal_configured(env): - logger.error("Temporal is not configured; skipping schedule creation") - return - - task_queue = env.AGENTEX_SERVER_TASK_QUEUE or AGENTEX_SERVER_TASK_QUEUE - adapter = TemporalAdapter(temporal_client=global_dependencies.temporal_client) - - workflow_args = { - "idle_days": env.RETENTION_CLEANUP_IDLE_DAYS, - "agent_names": env.RETENTION_CLEANUP_AGENT_ALLOWLIST, - "page_size": env.RETENTION_CLEANUP_PAGE_SIZE, - "max_in_flight": env.RETENTION_CLEANUP_MAX_IN_FLIGHT, - } - - try: - await adapter.create_schedule( - schedule_id=SCHEDULE_ID, - workflow=RetentionCleanupSweepWorkflow.run, - workflow_id=WORKFLOW_ID, - args=[workflow_args], - task_queue=task_queue, - cron_expressions=[env.RETENTION_CLEANUP_CRON], - ) - logger.info( - "Created retention-cleanup schedule", - extra={"cron": env.RETENTION_CLEANUP_CRON, "args": workflow_args}, - ) - except TemporalScheduleAlreadyExistsError: - logger.info("Retention-cleanup schedule already exists; leaving as-is") - - -if __name__ == "__main__": - asyncio.run(main()) -``` - -- [ ] **Step 2: Verify imports/compile** - -Run: `cd agentex && uv run python -c "import src.temporal.run_retention_cleanup_schedule as m; print(m.SCHEDULE_ID)"` -Expected: prints `retention-cleanup-sweep`. - -- [ ] **Step 3: Lint + commit** - -```bash -cd agentex && uv run ruff check src/ --fix && uv run ruff format src/ -git add src/temporal/run_retention_cleanup_schedule.py -git commit -m "feat(retention): add cleanup schedule bootstrap script" -``` - ---- - -## Task 8: Wire local dev (docker-compose) + docs - -**Files:** -- Modify: `agentex/docker-compose.yml` - -- [ ] **Step 1: Add the retention worker service + schedule bootstrap** - -In `agentex/docker-compose.yml`, add a worker service modeled on the existing -`agentex-temporal-worker` service, but running the retention entrypoint, and add a -schedule-bootstrap invocation. Use the same image/build, env, and `depends_on` as -`agentex-temporal-worker`. Set its command to run the retention worker: - -```yaml - agentex-retention-cleanup-worker: - # (copy build/image/env/depends_on/networks from agentex-temporal-worker) - command: >- - python -c "import asyncio; from src.temporal.run_worker import run_retention_cleanup_worker_main; asyncio.run(run_retention_cleanup_worker_main())" - environment: - # inherit the temporal-worker env, plus: - RETENTION_CLEANUP_ENABLED: "${RETENTION_CLEANUP_ENABLED:-false}" - RETENTION_CLEANUP_AGENT_ALLOWLIST: "${RETENTION_CLEANUP_AGENT_ALLOWLIST:-}" - RETENTION_CLEANUP_IDLE_DAYS: "${RETENTION_CLEANUP_IDLE_DAYS:-7}" - RETENTION_CLEANUP_CRON: "${RETENTION_CLEANUP_CRON:-0 4 * * *}" - RETENTION_CLEANUP_PAGE_SIZE: "${RETENTION_CLEANUP_PAGE_SIZE:-200}" - RETENTION_CLEANUP_MAX_IN_FLIGHT: "${RETENTION_CLEANUP_MAX_IN_FLIGHT:-20}" -``` - -Add the schedule bootstrap to the `agentex` API service startup command, right -after the health-check workflow bootstrap (the existing command runs -`python src/temporal/run_healthcheck_workflow.py`); append: - -``` -python src/temporal/run_retention_cleanup_schedule.py -``` - -so the schedule is (re)asserted on API startup, gated by `RETENTION_CLEANUP_ENABLED`. - -- [ ] **Step 2: Validate compose syntax** - -Run: `cd agentex && docker compose config >/dev/null && echo "compose ok"` -Expected: prints `compose ok` (no YAML/interpolation errors). - -- [ ] **Step 3: Commit** - -```bash -git add agentex/docker-compose.yml -git commit -m "feat(retention): wire cleanup worker + schedule bootstrap for local dev" -``` - ---- - -## Task 9: Full test sweep + final verification - -- [ ] **Step 1: Run the full new test surface** - -```bash -cd agentex -make test FILE=tests/unit/config/test_retention_cleanup_env.py -make test FILE=tests/unit/temporal/test_retention_cleanup_activities.py -make test FILE=tests/unit/temporal/test_retention_cleanup_workflow.py -make test FILE=tests/integration/test_retention_cleanup_discovery.py -``` -Expected: all PASS. - -- [ ] **Step 2: Lint the whole changed surface** - -```bash -cd agentex && uv run ruff check src/ && uv run ruff format --check src/ -``` -Expected: no errors. - -- [ ] **Step 3: Manual smoke (optional, local)** - -With `RETENTION_CLEANUP_ENABLED=true` and `RETENTION_CLEANUP_AGENT_ALLOWLIST=`, -`make dev`, then in the Temporal UI (http://localhost:8080) confirm a Schedule -`retention-cleanup-sweep` exists; trigger it manually and confirm a -`RetentionCleanupSweepWorkflow` run completes with a summary. - -- [ ] **Step 4: Final commit (if any lint/fixups remain)** - -```bash -cd agentex && git add -A && git commit -m "chore(retention): lint + test fixups for scheduled cleanup" || echo "nothing to commit" -``` - ---- - -## Notes / Out of Scope - -- **Worker must run in the deployed env.** This plan wires the worker for local - docker-compose. The deployed (k8s) environment must run an agentex-backend - Temporal worker on the `agentex-server` queue for the Schedule to execute — - tracked as a separate infra change (see spec "Open prerequisites"). -- **No export sink** in v1 (clean only), per the spec. -- **Audit trail** is the existing `task_cleanup_completed` structured log (emitted - inside `clean_task`) plus the new `task_cleanup_skipped` and - `retention_cleanup_sweep_completed` logs — faceted in Datadog. No new table. diff --git a/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md b/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md deleted file mode 100644 index ae819d7f..00000000 --- a/docs/superpowers/specs/2026-06-03-scheduled-task-retention-cleanup-design.md +++ /dev/null @@ -1,190 +0,0 @@ -# Scheduled Task-Retention Cleanup Workflow — Design - -**Date:** 2026-06-03 -**Status:** Approved (pending spec review) -**Author:** Stas Moreinis - -## Background - -A data-retention requirement calls for isolating project data and avoiding -long-lived chat/task data in the shared setup: keep Agentex chat/task data only -while a conversation is "active", and auto-clean it after a configurable idle -window (default 7 days since last interaction). - -The export / clean / rehydrate building blocks already landed (PR #243): -`TaskRetentionUseCase` and `TaskRetentionService.clean_task(...)` are written so -that the same logic backs both the HTTP admin endpoints and a scheduled cleanup -caller. `clean_task` is **idempotent**, performs its own **authoritative idle -check** (`max(task.updated_at, latest_message.created_at) < now - idle_days`), -and **refuses** (raises `ClientError`) for three safety/policy cases: the task is -`RUNNING`, it is not idle long enough (when the threshold is enforced), or it has -unprocessed events past the `agent_task_tracker` cursors. If the task is already -cleaned (`cleaned_at IS NOT NULL`) it returns an empty result rather than raising. - -This document designs the missing piece: a **regularly scheduled sweep** that -discovers idle tasks and drives them through `clean_task`. - -### Scope (v1) - -- **In scope:** A Temporal Schedule + sweep workflow that finds idle tasks - belonging to an allowlisted set of agents and cleans them, gated by a feature - flag. Clean only. -- **Out of scope (explicitly):** - - Exporting task content to an external sink before cleanup. Per the retention - discussion, v1 cleanup does **not** export anywhere; persisted chat history - lives in the consuming product's approved store and the export/rehydrate APIs - remain available for manual testing and a later full-restore path. - - Rehydrate wiring. - - **Deploying the Temporal worker in the target (k8s) environment.** The worker - process and Schedule infrastructure exist in code and docker-compose, but the - deployed environment may not yet run a backend Temporal worker. That is an - infra prerequisite tracked separately; this design is the code change. - -## Goals - -1. Periodically clean idle tasks for an allowlisted set of agents, with zero - behavior change in any environment until explicitly enabled. -2. Reuse the existing, idempotent `clean_task` path verbatim — no duplicated - deletion logic. -3. Be resilient: one task's failure or refusal never aborts the sweep; the run - is safe to retry and safe to replay after a worker crash. -4. Bound resource usage (Temporal history, concurrent deletes) regardless of - backlog size. - -## Configuration - -All configuration is via environment variables (consistent with the existing -`ENABLE_HEALTH_CHECK_WORKFLOW` pattern). The master flag (`RETENTION_CLEANUP_ENABLED`) -and `RETENTION_CLEANUP_CRON` are read at **schedule-bootstrap** time — they are -inherent schedule properties (whether the schedule exists, and its cadence). The -**policy** (allowlist, idle threshold, paging) is deliberately **not** baked into -the schedule: it is read at **sweep run time** by a `load_cleanup_config` activity -from the worker's environment, then carried across the sweep's `continue_as_new` -pages. This means changing the allowlist / idle days / paging is a matter of -editing the env var and restarting the worker — the next scheduled run picks up -the new policy with **no schedule recreation**. (Changing the *cron* cadence still -requires updating/recreating the schedule, since cron is a schedule property.) - -| Env var | Meaning | Default | -|---|---|---| -| `RETENTION_CLEANUP_ENABLED` | Master on/off. When false, the schedule is not created and the sweep is a no-op. | `false` | -| `RETENTION_CLEANUP_AGENT_ALLOWLIST` | Comma-separated agent **names**. Only tasks owned by these agents are eligible. Empty ⇒ nothing eligible (fail-closed). | `""` | -| `RETENTION_CLEANUP_IDLE_DAYS` | Idle threshold in days. | `7` | -| `RETENTION_CLEANUP_CRON` | Cron expression for the schedule. | `0 4 * * *` (daily 04:00) | -| `RETENTION_CLEANUP_PAGE_SIZE` | Candidate page size per discovery activity call. | `200` | -| `RETENTION_CLEANUP_MAX_IN_FLIGHT` | Max concurrent per-task child workflows. | `20` | - -**Fail-closed:** an empty allowlist cleans nothing. The allowlist scopes the -blast radius to named agents only. - -## Architecture - -### Components - -| Component | File | Responsibility | -|---|---|---| -| `RetentionCleanupSweepWorkflow` | `agentex/src/temporal/workflows/retention_cleanup_workflow.py` | Paginate candidates → fan out child workflows in bounded batches → aggregate summary → `continue_as_new` across pages. | -| `RetentionCleanupTaskWorkflow` | same file | Per-task child workflow: invoke the clean activity, return a structured outcome. | -| `RetentionCleanupActivities` | `agentex/src/temporal/activities/retention_cleanup_activities.py` | `load_cleanup_config(...)` (reads policy from env), `find_cleanup_candidates(...)`, and `clean_task(...)` (the latter catches `ClientError` and maps it to a `skipped` outcome). | -| Discovery query | `agentex/src/domain/repositories/task_repository.py` (extend) | Keyset-paginated query for idle, uncleaned candidate task ids filtered by agent name. | -| Schedule bootstrap | `agentex/src/temporal/run_retention_cleanup_schedule.py` | On startup, when enabled + Temporal configured, create the Temporal Schedule with **no policy args** (cron + identity only; idempotent, leaves an existing schedule as-is). Mirrors `run_healthcheck_workflow.py`. | -| Worker registration | `agentex/src/temporal/run_worker.py` (edit) | Register both workflows + the activities on the `agentex-server` task queue. | - -### Data flow - -``` -Temporal Schedule (cron; created at bootstrap only when RETENTION_CLEANUP_ENABLED; no policy args) - └─> RetentionCleanupSweepWorkflow() # no policy baked in - ├─ (first page only) activity load_cleanup_config() -> {idle_days, allowlist, page_size, max_in_flight} - │ (read from worker env; carried across continue_as_new pages so a sweep stays consistent) - ├─ activity find_cleanup_candidates(cursor, limit, idle_days, allowlist) -> [task_id...] - ├─ for each batch (size ≤ max_in_flight) of task_ids: - │ start child RetentionCleanupTaskWorkflow(task_id, idle_days) - │ └─ activity clean_task(task_id, idle_days) # enforce_idle_threshold=True - │ -> outcome: cleaned{counts} | skipped{reason} - │ (raises only on transient/infra errors -> Temporal retries) - ├─ accumulate running totals: cleaned / skipped(by reason) / failed - └─ if another page exists: continue_as_new(next_cursor, running_totals) - else: emit structured summary log and complete -``` - -### Discovery query - -```sql -SELECT t.id -FROM tasks t -JOIN task_agents ta ON ta.task_id = t.id -JOIN agents a ON a.id = ta.agent_id -WHERE t.cleaned_at IS NULL - AND t.updated_at < (now() - make_interval(days => :idle_days)) - AND a.name = ANY(:allowlist) - AND t.id > :cursor -- keyset pagination -ORDER BY t.id -LIMIT :page_size; -``` - -Notes: - -- **No `status` filter.** `status` is the race-prone dimension — a task can flip - to `RUNNING` between this query and the clean call, so filtering it here gives - only a false sense of safety. The trustworthy RUNNING check is the - authoritative guard inside `clean_task` (evaluated at clean-time). Discovery is - therefore limited to stable, index-friendly columns (`cleaned_at`, - `updated_at`) plus the allowlist join; a rare RUNNING-but-stale task surfaces as - a candidate and is absorbed as `skipped{reason=running}` by the backstop. -- The `updated_at < cutoff` pre-filter is a **correct superset** of genuinely-idle - tasks: true idleness requires both `updated_at` **and** the latest Mongo message - to predate the cutoff, so the Postgres pre-filter can never exclude a truly-idle - task. It only over-includes (caught at clean-time), never under-includes. -- Keyset pagination by `id` (not OFFSET) keeps each page cheap and stable as rows - are cleaned mid-sweep. - -## Idleness & correctness - -- **Pre-filter (cheap, in discovery):** `cleaned_at IS NULL AND updated_at < cutoff`. -- **Authoritative (correctness-critical, in `clean_task`):** idle check including - the latest Mongo message timestamp, the RUNNING guard, and the unprocessed-events - guard. The sweep always runs with `enforce_idle_threshold=True` and **never** - forces. -- **Idempotency / replay safety:** `clean_task` no-ops on already-cleaned tasks and - is idempotent across all stores, so child-workflow retries and worker-crash - replays are safe. - -## Error handling - -- The `clean_task` activity **catches `ClientError`** (the three refusals) and - returns a structured `skipped{reason}` outcome; the child workflow completes - successfully. Pre-filtering keeps these rare; the catch handles the unavoidable - races (a message/event landing between discovery and clean). -- Genuine transient errors (Postgres/Mongo) **propagate**, so Temporal's default - RetryPolicy retries the activity. A child that still fails after retries is - counted as `failed` by the parent and **does not abort the sweep**. -- The parent emits a structured summary log (`cleaned`, `skipped` by reason, - `failed`) for Datadog faceting, consistent with the existing - `task_cleanup_completed` forensic log emitted by `clean_task`. - -## Scale & safety - -- `continue_as_new` per page bounds workflow history irrespective of backlog size. -- `max_in_flight` caps concurrent child workflows to avoid a thundering herd of - deletes against Mongo/Postgres. -- Feature flag ⇒ no behavior change anywhere until explicitly enabled. -- Allowlist (fail-closed) ⇒ blast radius limited to named agents. - -## Testing - -- **Unit:** discovery query filters and keyset paging; activity skip-mapping - (`ClientError` → `skipped{reason}`); parent summary aggregation; fail-closed on - empty allowlist. -- **Integration (testcontainers — Postgres/Mongo):** seed idle, active, - already-cleaned, and not-yet-idle tasks across allowlisted and non-allowlisted - agents; run the activity layer; assert only the right tasks are cleaned and that - counts/skips match. -- **Workflow (Temporal `WorkflowEnvironment`):** fan-out correctness, - `continue_as_new` paging across multiple pages, and that a failed child does not - abort the sweep. - -## Open prerequisites (not built here) - -- Backend Temporal worker must actually run in the target deployed environment for - the Schedule to execute. Tracked as an infra change separate from this code. From 935f7cdadac167315c0a0a97d4eb48c57589f4da Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 17:15:02 -0700 Subject: [PATCH 18/20] refactor(retention): serve cleanup on the single agentex-server worker Merge the health-check and retention-cleanup Temporal workers into one process/container. Both sets of workflows and activities are now registered on the same `agentex-server` task queue worker, preventing tasks from landing on a worker that has no handler for their type. Remove the separate `agentex-retention-cleanup-worker` compose service and `run_retention_cleanup_worker_main` entrypoint; add the four runtime-policy env vars to the surviving `agentex-temporal-worker` service. --- agentex/docker-compose.yml | 47 +---------------- agentex/src/temporal/run_worker.py | 85 +++++++++++------------------- 2 files changed, 32 insertions(+), 100 deletions(-) diff --git a/agentex/docker-compose.yml b/agentex/docker-compose.yml index f82c51d4..10ca14c3 100644 --- a/agentex/docker-compose.yml +++ b/agentex/docker-compose.yml @@ -235,51 +235,8 @@ services: - MONGODB_URI=mongodb://agentex-mongodb:27017 - MONGODB_DATABASE_NAME=agentex - AGENTEX_SERVER_TASK_QUEUE=agentex-server - volumes: - - .:/app:cached - depends_on: - agentex-temporal: - condition: service_healthy - agentex-redis: - condition: service_healthy - agentex-postgres: - condition: service_healthy - agentex-mongodb: - condition: service_healthy - networks: - - agentex-network - command: | - bash -c " - echo 'Starting Temporal Worker...' && - export TEMPORAL_ADDRESS=agentex-temporal:7233 && - export TEMPORAL_HOST=agentex-temporal && - export MONGODB_URI=mongodb://agentex-mongodb:27017 && - python src/temporal/run_worker.py - " - user: root - restart: unless-stopped - - agentex-retention-cleanup-worker: - container_name: agentex-retention-cleanup-worker - build: - context: .. - dockerfile: agentex/Dockerfile - target: dev - args: - SOURCE_DIR: agentex - environment: - - ENVIRONMENT=development - - DATABASE_URL=postgresql://postgres:postgres@agentex-postgres:5432/agentex - - TEMPORAL_ADDRESS=agentex-temporal:7233 - - TEMPORAL_HOST=agentex-temporal - - REDIS_URL=redis://agentex-redis:6379 - - MONGODB_URI=mongodb://agentex-mongodb:27017 - - MONGODB_DATABASE_NAME=agentex - - AGENTEX_SERVER_TASK_QUEUE=agentex-server - - RETENTION_CLEANUP_ENABLED=${RETENTION_CLEANUP_ENABLED:-false} - RETENTION_CLEANUP_AGENT_ALLOWLIST=${RETENTION_CLEANUP_AGENT_ALLOWLIST:-} - RETENTION_CLEANUP_IDLE_DAYS=${RETENTION_CLEANUP_IDLE_DAYS:-7} - - RETENTION_CLEANUP_CRON=${RETENTION_CLEANUP_CRON:-0 4 * * *} - RETENTION_CLEANUP_PAGE_SIZE=${RETENTION_CLEANUP_PAGE_SIZE:-200} - RETENTION_CLEANUP_MAX_IN_FLIGHT=${RETENTION_CLEANUP_MAX_IN_FLIGHT:-20} volumes: @@ -297,11 +254,11 @@ services: - agentex-network command: | bash -c " - echo 'Starting Retention Cleanup Worker...' && + echo 'Starting Temporal Worker...' && export TEMPORAL_ADDRESS=agentex-temporal:7233 && export TEMPORAL_HOST=agentex-temporal && export MONGODB_URI=mongodb://agentex-mongodb:27017 && - python -c \"import asyncio; from src.temporal.run_worker import run_retention_cleanup_worker_main; asyncio.run(run_retention_cleanup_worker_main())\" + python src/temporal/run_worker.py " user: root restart: unless-stopped diff --git a/agentex/src/temporal/run_worker.py b/agentex/src/temporal/run_worker.py index d6c2c766..e826bcc1 100644 --- a/agentex/src/temporal/run_worker.py +++ b/agentex/src/temporal/run_worker.py @@ -130,63 +130,47 @@ async def run_worker( await health_check_worker.shutdown() -def create_health_check_worker( - agent_repo: AgentRepository, http_client: httpx.AsyncClient +def create_agentex_server_worker( + agent_repo: AgentRepository, + http_client: httpx.AsyncClient, + global_dependencies: GlobalDependencies, ) -> asyncio.Task: """ - Create a Health Check worker. + Create the single Temporal worker that serves the `agentex-server` task queue. + + Registers ALL workflows + activities that run on this queue — health checks + AND retention cleanup — in one worker. Workers polling the same task queue + must register the same set of types (the queue is not typed), so these live + together in one worker rather than as separate processes/containers. """ - # Get task queue from environment or use default task_queue = os.environ.get("AGENTEX_SERVER_TASK_QUEUE", AGENTEX_SERVER_TASK_QUEUE) - logger.info("Starting Temporal Health Check Worker") + logger.info("Starting agentex-server Temporal worker") logger.info(f"Task queue: {task_queue}") - # Create activities instance with dependencies health_check_activities = HealthCheckActivities( agent_repo=agent_repo, - http_client=httpx_client(), - ) - - # Extract activity methods - activities = [ - health_check_activities.check_status_activity, - health_check_activities.update_agent_status_activity, - ] - - # Create and run worker task - return asyncio.create_task( - run_worker( - task_queue=task_queue, - workflows=[HealthCheckWorkflow], - activities=activities, - max_workers=50, - max_concurrent_activities=50, - ) + http_client=http_client, ) - -def create_retention_cleanup_worker( - global_dependencies: GlobalDependencies, -) -> asyncio.Task: - """Create a worker that serves the retention-cleanup workflows + activities.""" - task_queue = os.environ.get("AGENTEX_SERVER_TASK_QUEUE", AGENTEX_SERVER_TASK_QUEUE) - - use_case = build_task_retention_use_case(global_dependencies) - # Reuse the repository the factory already built (avoids a second TaskRepository - # / connection pool for the same database). - task_repository = use_case.retention_service.task_repository - + retention_use_case = build_task_retention_use_case(global_dependencies) + # Reuse the repository the factory already built (one connection pool). retention_activities = RetentionCleanupActivities( - task_repository=task_repository, - use_case=use_case, + task_repository=retention_use_case.retention_service.task_repository, + use_case=retention_use_case, ) return asyncio.create_task( run_worker( task_queue=task_queue, - workflows=[RetentionCleanupSweepWorkflow, RetentionCleanupTaskWorkflow], + workflows=[ + HealthCheckWorkflow, + RetentionCleanupSweepWorkflow, + RetentionCleanupTaskWorkflow, + ], activities=[ + health_check_activities.check_status_activity, + health_check_activities.update_agent_status_activity, retention_activities.load_cleanup_config, retention_activities.find_cleanup_candidates, retention_activities.clean_task, @@ -197,32 +181,23 @@ def create_retention_cleanup_worker( ) -async def run_retention_cleanup_worker_main() -> None: - """Entrypoint: run the retention-cleanup worker as its own process.""" - global_dependencies = GlobalDependencies() - await global_dependencies.load() - worker_task = create_retention_cleanup_worker(global_dependencies) - await worker_task - - async def main() -> None: - """ - Main entry point for the Health Check worker. - """ + """Main entry point for the agentex-server Temporal worker.""" try: - # Initialize global dependencies for this thread await startup_global_dependencies() - # Create session maker + global_dependencies = GlobalDependencies() + engine = database_async_read_write_engine() session_maker = database_async_read_write_session_maker(engine) read_only_session_maker = database_async_read_only_session_maker(engine) agent_repo = AgentRepository(session_maker, read_only_session_maker) - health_check_worker_task = create_health_check_worker( + + worker_task = create_agentex_server_worker( agent_repo=agent_repo, http_client=httpx_client(), + global_dependencies=global_dependencies, ) - # Wait for the worker to complete - await health_check_worker_task + await worker_task except KeyboardInterrupt: logger.info("Received interrupt signal, shutting down worker...") From 76cdf521e5d5dfb429e815d5553ef29590c41b3a Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 17:54:55 -0700 Subject: [PATCH 19/20] fix(retention): pass RETENTION_CLEANUP_ENABLED to the worker for the runtime enabled gate --- agentex/docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/agentex/docker-compose.yml b/agentex/docker-compose.yml index 10ca14c3..4bedf1b2 100644 --- a/agentex/docker-compose.yml +++ b/agentex/docker-compose.yml @@ -235,6 +235,7 @@ services: - MONGODB_URI=mongodb://agentex-mongodb:27017 - MONGODB_DATABASE_NAME=agentex - AGENTEX_SERVER_TASK_QUEUE=agentex-server + - RETENTION_CLEANUP_ENABLED=${RETENTION_CLEANUP_ENABLED:-false} - RETENTION_CLEANUP_AGENT_ALLOWLIST=${RETENTION_CLEANUP_AGENT_ALLOWLIST:-} - RETENTION_CLEANUP_IDLE_DAYS=${RETENTION_CLEANUP_IDLE_DAYS:-7} - RETENTION_CLEANUP_PAGE_SIZE=${RETENTION_CLEANUP_PAGE_SIZE:-200} From 45802a7d2d08fbf60c62ee983eadf4aa8621c893 Mon Sep 17 00:00:00 2001 From: Stas Moreinis Date: Wed, 3 Jun 2026 18:00:28 -0700 Subject: [PATCH 20/20] fix(retention): address greptile review findings - carry the enabled flag across continue_as_new pages so the runtime kill switch stays consistent with the other carried policy fields - scope child workflow ids with the sweep run id to avoid collisions under a REJECT_DUPLICATE workflow-id-reuse policy - add a task_repository property to TaskRetentionUseCase instead of reaching through retention_service internals in the worker --- agentex/src/domain/use_cases/task_retention_use_case.py | 8 ++++++++ agentex/src/temporal/run_worker.py | 5 +++-- .../src/temporal/workflows/retention_cleanup_workflow.py | 7 ++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/agentex/src/domain/use_cases/task_retention_use_case.py b/agentex/src/domain/use_cases/task_retention_use_case.py index ef8ec620..3998a0df 100644 --- a/agentex/src/domain/use_cases/task_retention_use_case.py +++ b/agentex/src/domain/use_cases/task_retention_use_case.py @@ -7,6 +7,7 @@ TaskExportToUrlResultEntity, TaskSnapshotEntity, ) +from src.domain.repositories.task_repository import TaskRepository from src.domain.services.task_retention_service import DTaskRetentionService @@ -20,6 +21,13 @@ class TaskRetentionUseCase: def __init__(self, retention_service: DTaskRetentionService): self.retention_service = retention_service + @property + def task_repository(self) -> TaskRepository: + """Stable accessor for the underlying task repository so callers (e.g. the + Temporal worker) can reuse the same instance without reaching through the + service's internals.""" + return self.retention_service.task_repository + async def export_task(self, task_id: str) -> TaskSnapshotEntity: return await self.retention_service.export_task(task_id) diff --git a/agentex/src/temporal/run_worker.py b/agentex/src/temporal/run_worker.py index e826bcc1..091e8ca7 100644 --- a/agentex/src/temporal/run_worker.py +++ b/agentex/src/temporal/run_worker.py @@ -154,9 +154,10 @@ def create_agentex_server_worker( ) retention_use_case = build_task_retention_use_case(global_dependencies) - # Reuse the repository the factory already built (one connection pool). + # Reuse the repository the factory already built (avoids a duplicate + # TaskRepository) via the use case's stable accessor. retention_activities = RetentionCleanupActivities( - task_repository=retention_use_case.retention_service.task_repository, + task_repository=retention_use_case.task_repository, use_case=retention_use_case, ) diff --git a/agentex/src/temporal/workflows/retention_cleanup_workflow.py b/agentex/src/temporal/workflows/retention_cleanup_workflow.py index 4ade1b7f..4bc8cd32 100644 --- a/agentex/src/temporal/workflows/retention_cleanup_workflow.py +++ b/agentex/src/temporal/workflows/retention_cleanup_workflow.py @@ -93,13 +93,17 @@ async def run(self, args: dict | None = None) -> dict: logger.info("retention_cleanup_sweep_completed", extra=totals) return totals + # Scope child workflow IDs to this run so a task re-discovered in a later + # sweep (e.g. one that was skipped) doesn't collide with a prior cycle's + # completed child under a REJECT_DUPLICATE workflow-id-reuse policy. + sweep_run_id = workflow.info().run_id[:8] for batch in _chunked(task_ids, max_in_flight): results = await asyncio.gather( *[ workflow.execute_child_workflow( RetentionCleanupTaskWorkflow.run, {"task_id": task_id, "idle_days": idle_days}, - id=f"retention-cleanup-task-{task_id}", + id=f"retention-cleanup-task-{sweep_run_id}-{task_id}", retry_policy=RetryPolicy(maximum_attempts=1), ) for task_id in batch @@ -115,6 +119,7 @@ async def run(self, args: dict | None = None) -> dict: workflow.continue_as_new( arg={ + "enabled": args.get("enabled", True), "idle_days": idle_days, "agent_names": agent_names, "page_size": page_size,