Skip to content

Commit 1d6eb13

Browse files
sjarmakcursoragent
andcommitted
Updates: docs, configs, scripts, and workflow additions
- Doc and config updates (README, AGENT_INTERFACE, CONFIGS, ERROR_CATALOG, etc.) - Script updates (aggregate_status, audit_traces, generate_manifest, and others) - eval_matrix and selected_benchmark_tasks updates - Add .github/workflows/docs-consistency.yml and roam.yml Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent d38c2af commit 1d6eb13

37 files changed

+589
-647
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: Docs Consistency
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
9+
jobs:
10+
docs-consistency:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout
14+
uses: actions/checkout@v4
15+
16+
- name: Setup Python
17+
uses: actions/setup-python@v5
18+
with:
19+
python-version: "3.11"
20+
21+
- name: Validate docs references
22+
run: python3 scripts/docs_consistency_check.py

.github/workflows/roam.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Roam Code Analysis
2+
on:
3+
pull_request:
4+
branches: [main, master]
5+
permissions:
6+
contents: read
7+
pull-requests: write
8+
jobs:
9+
roam:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- uses: actions/checkout@v4
13+
with:
14+
fetch-depth: 0
15+
- uses: actions/setup-python@v5
16+
with:
17+
python-version: "3.12"
18+
- run: pip install roam-code
19+
- run: roam index
20+
- run: roam fitness
21+
- run: roam pr-risk --json

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@ Eight suites organized by software development lifecycle phase:
1616
| `ccb_design` | Architecture & Design | 20 | Architecture analysis, dependency graphs, change impact |
1717
| `ccb_fix` | Bug Repair | 25 | Diagnosing and fixing real bugs across production codebases |
1818
| `ccb_build` | Feature & Refactoring | 25 | New features, refactoring, dependency management |
19-
| `ccb_test` | Testing & QA | 14 | Code review, performance testing, code search validation |
20-
| `ccb_document` | Documentation | 13 | API references, architecture docs, migration guides |
19+
| `ccb_test` | Testing & QA | 20 | Code review, performance testing, code search validation |
20+
| `ccb_document` | Documentation | 20 | API references, architecture docs, migration guides |
2121
| `ccb_secure` | Security & Compliance | 20 | CVE analysis, reachability, governance, access control |
2222
| `ccb_debug` | Debugging & Investigation | 20 | Root cause tracing, fault localization, provenance |
23-
| **Total** | | **157** | |
23+
| **Total** | | **170** | |
24+
25+
*ccb_test* and *ccb_document* currently have 14 and 13 tasks on disk (target 20 each); see `docs/backlog_ccb_test.json` and `docs/backlog_ccb_document.json` for the growth plan.
2426

2527
See `docs/PRD_SDLC_SUITE_REORGANIZATION.md` for the reorganization rationale and task mapping.
2628

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,40 @@
11
"""OpenHands harness agent wired to Harbor's OpenHands CLI with shared baseline tooling."""
22

3+
import os
4+
5+
from harbor.agents import utils as harbor_utils
36
from harbor.agents.installed.openhands import OpenHands
47

58
from ..base import BaselineHarnessMixin
69

10+
# Codex model names (LiteLLM/Harbor don't know these); we map them to OPENAI_API_KEY
11+
# so Harbor's get_api_key_var_names_from_model_name can resolve the key.
12+
_CODEX_MODEL_PREFIXES = ("gpt-5.3-codex", "gpt53codex", "codex")
13+
14+
15+
def _get_api_key_var_names_from_model_name(model_name: str) -> list[str]:
16+
"""Wrap Harbor's resolver so Codex models map to OPENAI_API_KEY or CODEX_API_KEY."""
17+
lower = (model_name or "").strip().lower()
18+
if lower in _CODEX_MODEL_PREFIXES or (lower and "codex" in lower and "gpt" in lower):
19+
if os.environ.get("CODEX_API_KEY"):
20+
return ["CODEX_API_KEY"]
21+
if os.environ.get("OPENAI_API_KEY"):
22+
return ["OPENAI_API_KEY"]
23+
# Harbor will raise "Unset API variable"; prefer telling user to set CODEX_API_KEY
24+
return ["CODEX_API_KEY"]
25+
return _original_get_api_key_var_names(model_name)
26+
27+
28+
# Apply once at import so Harbor's OpenHands sees it. Harbor's openhands.py does
29+
# "from harbor.agents.utils import get_api_key_var_names_from_model_name", so we
30+
# must patch the openhands module's reference too (utils patch alone is not seen there).
31+
_original_get_api_key_var_names = harbor_utils.get_api_key_var_names_from_model_name
32+
harbor_utils.get_api_key_var_names_from_model_name = _get_api_key_var_names_from_model_name
33+
import sys
34+
_openhands_mod = sys.modules.get("harbor.agents.installed.openhands")
35+
if _openhands_mod is not None:
36+
_openhands_mod.get_api_key_var_names_from_model_name = _get_api_key_var_names_from_model_name
37+
738

839
class OpenHandsHarnessAgent(BaselineHarnessMixin, OpenHands):
940
"""OpenHands CLI agent extended with evaluation context and MCP wiring."""

configs/eval_matrix.json

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,9 @@
66
],
77
"supported_configs": [
88
"baseline",
9-
"sourcegraph_base",
109
"sourcegraph_full",
1110
"sourcegraph_isolated",
1211
"sg_only_env",
13-
"github_base",
1412
"github_full"
1513
],
1614
"config_definitions": {
@@ -21,13 +19,6 @@
2119
"track_in_official": true,
2220
"status": "active"
2321
},
24-
"sourcegraph_base": {
25-
"baseline_mcp_type": "sourcegraph_base",
26-
"mcp_enabled": true,
27-
"provider": "sourcegraph",
28-
"track_in_official": true,
29-
"status": "legacy_or_targeted"
30-
},
3122
"sourcegraph_full": {
3223
"baseline_mcp_type": "sourcegraph_full",
3324
"mcp_enabled": true,
@@ -51,13 +42,6 @@
5142
"status": "experimental",
5243
"notes": "Environment-only variant: same agent as sourcegraph_full, Dockerfile.sg_only removes local source"
5344
},
54-
"github_base": {
55-
"baseline_mcp_type": "github_base",
56-
"mcp_enabled": true,
57-
"provider": "github",
58-
"track_in_official": false,
59-
"status": "experimental_scaffold"
60-
},
6145
"github_full": {
6246
"baseline_mcp_type": "github_full",
6347
"mcp_enabled": true,

configs/openhands_2config.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# --parallel N Max parallel task subshells (default: 1)
1717
# --category CATEGORY Run category label for jobs dir (default: staging)
1818
# --benchmark BENCH Optional benchmark filter (e.g. ccb_build, ccb_fix)
19+
# --task TASK_ID Run only this task (further filters after --benchmark)
1920

2021
set -e
2122

@@ -27,12 +28,14 @@ export PYTHONPATH="$(pwd):${PYTHONPATH:-}"
2728

2829
# Shared helpers (validation/reporting and run helpers)
2930
source "$SCRIPT_DIR/_common.sh"
31+
load_credentials
3032

3133
SELECTION_FILE="$SCRIPT_DIR/selected_benchmark_tasks.json"
3234
AGENT_PATH="${AGENT_PATH:-agents.harnesses.openhands:OpenHandsHarnessAgent}"
3335
MODEL="${MODEL:-anthropic/claude-opus-4-6}"
3436
CATEGORY="${CATEGORY:-staging}"
3537
BENCHMARK_FILTER=""
38+
TASK_FILTER=""
3639
CONCURRENCY=2
3740
TIMEOUT_MULTIPLIER=10
3841
RUN_BASELINE=true
@@ -68,6 +71,10 @@ while [[ $# -gt 0 ]]; do
6871
BENCHMARK_FILTER="$2"
6972
shift 2
7073
;;
74+
--task)
75+
TASK_FILTER="$2"
76+
shift 2
77+
;;
7178
*)
7279
echo "Unknown option: $1"
7380
exit 1
@@ -80,19 +87,22 @@ if [ ! -f "$SELECTION_FILE" ]; then
8087
exit 1
8188
fi
8289

83-
readarray -t TASK_ROWS < <(python3 - "$SELECTION_FILE" "$BENCHMARK_FILTER" <<'PYEOF'
90+
readarray -t TASK_ROWS < <(python3 - "$SELECTION_FILE" "$BENCHMARK_FILTER" "$TASK_FILTER" <<'PYEOF'
8491
import json
8592
import sys
8693
8794
selection_file = sys.argv[1]
8895
benchmark_filter = sys.argv[2]
96+
task_filter = sys.argv[3] if len(sys.argv) > 3 else ""
8997
9098
data = json.load(open(selection_file))
9199
for task in data.get("tasks", []):
92100
if task.get("excluded", False):
93101
continue
94102
if benchmark_filter and task.get("benchmark") != benchmark_filter:
95103
continue
104+
if task_filter and task.get("task_id") != task_filter:
105+
continue
96106
task_id = task["task_id"]
97107
task_dir = task["task_dir"]
98108
benchmark = task.get("benchmark", "")

0 commit comments

Comments
 (0)