Skip to content

Commit 2a7b7a4

Browse files
LoCoBench Botclaude
andcommitted
feat: [US-013] - Add harness configuration capture to report
Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 6783638 commit 2a7b7a4

File tree

7 files changed

+153
-3
lines changed

7 files changed

+153
-3
lines changed

ralph/prd.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@
226226
"Writes a separate harness_configs.json to output dir with the full config per run"
227227
],
228228
"priority": 13,
229-
"passes": false,
229+
"passes": true,
230230
"notes": "config.json is in the batch timestamp dir. claude-code.txt init line has {type: 'system', subtype: 'init', tools: [...], mcp_servers: [...], model: '...', permissionMode: '...', claude_code_version: '...'}. This is critical for reproducibility."
231231
},
232232
{

ralph/progress.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Started: 2026-02-01
1616
- Harbor run output structure: `runs/<category>/<run_name>/<config>/<batch_timestamp>/<task_id>__<hash>/`
1717
- Each `harbor run --path` invocation creates a separate batch timestamp dir (one task per batch for BigCode)
1818
- This repo has no build/lint/test CI — quality checks are manual review of created files
19+
- Batch-level config.json has `agents` (list); task-level has `agent` (dict) — always check both
20+
- claude-code.txt init line (`type: 'system', subtype: 'init'`) contains tools, mcp_servers, model, claude_code_version
1921

2022
---
2123

@@ -223,3 +225,22 @@ Started: 2026-02-01
223225
- Agent output is in `agent/solution.md` when available, else last assistant text from transcript
224226
- `scripts/__init__.py` is needed for `python3 -m scripts.ccb_metrics.judge_context` to work
225227
---
228+
229+
## 2026-02-01 - US-013
230+
- Added `extract_run_config(batch_dir, transcript_path)` to `scripts/ccb_metrics/extractors.py`
231+
- Reads batch-level `config.json` for model_name, agent_import_path, timeout_multiplier, task_source
232+
- Parses claude-code.txt system init line for claude_code_version, permissionMode, tools list, mcp_servers, model
233+
- Infers mcp_mode from mcp_servers names (none/deepsearch/sourcegraph_no_deepsearch/sourcegraph_hybrid)
234+
- Added `harness_config` optional dict field to `RunMetrics` in `models.py`
235+
- Updated `discovery.py` to extract harness config from each batch dir + first task's transcript
236+
- Updated `generate_eval_report.py`: Run Inventory table now includes MCP Mode column; writes `harness_configs.json` to output dir
237+
- Updated `__init__.py` to export `extract_run_config`
238+
- Files changed: `scripts/ccb_metrics/extractors.py` (modified), `scripts/ccb_metrics/models.py` (modified), `scripts/ccb_metrics/discovery.py` (modified), `scripts/ccb_metrics/__init__.py` (modified), `scripts/generate_eval_report.py` (modified)
239+
- **Learnings for future iterations:**
240+
- Batch-level config.json has `agents` (list), task-level config.json has `agent` (dict) — handle both
241+
- Batch-level config.json also lacks `task` key; it uses `tasks` (list) when present
242+
- claude-code.txt init line is `{type: 'system', subtype: 'init', ...}` — always the first JSONL entry
243+
- mcp_servers is a list of `{name, status}` dicts; server name determines the MCP mode
244+
- `sourcegraph_hybrid` in Harbor config dir name actually had only `sourcegraph` server (no deepsearch), so mcp_mode resolves to `sourcegraph_no_deepsearch`
245+
- `deepsearch_hybrid` in Harbor config dir name had `deepsearch` server only
246+
---

ralph/scripts/ccb_metrics/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22

33
from .models import TaskMetrics, RunMetrics, EvalReport
44
from .discovery import discover_runs
5+
from .extractors import extract_run_config
56

6-
__all__ = ["TaskMetrics", "RunMetrics", "EvalReport", "discover_runs"]
7+
__all__ = ["TaskMetrics", "RunMetrics", "EvalReport", "discover_runs", "extract_run_config"]

ralph/scripts/ccb_metrics/discovery.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
extract_tool_usage_from_transcript,
2323
extract_swebench_partial_score,
2424
extract_reward_from_file,
25+
extract_run_config,
2526
)
2627

2728

@@ -182,6 +183,7 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
182183
# Key: (benchmark, config_name) -> {task_id: TaskMetrics}
183184
grouped: dict[tuple[str, str], dict[str, TaskMetrics]] = {}
184185
run_metadata: dict[tuple[str, str], dict] = {}
186+
harness_configs: dict[tuple[str, str], dict] = {}
185187

186188
for run_dir in sorted(runs_dir.iterdir()):
187189
if not run_dir.is_dir():
@@ -216,6 +218,20 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
216218
"timestamp": timestamp,
217219
}
218220

221+
# Extract harness config from batch dir + first task transcript
222+
# Find a transcript path from the first task dir in this batch
223+
_transcript_for_config = None
224+
for _td in sorted(batch_dir.iterdir()):
225+
if _is_task_dir(_td):
226+
_candidate = _td / "agent" / "claude-code.txt"
227+
if _candidate.is_file():
228+
_transcript_for_config = _candidate
229+
break
230+
231+
hc = extract_run_config(batch_dir, _transcript_for_config)
232+
if key not in harness_configs or hc.get("model_name") is not None:
233+
harness_configs[key] = hc
234+
219235
# Process each task directory
220236
for task_dir in sorted(batch_dir.iterdir()):
221237
if not _is_task_dir(task_dir):
@@ -243,6 +259,7 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
243259
timestamp=meta.get("timestamp", "unknown"),
244260
task_count=len(tasks),
245261
tasks=tasks,
262+
harness_config=harness_configs.get((benchmark, config_name)),
246263
)
247264
results.append(run)
248265

ralph/scripts/ccb_metrics/extractors.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,102 @@ def extract_tool_usage_from_transcript(
373373
return _build_tool_usage_dict(tool_counts)
374374

375375

376+
def extract_run_config(
377+
batch_dir: str | Path,
378+
transcript_path: Optional[str | Path] = None,
379+
) -> dict:
380+
"""Extract harness configuration from a batch directory.
381+
382+
Reads config.json from the batch directory and optionally parses the
383+
system init line from claude-code.txt to capture runtime details.
384+
385+
Args:
386+
batch_dir: Path to the batch timestamp directory containing config.json.
387+
transcript_path: Optional path to agent/claude-code.txt for init data.
388+
389+
Returns:
390+
Dict with keys: model_name, agent_import_path, timeout_multiplier,
391+
mcp_mode, task_source, claude_code_version, permission_mode,
392+
tools, mcp_servers, model. Missing values are None.
393+
"""
394+
result: dict = {
395+
"model_name": None,
396+
"agent_import_path": None,
397+
"timeout_multiplier": None,
398+
"mcp_mode": None,
399+
"task_source": None,
400+
"claude_code_version": None,
401+
"permission_mode": None,
402+
"tools": None,
403+
"mcp_servers": None,
404+
"model": None,
405+
}
406+
407+
batch_dir = Path(batch_dir)
408+
config_path = batch_dir / "config.json"
409+
410+
# --- Extract from config.json ---
411+
if config_path.is_file():
412+
try:
413+
data = json.loads(config_path.read_text())
414+
except (OSError, json.JSONDecodeError):
415+
data = {}
416+
417+
# Batch-level config has "agents" (list); task-level has "agent" (dict)
418+
agents = data.get("agents") or []
419+
agent = data.get("agent") or {}
420+
if agents and isinstance(agents, list):
421+
agent = agents[0]
422+
result["model_name"] = agent.get("model_name")
423+
result["agent_import_path"] = agent.get("import_path")
424+
result["timeout_multiplier"] = data.get("timeout_multiplier")
425+
426+
task = data.get("task") or {}
427+
# Batch-level uses "tasks" list; task-level uses "task" dict
428+
tasks_list = data.get("tasks") or []
429+
if not task and tasks_list and isinstance(tasks_list, list):
430+
task = tasks_list[0]
431+
result["task_source"] = task.get("git_url") or task.get("path")
432+
433+
# --- Extract from claude-code.txt init line ---
434+
if transcript_path is not None:
435+
tp = Path(transcript_path)
436+
if tp.is_file():
437+
try:
438+
for line in tp.open():
439+
line = line.strip()
440+
if not line:
441+
continue
442+
try:
443+
entry = json.loads(line)
444+
except json.JSONDecodeError:
445+
continue
446+
if entry.get("type") == "system" and entry.get("subtype") == "init":
447+
result["claude_code_version"] = entry.get("claude_code_version")
448+
result["permission_mode"] = entry.get("permissionMode")
449+
result["tools"] = entry.get("tools")
450+
result["mcp_servers"] = entry.get("mcp_servers")
451+
result["model"] = entry.get("model")
452+
# Infer mcp_mode from mcp_servers
453+
servers = entry.get("mcp_servers") or []
454+
server_names = [s.get("name") for s in servers if isinstance(s, dict)]
455+
if not server_names:
456+
result["mcp_mode"] = "none"
457+
elif "sourcegraph" in server_names and "deepsearch" in server_names:
458+
result["mcp_mode"] = "sourcegraph_hybrid"
459+
elif "sourcegraph" in server_names:
460+
result["mcp_mode"] = "sourcegraph_no_deepsearch"
461+
elif "deepsearch" in server_names:
462+
result["mcp_mode"] = "deepsearch"
463+
else:
464+
result["mcp_mode"] = ",".join(server_names)
465+
break
466+
except OSError:
467+
pass
468+
469+
return result
470+
471+
376472
def extract_reward_from_file(
377473
reward_txt_path: str | Path,
378474
) -> Optional[float]:

ralph/scripts/ccb_metrics/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class RunMetrics:
8383
timestamp: str
8484
task_count: int
8585
tasks: list[TaskMetrics] = field(default_factory=list)
86+
harness_config: Optional[dict] = None
8687

8788
# --- computed properties ---
8889

@@ -126,6 +127,7 @@ def to_dict(self) -> dict:
126127
"timestamp": self.timestamp,
127128
"task_count": self.task_count,
128129
"tasks": [t.to_dict() for t in self.tasks],
130+
"harness_config": self.harness_config,
129131
"mean_reward": self.mean_reward,
130132
"mean_partial_score": self.mean_partial_score,
131133
"pass_rate": self.pass_rate,

ralph/scripts/generate_eval_report.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,16 @@ def _write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
102102

103103
def _build_run_inventory(runs: list[RunMetrics]) -> tuple[list[str], list[list[str]]]:
104104
"""Table 1: Run Inventory."""
105-
headers = ["Benchmark", "Config", "Model", "Tasks", "Timestamp"]
105+
headers = ["Benchmark", "Config", "Model", "MCP Mode", "Tasks", "Timestamp"]
106106
rows = []
107107
for r in runs:
108+
hc = r.harness_config or {}
109+
mcp_mode = hc.get("mcp_mode") or r.config_name
108110
rows.append([
109111
r.benchmark,
110112
r.config_name,
111113
r.model,
114+
mcp_mode,
112115
str(r.task_count),
113116
r.timestamp,
114117
])
@@ -265,6 +268,16 @@ def generate_report(
265268
report.to_json(json_path)
266269
print(f"Written: {json_path}")
267270

271+
# Write harness_configs.json
272+
harness_configs = {}
273+
for r in runs:
274+
if r.harness_config:
275+
harness_configs[r.run_id] = r.harness_config
276+
if harness_configs:
277+
hc_path = output_dir / "harness_configs.json"
278+
hc_path.write_text(json.dumps(harness_configs, indent=2) + "\n")
279+
print(f"Written: {hc_path}")
280+
268281
# Build all tables
269282
tables: list[tuple[str, str, list[str], list[list[str]]]] = []
270283

0 commit comments

Comments
 (0)