feat: [US-013] - Add harness configuration capture to report

LoCoBench Bot · claude · LoCoBench Bot · commit 2a7b7a4543f9 · 2026-02-01T04:28:52.000Z
Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/ralph/prd.json b/ralph/prd.json
@@ -226,7 +226,7 @@
         "Writes a separate harness_configs.json to output dir with the full config per run"
       ],
       "priority": 13,
-      "passes": false,
+      "passes": true,
       "notes": "config.json is in the batch timestamp dir. claude-code.txt init line has {type: 'system', subtype: 'init', tools: [...], mcp_servers: [...], model: '...', permissionMode: '...', claude_code_version: '...'}. This is critical for reproducibility."
     },
     {
diff --git a/ralph/progress.txt b/ralph/progress.txt
@@ -16,6 +16,8 @@ Started: 2026-02-01
 - Harbor run output structure: `runs/<category>/<run_name>/<config>/<batch_timestamp>/<task_id>__<hash>/`
 - Each `harbor run --path` invocation creates a separate batch timestamp dir (one task per batch for BigCode)
 - This repo has no build/lint/test CI — quality checks are manual review of created files
+- Batch-level config.json has `agents` (list); task-level has `agent` (dict) — always check both
+- claude-code.txt init line (`type: 'system', subtype: 'init'`) contains tools, mcp_servers, model, claude_code_version
 
 ---
 
@@ -223,3 +225,22 @@ Started: 2026-02-01
   - Agent output is in `agent/solution.md` when available, else last assistant text from transcript
   - `scripts/__init__.py` is needed for `python3 -m scripts.ccb_metrics.judge_context` to work
 ---
+
+## 2026-02-01 - US-013
+- Added `extract_run_config(batch_dir, transcript_path)` to `scripts/ccb_metrics/extractors.py`
+- Reads batch-level `config.json` for model_name, agent_import_path, timeout_multiplier, task_source
+- Parses claude-code.txt system init line for claude_code_version, permissionMode, tools list, mcp_servers, model
+- Infers mcp_mode from mcp_servers names (none/deepsearch/sourcegraph_no_deepsearch/sourcegraph_hybrid)
+- Added `harness_config` optional dict field to `RunMetrics` in `models.py`
+- Updated `discovery.py` to extract harness config from each batch dir + first task's transcript
+- Updated `generate_eval_report.py`: Run Inventory table now includes MCP Mode column; writes `harness_configs.json` to output dir
+- Updated `__init__.py` to export `extract_run_config`
+- Files changed: `scripts/ccb_metrics/extractors.py` (modified), `scripts/ccb_metrics/models.py` (modified), `scripts/ccb_metrics/discovery.py` (modified), `scripts/ccb_metrics/__init__.py` (modified), `scripts/generate_eval_report.py` (modified)
+- **Learnings for future iterations:**
+  - Batch-level config.json has `agents` (list), task-level config.json has `agent` (dict) — handle both
+  - Batch-level config.json also lacks `task` key; it uses `tasks` (list) when present
+  - claude-code.txt init line is `{type: 'system', subtype: 'init', ...}` — always the first JSONL entry
+  - mcp_servers is a list of `{name, status}` dicts; server name determines the MCP mode
+  - `sourcegraph_hybrid` in Harbor config dir name actually had only `sourcegraph` server (no deepsearch), so mcp_mode resolves to `sourcegraph_no_deepsearch`
+  - `deepsearch_hybrid` in Harbor config dir name had `deepsearch` server only
+---
diff --git a/ralph/scripts/ccb_metrics/__init__.py b/ralph/scripts/ccb_metrics/__init__.py
@@ -2,5 +2,6 @@
 
 from .models import TaskMetrics, RunMetrics, EvalReport
 from .discovery import discover_runs
+from .extractors import extract_run_config
 
-__all__ = ["TaskMetrics", "RunMetrics", "EvalReport", "discover_runs"]
+__all__ = ["TaskMetrics", "RunMetrics", "EvalReport", "discover_runs", "extract_run_config"]
diff --git a/ralph/scripts/ccb_metrics/discovery.py b/ralph/scripts/ccb_metrics/discovery.py
@@ -22,6 +22,7 @@
     extract_tool_usage_from_transcript,
     extract_swebench_partial_score,
     extract_reward_from_file,
+    extract_run_config,
 )
 
 
@@ -182,6 +183,7 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
     # Key: (benchmark, config_name) -> {task_id: TaskMetrics}
     grouped: dict[tuple[str, str], dict[str, TaskMetrics]] = {}
     run_metadata: dict[tuple[str, str], dict] = {}
+    harness_configs: dict[tuple[str, str], dict] = {}
 
     for run_dir in sorted(runs_dir.iterdir()):
         if not run_dir.is_dir():
@@ -216,6 +218,20 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
                         "timestamp": timestamp,
                     }
 
+                # Extract harness config from batch dir + first task transcript
+                # Find a transcript path from the first task dir in this batch
+                _transcript_for_config = None
+                for _td in sorted(batch_dir.iterdir()):
+                    if _is_task_dir(_td):
+                        _candidate = _td / "agent" / "claude-code.txt"
+                        if _candidate.is_file():
+                            _transcript_for_config = _candidate
+                            break
+
+                hc = extract_run_config(batch_dir, _transcript_for_config)
+                if key not in harness_configs or hc.get("model_name") is not None:
+                    harness_configs[key] = hc
+
                 # Process each task directory
                 for task_dir in sorted(batch_dir.iterdir()):
                     if not _is_task_dir(task_dir):
@@ -243,6 +259,7 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
             timestamp=meta.get("timestamp", "unknown"),
             task_count=len(tasks),
             tasks=tasks,
+            harness_config=harness_configs.get((benchmark, config_name)),
         )
         results.append(run)
 
diff --git a/ralph/scripts/ccb_metrics/extractors.py b/ralph/scripts/ccb_metrics/extractors.py
@@ -373,6 +373,102 @@ def extract_tool_usage_from_transcript(
     return _build_tool_usage_dict(tool_counts)
 
 
+def extract_run_config(
+    batch_dir: str | Path,
+    transcript_path: Optional[str | Path] = None,
+) -> dict:
+    """Extract harness configuration from a batch directory.
+
+    Reads config.json from the batch directory and optionally parses the
+    system init line from claude-code.txt to capture runtime details.
+
+    Args:
+        batch_dir: Path to the batch timestamp directory containing config.json.
+        transcript_path: Optional path to agent/claude-code.txt for init data.
+
+    Returns:
+        Dict with keys: model_name, agent_import_path, timeout_multiplier,
+        mcp_mode, task_source, claude_code_version, permission_mode,
+        tools, mcp_servers, model. Missing values are None.
+    """
+    result: dict = {
+        "model_name": None,
+        "agent_import_path": None,
+        "timeout_multiplier": None,
+        "mcp_mode": None,
+        "task_source": None,
+        "claude_code_version": None,
+        "permission_mode": None,
+        "tools": None,
+        "mcp_servers": None,
+        "model": None,
+    }
+
+    batch_dir = Path(batch_dir)
+    config_path = batch_dir / "config.json"
+
+    # --- Extract from config.json ---
+    if config_path.is_file():
+        try:
+            data = json.loads(config_path.read_text())
+        except (OSError, json.JSONDecodeError):
+            data = {}
+
+        # Batch-level config has "agents" (list); task-level has "agent" (dict)
+        agents = data.get("agents") or []
+        agent = data.get("agent") or {}
+        if agents and isinstance(agents, list):
+            agent = agents[0]
+        result["model_name"] = agent.get("model_name")
+        result["agent_import_path"] = agent.get("import_path")
+        result["timeout_multiplier"] = data.get("timeout_multiplier")
+
+        task = data.get("task") or {}
+        # Batch-level uses "tasks" list; task-level uses "task" dict
+        tasks_list = data.get("tasks") or []
+        if not task and tasks_list and isinstance(tasks_list, list):
+            task = tasks_list[0]
+        result["task_source"] = task.get("git_url") or task.get("path")
+
+    # --- Extract from claude-code.txt init line ---
+    if transcript_path is not None:
+        tp = Path(transcript_path)
+        if tp.is_file():
+            try:
+                for line in tp.open():
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        entry = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    if entry.get("type") == "system" and entry.get("subtype") == "init":
+                        result["claude_code_version"] = entry.get("claude_code_version")
+                        result["permission_mode"] = entry.get("permissionMode")
+                        result["tools"] = entry.get("tools")
+                        result["mcp_servers"] = entry.get("mcp_servers")
+                        result["model"] = entry.get("model")
+                        # Infer mcp_mode from mcp_servers
+                        servers = entry.get("mcp_servers") or []
+                        server_names = [s.get("name") for s in servers if isinstance(s, dict)]
+                        if not server_names:
+                            result["mcp_mode"] = "none"
+                        elif "sourcegraph" in server_names and "deepsearch" in server_names:
+                            result["mcp_mode"] = "sourcegraph_hybrid"
+                        elif "sourcegraph" in server_names:
+                            result["mcp_mode"] = "sourcegraph_no_deepsearch"
+                        elif "deepsearch" in server_names:
+                            result["mcp_mode"] = "deepsearch"
+                        else:
+                            result["mcp_mode"] = ",".join(server_names)
+                        break
+            except OSError:
+                pass
+
+    return result
+
+
 def extract_reward_from_file(
     reward_txt_path: str | Path,
 ) -> Optional[float]:
diff --git a/ralph/scripts/ccb_metrics/models.py b/ralph/scripts/ccb_metrics/models.py
@@ -83,6 +83,7 @@ class RunMetrics:
     timestamp: str
     task_count: int
     tasks: list[TaskMetrics] = field(default_factory=list)
+    harness_config: Optional[dict] = None
 
     # --- computed properties ---
 
@@ -126,6 +127,7 @@ def to_dict(self) -> dict:
             "timestamp": self.timestamp,
             "task_count": self.task_count,
             "tasks": [t.to_dict() for t in self.tasks],
+            "harness_config": self.harness_config,
             "mean_reward": self.mean_reward,
             "mean_partial_score": self.mean_partial_score,
             "pass_rate": self.pass_rate,
diff --git a/ralph/scripts/generate_eval_report.py b/ralph/scripts/generate_eval_report.py
@@ -102,13 +102,16 @@ def _write_csv(path: Path, headers: list[str], rows: list[list[str]]) -> None:
 
 def _build_run_inventory(runs: list[RunMetrics]) -> tuple[list[str], list[list[str]]]:
     """Table 1: Run Inventory."""
-    headers = ["Benchmark", "Config", "Model", "Tasks", "Timestamp"]
+    headers = ["Benchmark", "Config", "Model", "MCP Mode", "Tasks", "Timestamp"]
     rows = []
     for r in runs:
+        hc = r.harness_config or {}
+        mcp_mode = hc.get("mcp_mode") or r.config_name
         rows.append([
             r.benchmark,
             r.config_name,
             r.model,
+            mcp_mode,
             str(r.task_count),
             r.timestamp,
         ])
@@ -265,6 +268,16 @@ def generate_report(
     report.to_json(json_path)
     print(f"Written: {json_path}")
 
+    # Write harness_configs.json
+    harness_configs = {}
+    for r in runs:
+        if r.harness_config:
+            harness_configs[r.run_id] = r.harness_config
+    if harness_configs:
+        hc_path = output_dir / "harness_configs.json"
+        hc_path.write_text(json.dumps(harness_configs, indent=2) + "\n")
+        print(f"Written: {hc_path}")
+
     # Build all tables
     tables: list[tuple[str, str, list[str], list[list[str]]]] = []
 

Original file line number	Diff line number	Diff line change
`@@ -226,7 +226,7 @@`
`226`	`226`	`"Writes a separate harness_configs.json to output dir with the full config per run"`
`227`	`227`	`],`
`228`	`228`	`"priority": 13,`
`229`		`- "passes": false,`
	`229`	`+ "passes": true,`
`230`	`230`	`"notes": "config.json is in the batch timestamp dir. claude-code.txt init line has {type: 'system', subtype: 'init', tools: [...], mcp_servers: [...], model: '...', permissionMode: '...', claude_code_version: '...'}. This is critical for reproducibility."`
`231`	`231`	`},`
`232`	`232`	`{`