sourcegraph
diff --git a/‎.github/workflows/docs-consistency.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/docs-consistency.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 15 additions & 15 deletions b/‎AGENTS.md‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 2 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 25 additions & 25 deletions b/‎README.md‎
Lines changed: 25 additions & 25 deletions
diff --git a/‎configs/eval_matrix.json‎
Lines changed: 72 additions & 0 deletions b/‎configs/eval_matrix.json‎
Lines changed: 72 additions & 0 deletions
@@ -0,0 +1,22 @@
+name: Docs Consistency
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  docs-consistency:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Validate docs references
+        run: python3 scripts/docs_consistency_check.py
@@ -16,27 +16,27 @@ Benchmark tasks are executed via **Harbor** (Docker container-based runner) with
 
 ```bash
 # Sequential (default)
-./configs/swebenchpro_3config.sh
+./configs/swebenchpro_2config.sh
 
 # Parallel with auto-detected concurrency
-./configs/swebenchpro_3config.sh --parallel
+./configs/swebenchpro_2config.sh --parallel
 
 # Parallel with explicit job count
-./configs/swebenchpro_3config.sh --parallel 4
+./configs/swebenchpro_2config.sh --parallel 4
 ```
 
 All 11 benchmark config scripts accept the `--parallel` flag:
-- `swebenchpro_3config.sh` — SWE-bench Pro (36 tasks)
-- `pytorch_3config.sh` — PyTorch (12 tasks)
-- `locobench_3config.sh` — LoCoBench (25 tasks)
-- `repoqa_3config.sh` — RepoQA (10 tasks)
-- `k8s_docs_3config.sh` — Kubernetes Docs (5 tasks)
-- `crossrepo_3config.sh` — Cross-Repo (4-5 tasks)
-- `largerepo_3config.sh` — Large Repo (4 tasks)
-- `tac_3config.sh` — TAC (8 tasks)
-- `dibench_3config.sh` — DIBench (8 tasks)
-- `sweperf_3config.sh` — SWE-Perf (3 tasks)
-- `linuxflbench_3config.sh` — LinuxFLBench (5 tasks)
+- `swebenchpro_2config.sh` — SWE-bench Pro (36 tasks)
+- `pytorch_2config.sh` — PyTorch (12 tasks)
+- `locobench_2config.sh` — LoCoBench (25 tasks)
+- `repoqa_2config.sh` — RepoQA (10 tasks)
+- `k8s_docs_2config.sh` — Kubernetes Docs (5 tasks)
+- `crossrepo_2config.sh` — Cross-Repo (4-5 tasks)
+- `largerepo_2config.sh` — Large Repo (4 tasks)
+- `tac_2config.sh` — TAC (8 tasks)
+- `dibench_2config.sh` — DIBench (8 tasks)
+- `sweperf_2config.sh` — SWE-Perf (3 tasks)
+- `linuxflbench_2config.sh` — LinuxFLBench (5 tasks)
 
 ### Config Scripts Structure
 
@@ -230,7 +230,7 @@ After all runs complete:
 
 ```bash
 python3 scripts/generate_manifest.py    # Regenerate MANIFEST.json
-python3 scripts/generate_report.py      # Aggregate results into report
+python3 scripts/generate_eval_report.py      # Aggregate results into report
 ```
 
 The MANIFEST tracks all runs, task counts, pass/fail rates, and mean rewards.
 
@@ -107,7 +107,7 @@ The script generates an OAuth URL — open it in your local browser, log in, pas
 python3 scripts/generate_manifest.py
 
 # Generate evaluation report
-python3 scripts/generate_report.py
+python3 scripts/generate_eval_report.py
 
 # Select benchmark tasks
 python3 scripts/select_benchmark_tasks.py
@@ -163,7 +163,7 @@ MAINTENANCE
 |-------|--------|---------|
 | `/compare-configs` | `scripts/compare_configs.py` | Show divergent tasks across baseline/SG_full, "MCP helps" vs "MCP hurts". Now includes optional MCP-conditioned analysis. |
 | `/cost-report` | `scripts/cost_report.py` | Token usage and estimated cost by suite/config, most expensive tasks |
-| `/generate-report` | `scripts/generate_report.py` | Aggregate CCB evaluation report from completed runs |
+| `/generate-report` | `scripts/generate_eval_report.py` | Aggregate CCB evaluation report from completed runs |
 | `/evaluate-traces` | `scripts/audit_traces.py` | Comprehensive trace evaluation: data integrity, output quality, efficiency analysis. Includes zero-MCP vs used-MCP classification. |
 | `/mcp-audit` | `scripts/mcp_audit.py` | MCP usage patterns: used vs zero-MCP, intensity buckets, reward/time deltas conditioned on actual MCP adoption |
 
 
@@ -60,19 +60,19 @@ benchmarks/              # Task definitions organized by benchmark suite
   ccb_tac/               #   TheAgentCompany tasks (8 tasks)
 configs/                 # 3-config comparison shell runners + task selection
   _common.sh             #   Shared infra: token refresh, parallel execution, multi-account
-  codereview_3config.sh  #   Per-suite runner: CodeReview (3 tasks)
-  crossrepo_3config.sh   #   Per-suite runner: CrossRepo (5 tasks)
-  dibench_3config.sh     #   Per-suite runner: DIBench (8 tasks)
-  k8s_docs_3config.sh    #   Per-suite runner: K8s Docs (5 tasks)
-  largerepo_3config.sh   #   Per-suite runner: Large Repo (4 tasks)
-  linuxflbench_3config.sh #  Per-suite runner: LinuxFLBench (5 tasks)
-  locobench_3config.sh   #   Per-suite runner: LoCoBench (25 tasks)
-  pytorch_3config.sh     #   Per-suite runner: PyTorch (12 tasks)
-  repoqa_3config.sh      #   Per-suite runner: RepoQA (10 tasks)
+  codereview_2config.sh  #   Per-suite runner: CodeReview (3 tasks)
+  crossrepo_2config.sh   #   Per-suite runner: CrossRepo (5 tasks)
+  dibench_2config.sh     #   Per-suite runner: DIBench (8 tasks)
+  k8s_docs_2config.sh    #   Per-suite runner: K8s Docs (5 tasks)
+  largerepo_2config.sh   #   Per-suite runner: Large Repo (4 tasks)
+  linuxflbench_2config.sh #  Per-suite runner: LinuxFLBench (5 tasks)
+  locobench_2config.sh   #   Per-suite runner: LoCoBench (25 tasks)
+  pytorch_2config.sh     #   Per-suite runner: PyTorch (12 tasks)
+  dependeval_2config.sh  #   Per-suite runner: DependEval (32 tasks)
   run_selected_tasks.sh  #   Unified runner for all tasks
-  swebenchpro_3config.sh #   Per-suite runner: SWE-Bench Pro (36 tasks)
-  sweperf_3config.sh     #   Per-suite runner: SWE-Perf (3 tasks)
-  tac_3config.sh         #   Per-suite runner: TheAgentCompany (8 tasks)
+  swebenchpro_2config.sh #   Per-suite runner: SWE-Bench Pro (36 tasks)
+  sweperf_2config.sh     #   Per-suite runner: SWE-Perf (3 tasks)
+  tac_2config.sh         #   Per-suite runner: TheAgentCompany (8 tasks)
   selected_benchmark_tasks.json  # Canonical task selection with metadata
 scripts/                 # Metrics extraction, evaluation, and operational tooling
   ccb_metrics/           #   Python package: models, extractors, discovery, judge context
@@ -148,25 +148,25 @@ bash configs/run_selected_tasks.sh --dry-run
 Per-suite runners are also available for individual benchmarks:
 
 ```bash
-bash configs/swebenchpro_3config.sh      # 36 SWE-Bench Pro tasks
-bash configs/locobench_3config.sh        # 25 LoCoBench tasks
-bash configs/pytorch_3config.sh          # 12 PyTorch tasks
-bash configs/repoqa_3config.sh           # 10 RepoQA tasks
-bash configs/tac_3config.sh              # 8 TheAgentCompany tasks
-bash configs/dibench_3config.sh          # 8 DIBench tasks
-bash configs/crossrepo_3config.sh        # 5 CrossRepo tasks
-bash configs/k8s_docs_3config.sh         # 5 K8s Docs tasks
-bash configs/linuxflbench_3config.sh     # 5 LinuxFLBench tasks (see note below)
-bash configs/largerepo_3config.sh        # 4 Large Repo tasks
-bash configs/sweperf_3config.sh          # 3 SWE-Perf tasks
-bash configs/codereview_3config.sh       # 3 CodeReview tasks
+bash configs/swebenchpro_2config.sh      # 36 SWE-Bench Pro tasks
+bash configs/locobench_2config.sh        # 25 LoCoBench tasks
+bash configs/pytorch_2config.sh          # 12 PyTorch tasks
+bash configs/dependeval_2config.sh       # 32 DependEval tasks
+bash configs/tac_2config.sh              # 8 TheAgentCompany tasks
+bash configs/dibench_2config.sh          # 8 DIBench tasks
+bash configs/crossrepo_2config.sh        # 5 CrossRepo tasks
+bash configs/k8s_docs_2config.sh         # 5 K8s Docs tasks
+bash configs/linuxflbench_2config.sh     # 5 LinuxFLBench tasks (see note below)
+bash configs/largerepo_2config.sh        # 4 Large Repo tasks
+bash configs/sweperf_2config.sh          # 3 SWE-Perf tasks
+bash configs/codereview_2config.sh       # 3 CodeReview tasks
 ```
 
 All runners support `--baseline-only` and `--full-only` flags.
 
 **LinuxFLBench note:** Docker image build is slow (~10 min) due to Linux kernel partial clone (~2GB). Pre-build images before running to save time.
 
-**DependEval note:** DependEval tasks use `--path` mode with local task directories. There is no unified `dependeval_3config.sh` yet; tasks are tracked via `configs/dependeval_selected_instances.json`.
+**DependEval note:** DependEval runs use local task directories and are handled by `configs/dependeval_2config.sh`.
 
 Requires [Harbor](https://github.com/laude-institute/harbor/tree/main) installed and configured with a Claude API key.
 
 
@@ -0,0 +1,72 @@
+{
+  "description": "Canonical benchmark config matrix and extension registry for CodeContextBench.",
+  "official_default_configs": [
+    "baseline",
+    "sourcegraph_full"
+  ],
+  "supported_configs": [
+    "baseline",
+    "sourcegraph_base",
+    "sourcegraph_full",
+    "sourcegraph_isolated",
+    "github_base",
+    "github_full"
+  ],
+  "config_definitions": {
+    "baseline": {
+      "baseline_mcp_type": "none",
+      "mcp_enabled": false,
+      "provider": "none",
+      "track_in_official": true,
+      "status": "active"
+    },
+    "sourcegraph_base": {
+      "baseline_mcp_type": "sourcegraph_base",
+      "mcp_enabled": true,
+      "provider": "sourcegraph",
+      "track_in_official": true,
+      "status": "legacy_or_targeted"
+    },
+    "sourcegraph_full": {
+      "baseline_mcp_type": "sourcegraph_full",
+      "mcp_enabled": true,
+      "provider": "sourcegraph",
+      "track_in_official": true,
+      "status": "active"
+    },
+    "sourcegraph_isolated": {
+      "baseline_mcp_type": "sourcegraph_isolated",
+      "mcp_enabled": true,
+      "provider": "sourcegraph",
+      "track_in_official": true,
+      "status": "experimental"
+    },
+    "sourcegraph_only": {
+      "baseline_mcp_type": "sourcegraph_only",
+      "mcp_enabled": true,
+      "provider": "sourcegraph",
+      "track_in_official": true,
+      "status": "experimental"
+    },
+    "github_base": {
+      "baseline_mcp_type": "github_base",
+      "mcp_enabled": true,
+      "provider": "github",
+      "track_in_official": false,
+      "status": "experimental_scaffold"
+    },
+    "github_full": {
+      "baseline_mcp_type": "github_full",
+      "mcp_enabled": true,
+      "provider": "github",
+      "track_in_official": false,
+      "status": "experimental_scaffold"
+    }
+  },
+  "provider_templates": {
+    "github": {
+      "example_config_name": "github_full",
+      "notes": "Reserved template for future GitHub MCP integration."
+    }
+  }
+}