Document task environment variants and preserve k8sdocs sg-only files

LoCoBench Bot · LoCoBench Bot · commit 73c5ad0c19bd · 2026-02-16T18:54:39.000Z
diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
@@ -74,14 +74,14 @@
 {"id":"CodeContextBench-b41","title":"Add DependEval and LinuxFLBench to TASK_CATALOG.md","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:26:04.732271812Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T14:30:46.293910463Z","closed_at":"2026-02-06T14:30:46.293910463Z","close_reason":"Closed"}
 {"id":"CodeContextBench-bgq","title":"US-013: Ensure all new tasks have Dockerfiles","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T00:42:19.445469836Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T00:50:42.281410025Z","closed_at":"2026-02-16T00:50:42.281410025Z","close_reason":"US-013 complete: duplicate issue, all Dockerfiles verified"}
 {"id":"CodeContextBench-c0h","title":"US-002: Create docgen-arch-002 Istio Pilot discovery architecture doc","status":"in_progress","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T16:02:21.400634759Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:02:27.452076279Z"}
-{"id":"CodeContextBench-c6m","title":"Phase 3: Dockerfile.sg_only for write-only suites","status":"open","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T18:42:40.749772061Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T18:42:40.749772061Z","dependencies":[{"issue_id":"CodeContextBench-c6m","depends_on_id":"CodeContextBench-zku","type":"blocks","created_at":"2026-02-16T18:42:50.554721937Z","created_by":"LoCoBench Bot"}]}
+{"id":"CodeContextBench-c6m","title":"Phase 3: Dockerfile.sg_only for write-only suites","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T18:42:40.749772061Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T18:51:31.383411361Z","closed_at":"2026-02-16T18:51:31.383411361Z","close_reason":"14 Dockerfile.sg_only files (5 K8s, 5 LFL, 4 INV) + sgonly_writeonly_2config.sh run script created","dependencies":[{"issue_id":"CodeContextBench-c6m","depends_on_id":"CodeContextBench-zku","type":"blocks","created_at":"2026-02-16T18:42:50.554721937Z","created_by":"LoCoBench Bot"}]}
 {"id":"CodeContextBench-cey","title":"US-012: Build failure analysis engine","status":"closed","priority":1,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-15T13:53:47.854221697Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T13:57:20.769673188Z","closed_at":"2026-02-15T13:57:20.769673188Z","close_reason":"US-012 implemented and all ACs verified"}
 {"id":"CodeContextBench-d00","title":"US-001: Create inv-deep-001 Envoy filter chain deep causal task","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:08:15.0008813Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:13:36.330715999Z","closed_at":"2026-02-16T15:13:36.330715999Z","close_reason":"US-001 complete: inv-deep-001 Envoy deep causal chain task created and committed"}
 {"id":"CodeContextBench-d5q","title":"US-003: Create inv-deep-003 - Deep causal chain in Terraform","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:28:45.184016129Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:40:01.995896974Z","closed_at":"2026-02-16T15:40:01.995896974Z","close_reason":"US-003 complete: inv-deep-003 created with Terraform sensitive marks bug"}
 {"id":"CodeContextBench-dfp","title":"Run LoCoBench baseline and SG_full configs","description":"QA audit H2: LoCoBench only has SG_base results in MANIFEST (25/25 tasks). Need baseline and SG_full runs for complete 3-config comparison. SG_full should use the updated Deep Search preamble.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:17.265852053Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:33:58.368434048Z","closed_at":"2026-02-16T01:33:58.368434048Z","close_reason":"Stale - LoCoBench dropped in favor of enterprise largerepo tasks (25 tasks across 5 categories). New tasks don't need separate LoCoBench runs.","dependencies":[{"issue_id":"CodeContextBench-dfp","depends_on_id":"CodeContextBench-17e","type":"blocks","created_at":"2026-02-06T21:09:35.481295416Z","created_by":"LoCoBench Bot"}]}
 {"id":"CodeContextBench-ega","title":"US-008b: Scaffold remaining 3 governance tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T14:39:32.981506882Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T14:45:09.007512651Z","closed_at":"2026-02-15T14:45:09.007512651Z","close_reason":"US-008b complete: 3 governance tasks scaffolded (cross-team-boundary, audit-trail, degraded-context)"}
 {"id":"CodeContextBench-f0x","title":"US-001: Create nlqa-arch-001 Envoy HTTP filter chain task","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:58:19.87022273Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:01:50.336839468Z","closed_at":"2026-02-16T16:01:50.336839468Z","close_reason":"US-001 complete: nlqa-arch-001 task created with all acceptance criteria passing"}
-{"id":"CodeContextBench-f2q","title":"Phase 4: sg_only for build-requiring suites","status":"open","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T18:42:43.836243874Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T18:42:43.836243874Z","dependencies":[{"issue_id":"CodeContextBench-f2q","depends_on_id":"CodeContextBench-zku","type":"blocks","created_at":"2026-02-16T18:42:50.620537139Z","created_by":"LoCoBench Bot"}]}
+{"id":"CodeContextBench-f2q","title":"Phase 4: sg_only for build-requiring suites","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T18:42:43.836243874Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T18:54:26.909852825Z","closed_at":"2026-02-16T18:54:26.909852825Z","close_reason":"Agent _setup_sgonly_truncation() method, sgonly_verifier_wrapper.sh, sgonly_build_2config.sh run script","dependencies":[{"issue_id":"CodeContextBench-f2q","depends_on_id":"CodeContextBench-zku","type":"blocks","created_at":"2026-02-16T18:42:50.620537139Z","created_by":"LoCoBench Bot"}]}
 {"id":"CodeContextBench-fph","title":"Design blind-bug task variants — vague instructions requiring discovery","description":"For 5-10 existing SWE-bench Pro tasks, create instruction variants where the bug location is NOT given. Current instructions often point to specific files/functions. Variants use realistic user-reported symptoms: e.g., 'Fix NULL handling in album.go' becomes 'Users report albums occasionally fail to load. Investigate and fix.' Same verifier, same Dockerfile, different instruction.md. SG semantic search genuinely helps with discovery when the agent doesn't know WHERE to look. Implementation: create instruction_blind.md variants, add --instruction-variant flag to agent, compare baseline vs SG on discovery success rate. Select tasks where the original instruction reveals the bug location.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:27.062045186Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:28:43.11808592Z","closed_at":"2026-02-12T10:28:43.11808592Z","close_reason":"Design complete: docs/DESIGN_blind_bug_variants.md. Phase 1 MVP: 3 HIGH-rated tasks (qutebrowser CertificateErrorWrapper, element-web VoiceBroadcastLiveness, openlibrary WorkSearchScheme). Env var INSTRUCTION_VARIANT=blind selects instruction_blind.md. ~400 LOC across 6-8 files. Key metric: (B-A)-(Y-X) measures MCP discovery value."}
 {"id":"CodeContextBench-fvh","title":"Archive broken first-attempt governance/enterprise runs, regenerate MANIFEST","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T18:28:31.19415021Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T18:29:50.70796578Z","closed_at":"2026-02-15T18:29:50.70796578Z","close_reason":"Archived 2 broken first-attempt runs, regenerated MANIFEST: 495 tasks / 43 runs with governance and enterprise properly included"}
 {"id":"CodeContextBench-gd4","title":"US-003: Create cr-terraform-001 multi-file Go defect review","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:41:56.001924582Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:48:45.705225232Z","closed_at":"2026-02-16T15:48:45.705225232Z","close_reason":"Completed: cr-terraform-001 task created with 6 defects across 4 Go files"}
diff --git a/benchmarks/ccb_k8sdocs/README.md b/benchmarks/ccb_k8sdocs/README.md
@@ -4,6 +4,9 @@
 
 This benchmark evaluates AI coding agents on documentation generation and update tasks for the Kubernetes codebase. The design leverages the extensive Kubernetes ecosystem to test how well agents can understand complex distributed systems code and produce accurate, comprehensive documentation.
 
+Environment variants for these tasks are documented in
+`benchmarks/ccb_k8sdocs/VARIANTS.md`.
+
 ## Benchmark Design
 
 ### Core Hypothesis
diff --git a/benchmarks/ccb_k8sdocs/VARIANTS.md b/benchmarks/ccb_k8sdocs/VARIANTS.md
@@ -0,0 +1,31 @@
+# ccb_k8sdocs Environment Variants
+
+This suite keeps `environment/Dockerfile` as the canonical task definition.
+
+## Variant Policy
+
+- Never overwrite canonical `Dockerfile` for official task lineage.
+- Variant environments must use separate filenames in the same directory.
+- Any run using a variant Dockerfile should be tracked as a separate study.
+
+## Current Files by Task
+
+For each task under `benchmarks/ccb_k8sdocs/*/environment/`:
+
+- `Dockerfile`
+  - Canonical environment (full local checkout at pinned commit).
+  - This is the default Harbor build target.
+- `Dockerfile.isolated`
+  - Sparse-checkout environment (target package only).
+  - Intended for MCP-isolation experiments where local cross-package context is restricted.
+- `Dockerfile.sg_only`
+  - Sourcegraph-only local environment (no repo clone; target path scaffold only).
+  - Intended for strict MCP-only ablations.
+
+## Notes
+
+- `Dockerfile.isolated` changes task conditions and should not be mixed into the
+  canonical official baseline/full series without explicit variant labeling.
+- `applyconfig-doc-001` has verifier checks that assume some sibling package
+  paths exist; if running isolated/sg-only variants, verify rubric compatibility
+  before interpreting score deltas.
diff --git a/benchmarks/ccb_k8sdocs/apiserver-doc-001/environment/Dockerfile.sg_only b/benchmarks/ccb_k8sdocs/apiserver-doc-001/environment/Dockerfile.sg_only
@@ -0,0 +1,28 @@
+FROM golang:1.23-bookworm
+
+WORKDIR /workspace
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    python3 \
+    npm \
+    ripgrep \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Claude Code CLI
+RUN npm install -g @anthropic-ai/claude-code
+
+# NO repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Create target directory so agent knows where to write doc.go.
+RUN mkdir -p staging/src/k8s.io/apiserver && \
+    git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# Create test directory for verifier
+RUN mkdir -p /workspace/tests
+
+# Create /app directory for MCP config upload compatibility
+RUN mkdir -p /app
diff --git a/benchmarks/ccb_k8sdocs/applyconfig-doc-001/environment/Dockerfile.sg_only b/benchmarks/ccb_k8sdocs/applyconfig-doc-001/environment/Dockerfile.sg_only
@@ -0,0 +1,28 @@
+FROM golang:1.23-bookworm
+
+WORKDIR /workspace
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    python3 \
+    npm \
+    ripgrep \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Claude Code CLI
+RUN npm install -g @anthropic-ai/claude-code
+
+# NO repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Create target directory so agent knows where to write doc.go.
+RUN mkdir -p staging/src/k8s.io/client-go/applyconfigurations && \
+    git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# Create test directory for verifier
+RUN mkdir -p /workspace/tests
+
+# Create /app directory for MCP config upload compatibility
+RUN mkdir -p /app
diff --git a/benchmarks/ccb_k8sdocs/client-go-doc-001/environment/Dockerfile.sg_only b/benchmarks/ccb_k8sdocs/client-go-doc-001/environment/Dockerfile.sg_only
@@ -0,0 +1,28 @@
+FROM golang:1.23-bookworm
+
+WORKDIR /workspace
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    python3 \
+    npm \
+    ripgrep \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Claude Code CLI
+RUN npm install -g @anthropic-ai/claude-code
+
+# NO repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Create target directory so agent knows where to write doc.go.
+RUN mkdir -p staging/src/k8s.io/client-go && \
+    git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# Create test directory for verifier
+RUN mkdir -p /workspace/tests
+
+# Create /app directory for MCP config upload compatibility
+RUN mkdir -p /app
diff --git a/benchmarks/ccb_k8sdocs/fairqueuing-doc-001/environment/Dockerfile.sg_only b/benchmarks/ccb_k8sdocs/fairqueuing-doc-001/environment/Dockerfile.sg_only
@@ -0,0 +1,28 @@
+FROM golang:1.23-bookworm
+
+WORKDIR /workspace
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    python3 \
+    npm \
+    ripgrep \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Claude Code CLI
+RUN npm install -g @anthropic-ai/claude-code
+
+# NO repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Create target directory so agent knows where to write doc.go.
+RUN mkdir -p staging/src/k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing/queueset && \
+    git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# Create test directory for verifier
+RUN mkdir -p /workspace/tests
+
+# Create /app directory for MCP config upload compatibility
+RUN mkdir -p /app
diff --git a/benchmarks/ccb_k8sdocs/pkg-doc-001/environment/Dockerfile.sg_only b/benchmarks/ccb_k8sdocs/pkg-doc-001/environment/Dockerfile.sg_only
@@ -0,0 +1,28 @@
+FROM golang:1.23-bookworm
+
+WORKDIR /workspace
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    python3 \
+    npm \
+    ripgrep \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Claude Code CLI
+RUN npm install -g @anthropic-ai/claude-code
+
+# NO repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Create target directory so agent knows where to write doc.go.
+RUN mkdir -p pkg/kubelet/cm && \
+    git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# Create test directory for verifier
+RUN mkdir -p /workspace/tests
+
+# Create /app directory for MCP config upload compatibility
+RUN mkdir -p /app
diff --git a/docs/EXTENSIBILITY.md b/docs/EXTENSIBILITY.md
@@ -69,3 +69,14 @@ python3 scripts/docs_consistency_check.py
 ```
 
 This validates that core docs do not reference missing scripts/configs.
+
+## 6) Task Environment Variants
+
+When adding benchmark environment variants, keep canonical task definitions intact:
+
+1. Keep `environment/Dockerfile` as the canonical default.
+2. Add variant files with explicit names (for example `Dockerfile.isolated`,
+   `Dockerfile.sg_only`).
+3. Document variant intent and caveats in-suite (for example
+   `benchmarks/ccb_k8sdocs/VARIANTS.md`).
+4. Treat variant runs as separate studies in reporting and curation.
diff --git a/docs/VARIANTS.md b/docs/VARIANTS.md
@@ -0,0 +1,7 @@
+# Variant Conventions
+
+See `docs/EXTENSIBILITY.md` section "Task Environment Variants" for rules.
+
+Suite-specific example:
+
+- `benchmarks/ccb_k8sdocs/VARIANTS.md`