Skip to content

Commit 4ec304c

Browse files
sjarmakclaude
andcommitted
feat: build canonical 220-task core retrieval benchmark manifest
Adds the complete core benchmark pipeline: n_repos derivation from Dockerfile topology, verifier-quality classification (core_ready/conditional/extension_only) from ABC audit signals, 220-task manifest with suite allocation targets, power validation (0.696 overall, need 283 for 0.80), and canonical vs extension policy. New scripts: derive_n_repos.py, generate_verifier_labels.py, build_core_manifest.py, validate_core_manifest.py. New configs: core_benchmark_manifest.json, verifier_quality_scheme.json, verifier_quality_labels.json. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 80badf4 commit 4ec304c

11 files changed

+6780
-484
lines changed

configs/core_benchmark_manifest.json

Lines changed: 1570 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
{
2+
"manifest_total": 220,
3+
"target_total": 220,
4+
"shortfall": 0,
5+
"distribution": {
6+
"loc_bands": {
7+
"2M-8M": 101,
8+
"400K-2M": 79,
9+
"8M-40M": 32,
10+
"<400K": 5,
11+
">40M": 3
12+
},
13+
"n_repos": {
14+
"1": 111,
15+
"2": 18,
16+
"3": 31,
17+
"4": 60
18+
},
19+
"single_repo": 111,
20+
"multi_repo": 109,
21+
"retrieval_sensitive": 200,
22+
"control": 20,
23+
"suites": {
24+
"csb_org_compliance": 12,
25+
"csb_org_crossorg": 10,
26+
"csb_org_crossrepo": 8,
27+
"csb_org_crossrepo_tracing": 20,
28+
"csb_org_domain": 10,
29+
"csb_org_incident": 22,
30+
"csb_org_migration": 23,
31+
"csb_org_onboarding": 16,
32+
"csb_org_org": 10,
33+
"csb_org_platform": 6,
34+
"csb_org_security": 30,
35+
"csb_sdlc_debug": 4,
36+
"csb_sdlc_design": 4,
37+
"csb_sdlc_document": 2,
38+
"csb_sdlc_feature": 6,
39+
"csb_sdlc_fix": 18,
40+
"csb_sdlc_refactor": 4,
41+
"csb_sdlc_secure": 6,
42+
"csb_sdlc_test": 6,
43+
"csb_sdlc_understand": 3
44+
},
45+
"verifier_quality": {
46+
"conditional": 167,
47+
"core_ready": 53
48+
}
49+
},
50+
"power_analysis": {
51+
"assumptions": {
52+
"effect_size": 0.05,
53+
"alpha": 0.05,
54+
"sigma_overall": 0.3,
55+
"sigma_loc_subgroup": 0.28,
56+
"sigma_repo_subgroup": 0.32
57+
},
58+
"overall": {
59+
"n": 220,
60+
"power": 0.696
61+
},
62+
"loc_small_repos": {
63+
"n": 84,
64+
"power": 0.373
65+
},
66+
"loc_large_repos": {
67+
"n": 136,
68+
"power": 0.549
69+
},
70+
"single_repo": {
71+
"n": 111,
72+
"power": 0.377
73+
},
74+
"multi_repo": {
75+
"n": 109,
76+
"power": 0.371
77+
},
78+
"min_n_80pct_power": {
79+
"overall": 283,
80+
"subgroup_loc": 247,
81+
"subgroup_repo": 322
82+
}
83+
},
84+
"recommendations": [
85+
"Overall power 0.696 < 0.80. Need 283 tasks minimum.",
86+
"Single-repo subgroup power is low. Consider whether single-repo moderation is testable."
87+
]
88+
}

0 commit comments

Comments
 (0)