|
1 | 1 | { |
2 | | - "2025-10-08T14-10-22-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 2 | + "2025-10-10T13-46-54-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 3 | + "version": 2, |
| 4 | + "status": "success", |
| 5 | + "eval": { |
| 6 | + "run_id": "88Xa9JpML2DQ6voZIUniQ9", |
| 7 | + "created": "2025-10-10T13:46:54-05:00", |
| 8 | + "task": "bluffbench", |
| 9 | + "task_id": "465475d21aad6cc3ab920b", |
| 10 | + "task_version": 0, |
| 11 | + "task_file": "", |
| 12 | + "task_attribs": {}, |
| 13 | + "task_args": {}, |
| 14 | + "dataset": { |
| 15 | + "samples": 13, |
| 16 | + "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
| 17 | + "shuffled": false |
| 18 | + }, |
| 19 | + "model": "bluff_solver (claude-sonnet-4-5-20250929)", |
| 20 | + "model_args": {}, |
| 21 | + "config": {}, |
| 22 | + "revision": { |
| 23 | + "type": "git", |
| 24 | + "origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git", |
| 25 | + "commit": "9140d8a2" |
| 26 | + }, |
| 27 | + "packages": { |
| 28 | + "inspect_ai": "0.3.63" |
| 29 | + }, |
| 30 | + "scorers": [ |
| 31 | + { |
| 32 | + "name": "bluff_scorer", |
| 33 | + "options": {}, |
| 34 | + "metrics": [ |
| 35 | + { |
| 36 | + "name": "mean", |
| 37 | + "options": {} |
| 38 | + } |
| 39 | + ], |
| 40 | + "metadata": {} |
| 41 | + } |
| 42 | + ] |
| 43 | + }, |
| 44 | + "plan": { |
| 45 | + "name": "plan", |
| 46 | + "steps": [ |
| 47 | + { |
| 48 | + "solver": "bluff_solver (claude-sonnet-4-5-20250929)", |
| 49 | + "params": { |
| 50 | + "1": "self$get_samples()$input", |
| 51 | + "solver_chat": "<Chat>" |
| 52 | + } |
| 53 | + } |
| 54 | + ], |
| 55 | + "config": {} |
| 56 | + }, |
| 57 | + "results": { |
| 58 | + "total_samples": 39, |
| 59 | + "completed_samples": 39, |
| 60 | + "scores": [ |
| 61 | + { |
| 62 | + "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
| 63 | + "scorer": "bluff_scorer (claude-sonnet-4-5-20250929)", |
| 64 | + "params": {}, |
| 65 | + "metrics": { |
| 66 | + "accuracy": { |
| 67 | + "name": "accuracy", |
| 68 | + "value": 53.8462, |
| 69 | + "params": { |
| 70 | + "1": "numeric_scores" |
| 71 | + } |
| 72 | + } |
| 73 | + } |
| 74 | + } |
| 75 | + ] |
| 76 | + }, |
| 77 | + "stats": { |
| 78 | + "started_at": "2025-10-09T16:04:29-05:00", |
| 79 | + "completed_at": "2025-10-09T16:14:54-05:00", |
| 80 | + "model_usage": { |
| 81 | + "claude-sonnet-4-5-20250929": { |
| 82 | + "input_tokens": 40218, |
| 83 | + "cache_creation_input_tokens": 0, |
| 84 | + "cache_read_input_tokens": 0, |
| 85 | + "output_tokens": 21194, |
| 86 | + "total_tokens": 61412 |
| 87 | + } |
| 88 | + } |
| 89 | + } |
| 90 | + }, |
| 91 | + "2025-10-10T13-46-57-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
3 | 92 | "version": 2, |
4 | 93 | "status": "success", |
5 | 94 | "eval": { |
6 | 95 | "run_id": "q0YGZyqYDC5ZCH5lJMwVYM", |
7 | | - "created": "2025-10-08T14:10:22-05:00", |
| 96 | + "created": "2025-10-10T13:46:57-05:00", |
8 | 97 | "task": "bluffbench", |
9 | 98 | "task_id": "465475d21aad6cc3ab920b", |
10 | 99 | "task_version": 0, |
|
88 | 177 | } |
89 | 178 | } |
90 | 179 | }, |
91 | | - "2025-10-08T15-51-58-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 180 | + "2025-10-10T13-46-59-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
92 | 181 | "version": 2, |
93 | 182 | "status": "success", |
94 | 183 | "eval": { |
95 | | - "run_id": "npeO315OziQzxfjhaCTEjH", |
96 | | - "created": "2025-10-08T15:51:58-05:00", |
| 184 | + "run_id": "gSktvP60wrTGOxMR1bbSX0", |
| 185 | + "created": "2025-10-10T13:46:59-05:00", |
97 | 186 | "task": "bluffbench", |
98 | 187 | "task_id": "465475d21aad6cc3ab920b", |
99 | 188 | "task_version": 0, |
100 | 189 | "task_file": "", |
101 | 190 | "task_attribs": {}, |
102 | 191 | "task_args": {}, |
103 | 192 | "dataset": { |
104 | | - "samples": 11, |
105 | | - "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], |
| 193 | + "samples": 13, |
| 194 | + "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
106 | 195 | "shuffled": false |
107 | 196 | }, |
108 | | - "model": "bluff_solver (gpt-5)", |
| 197 | + "model": "bluff_solver (gemini-2.5-pro)", |
109 | 198 | "model_args": {}, |
110 | 199 | "config": {}, |
111 | 200 | "revision": { |
|
134 | 223 | "name": "plan", |
135 | 224 | "steps": [ |
136 | 225 | { |
137 | | - "solver": "bluff_solver (gpt-5)", |
| 226 | + "solver": "bluff_solver (gemini-2.5-pro)", |
138 | 227 | "params": { |
139 | 228 | "1": "self$get_samples()$input", |
140 | 229 | "solver_chat": "<Chat>" |
|
144 | 233 | "config": {} |
145 | 234 | }, |
146 | 235 | "results": { |
147 | | - "total_samples": 33, |
148 | | - "completed_samples": 33, |
| 236 | + "total_samples": 39, |
| 237 | + "completed_samples": 39, |
149 | 238 | "scores": [ |
150 | 239 | { |
151 | 240 | "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
|
154 | 243 | "metrics": { |
155 | 244 | "accuracy": { |
156 | 245 | "name": "accuracy", |
| 246 | + "value": 0, |
157 | 247 | "params": { |
158 | 248 | "1": "numeric_scores" |
159 | 249 | } |
|
163 | 253 | ] |
164 | 254 | }, |
165 | 255 | "stats": { |
166 | | - "started_at": "2025-10-08T15:33:39-05:00", |
167 | | - "completed_at": "2025-10-08T15:51:58-05:00", |
| 256 | + "started_at": "2025-10-09T16:15:53-05:00", |
| 257 | + "completed_at": "2025-10-09T16:24:59-05:00", |
168 | 258 | "model_usage": { |
169 | | - "gpt-5": { |
170 | | - "input_tokens": -32640, |
| 259 | + "gemini-2.5-pro": { |
| 260 | + "input_tokens": 800923, |
171 | 261 | "cache_creation_input_tokens": 0, |
172 | 262 | "cache_read_input_tokens": 0, |
173 | | - "output_tokens": 57584, |
174 | | - "total_tokens": 24944 |
| 263 | + "output_tokens": 5540, |
| 264 | + "total_tokens": 806463 |
175 | 265 | } |
176 | 266 | } |
177 | 267 | } |
178 | 268 | }, |
179 | | - "2025-10-08T16-11-29-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 269 | + "2025-10-10T13-47-01-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
180 | 270 | "version": 2, |
181 | 271 | "status": "success", |
182 | 272 | "eval": { |
183 | 273 | "run_id": "ZWx0DB1EsHIfbGEsK9PiYu", |
184 | | - "created": "2025-10-08T16:11:29-05:00", |
| 274 | + "created": "2025-10-10T13:47:01-05:00", |
185 | 275 | "task": "bluffbench", |
186 | 276 | "task_id": "465475d21aad6cc3ab920b", |
187 | 277 | "task_version": 0, |
|
264 | 354 | } |
265 | 355 | } |
266 | 356 | } |
| 357 | + }, |
| 358 | + "2025-10-10T13-47-04-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 359 | + "version": 2, |
| 360 | + "status": "success", |
| 361 | + "eval": { |
| 362 | + "run_id": "EZQrw5N42mAdRONbUMPtd6", |
| 363 | + "created": "2025-10-10T13:47:04-05:00", |
| 364 | + "task": "bluffbench", |
| 365 | + "task_id": "465475d21aad6cc3ab920b", |
| 366 | + "task_version": 0, |
| 367 | + "task_file": "", |
| 368 | + "task_attribs": {}, |
| 369 | + "task_args": {}, |
| 370 | + "dataset": { |
| 371 | + "samples": 13, |
| 372 | + "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
| 373 | + "shuffled": false |
| 374 | + }, |
| 375 | + "model": "bluff_solver (gpt-5)", |
| 376 | + "model_args": {}, |
| 377 | + "config": {}, |
| 378 | + "revision": { |
| 379 | + "type": "git", |
| 380 | + "origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git", |
| 381 | + "commit": "9140d8a2" |
| 382 | + }, |
| 383 | + "packages": { |
| 384 | + "inspect_ai": "0.3.63" |
| 385 | + }, |
| 386 | + "scorers": [ |
| 387 | + { |
| 388 | + "name": "bluff_scorer", |
| 389 | + "options": {}, |
| 390 | + "metrics": [ |
| 391 | + { |
| 392 | + "name": "mean", |
| 393 | + "options": {} |
| 394 | + } |
| 395 | + ], |
| 396 | + "metadata": {} |
| 397 | + } |
| 398 | + ] |
| 399 | + }, |
| 400 | + "plan": { |
| 401 | + "name": "plan", |
| 402 | + "steps": [ |
| 403 | + { |
| 404 | + "solver": "bluff_solver (gpt-5)", |
| 405 | + "params": { |
| 406 | + "1": "self$get_samples()$input", |
| 407 | + "solver_chat": "<Chat>" |
| 408 | + } |
| 409 | + } |
| 410 | + ], |
| 411 | + "config": {} |
| 412 | + }, |
| 413 | + "results": { |
| 414 | + "total_samples": 39, |
| 415 | + "completed_samples": 39, |
| 416 | + "scores": [ |
| 417 | + { |
| 418 | + "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
| 419 | + "scorer": "bluff_scorer (claude-sonnet-4-5-20250929)", |
| 420 | + "params": {}, |
| 421 | + "metrics": { |
| 422 | + "accuracy": { |
| 423 | + "name": "accuracy", |
| 424 | + "value": 0, |
| 425 | + "params": { |
| 426 | + "1": "numeric_scores" |
| 427 | + } |
| 428 | + } |
| 429 | + } |
| 430 | + } |
| 431 | + ] |
| 432 | + }, |
| 433 | + "stats": { |
| 434 | + "started_at": "2025-10-09T16:25:00-05:00", |
| 435 | + "completed_at": "2025-10-09T16:44:52-05:00", |
| 436 | + "model_usage": { |
| 437 | + "gpt-5": { |
| 438 | + "input_tokens": -40457, |
| 439 | + "cache_creation_input_tokens": 0, |
| 440 | + "cache_read_input_tokens": 0, |
| 441 | + "output_tokens": 77866, |
| 442 | + "total_tokens": 37409 |
| 443 | + } |
| 444 | + } |
| 445 | + } |
| 446 | + }, |
| 447 | + "2025-10-10T13-47-06-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 448 | + "version": 2, |
| 449 | + "status": "success", |
| 450 | + "eval": { |
| 451 | + "run_id": "npeO315OziQzxfjhaCTEjH", |
| 452 | + "created": "2025-10-10T13:47:06-05:00", |
| 453 | + "task": "bluffbench", |
| 454 | + "task_id": "465475d21aad6cc3ab920b", |
| 455 | + "task_version": 0, |
| 456 | + "task_file": "", |
| 457 | + "task_attribs": {}, |
| 458 | + "task_args": {}, |
| 459 | + "dataset": { |
| 460 | + "samples": 11, |
| 461 | + "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], |
| 462 | + "shuffled": false |
| 463 | + }, |
| 464 | + "model": "bluff_solver (gpt-5)", |
| 465 | + "model_args": {}, |
| 466 | + "config": {}, |
| 467 | + "revision": { |
| 468 | + "type": "git", |
| 469 | + "origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git", |
| 470 | + "commit": "9140d8a2" |
| 471 | + }, |
| 472 | + "packages": { |
| 473 | + "inspect_ai": "0.3.63" |
| 474 | + }, |
| 475 | + "scorers": [ |
| 476 | + { |
| 477 | + "name": "bluff_scorer", |
| 478 | + "options": {}, |
| 479 | + "metrics": [ |
| 480 | + { |
| 481 | + "name": "mean", |
| 482 | + "options": {} |
| 483 | + } |
| 484 | + ], |
| 485 | + "metadata": {} |
| 486 | + } |
| 487 | + ] |
| 488 | + }, |
| 489 | + "plan": { |
| 490 | + "name": "plan", |
| 491 | + "steps": [ |
| 492 | + { |
| 493 | + "solver": "bluff_solver (gpt-5)", |
| 494 | + "params": { |
| 495 | + "1": "self$get_samples()$input", |
| 496 | + "solver_chat": "<Chat>" |
| 497 | + } |
| 498 | + } |
| 499 | + ], |
| 500 | + "config": {} |
| 501 | + }, |
| 502 | + "results": { |
| 503 | + "total_samples": 33, |
| 504 | + "completed_samples": 33, |
| 505 | + "scores": [ |
| 506 | + { |
| 507 | + "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
| 508 | + "scorer": "bluff_scorer (claude-sonnet-4-5-20250929)", |
| 509 | + "params": {}, |
| 510 | + "metrics": { |
| 511 | + "accuracy": { |
| 512 | + "name": "accuracy", |
| 513 | + "value": 0, |
| 514 | + "params": { |
| 515 | + "1": "numeric_scores" |
| 516 | + } |
| 517 | + } |
| 518 | + } |
| 519 | + } |
| 520 | + ] |
| 521 | + }, |
| 522 | + "stats": { |
| 523 | + "started_at": "2025-10-08T15:33:39-05:00", |
| 524 | + "completed_at": "2025-10-08T15:51:58-05:00", |
| 525 | + "model_usage": { |
| 526 | + "gpt-5": { |
| 527 | + "input_tokens": -32640, |
| 528 | + "cache_creation_input_tokens": 0, |
| 529 | + "cache_read_input_tokens": 0, |
| 530 | + "output_tokens": 57584, |
| 531 | + "total_tokens": 24944 |
| 532 | + } |
| 533 | + } |
| 534 | + } |
267 | 535 | } |
268 | 536 | } |
0 commit comments