|
1 | 1 | { |
2 | | - "2025-10-10T09-57-02-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
3 | | - "version": 2, |
4 | | - "status": "success", |
5 | | - "eval": { |
6 | | - "run_id": "88Xa9JpML2DQ6voZIUniQ9", |
7 | | - "created": "2025-10-10T09:57:02-05:00", |
8 | | - "task": "bluffbench", |
9 | | - "task_id": "465475d21aad6cc3ab920b", |
10 | | - "task_version": 0, |
11 | | - "task_file": "", |
12 | | - "task_attribs": {}, |
13 | | - "task_args": {}, |
14 | | - "dataset": { |
15 | | - "samples": 13, |
16 | | - "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
17 | | - "shuffled": false |
18 | | - }, |
19 | | - "model": "bluff_solver (claude-sonnet-4-5-20250929)", |
20 | | - "model_args": {}, |
21 | | - "config": {}, |
22 | | - "revision": { |
23 | | - "type": "git", |
24 | | - "origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git", |
25 | | - "commit": "9140d8a2" |
26 | | - }, |
27 | | - "packages": { |
28 | | - "inspect_ai": "0.3.63" |
29 | | - }, |
30 | | - "scorers": [ |
31 | | - { |
32 | | - "name": "bluff_scorer", |
33 | | - "options": {}, |
34 | | - "metrics": [ |
35 | | - { |
36 | | - "name": "mean", |
37 | | - "options": {} |
38 | | - } |
39 | | - ], |
40 | | - "metadata": {} |
41 | | - } |
42 | | - ] |
43 | | - }, |
44 | | - "plan": { |
45 | | - "name": "plan", |
46 | | - "steps": [ |
47 | | - { |
48 | | - "solver": "bluff_solver (claude-sonnet-4-5-20250929)", |
49 | | - "params": { |
50 | | - "1": "self$get_samples()$input", |
51 | | - "solver_chat": "<Chat>" |
52 | | - } |
53 | | - } |
54 | | - ], |
55 | | - "config": {} |
56 | | - }, |
57 | | - "results": { |
58 | | - "total_samples": 39, |
59 | | - "completed_samples": 39, |
60 | | - "scores": [ |
61 | | - { |
62 | | - "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
63 | | - "scorer": "bluff_scorer (claude-sonnet-4-5-20250929)", |
64 | | - "params": {}, |
65 | | - "metrics": { |
66 | | - "accuracy": { |
67 | | - "name": "accuracy", |
68 | | - "value": 53.8462, |
69 | | - "params": { |
70 | | - "1": "numeric_scores" |
71 | | - } |
72 | | - } |
73 | | - } |
74 | | - } |
75 | | - ] |
76 | | - }, |
77 | | - "stats": { |
78 | | - "started_at": "2025-10-09T16:04:29-05:00", |
79 | | - "completed_at": "2025-10-09T16:14:54-05:00", |
80 | | - "model_usage": { |
81 | | - "claude-sonnet-4-5-20250929": { |
82 | | - "input_tokens": 40218, |
83 | | - "cache_creation_input_tokens": 0, |
84 | | - "cache_read_input_tokens": 0, |
85 | | - "output_tokens": 21194, |
86 | | - "total_tokens": 61412 |
87 | | - } |
88 | | - } |
89 | | - } |
90 | | - }, |
91 | | - "2025-10-10T09-57-04-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 2 | + "2025-10-08T14-10-22-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
92 | 3 | "version": 2, |
93 | 4 | "status": "success", |
94 | 5 | "eval": { |
95 | 6 | "run_id": "q0YGZyqYDC5ZCH5lJMwVYM", |
96 | | - "created": "2025-10-10T09:57:04-05:00", |
| 7 | + "created": "2025-10-08T14:10:22-05:00", |
97 | 8 | "task": "bluffbench", |
98 | 9 | "task_id": "465475d21aad6cc3ab920b", |
99 | 10 | "task_version": 0, |
|
177 | 88 | } |
178 | 89 | } |
179 | 90 | }, |
180 | | - "2025-10-10T09-57-06-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 91 | + "2025-10-08T15-51-58-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
181 | 92 | "version": 2, |
182 | 93 | "status": "success", |
183 | 94 | "eval": { |
184 | | - "run_id": "gSktvP60wrTGOxMR1bbSX0", |
185 | | - "created": "2025-10-10T09:57:06-05:00", |
| 95 | + "run_id": "npeO315OziQzxfjhaCTEjH", |
| 96 | + "created": "2025-10-08T15:51:58-05:00", |
186 | 97 | "task": "bluffbench", |
187 | 98 | "task_id": "465475d21aad6cc3ab920b", |
188 | 99 | "task_version": 0, |
189 | 100 | "task_file": "", |
190 | 101 | "task_attribs": {}, |
191 | 102 | "task_args": {}, |
192 | 103 | "dataset": { |
193 | | - "samples": 13, |
194 | | - "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
| 104 | + "samples": 11, |
| 105 | + "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], |
195 | 106 | "shuffled": false |
196 | 107 | }, |
197 | | - "model": "bluff_solver (gemini-2.5-pro)", |
| 108 | + "model": "bluff_solver (gpt-5)", |
198 | 109 | "model_args": {}, |
199 | 110 | "config": {}, |
200 | 111 | "revision": { |
|
223 | 134 | "name": "plan", |
224 | 135 | "steps": [ |
225 | 136 | { |
226 | | - "solver": "bluff_solver (gemini-2.5-pro)", |
| 137 | + "solver": "bluff_solver (gpt-5)", |
227 | 138 | "params": { |
228 | 139 | "1": "self$get_samples()$input", |
229 | 140 | "solver_chat": "<Chat>" |
|
233 | 144 | "config": {} |
234 | 145 | }, |
235 | 146 | "results": { |
236 | | - "total_samples": 39, |
237 | | - "completed_samples": 39, |
| 147 | + "total_samples": 33, |
| 148 | + "completed_samples": 33, |
238 | 149 | "scores": [ |
239 | 150 | { |
240 | 151 | "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
|
252 | 163 | ] |
253 | 164 | }, |
254 | 165 | "stats": { |
255 | | - "started_at": "2025-10-09T16:15:53-05:00", |
256 | | - "completed_at": "2025-10-09T16:24:59-05:00", |
| 166 | + "started_at": "2025-10-08T15:33:39-05:00", |
| 167 | + "completed_at": "2025-10-08T15:51:58-05:00", |
257 | 168 | "model_usage": { |
258 | | - "gemini-2.5-pro": { |
259 | | - "input_tokens": 800923, |
| 169 | + "gpt-5": { |
| 170 | + "input_tokens": -32640, |
260 | 171 | "cache_creation_input_tokens": 0, |
261 | 172 | "cache_read_input_tokens": 0, |
262 | | - "output_tokens": 5540, |
263 | | - "total_tokens": 806463 |
| 173 | + "output_tokens": 57584, |
| 174 | + "total_tokens": 24944 |
264 | 175 | } |
265 | 176 | } |
266 | 177 | } |
267 | 178 | }, |
268 | | - "2025-10-10T09-57-08-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
| 179 | + "2025-10-08T16-11-29-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
269 | 180 | "version": 2, |
270 | 181 | "status": "success", |
271 | 182 | "eval": { |
272 | 183 | "run_id": "ZWx0DB1EsHIfbGEsK9PiYu", |
273 | | - "created": "2025-10-10T09:57:08-05:00", |
| 184 | + "created": "2025-10-08T16:11:29-05:00", |
274 | 185 | "task": "bluffbench", |
275 | 186 | "task_id": "465475d21aad6cc3ab920b", |
276 | 187 | "task_version": 0, |
|
353 | 264 | } |
354 | 265 | } |
355 | 266 | } |
356 | | - }, |
357 | | - "2025-10-10T09-57-10-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
358 | | - "version": 2, |
359 | | - "status": "success", |
360 | | - "eval": { |
361 | | - "run_id": "EZQrw5N42mAdRONbUMPtd6", |
362 | | - "created": "2025-10-10T09:57:10-05:00", |
363 | | - "task": "bluffbench", |
364 | | - "task_id": "465475d21aad6cc3ab920b", |
365 | | - "task_version": 0, |
366 | | - "task_file": "", |
367 | | - "task_attribs": {}, |
368 | | - "task_args": {}, |
369 | | - "dataset": { |
370 | | - "samples": 13, |
371 | | - "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
372 | | - "shuffled": false |
373 | | - }, |
374 | | - "model": "bluff_solver (gpt-5)", |
375 | | - "model_args": {}, |
376 | | - "config": {}, |
377 | | - "revision": { |
378 | | - "type": "git", |
379 | | - "origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git", |
380 | | - "commit": "9140d8a2" |
381 | | - }, |
382 | | - "packages": { |
383 | | - "inspect_ai": "0.3.63" |
384 | | - }, |
385 | | - "scorers": [ |
386 | | - { |
387 | | - "name": "bluff_scorer", |
388 | | - "options": {}, |
389 | | - "metrics": [ |
390 | | - { |
391 | | - "name": "mean", |
392 | | - "options": {} |
393 | | - } |
394 | | - ], |
395 | | - "metadata": {} |
396 | | - } |
397 | | - ] |
398 | | - }, |
399 | | - "plan": { |
400 | | - "name": "plan", |
401 | | - "steps": [ |
402 | | - { |
403 | | - "solver": "bluff_solver (gpt-5)", |
404 | | - "params": { |
405 | | - "1": "self$get_samples()$input", |
406 | | - "solver_chat": "<Chat>" |
407 | | - } |
408 | | - } |
409 | | - ], |
410 | | - "config": {} |
411 | | - }, |
412 | | - "results": { |
413 | | - "total_samples": 39, |
414 | | - "completed_samples": 39, |
415 | | - "scores": [ |
416 | | - { |
417 | | - "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
418 | | - "scorer": "bluff_scorer (claude-sonnet-4-5-20250929)", |
419 | | - "params": {}, |
420 | | - "metrics": { |
421 | | - "accuracy": { |
422 | | - "name": "accuracy", |
423 | | - "params": { |
424 | | - "1": "numeric_scores" |
425 | | - } |
426 | | - } |
427 | | - } |
428 | | - } |
429 | | - ] |
430 | | - }, |
431 | | - "stats": { |
432 | | - "started_at": "2025-10-09T16:25:00-05:00", |
433 | | - "completed_at": "2025-10-09T16:44:52-05:00", |
434 | | - "model_usage": { |
435 | | - "gpt-5": { |
436 | | - "input_tokens": -40457, |
437 | | - "cache_creation_input_tokens": 0, |
438 | | - "cache_read_input_tokens": 0, |
439 | | - "output_tokens": 77866, |
440 | | - "total_tokens": 37409 |
441 | | - } |
442 | | - } |
443 | | - } |
444 | | - }, |
445 | | - "2025-10-10T09-57-12-05-00_bluffbench_465475d21aad6cc3ab920b.json": { |
446 | | - "version": 2, |
447 | | - "status": "success", |
448 | | - "eval": { |
449 | | - "run_id": "npeO315OziQzxfjhaCTEjH", |
450 | | - "created": "2025-10-10T09:57:12-05:00", |
451 | | - "task": "bluffbench", |
452 | | - "task_id": "465475d21aad6cc3ab920b", |
453 | | - "task_version": 0, |
454 | | - "task_file": "", |
455 | | - "task_attribs": {}, |
456 | | - "task_args": {}, |
457 | | - "dataset": { |
458 | | - "samples": 11, |
459 | | - "sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], |
460 | | - "shuffled": false |
461 | | - }, |
462 | | - "model": "bluff_solver (gpt-5)", |
463 | | - "model_args": {}, |
464 | | - "config": {}, |
465 | | - "revision": { |
466 | | - "type": "git", |
467 | | - "origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git", |
468 | | - "commit": "9140d8a2" |
469 | | - }, |
470 | | - "packages": { |
471 | | - "inspect_ai": "0.3.63" |
472 | | - }, |
473 | | - "scorers": [ |
474 | | - { |
475 | | - "name": "bluff_scorer", |
476 | | - "options": {}, |
477 | | - "metrics": [ |
478 | | - { |
479 | | - "name": "mean", |
480 | | - "options": {} |
481 | | - } |
482 | | - ], |
483 | | - "metadata": {} |
484 | | - } |
485 | | - ] |
486 | | - }, |
487 | | - "plan": { |
488 | | - "name": "plan", |
489 | | - "steps": [ |
490 | | - { |
491 | | - "solver": "bluff_solver (gpt-5)", |
492 | | - "params": { |
493 | | - "1": "self$get_samples()$input", |
494 | | - "solver_chat": "<Chat>" |
495 | | - } |
496 | | - } |
497 | | - ], |
498 | | - "config": {} |
499 | | - }, |
500 | | - "results": { |
501 | | - "total_samples": 33, |
502 | | - "completed_samples": 33, |
503 | | - "scores": [ |
504 | | - { |
505 | | - "name": "bluff_scorer (claude-sonnet-4-5-20250929)", |
506 | | - "scorer": "bluff_scorer (claude-sonnet-4-5-20250929)", |
507 | | - "params": {}, |
508 | | - "metrics": { |
509 | | - "accuracy": { |
510 | | - "name": "accuracy", |
511 | | - "params": { |
512 | | - "1": "numeric_scores" |
513 | | - } |
514 | | - } |
515 | | - } |
516 | | - } |
517 | | - ] |
518 | | - }, |
519 | | - "stats": { |
520 | | - "started_at": "2025-10-08T15:33:39-05:00", |
521 | | - "completed_at": "2025-10-08T15:51:58-05:00", |
522 | | - "model_usage": { |
523 | | - "gpt-5": { |
524 | | - "input_tokens": -32640, |
525 | | - "cache_creation_input_tokens": 0, |
526 | | - "cache_read_input_tokens": 0, |
527 | | - "output_tokens": 57584, |
528 | | - "total_tokens": 24944 |
529 | | - } |
530 | | - } |
531 | | - } |
532 | 267 | } |
533 | 268 | } |
0 commit comments