Skip to content

Commit 36e471d

Browse files
committed
correct NaN accuracies
These resulted in a log viewer crash, but only once one navigated from a specific log back to the index page.
1 parent 1009342 commit 36e471d

14 files changed

+119002
-22349
lines changed

inst/run/logs/2025-10-09T16-15-27-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-10T13-46-54-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 1678 additions & 1678 deletions
Large diffs are not rendered by default.

docs/logs/logs/2025-10-08T14-10-22-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-10T13-46-57-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 1450 additions & 1450 deletions
Large diffs are not rendered by default.

inst/run/logs/2025-10-09T16-24-59-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-10T13-46-59-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 1661 additions & 1661 deletions
Large diffs are not rendered by default.

inst/run/logs/2025-10-08T16-11-29-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-10T13-47-01-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 1462 additions & 1462 deletions
Large diffs are not rendered by default.

inst/run/logs/2025-10-09T16-44-52-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-10T13-47-04-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 1679 additions & 1679 deletions
Large diffs are not rendered by default.

docs/logs/logs/2025-10-08T15-51-58-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-10T13-47-06-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 1477 additions & 1477 deletions
Large diffs are not rendered by default.

docs/logs/logs/listing.json

Lines changed: 287 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,99 @@
11
{
2-
"2025-10-08T14-10-22-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
2+
"2025-10-10T13-46-54-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
3+
"version": 2,
4+
"status": "success",
5+
"eval": {
6+
"run_id": "88Xa9JpML2DQ6voZIUniQ9",
7+
"created": "2025-10-10T13:46:54-05:00",
8+
"task": "bluffbench",
9+
"task_id": "465475d21aad6cc3ab920b",
10+
"task_version": 0,
11+
"task_file": "",
12+
"task_attribs": {},
13+
"task_args": {},
14+
"dataset": {
15+
"samples": 13,
16+
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
17+
"shuffled": false
18+
},
19+
"model": "bluff_solver (claude-sonnet-4-5-20250929)",
20+
"model_args": {},
21+
"config": {},
22+
"revision": {
23+
"type": "git",
24+
"origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git",
25+
"commit": "9140d8a2"
26+
},
27+
"packages": {
28+
"inspect_ai": "0.3.63"
29+
},
30+
"scorers": [
31+
{
32+
"name": "bluff_scorer",
33+
"options": {},
34+
"metrics": [
35+
{
36+
"name": "mean",
37+
"options": {}
38+
}
39+
],
40+
"metadata": {}
41+
}
42+
]
43+
},
44+
"plan": {
45+
"name": "plan",
46+
"steps": [
47+
{
48+
"solver": "bluff_solver (claude-sonnet-4-5-20250929)",
49+
"params": {
50+
"1": "self$get_samples()$input",
51+
"solver_chat": "<Chat>"
52+
}
53+
}
54+
],
55+
"config": {}
56+
},
57+
"results": {
58+
"total_samples": 39,
59+
"completed_samples": 39,
60+
"scores": [
61+
{
62+
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
63+
"scorer": "bluff_scorer (claude-sonnet-4-5-20250929)",
64+
"params": {},
65+
"metrics": {
66+
"accuracy": {
67+
"name": "accuracy",
68+
"value": 53.8462,
69+
"params": {
70+
"1": "numeric_scores"
71+
}
72+
}
73+
}
74+
}
75+
]
76+
},
77+
"stats": {
78+
"started_at": "2025-10-09T16:04:29-05:00",
79+
"completed_at": "2025-10-09T16:14:54-05:00",
80+
"model_usage": {
81+
"claude-sonnet-4-5-20250929": {
82+
"input_tokens": 40218,
83+
"cache_creation_input_tokens": 0,
84+
"cache_read_input_tokens": 0,
85+
"output_tokens": 21194,
86+
"total_tokens": 61412
87+
}
88+
}
89+
}
90+
},
91+
"2025-10-10T13-46-57-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
392
"version": 2,
493
"status": "success",
594
"eval": {
695
"run_id": "q0YGZyqYDC5ZCH5lJMwVYM",
7-
"created": "2025-10-08T14:10:22-05:00",
96+
"created": "2025-10-10T13:46:57-05:00",
897
"task": "bluffbench",
998
"task_id": "465475d21aad6cc3ab920b",
1099
"task_version": 0,
@@ -88,24 +177,24 @@
88177
}
89178
}
90179
},
91-
"2025-10-08T15-51-58-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
180+
"2025-10-10T13-46-59-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
92181
"version": 2,
93182
"status": "success",
94183
"eval": {
95-
"run_id": "npeO315OziQzxfjhaCTEjH",
96-
"created": "2025-10-08T15:51:58-05:00",
184+
"run_id": "gSktvP60wrTGOxMR1bbSX0",
185+
"created": "2025-10-10T13:46:59-05:00",
97186
"task": "bluffbench",
98187
"task_id": "465475d21aad6cc3ab920b",
99188
"task_version": 0,
100189
"task_file": "",
101190
"task_attribs": {},
102191
"task_args": {},
103192
"dataset": {
104-
"samples": 11,
105-
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
193+
"samples": 13,
194+
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
106195
"shuffled": false
107196
},
108-
"model": "bluff_solver (gpt-5)",
197+
"model": "bluff_solver (gemini-2.5-pro)",
109198
"model_args": {},
110199
"config": {},
111200
"revision": {
@@ -134,7 +223,7 @@
134223
"name": "plan",
135224
"steps": [
136225
{
137-
"solver": "bluff_solver (gpt-5)",
226+
"solver": "bluff_solver (gemini-2.5-pro)",
138227
"params": {
139228
"1": "self$get_samples()$input",
140229
"solver_chat": "<Chat>"
@@ -144,8 +233,8 @@
144233
"config": {}
145234
},
146235
"results": {
147-
"total_samples": 33,
148-
"completed_samples": 33,
236+
"total_samples": 39,
237+
"completed_samples": 39,
149238
"scores": [
150239
{
151240
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
@@ -154,6 +243,7 @@
154243
"metrics": {
155244
"accuracy": {
156245
"name": "accuracy",
246+
"value": 0,
157247
"params": {
158248
"1": "numeric_scores"
159249
}
@@ -163,25 +253,25 @@
163253
]
164254
},
165255
"stats": {
166-
"started_at": "2025-10-08T15:33:39-05:00",
167-
"completed_at": "2025-10-08T15:51:58-05:00",
256+
"started_at": "2025-10-09T16:15:53-05:00",
257+
"completed_at": "2025-10-09T16:24:59-05:00",
168258
"model_usage": {
169-
"gpt-5": {
170-
"input_tokens": -32640,
259+
"gemini-2.5-pro": {
260+
"input_tokens": 800923,
171261
"cache_creation_input_tokens": 0,
172262
"cache_read_input_tokens": 0,
173-
"output_tokens": 57584,
174-
"total_tokens": 24944
263+
"output_tokens": 5540,
264+
"total_tokens": 806463
175265
}
176266
}
177267
}
178268
},
179-
"2025-10-08T16-11-29-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
269+
"2025-10-10T13-47-01-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
180270
"version": 2,
181271
"status": "success",
182272
"eval": {
183273
"run_id": "ZWx0DB1EsHIfbGEsK9PiYu",
184-
"created": "2025-10-08T16:11:29-05:00",
274+
"created": "2025-10-10T13:47:01-05:00",
185275
"task": "bluffbench",
186276
"task_id": "465475d21aad6cc3ab920b",
187277
"task_version": 0,
@@ -264,5 +354,183 @@
264354
}
265355
}
266356
}
357+
},
358+
"2025-10-10T13-47-04-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
359+
"version": 2,
360+
"status": "success",
361+
"eval": {
362+
"run_id": "EZQrw5N42mAdRONbUMPtd6",
363+
"created": "2025-10-10T13:47:04-05:00",
364+
"task": "bluffbench",
365+
"task_id": "465475d21aad6cc3ab920b",
366+
"task_version": 0,
367+
"task_file": "",
368+
"task_attribs": {},
369+
"task_args": {},
370+
"dataset": {
371+
"samples": 13,
372+
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
373+
"shuffled": false
374+
},
375+
"model": "bluff_solver (gpt-5)",
376+
"model_args": {},
377+
"config": {},
378+
"revision": {
379+
"type": "git",
380+
"origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git",
381+
"commit": "9140d8a2"
382+
},
383+
"packages": {
384+
"inspect_ai": "0.3.63"
385+
},
386+
"scorers": [
387+
{
388+
"name": "bluff_scorer",
389+
"options": {},
390+
"metrics": [
391+
{
392+
"name": "mean",
393+
"options": {}
394+
}
395+
],
396+
"metadata": {}
397+
}
398+
]
399+
},
400+
"plan": {
401+
"name": "plan",
402+
"steps": [
403+
{
404+
"solver": "bluff_solver (gpt-5)",
405+
"params": {
406+
"1": "self$get_samples()$input",
407+
"solver_chat": "<Chat>"
408+
}
409+
}
410+
],
411+
"config": {}
412+
},
413+
"results": {
414+
"total_samples": 39,
415+
"completed_samples": 39,
416+
"scores": [
417+
{
418+
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
419+
"scorer": "bluff_scorer (claude-sonnet-4-5-20250929)",
420+
"params": {},
421+
"metrics": {
422+
"accuracy": {
423+
"name": "accuracy",
424+
"value": 0,
425+
"params": {
426+
"1": "numeric_scores"
427+
}
428+
}
429+
}
430+
}
431+
]
432+
},
433+
"stats": {
434+
"started_at": "2025-10-09T16:25:00-05:00",
435+
"completed_at": "2025-10-09T16:44:52-05:00",
436+
"model_usage": {
437+
"gpt-5": {
438+
"input_tokens": -40457,
439+
"cache_creation_input_tokens": 0,
440+
"cache_read_input_tokens": 0,
441+
"output_tokens": 77866,
442+
"total_tokens": 37409
443+
}
444+
}
445+
}
446+
},
447+
"2025-10-10T13-47-06-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
448+
"version": 2,
449+
"status": "success",
450+
"eval": {
451+
"run_id": "npeO315OziQzxfjhaCTEjH",
452+
"created": "2025-10-10T13:47:06-05:00",
453+
"task": "bluffbench",
454+
"task_id": "465475d21aad6cc3ab920b",
455+
"task_version": 0,
456+
"task_file": "",
457+
"task_attribs": {},
458+
"task_args": {},
459+
"dataset": {
460+
"samples": 11,
461+
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
462+
"shuffled": false
463+
},
464+
"model": "bluff_solver (gpt-5)",
465+
"model_args": {},
466+
"config": {},
467+
"revision": {
468+
"type": "git",
469+
"origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git",
470+
"commit": "9140d8a2"
471+
},
472+
"packages": {
473+
"inspect_ai": "0.3.63"
474+
},
475+
"scorers": [
476+
{
477+
"name": "bluff_scorer",
478+
"options": {},
479+
"metrics": [
480+
{
481+
"name": "mean",
482+
"options": {}
483+
}
484+
],
485+
"metadata": {}
486+
}
487+
]
488+
},
489+
"plan": {
490+
"name": "plan",
491+
"steps": [
492+
{
493+
"solver": "bluff_solver (gpt-5)",
494+
"params": {
495+
"1": "self$get_samples()$input",
496+
"solver_chat": "<Chat>"
497+
}
498+
}
499+
],
500+
"config": {}
501+
},
502+
"results": {
503+
"total_samples": 33,
504+
"completed_samples": 33,
505+
"scores": [
506+
{
507+
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
508+
"scorer": "bluff_scorer (claude-sonnet-4-5-20250929)",
509+
"params": {},
510+
"metrics": {
511+
"accuracy": {
512+
"name": "accuracy",
513+
"value": 0,
514+
"params": {
515+
"1": "numeric_scores"
516+
}
517+
}
518+
}
519+
}
520+
]
521+
},
522+
"stats": {
523+
"started_at": "2025-10-08T15:33:39-05:00",
524+
"completed_at": "2025-10-08T15:51:58-05:00",
525+
"model_usage": {
526+
"gpt-5": {
527+
"input_tokens": -32640,
528+
"cache_creation_input_tokens": 0,
529+
"cache_read_input_tokens": 0,
530+
"output_tokens": 57584,
531+
"total_tokens": 24944
532+
}
533+
}
534+
}
267535
}
268536
}

0 commit comments

Comments
 (0)