Skip to content

Commit 8acfe91

Browse files
committed
Revert "update bundled logs"
This reverts commit a579e92.
1 parent 3756805 commit 8acfe91

8 files changed

+32953
-106902
lines changed

docs/logs/logs/2025-10-10T09-57-04-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-08T14-10-22-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 2605 additions & 1617 deletions
Large diffs are not rendered by default.

docs/logs/logs/2025-10-10T09-57-12-05-00_bluffbench_465475d21aad6cc3ab920b.json renamed to docs/logs/logs/2025-10-08T15-51-58-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 6412 additions & 2221 deletions
Large diffs are not rendered by default.

docs/logs/logs/2025-10-08T16-11-29-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 23917 additions & 0 deletions
Large diffs are not rendered by default.

docs/logs/logs/2025-10-10T09-57-02-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 0 additions & 26547 deletions
This file was deleted.

docs/logs/logs/2025-10-10T09-57-06-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 0 additions & 26324 deletions
This file was deleted.

docs/logs/logs/2025-10-10T09-57-08-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 0 additions & 23362 deletions
This file was deleted.

docs/logs/logs/2025-10-10T09-57-10-05-00_bluffbench_465475d21aad6cc3ab920b.json

Lines changed: 0 additions & 26547 deletions
This file was deleted.

docs/logs/logs/listing.json

Lines changed: 19 additions & 284 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,10 @@
11
{
2-
"2025-10-10T09-57-02-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
3-
"version": 2,
4-
"status": "success",
5-
"eval": {
6-
"run_id": "88Xa9JpML2DQ6voZIUniQ9",
7-
"created": "2025-10-10T09:57:02-05:00",
8-
"task": "bluffbench",
9-
"task_id": "465475d21aad6cc3ab920b",
10-
"task_version": 0,
11-
"task_file": "",
12-
"task_attribs": {},
13-
"task_args": {},
14-
"dataset": {
15-
"samples": 13,
16-
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
17-
"shuffled": false
18-
},
19-
"model": "bluff_solver (claude-sonnet-4-5-20250929)",
20-
"model_args": {},
21-
"config": {},
22-
"revision": {
23-
"type": "git",
24-
"origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git",
25-
"commit": "9140d8a2"
26-
},
27-
"packages": {
28-
"inspect_ai": "0.3.63"
29-
},
30-
"scorers": [
31-
{
32-
"name": "bluff_scorer",
33-
"options": {},
34-
"metrics": [
35-
{
36-
"name": "mean",
37-
"options": {}
38-
}
39-
],
40-
"metadata": {}
41-
}
42-
]
43-
},
44-
"plan": {
45-
"name": "plan",
46-
"steps": [
47-
{
48-
"solver": "bluff_solver (claude-sonnet-4-5-20250929)",
49-
"params": {
50-
"1": "self$get_samples()$input",
51-
"solver_chat": "<Chat>"
52-
}
53-
}
54-
],
55-
"config": {}
56-
},
57-
"results": {
58-
"total_samples": 39,
59-
"completed_samples": 39,
60-
"scores": [
61-
{
62-
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
63-
"scorer": "bluff_scorer (claude-sonnet-4-5-20250929)",
64-
"params": {},
65-
"metrics": {
66-
"accuracy": {
67-
"name": "accuracy",
68-
"value": 53.8462,
69-
"params": {
70-
"1": "numeric_scores"
71-
}
72-
}
73-
}
74-
}
75-
]
76-
},
77-
"stats": {
78-
"started_at": "2025-10-09T16:04:29-05:00",
79-
"completed_at": "2025-10-09T16:14:54-05:00",
80-
"model_usage": {
81-
"claude-sonnet-4-5-20250929": {
82-
"input_tokens": 40218,
83-
"cache_creation_input_tokens": 0,
84-
"cache_read_input_tokens": 0,
85-
"output_tokens": 21194,
86-
"total_tokens": 61412
87-
}
88-
}
89-
}
90-
},
91-
"2025-10-10T09-57-04-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
2+
"2025-10-08T14-10-22-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
923
"version": 2,
934
"status": "success",
945
"eval": {
956
"run_id": "q0YGZyqYDC5ZCH5lJMwVYM",
96-
"created": "2025-10-10T09:57:04-05:00",
7+
"created": "2025-10-08T14:10:22-05:00",
978
"task": "bluffbench",
989
"task_id": "465475d21aad6cc3ab920b",
9910
"task_version": 0,
@@ -177,24 +88,24 @@
17788
}
17889
}
17990
},
180-
"2025-10-10T09-57-06-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
91+
"2025-10-08T15-51-58-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
18192
"version": 2,
18293
"status": "success",
18394
"eval": {
184-
"run_id": "gSktvP60wrTGOxMR1bbSX0",
185-
"created": "2025-10-10T09:57:06-05:00",
95+
"run_id": "npeO315OziQzxfjhaCTEjH",
96+
"created": "2025-10-08T15:51:58-05:00",
18697
"task": "bluffbench",
18798
"task_id": "465475d21aad6cc3ab920b",
18899
"task_version": 0,
189100
"task_file": "",
190101
"task_attribs": {},
191102
"task_args": {},
192103
"dataset": {
193-
"samples": 13,
194-
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
104+
"samples": 11,
105+
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
195106
"shuffled": false
196107
},
197-
"model": "bluff_solver (gemini-2.5-pro)",
108+
"model": "bluff_solver (gpt-5)",
198109
"model_args": {},
199110
"config": {},
200111
"revision": {
@@ -223,7 +134,7 @@
223134
"name": "plan",
224135
"steps": [
225136
{
226-
"solver": "bluff_solver (gemini-2.5-pro)",
137+
"solver": "bluff_solver (gpt-5)",
227138
"params": {
228139
"1": "self$get_samples()$input",
229140
"solver_chat": "<Chat>"
@@ -233,8 +144,8 @@
233144
"config": {}
234145
},
235146
"results": {
236-
"total_samples": 39,
237-
"completed_samples": 39,
147+
"total_samples": 33,
148+
"completed_samples": 33,
238149
"scores": [
239150
{
240151
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
@@ -252,25 +163,25 @@
252163
]
253164
},
254165
"stats": {
255-
"started_at": "2025-10-09T16:15:53-05:00",
256-
"completed_at": "2025-10-09T16:24:59-05:00",
166+
"started_at": "2025-10-08T15:33:39-05:00",
167+
"completed_at": "2025-10-08T15:51:58-05:00",
257168
"model_usage": {
258-
"gemini-2.5-pro": {
259-
"input_tokens": 800923,
169+
"gpt-5": {
170+
"input_tokens": -32640,
260171
"cache_creation_input_tokens": 0,
261172
"cache_read_input_tokens": 0,
262-
"output_tokens": 5540,
263-
"total_tokens": 806463
173+
"output_tokens": 57584,
174+
"total_tokens": 24944
264175
}
265176
}
266177
}
267178
},
268-
"2025-10-10T09-57-08-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
179+
"2025-10-08T16-11-29-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
269180
"version": 2,
270181
"status": "success",
271182
"eval": {
272183
"run_id": "ZWx0DB1EsHIfbGEsK9PiYu",
273-
"created": "2025-10-10T09:57:08-05:00",
184+
"created": "2025-10-08T16:11:29-05:00",
274185
"task": "bluffbench",
275186
"task_id": "465475d21aad6cc3ab920b",
276187
"task_version": 0,
@@ -353,181 +264,5 @@
353264
}
354265
}
355266
}
356-
},
357-
"2025-10-10T09-57-10-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
358-
"version": 2,
359-
"status": "success",
360-
"eval": {
361-
"run_id": "EZQrw5N42mAdRONbUMPtd6",
362-
"created": "2025-10-10T09:57:10-05:00",
363-
"task": "bluffbench",
364-
"task_id": "465475d21aad6cc3ab920b",
365-
"task_version": 0,
366-
"task_file": "",
367-
"task_attribs": {},
368-
"task_args": {},
369-
"dataset": {
370-
"samples": 13,
371-
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
372-
"shuffled": false
373-
},
374-
"model": "bluff_solver (gpt-5)",
375-
"model_args": {},
376-
"config": {},
377-
"revision": {
378-
"type": "git",
379-
"origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git",
380-
"commit": "9140d8a2"
381-
},
382-
"packages": {
383-
"inspect_ai": "0.3.63"
384-
},
385-
"scorers": [
386-
{
387-
"name": "bluff_scorer",
388-
"options": {},
389-
"metrics": [
390-
{
391-
"name": "mean",
392-
"options": {}
393-
}
394-
],
395-
"metadata": {}
396-
}
397-
]
398-
},
399-
"plan": {
400-
"name": "plan",
401-
"steps": [
402-
{
403-
"solver": "bluff_solver (gpt-5)",
404-
"params": {
405-
"1": "self$get_samples()$input",
406-
"solver_chat": "<Chat>"
407-
}
408-
}
409-
],
410-
"config": {}
411-
},
412-
"results": {
413-
"total_samples": 39,
414-
"completed_samples": 39,
415-
"scores": [
416-
{
417-
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
418-
"scorer": "bluff_scorer (claude-sonnet-4-5-20250929)",
419-
"params": {},
420-
"metrics": {
421-
"accuracy": {
422-
"name": "accuracy",
423-
"params": {
424-
"1": "numeric_scores"
425-
}
426-
}
427-
}
428-
}
429-
]
430-
},
431-
"stats": {
432-
"started_at": "2025-10-09T16:25:00-05:00",
433-
"completed_at": "2025-10-09T16:44:52-05:00",
434-
"model_usage": {
435-
"gpt-5": {
436-
"input_tokens": -40457,
437-
"cache_creation_input_tokens": 0,
438-
"cache_read_input_tokens": 0,
439-
"output_tokens": 77866,
440-
"total_tokens": 37409
441-
}
442-
}
443-
}
444-
},
445-
"2025-10-10T09-57-12-05-00_bluffbench_465475d21aad6cc3ab920b.json": {
446-
"version": 2,
447-
"status": "success",
448-
"eval": {
449-
"run_id": "npeO315OziQzxfjhaCTEjH",
450-
"created": "2025-10-10T09:57:12-05:00",
451-
"task": "bluffbench",
452-
"task_id": "465475d21aad6cc3ab920b",
453-
"task_version": 0,
454-
"task_file": "",
455-
"task_attribs": {},
456-
"task_args": {},
457-
"dataset": {
458-
"samples": 11,
459-
"sample_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
460-
"shuffled": false
461-
},
462-
"model": "bluff_solver (gpt-5)",
463-
"model_args": {},
464-
"config": {},
465-
"revision": {
466-
"type": "git",
467-
"origin": "https://github.com/UKGovernmentBEIS/inspect_ai.git",
468-
"commit": "9140d8a2"
469-
},
470-
"packages": {
471-
"inspect_ai": "0.3.63"
472-
},
473-
"scorers": [
474-
{
475-
"name": "bluff_scorer",
476-
"options": {},
477-
"metrics": [
478-
{
479-
"name": "mean",
480-
"options": {}
481-
}
482-
],
483-
"metadata": {}
484-
}
485-
]
486-
},
487-
"plan": {
488-
"name": "plan",
489-
"steps": [
490-
{
491-
"solver": "bluff_solver (gpt-5)",
492-
"params": {
493-
"1": "self$get_samples()$input",
494-
"solver_chat": "<Chat>"
495-
}
496-
}
497-
],
498-
"config": {}
499-
},
500-
"results": {
501-
"total_samples": 33,
502-
"completed_samples": 33,
503-
"scores": [
504-
{
505-
"name": "bluff_scorer (claude-sonnet-4-5-20250929)",
506-
"scorer": "bluff_scorer (claude-sonnet-4-5-20250929)",
507-
"params": {},
508-
"metrics": {
509-
"accuracy": {
510-
"name": "accuracy",
511-
"params": {
512-
"1": "numeric_scores"
513-
}
514-
}
515-
}
516-
}
517-
]
518-
},
519-
"stats": {
520-
"started_at": "2025-10-08T15:33:39-05:00",
521-
"completed_at": "2025-10-08T15:51:58-05:00",
522-
"model_usage": {
523-
"gpt-5": {
524-
"input_tokens": -32640,
525-
"cache_creation_input_tokens": 0,
526-
"cache_read_input_tokens": 0,
527-
"output_tokens": 57584,
528-
"total_tokens": 24944
529-
}
530-
}
531-
}
532267
}
533268
}

0 commit comments

Comments
 (0)