forked from aorwall/SWE-bench-docker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_single_instance.py
executable file
·121 lines (94 loc) · 4.24 KB
/
run_single_instance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
"""Run evaluation"""
import argparse
import asyncio
import logging
import os
import tempfile
from swebench.metrics.getters import get_logs_eval, get_id_from_lp, get_eval_refs
from swebench.metrics.report import get_eval_report
from swebench_docker.constants import (
KEY_INSTANCE_ID,
KEY_MODEL,
KEY_PREDICTION, MAP_REPO_TO_TEST_FRAMEWORK, )
from swebench_docker.run_docker import run_docker_evaluation
from swebench_docker.utils import get_instances, get_test_directives
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("run_evaluation")
async def main(
instance_id: str,
swe_bench_tasks: str,
namespace: str,
predictions_path: str,
):
"""
Runs evaluation on single instance's prediction
Args:
instance_id (str): Path to the predictions file.
swe_bench_tasks (str): Path to the SWE-bench tasks file OR HF dataset name.
namespace (str): Docker repository namespace.
predictions_path (str): Path to the predictions file. If not specified the golden patch will be run.
"""
tasks = get_eval_refs(swe_bench_tasks)
task = tasks[instance_id]
test_type = MAP_REPO_TO_TEST_FRAMEWORK[task["repo"]]
# Show more detailed test output for troubleshooting
if "--tb=no" in test_type:
test_type = test_type.replace("--tb=no", "")
test_directives = get_test_directives(task)
test_cmd = f"{test_type} {' '.join(test_directives)}"
instance = {
KEY_INSTANCE_ID: instance_id,
"repo": task["repo"],
"version": task["version"],
"base_commit": task["base_commit"],
"test_patch": task["test_patch"],
"test_directives": test_directives,
"test_cmd": test_cmd
}
task = tasks[instance_id]
if predictions_path:
predictions_path = os.path.abspath(predictions_path)
predictions = get_instances(predictions_path)
prediction = [p for p in predictions if p[KEY_INSTANCE_ID] == instance_id][0]
instance[KEY_PREDICTION] = prediction[KEY_PREDICTION]
instance[KEY_MODEL] = prediction[KEY_MODEL]
else:
instance[KEY_PREDICTION] = task["patch"]
instance[KEY_MODEL] = "golden"
with tempfile.TemporaryDirectory() as temp_dir:
await run_docker_evaluation(instance, namespace, temp_dir, verbose=True)
logger.info(f"Instance {instance_id} evaluation logs:")
eval_log = os.path.join(temp_dir, f"{instance_id}.{instance[KEY_MODEL]}.eval.log")
with open(eval_log, "r") as f:
logger.info(f.read())
eval_sm, has_report = get_logs_eval(eval_log)
eval_refs = get_eval_refs(swe_bench_tasks)
instance_id = get_id_from_lp(eval_log)
if instance_id not in eval_refs:
print(f"Gold results not found for {instance_id}")
exit(1)
gold_results = eval_refs[instance_id]
report = get_eval_report(eval_sm, gold_results)
if report["FAIL_TO_PASS"]["failure"] or report["PASS_TO_PASS"]["failure"]:
logger.info("Found failing tests")
logger.info("Prediction:")
logger.info(instance[KEY_PREDICTION])
if report["PASS_TO_PASS"]["failure"]:
logger.info("Pass to pass:")
for pass_ in report["PASS_TO_PASS"]["failure"]:
logger.info(f" - {pass_}")
if report["FAIL_TO_PASS"]["failure"]:
logger.info("Fail to pass:")
for fail in report["FAIL_TO_PASS"]["failure"]:
logger.info(f" - {fail}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--instance_id", type=str, help="Instance ID", required=True)
parser.add_argument("--swe_bench_tasks", type=str, help="Path to dataset file or HF datasets name", required=False, default="princeton-nlp/SWE-bench_Lite")
parser.add_argument("--namespace", type=str, help="Docker repository namespace", required=False, default="aorwall")
parser.add_argument("--predictions_path", type=str, help="Path to predictions file (must be .json)", required=False)
args = parser.parse_args()
asyncio.run(main(**vars(args)))