-
Notifications
You must be signed in to change notification settings - Fork 2
/
run_gpt4_eval.py
50 lines (42 loc) · 1.54 KB
/
run_gpt4_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
from evaluation.evaluate import GPT4Metric, Llama3Metric
import traceback
# log to file
import logging
from pathlib import Path
# logging is set by the processor, here we want to add a file handler
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler(f"evaluation.log", mode="a")
fh.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
if __name__ == "__main__":
datasets = ["ice_hockey", "gsmarena", "openweather", "owid", "wikidata"]
models = ["zephyr", "mistral", "llama2", "gpt-3.5"]
# splits = ["dev", "test"]
splits = ["test"]
# this will only evaluate 3 examples from each domain
# debug = True
debug = False
e = GPT4Metric()
# e = Llama3Metric()
for split in splits:
for model in models:
for dataset in datasets:
try:
e.run(
model_name=model,
dataset_name=dataset,
split=split,
setup_name="direct",
base_path="data/quintd-1",
debug=debug,
)
except Exception as e:
print(traceback.format_exc())
logging.error(f"Exception: {e}")
logging.error(f"Dataset: {dataset}, Model: {model}")
logging.error(traceback.format_exc())
logging.error("")