gen_api_answer.py

"""Generate answers with GPT-4

Usage:
python3 gen_api_answer.py --model gpt-3.5-turbo
"""

import argparse
import concurrent.futures
import json
import os
import time

import shortuuid
import tqdm
from fastchat.llm_judge.common import ANTHROPIC_MODEL_LIST
from fastchat.llm_judge.common import chat_completion_anthropic
from fastchat.llm_judge.common import chat_completion_palm
from fastchat.llm_judge.common import load_questions
from fastchat.llm_judge.common import temperature_config
from fastchat.llm_judge.gen_model_answer import reorg_answer_file
from fastchat.model.model_adapter import get_conversation_template

import utils.common
from utils.api import ANTHROPIC_MODEL_LIST_NEW
from utils.api import anthropic_chat_completion_new
from utils.api import chat_completion_openai
from utils.dedup import drop_questions_already_processed_by_question_id


def get_answer(
    question: dict,
    model: str,
    num_choices: int,
    max_tokens: int,
    answer_file: str,
    disable_strict_injection_check=False,
):
    assert (
        args.force_temperature is not None and "required_temperature" in question.keys()
    ) == False
    if args.force_temperature is not None:
        temperature = args.force_temperature
    elif "required_temperature" in question.keys():
        temperature = question["required_temperature"]
    elif question["category"] in temperature_config:
        temperature = temperature_config[question["category"]]
    else:
        temperature = 0.7

    choices = []
    chat_state = None  # for palm-2 model
    for i in range(num_choices):
        if model in ANTHROPIC_MODEL_LIST_NEW:
            conv = get_conversation_template("claude-2")
        else:
            conv = get_conversation_template(model)

        turns = []
        for j in range(len(question["turns"])):
            conv.append_message(conv.roles[0], question["turns"][j])
            conv.append_message(conv.roles[1], None)

            if model in ANTHROPIC_MODEL_LIST_NEW:
                output = anthropic_chat_completion_new(
                    model, conv, temperature, max_tokens
                )
            elif model in ANTHROPIC_MODEL_LIST:
                output = chat_completion_anthropic(model, conv, temperature, max_tokens)
            elif model == "palm-2-chat-bison-001":
                chat_state, output = chat_completion_palm(
                    chat_state, model, conv, temperature, max_tokens
                )
            else:
                output = chat_completion_openai(model, conv, temperature, max_tokens)

            if output:
                stop_list = [conv.sep + conv.roles[0], conv.sep + conv.roles[1]]
                for stop_str_original in stop_list:
                    for stop_str in [stop_str_original, stop_str_original.strip()]:
                        if output[-len(stop_str) :] == stop_str:
                            if disable_strict_injection_check:
                                output = output[: -len(stop_str)]
                            else:
                                output = ""

            conv.update_last_message(output)
            turns.append(output)

        choices.append({"index": i, "turns": turns})

    # Dump answers
    ans = {
        "question_id": question["question_id"],
        "answer_id": shortuuid.uuid(),
        "model_id": model,
        "choices": choices,
        "tstamp": time.time(),
    }

    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
    with open(answer_file, "a") as fout:
        fout.write(json.dumps(ans) + "\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bench-name",
        type=str,
        default="pfmt_bench_fin_ja",
        help="The name of the benchmark question set.",
    )
    parser.add_argument("--answer-file", type=str, help="The output answer file.")
    parser.add_argument("--model", type=str, default="gpt-3.5-turbo")
    parser.add_argument(
        "--num-choices",
        type=int,
        default=1,
        help="How many completion choices to generate.",
    )
    parser.add_argument(
        "--force-temperature", type=float, help="Forcibly set a sampling temperature."
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=4096,
        help="The maximum number of new generated tokens.",
    )
    parser.add_argument(
        "--question-begin",
        type=int,
        help="A debug option. The begin index of questions.",
    )
    parser.add_argument(
        "--question-end", type=int, help="A debug option. The end index of questions."
    )
    parser.add_argument(
        "--parallel", type=int, default=1, help="The number of concurrent API calls."
    )
    parser.add_argument(
        "--disable-strict-injection-check",
        action="store_true",
        help="Disable strict injection check. If it is not set, the model output is ignored if it generate the next convesation inclusing conv template such as ### assistant:.",
    )
    parser.add_argument("--openai-api-base", type=str, default=None)
    args = parser.parse_args()

    question_file = f"data/{args.bench_name}/question.jsonl"
    questions = load_questions(question_file, args.question_begin, args.question_end)

    if args.answer_file:
        answer_file = args.answer_file
    else:
        answer_file = f"data/{args.bench_name}/model_answer/{args.model.replace('/','_')}.jsonl"
    print(f"Output to {answer_file}")

    questions = drop_questions_already_processed_by_question_id(questions, answer_file)

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as executor:
        futures = []
        for question in questions:
            future = executor.submit(
                get_answer,
                question,
                args.model,
                args.num_choices,
                args.max_tokens,
                answer_file,
                disable_strict_injection_check=args.disable_strict_injection_check,
            )
            futures.append(future)

        for future in tqdm.tqdm(
            concurrent.futures.as_completed(futures), total=len(futures)
        ):
            future.result()

    reorg_answer_file(answer_file)