forked from bespokelabsai/curator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathungrounded_qa.py
104 lines (67 loc) · 3.38 KB
/
ungrounded_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Generate diverse set of questions and answers by generating diverse subjects and subsubjects.
This is similar to how data is generated for the Camel dataset.
See section F (appendix) of https://arxiv.org/pdf/2303.17760.
"""
from typing import List
from pydantic import BaseModel, Field
from bespokelabs import curator
class Subject(BaseModel):
"""A single subject."""
subject: str = Field(description="A subject")
class Subjects(BaseModel):
"""A list of subjects."""
subjects: List[Subject] = Field(description="A list of subjects")
class QA(BaseModel):
"""A question and answer pair."""
question: str = Field(description="A question")
answer: str = Field(description="An answer")
class QAs(BaseModel):
"""A list of question and answer pairs."""
qas: List[QA] = Field(description="A list of QAs")
class SubjectGenerator(curator.LLM):
"""A subject generator that generates diverse subjects."""
response_format = Subjects
def prompt(self, input: dict) -> str:
"""Generate a prompt for the subject generator."""
return "Generate a diverse list of 3 subjects. Keep it high-level (e.g. Math, Science)."
def parse(self, input: dict, response: Subjects) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
return response.subjects
class SubsubjectGenerator(curator.LLM):
"""A subsubject generator that generates diverse subsubjects for a given subject."""
response_format = Subjects
def prompt(self, input: dict) -> str:
"""Generate a prompt for the subsubject generator."""
return f"For the given subject {input['subject']}. Generate 3 diverse subsubjects. No explanation."
def parse(self, input: dict, response: Subjects) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
return [{"subject": input["subject"], "subsubject": subsubject.subject} for subsubject in response.subjects]
class QAGenerator(curator.LLM):
"""A QA generator that generates diverse questions and answers for a given subsubject."""
response_format = QAs
def prompt(self, input: dict) -> str:
"""Generate a prompt for the QA generator."""
return f"For the given subsubject {input['subsubject']}. Generate 3 diverse questions and answers. No explanation."
def parse(self, input: dict, response: QAs) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
return [
{
"subject": input["subject"],
"subsubject": input["subsubject"],
"question": qa.question,
"answer": qa.answer,
}
for qa in response.qas
]
def main():
"""Main function to generate a dataset of questions and answers."""
subject_generator = SubjectGenerator(model_name="gpt-4o-mini")
subject_dataset = subject_generator()
subsubject_generator = SubsubjectGenerator(model_name="gpt-4o-mini")
subsubject_dataset = subsubject_generator(subject_dataset)
qa_generator = QAGenerator(model_name="gpt-4o-mini")
qa_dataset = qa_generator(subsubject_dataset)
qa_dataset = qa_dataset.map(lambda row: {"answer": row["answer"].strip()}, num_proc=2)
print(qa_dataset.to_pandas())
if __name__ == "__main__":
main()