forked from bespokelabsai/curator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwildchat.py
34 lines (22 loc) · 1.12 KB
/
wildchat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""Example of reannotating the WildChat dataset using curator."""
import logging
from datasets import load_dataset
from bespokelabs import curator
dataset = load_dataset("allenai/WildChat", split="train")
dataset = dataset.select(range(3_000))
# To see more detail about how batches are being processed
logger = logging.getLogger("bespokelabs.curator")
logger.setLevel(logging.INFO)
class WildChatReannotator(curator.LLM):
"""A reannotator for the WildChat dataset."""
def prompt(self, input: dict) -> str:
"""Extract the first message from a conversation to use as the prompt."""
return input["conversation"][0]["content"]
def parse(self, input: dict, response: str) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
instruction = input["conversation"][0]["content"]
return {"instruction": instruction, "new_response": response}
distiller = WildChatReannotator(model_name="gpt-4o-mini", batch=True, backend_params={"batch_size": 1_000})
distilled_dataset = distiller(dataset)
print(distilled_dataset)
print(distilled_dataset[0])