Skip to content

Commit

Permalink
Merge pull request #140 from bespokelabsai/dev
Browse files Browse the repository at this point in the history
0.1.9post1
  • Loading branch information
CharlieJCJ authored Nov 19, 2024
2 parents 4b00175 + 1db0dc8 commit c23405b
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 11 deletions.
31 changes: 25 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
<p align="center">
<a href="https://bespokelabs.ai/" target="_blank">
<picture>
<source media="(prefers-color-scheme: light)" width="80" srcset="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red.png">
<img alt="Bespoke Labs Logo" width="80" src="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red-on-Black.png">
<source media="(prefers-color-scheme: light)" width="80px" srcset="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red.png">
<img alt="Bespoke Labs Logo" width="80px" src="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red-on-Black.png">
</picture>
</a>
</p>

<h1 align="center">Bespoke Labs Curator</h1>
<h1 align="center">Bespoke Curator</h1>
<h3 align="center" style="font-size: 20px; margin-bottom: 4px">Data Curation for Post-Training & Structured Data Extraction</h3>
<br/>
<p align="center">
Expand All @@ -26,6 +26,22 @@
</a>
</p>

### Overview

Bespoke Curator makes it very easy to create high-quality synthetic data at scale, which you can use to finetune models or use for structured data extraction at scale.

Bespoke Curator is an open-source project:
* That comes with a rich Python based library for generating and curating synthetic data.
* A Curator Viewer which makes it easy to view the datasets, thus aiding in the dataset creation.
* We will also be releasing high-quality datasets that should move the needle on post-training.

### Key Features

1. **Programmability and Structured Outputs**: Synthetic data generation is lot more than just using a single prompt -- it involves calling LLMs multiple times and orchestrating control-flow. Curator treats structured outputs as first class citizens and helps you design complex pipelines.
2. **Built-in Performance Optimization**: We often see calling LLMs in loops, or inefficient implementation of multi-threading. We have baked in performance optimizations so that you don't need to worry about those!
3. **Intelligent Caching and Fault Recovery**: Given LLM calls can add up in cost and time, failures are undesirable but sometimes unavoidable. We cache the LLM requests and responses so that it is easy to recover from a failure. Moreover, when working on a multi-stage pipeline, caching of stages makes it easy to iterate.
4. **Native HuggingFace Dataset Integration**: Work directly on HuggingFace Dataset objects throughout your pipeline. Your synthetic data is immediately ready for fine-tuning!
5. **Interactive Curator Viewer**: Improve and iterate on your prompts using our built-in viewer. Inspect LLM requests and responses in real-time, allowing you to iterate and refine your data generation strategy with immediate feedback.

### Installation

Expand Down Expand Up @@ -72,9 +88,9 @@ poet = curator.Prompter(
poem = poet(topics)
print(poem.to_pandas())
# Example output:
# topic poem
# 0 Urban loneliness in a bustling city In the city's heart, where the sirens wail,\nA...
# 1 Urban loneliness in a bustling city City streets hum with a bittersweet song,\nHor...
# topic poem
# 0 Urban loneliness in a bustling city In the city's heart, where the sirens wail,\nA...
# 1 Urban loneliness in a bustling city City streets hum with a bittersweet song,\nHor...
# 2 Beauty of Bespoke Labs's Curator library In whispers of design and crafted grace,\nBesp...
# 3 Beauty of Bespoke Labs's Curator library In the hushed breath of parchment and ink,\nBe...
```
Expand Down Expand Up @@ -134,3 +150,6 @@ node -v # should print `v22.11.0`
# verifies the right npm version is in the environment
npm -v # should print `10.9.0`
```

## Contributing
Contributions are welcome!
Binary file modified bespoke-dataset-viewer/app/favicon.ico
Binary file not shown.
2 changes: 1 addition & 1 deletion bespoke-dataset-viewer/app/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import "./globals.css";


export const metadata: Metadata = {
title: "Bella Dataset Viewer",
title: "Curator Viewer",
description: "A powerful dataset viewer and analysis tool",
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,33 @@ export function DetailsSidebar({ item, onClose }: DetailsSidebarProps) {
<p className="text-sm text-muted-foreground">{item.generic_request.model}</p>
</div>
<Separator />
{item.generic_request.messages.some(m => m.role === "system") && (
<>
<div className="space-y-2">
<h3 className="text-lg font-semibold">System Prompt</h3>
<p className="text-sm text-muted-foreground whitespace-pre-wrap">
{item.generic_request.messages.find(m => m.role === "system")?.content}
</p>
<Button
onClick={() => copyToClipboard(item.generic_request.messages.find(m => m.role === "system")?.content || "")}
variant="outline"
size="sm"
className="mt-2"
>
<Copy className="h-4 w-4 mr-2" />
Copy
</Button>
</div>
<Separator />
</>
)}
<div className="space-y-2">
<h3 className="text-lg font-semibold">User Message</h3>
<p className="text-sm text-muted-foreground whitespace-pre-wrap">
{item.generic_request.messages.find(m => m.role === "user")?.content}
</p>
<Button
onClick={() => copyToClipboard(item.generic_request.messages.find(m => m.role === "user")?.content || "")}
onClick={() => copyToClipboard(item.generic_request.messages.find(m => m.role === "user")?.content || "")}
variant="outline"
size="sm"
className="mt-2"
Expand Down
Binary file added docs/Bespoke-Labs-Logomark-Red-Small.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion examples/poem.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ class Poems(BaseModel):
# 0 Dreams vs. reality In the realm where dreams take flight,\nWhere ...
# 1 Dreams vs. reality Reality stands with open eyes,\nA weighty thro...
# 2 Urban loneliness in a bustling city In the city's heart where shadows blend,\nAmon...
# 3 Urban loneliness in a bustling city Among the crowds, I walk alone,\nA sea of face...
# 3 Urban loneliness in a bustling city Among the crowds, I walk alone,\nA sea of face...
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "bespokelabs-curator"
version = "0.1.7"
version = "0.1.9post1"
description = "Bespoke Labs Curator"
authors = ["Bespoke Labs <[email protected]>"]
readme = "README.md"
Expand Down
14 changes: 13 additions & 1 deletion src/bespokelabs/curator/prompter/prompter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def __init__(
batch_size: Optional[int] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
):
"""Initialize a Prompter.
Expand All @@ -64,6 +66,10 @@ def __init__(
response format from the LLM.
batch (bool): Whether to use batch processing
batch_size (Optional[int]): The size of the batch to use, only used if batch is True
temperature (Optional[float]): The temperature to use for the LLM, only used if batch is False
top_p (Optional[float]): The top_p to use for the LLM, only used if batch is False
presence_penalty (Optional[float]): The presence_penalty to use for the LLM, only used if batch is False
frequency_penalty (Optional[float]): The frequency_penalty to use for the LLM, only used if batch is False
"""
prompt_sig = inspect.signature(prompt_func)
if len(prompt_sig.parameters) > 1:
Expand Down Expand Up @@ -93,14 +99,20 @@ def __init__(
batch_size=batch_size,
temperature=temperature,
top_p=top_p,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
)
else:
if batch_size is not None:
logger.warning(
f"Prompter argument `batch_size` {batch_size} is ignored because `batch` is False"
)
self._request_processor = OpenAIOnlineRequestProcessor(
model=model_name, temperature=temperature, top_p=top_p
model=model_name,
temperature=temperature,
top_p=top_p,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
)

def __call__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def __init__(
check_interval: int = 10,
api_key: str = os.getenv("OPENAI_API_KEY"),
url: str = "https://api.openai.com/v1/chat/completions",
presence_penalty: float | None = None,
frequency_penalty: float | None = None,
):
if batch_size > MAX_REQUESTS_PER_BATCH:
raise ValueError(
Expand All @@ -48,6 +50,8 @@ def __init__(
self.check_interval: int = check_interval
self.temperature: float | None = temperature
self.top_p: float | None = top_p
self.presence_penalty: float | None = presence_penalty
self.frequency_penalty: float | None = frequency_penalty

def get_rate_limits(self) -> dict:
"""
Expand Down Expand Up @@ -132,6 +136,12 @@ def create_api_specific_request(
if self.top_p is not None:
body["top_p"] = self.top_p

if self.presence_penalty is not None:
body["presence_penalty"] = self.presence_penalty

if self.frequency_penalty is not None:
body["frequency_penalty"] = self.frequency_penalty

request = {
"custom_id": str(generic_request.original_row_idx),
"method": "POST",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,17 @@ def __init__(
url: str = "https://api.openai.com/v1/chat/completions",
temperature: Optional[float] = None,
top_p: Optional[float] = None,
presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
):
super().__init__(batch_size=None)
self.model: str = model
self.url: str = url
self.api_key: str = api_key
self.temperature: float = temperature
self.top_p: float = top_p
self.presence_penalty: float = presence_penalty
self.frequency_penalty: float = frequency_penalty

def get_rate_limits(self) -> dict:
"""
Expand Down Expand Up @@ -117,6 +121,12 @@ def create_api_specific_request(
if self.top_p is not None:
request["top_p"] = self.top_p

if self.presence_penalty is not None:
request["presence_penalty"] = self.presence_penalty

if self.frequency_penalty is not None:
request["frequency_penalty"] = self.frequency_penalty

return request

def run(
Expand Down

0 comments on commit c23405b

Please sign in to comment.