diff --git a/README.md b/README.md index 92c080df..e129be08 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@

- - Bespoke Labs Logo + + Bespoke Labs Logo

-

Bespoke Labs Curator

+

Bespoke Curator

Data Curation for Post-Training & Structured Data Extraction


@@ -26,6 +26,22 @@

+### Overview + +Bespoke Curator makes it very easy to create high-quality synthetic data at scale, which you can use to finetune models or use for structured data extraction at scale. + +Bespoke Curator is an open-source project: +* That comes with a rich Python based library for generating and curating synthetic data. +* A Curator Viewer which makes it easy to view the datasets, thus aiding in the dataset creation. +* We will also be releasing high-quality datasets that should move the needle on post-training. + +### Key Features + +1. **Programmability and Structured Outputs**: Synthetic data generation is lot more than just using a single prompt -- it involves calling LLMs multiple times and orchestrating control-flow. Curator treats structured outputs as first class citizens and helps you design complex pipelines. +2. **Built-in Performance Optimization**: We often see calling LLMs in loops, or inefficient implementation of multi-threading. We have baked in performance optimizations so that you don't need to worry about those! +3. **Intelligent Caching and Fault Recovery**: Given LLM calls can add up in cost and time, failures are undesirable but sometimes unavoidable. We cache the LLM requests and responses so that it is easy to recover from a failure. Moreover, when working on a multi-stage pipeline, caching of stages makes it easy to iterate. +4. **Native HuggingFace Dataset Integration**: Work directly on HuggingFace Dataset objects throughout your pipeline. Your synthetic data is immediately ready for fine-tuning! +5. **Interactive Curator Viewer**: Improve and iterate on your prompts using our built-in viewer. Inspect LLM requests and responses in real-time, allowing you to iterate and refine your data generation strategy with immediate feedback. ### Installation @@ -72,9 +88,9 @@ poet = curator.Prompter( poem = poet(topics) print(poem.to_pandas()) # Example output: -# topic poem -# 0 Urban loneliness in a bustling city In the city's heart, where the sirens wail,\nA... -# 1 Urban loneliness in a bustling city City streets hum with a bittersweet song,\nHor... +# topic poem +# 0 Urban loneliness in a bustling city In the city's heart, where the sirens wail,\nA... +# 1 Urban loneliness in a bustling city City streets hum with a bittersweet song,\nHor... # 2 Beauty of Bespoke Labs's Curator library In whispers of design and crafted grace,\nBesp... # 3 Beauty of Bespoke Labs's Curator library In the hushed breath of parchment and ink,\nBe... ``` @@ -134,3 +150,6 @@ node -v # should print `v22.11.0` # verifies the right npm version is in the environment npm -v # should print `10.9.0` ``` + +## Contributing +Contributions are welcome! \ No newline at end of file diff --git a/bespoke-dataset-viewer/app/favicon.ico b/bespoke-dataset-viewer/app/favicon.ico index 718d6fea..861c19a4 100644 Binary files a/bespoke-dataset-viewer/app/favicon.ico and b/bespoke-dataset-viewer/app/favicon.ico differ diff --git a/bespoke-dataset-viewer/app/layout.tsx b/bespoke-dataset-viewer/app/layout.tsx index 13090471..03590f7a 100644 --- a/bespoke-dataset-viewer/app/layout.tsx +++ b/bespoke-dataset-viewer/app/layout.tsx @@ -3,7 +3,7 @@ import "./globals.css"; export const metadata: Metadata = { - title: "Bella Dataset Viewer", + title: "Curator Viewer", description: "A powerful dataset viewer and analysis tool", }; diff --git a/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx b/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx index 8a3865f7..02d5fcd5 100644 --- a/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx +++ b/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx @@ -45,13 +45,33 @@ export function DetailsSidebar({ item, onClose }: DetailsSidebarProps) {

{item.generic_request.model}

+ {item.generic_request.messages.some(m => m.role === "system") && ( + <> +
+

System Prompt

+

+ {item.generic_request.messages.find(m => m.role === "system")?.content} +

+ +
+ + + )}

User Message

{item.generic_request.messages.find(m => m.role === "user")?.content}