Merge pull request #140 from bespokelabsai/dev

0.1.9post1
bespokelabsai · Nov 19, 2024 · c23405b · c23405b
2 parents 4b00175 + 1db0dc8
commit c23405b
Show file tree

Hide file tree

Showing 10 changed files with 82 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
 <p align="center">
   <a href="https://bespokelabs.ai/" target="_blank">
     <picture>
-      <source media="(prefers-color-scheme: light)" width="80" srcset="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red.png">
-      <img alt="Bespoke Labs Logo" width="80" src="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red-on-Black.png">
+      <source media="(prefers-color-scheme: light)" width="80px" srcset="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red.png">
+      <img alt="Bespoke Labs Logo" width="80px" src="https://raw.githubusercontent.com/bespokelabsai/curator/main/docs/Bespoke-Labs-Logomark-Red-on-Black.png">
     </picture>
   </a>
 </p>
 
-<h1 align="center">Bespoke Labs Curator</h1>
+<h1 align="center">Bespoke Curator</h1>
 <h3 align="center" style="font-size: 20px; margin-bottom: 4px">Data Curation for Post-Training & Structured Data Extraction</h3>
 <br/>
 <p align="center">
@@ -26,6 +26,22 @@
   </a>
 </p>
 
+### Overview
+
+Bespoke Curator makes it very easy to create high-quality synthetic data at scale, which you can use to finetune models or use for structured data extraction at scale.
+
+Bespoke Curator is an open-source project:
+* That comes with a rich Python based library for generating and curating synthetic data.
+* A Curator Viewer which makes it easy to view the datasets, thus aiding in the dataset creation.
+* We will also be releasing high-quality datasets that should move the needle on post-training.
+
+### Key Features
+
+1. **Programmability and Structured Outputs**: Synthetic data generation is lot more than just using a single prompt -- it involves calling LLMs multiple times and orchestrating control-flow. Curator treats structured outputs as first class citizens and helps you design complex pipelines.
+2. **Built-in Performance Optimization**: We often see calling LLMs in loops, or inefficient implementation of multi-threading. We have baked in performance optimizations so that you don't need to worry about those!
+3. **Intelligent Caching and Fault Recovery**: Given LLM calls can add up in cost and time, failures are undesirable but sometimes unavoidable. We cache the LLM requests and responses so that it is easy to recover from a failure. Moreover, when working on a multi-stage pipeline, caching of stages makes it easy to iterate.
+4. **Native HuggingFace Dataset Integration**: Work directly on HuggingFace Dataset objects throughout your pipeline. Your synthetic data is immediately ready for fine-tuning!
+5. **Interactive Curator Viewer**: Improve and iterate on your prompts using our built-in viewer. Inspect LLM requests and responses in real-time, allowing you to iterate and refine your data generation strategy with immediate feedback.
 
 ### Installation
 
@@ -72,9 +88,9 @@ poet = curator.Prompter(
 poem = poet(topics)
 print(poem.to_pandas())
 # Example output:
-#                                       topic                                               poem
-# 0       Urban loneliness in a bustling city  In the city's heart, where the sirens wail,\nA...
-# 1       Urban loneliness in a bustling city  City streets hum with a bittersweet song,\nHor...
+#    topic                                     poem
+# 0  Urban loneliness in a bustling city       In the city's heart, where the sirens wail,\nA...
+# 1  Urban loneliness in a bustling city       City streets hum with a bittersweet song,\nHor...
 # 2  Beauty of Bespoke Labs's Curator library  In whispers of design and crafted grace,\nBesp...
 # 3  Beauty of Bespoke Labs's Curator library  In the hushed breath of parchment and ink,\nBe...
 ```
@@ -134,3 +150,6 @@ node -v # should print `v22.11.0`
 # verifies the right npm version is in the environment
 npm -v # should print `10.9.0`
 ```
+
+## Contributing
+Contributions are welcome! 
diff --git a/bespoke-dataset-viewer/app/favicon.ico b/bespoke-dataset-viewer/app/favicon.ico
diff --git a/bespoke-dataset-viewer/app/layout.tsx b/bespoke-dataset-viewer/app/layout.tsx
@@ -3,7 +3,7 @@ import "./globals.css";
 
 
 export const metadata: Metadata = {
-  title: "Bella Dataset Viewer",
+  title: "Curator Viewer",
   description: "A powerful dataset viewer and analysis tool",
 };
 

diff --git a/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx b/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx
@@ -45,13 +45,33 @@ export function DetailsSidebar({ item, onClose }: DetailsSidebarProps) {
                 <p className="text-sm text-muted-foreground">{item.generic_request.model}</p>
               </div>
               <Separator />
+              {item.generic_request.messages.some(m => m.role === "system") && (
+                <>
+                  <div className="space-y-2">
+                    <h3 className="text-lg font-semibold">System Prompt</h3>
+                    <p className="text-sm text-muted-foreground whitespace-pre-wrap">
+                      {item.generic_request.messages.find(m => m.role === "system")?.content}
+                    </p>
+                    <Button 
+                      onClick={() => copyToClipboard(item.generic_request.messages.find(m => m.role === "system")?.content || "")} 
+                      variant="outline" 
+                      size="sm" 
+                      className="mt-2"
+                    >
+                      <Copy className="h-4 w-4 mr-2" />
+                      Copy
+                    </Button>
+                  </div>
+                  <Separator />
+                </>
+              )}
               <div className="space-y-2">
                 <h3 className="text-lg font-semibold">User Message</h3>
                 <p className="text-sm text-muted-foreground whitespace-pre-wrap">
                   {item.generic_request.messages.find(m => m.role === "user")?.content}
                 </p>
                 <Button 
-                                  onClick={() => copyToClipboard(item.generic_request.messages.find(m => m.role === "user")?.content || "")} 
+                  onClick={() => copyToClipboard(item.generic_request.messages.find(m => m.role === "user")?.content || "")} 
                   variant="outline" 
                   size="sm" 
                   className="mt-2"

diff --git a/docs/Bespoke-Labs-Logomark-Red-Small.png b/docs/Bespoke-Labs-Logomark-Red-Small.png
diff --git a/examples/poem.py b/examples/poem.py
@@ -55,4 +55,4 @@ class Poems(BaseModel):
 # 0                            Dreams vs. reality  In the realm where dreams take flight,\nWhere ...
 # 1                            Dreams vs. reality  Reality stands with open eyes,\nA weighty thro...
 # 2           Urban loneliness in a bustling city  In the city's heart where shadows blend,\nAmon...
-# 3           Urban loneliness in a bustling city  Among the crowds, I walk alone,\nA sea of face...
+# 3           Urban loneliness in a bustling city  Among the crowds, I walk alone,\nA sea of face...
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "bespokelabs-curator"
-version = "0.1.7"
+version = "0.1.9post1"
 description = "Bespoke Labs Curator"
 authors = ["Bespoke Labs <[email protected]>"]
 readme = "README.md"

diff --git a/src/bespokelabs/curator/prompter/prompter.py b/src/bespokelabs/curator/prompter/prompter.py
@@ -51,6 +51,8 @@ def __init__(
         batch_size: Optional[int] = None,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
     ):
         """Initialize a Prompter.
 
@@ -64,6 +66,10 @@ def __init__(
                 response format from the LLM.
             batch (bool): Whether to use batch processing
             batch_size (Optional[int]): The size of the batch to use, only used if batch is True
+            temperature (Optional[float]): The temperature to use for the LLM, only used if batch is False
+            top_p (Optional[float]): The top_p to use for the LLM, only used if batch is False
+            presence_penalty (Optional[float]): The presence_penalty to use for the LLM, only used if batch is False
+            frequency_penalty (Optional[float]): The frequency_penalty to use for the LLM, only used if batch is False
         """
         prompt_sig = inspect.signature(prompt_func)
         if len(prompt_sig.parameters) > 1:
@@ -93,14 +99,20 @@ def __init__(
                 batch_size=batch_size,
                 temperature=temperature,
                 top_p=top_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
             )
         else:
             if batch_size is not None:
                 logger.warning(
                     f"Prompter argument `batch_size` {batch_size} is ignored because `batch` is False"
                 )
             self._request_processor = OpenAIOnlineRequestProcessor(
-                model=model_name, temperature=temperature, top_p=top_p
+                model=model_name,
+                temperature=temperature,
+                top_p=top_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
             )
 
     def __call__(

diff --git a/src/bespokelabs/curator/request_processor/openai_batch_request_processor.py b/src/bespokelabs/curator/request_processor/openai_batch_request_processor.py
@@ -35,6 +35,8 @@ def __init__(
         check_interval: int = 10,
         api_key: str = os.getenv("OPENAI_API_KEY"),
         url: str = "https://api.openai.com/v1/chat/completions",
+        presence_penalty: float | None = None,
+        frequency_penalty: float | None = None,
     ):
         if batch_size > MAX_REQUESTS_PER_BATCH:
             raise ValueError(
@@ -48,6 +50,8 @@ def __init__(
         self.check_interval: int = check_interval
         self.temperature: float | None = temperature
         self.top_p: float | None = top_p
+        self.presence_penalty: float | None = presence_penalty
+        self.frequency_penalty: float | None = frequency_penalty
 
     def get_rate_limits(self) -> dict:
         """
@@ -132,6 +136,12 @@ def create_api_specific_request(
         if self.top_p is not None:
             body["top_p"] = self.top_p
 
+        if self.presence_penalty is not None:
+            body["presence_penalty"] = self.presence_penalty
+
+        if self.frequency_penalty is not None:
+            body["frequency_penalty"] = self.frequency_penalty
+
         request = {
             "custom_id": str(generic_request.original_row_idx),
             "method": "POST",

diff --git a/src/bespokelabs/curator/request_processor/openai_online_request_processor.py b/src/bespokelabs/curator/request_processor/openai_online_request_processor.py
@@ -36,13 +36,17 @@ def __init__(
         url: str = "https://api.openai.com/v1/chat/completions",
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
     ):
         super().__init__(batch_size=None)
         self.model: str = model
         self.url: str = url
         self.api_key: str = api_key
         self.temperature: float = temperature
         self.top_p: float = top_p
+        self.presence_penalty: float = presence_penalty
+        self.frequency_penalty: float = frequency_penalty
 
     def get_rate_limits(self) -> dict:
         """
@@ -117,6 +121,12 @@ def create_api_specific_request(
         if self.top_p is not None:
             request["top_p"] = self.top_p
 
+        if self.presence_penalty is not None:
+            request["presence_penalty"] = self.presence_penalty
+
+        if self.frequency_penalty is not None:
+            request["frequency_penalty"] = self.frequency_penalty
+
         return request
 
     def run(