Merge pull request #81 from bespokelabsai/dev

0.1.7
bespokelabsai · Nov 13, 2024 · a6536c9 · a6536c9
2 parents bbbee20 + 7991ae5
commit a6536c9
Show file tree

Hide file tree

Showing 34 changed files with 855 additions and 535 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,31 @@
-# Bespoke Curator
+<p align="center">
+  <a href="https://bespokelabs.ai/" target="_blank">
+    <picture>
+      <source media="(prefers-color-scheme: light)" width="10%" srcset="./docs/Bespoke-Labs-Logomark-Red.png">
+      <img alt="Bespoke Labs Logo" width="10%" src="./docs/Bespoke-Labs-Logomark-Red-on-Black.png">
+    </picture>
+  </a>
+</p>
+
+<h1 align="center">Bespoke Labs Curator</h1>
+<h3 align="center" style="font-size: 20px; margin-bottom: 4px">Data Curation for Post-Training & Structured Data Extraction</h3>
+<br/>
+<p align="center">
+  <a href="https://docs.bespokelabs.ai/">
+    <img alt="Static Badge" src="https://img.shields.io/badge/Docs-docs.bespokelabs.ai-blue?style=flat&link=https%3A%2F%2Fdocs.bespokelabs.ai">
+  </a>
+  <a href="https://bespokelabs.ai/">
+    <img alt="Site" src="https://img.shields.io/badge/Site-bespokelabs.ai-blue?link=https%3A%2F%2Fbespokelabs.ai"/>
+  </a>
+  <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/bespokelabs-curator">
+  <a href="https://twitter.com/bespokelabsai">
+    <img src="https://img.shields.io/twitter/follow/bespokelabsai" alt="Follow on X" />
+  </a>
+  <a href="https://discord.gg/KqpXvpzVBS">
+    <img alt="Discord" src="https://img.shields.io/discord/1230990265867698186">
+  </a>
+</p>
 
-Bespoke Labs Synthetic Data Curation Library
 
 ### Installation
 
@@ -17,14 +42,12 @@ import os
 os.environ['OPENAI_API_KEY'] = 'sk-...' # Set your OpenAI API key here
 
 poet = curator.Prompter(
-    prompt_func=lambda: {
-        "user_prompt": "Write a poem about the beauty of computer science"
-    },
+    prompt_func=lambda: "Write a poem about the beauty of computer science",
     model_name="gpt-4o-mini",
 )
 
 poem = poet()
-print(poem.to_list()[0])
+print(poem["response"][0])
 ```
 
 You can see more examples in the [examples](examples) directory.

diff --git a/bespoke-dataset-viewer/app/api/responses/[runHash]/route.ts b/bespoke-dataset-viewer/app/api/responses/[runHash]/route.ts
@@ -1,32 +1,75 @@
+import { DataItem } from '@/types/dataset'
+import { existsSync, promises as fs } from 'fs'
 import { NextRequest, NextResponse } from 'next/server'
-import { promises as fs } from 'fs'
 import { homedir } from 'os'
 import { join } from 'path'
 
 export const dynamic = 'force-dynamic'
 export const runtime = 'nodejs'
 
+async function readJsonlFile(filePath: string): Promise<DataItem[]> {
+  const content = await fs.readFile(filePath, 'utf-8')
+  return content.split('\n')
+    .filter(line => line.trim() !== '')
+    .map(line => JSON.parse(line))
+}
+
 export async function GET(
   request: NextRequest,
-  { params }: { params: Promise<{ runHash: string }>  }
-): Promise<Response> {  // This is the key change
+  { params }: { params: Promise<{ runHash: string }> }
+): Promise<Response> {
   try {
     const { runHash } = await params
-    const responsesPath = join(homedir(), '.cache', 'curator', runHash, 'responses.jsonl')
-    
+    const runDir = join(homedir(), '.cache', 'curator', runHash)
+
     const searchParams = request.nextUrl.searchParams
     const lastLineNumber = parseInt(searchParams.get('lastLine') || '0')
+    const processedFiles = (searchParams.get('processedFiles') || '').split(',').filter(Boolean)
+    const isBatchMode = searchParams.get('batchMode') === 'true'
+
+    if (!existsSync(runDir)) {
+      return NextResponse.json(
+        { error: "Run directory not found" },
+        { status: 404 }
+      )
+    }
+
+    if (isBatchMode) {
+      // Batch streaming mode: Read all response files that haven't been processed
+      const files = await fs.readdir(runDir)
+      const responseFiles = files
+        .filter(f => f.startsWith('responses_') && f.endsWith('.jsonl'))
+        .filter(f => !processedFiles.includes(f))
+
+      const allData: DataItem[] = []
+      for (const file of responseFiles) {
+        const filePath = join(runDir, file)
+        const fileData = await readJsonlFile(filePath)
+        allData.push(...fileData)
+      }
+
+      return NextResponse.json({
+        data: allData,
+        processedFiles: responseFiles,
+        isBatchMode: true,
+        totalLines: null
+      })
+    } else {
+      // Online streaming mode
+      const responsesPath = join(runDir, 'responses_0.jsonl')
+      const content = await fs.readFile(responsesPath, 'utf-8')
+      const lines = content.split('\n').filter(line => line.trim() !== '')
 
-    const content = await fs.readFile(responsesPath, 'utf-8')
-    const lines = content.split('\n').filter(line => line.trim() !== '')
-
-    const newLines = lines.slice(lastLineNumber)
-    const jsonData = newLines.map(line => JSON.parse(line))
+      const newLines = lines.slice(lastLineNumber)
+      const jsonData = newLines.map(line => JSON.parse(line))
 
-    return NextResponse.json({
-      data: jsonData,
-      totalLines: lines.length
-    })
+      return NextResponse.json({
+        data: jsonData,
+        totalLines: lines.length,
+        isBatchMode: false,
+        processedFiles: null
+      })
+    }
   } catch (error) {
     console.error("Error reading responses file:", error)
     return NextResponse.json(

diff --git a/bespoke-dataset-viewer/app/dataset/[runHash]/page.tsx b/bespoke-dataset-viewer/app/dataset/[runHash]/page.tsx
@@ -1,15 +1,19 @@
 import { DatasetViewer } from "@/components/dataset-viewer/DatasetViewer"
 
 export default async function DatasetPage({ 
-  params 
+  params,
+  searchParams
 }: { 
-  params: Promise<{ runHash: string }> 
+  params: Promise<{ runHash: string }>,
+  searchParams: Promise<{ [key: string]: string | string[] | undefined }>
 }) {
   const { runHash } = await params
+  const { batchMode } = await searchParams
+  const isBatchMode = batchMode === '1'
   return (
     <html lang="en" suppressHydrationWarning>
       <body>
-        <DatasetViewer runHash={runHash} />
+        <DatasetViewer runHash={runHash} batchMode={isBatchMode} />
       </body>
     </html>
   )