Skip to content

Commit

Permalink
Merge pull request #81 from bespokelabsai/dev
Browse files Browse the repository at this point in the history
0.1.7
  • Loading branch information
vutrung96 authored Nov 13, 2024
2 parents bbbee20 + 7991ae5 commit a6536c9
Show file tree
Hide file tree
Showing 34 changed files with 855 additions and 535 deletions.
35 changes: 29 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
# Bespoke Curator
<p align="center">
<a href="https://bespokelabs.ai/" target="_blank">
<picture>
<source media="(prefers-color-scheme: light)" width="10%" srcset="./docs/Bespoke-Labs-Logomark-Red.png">
<img alt="Bespoke Labs Logo" width="10%" src="./docs/Bespoke-Labs-Logomark-Red-on-Black.png">
</picture>
</a>
</p>

<h1 align="center">Bespoke Labs Curator</h1>
<h3 align="center" style="font-size: 20px; margin-bottom: 4px">Data Curation for Post-Training & Structured Data Extraction</h3>
<br/>
<p align="center">
<a href="https://docs.bespokelabs.ai/">
<img alt="Static Badge" src="https://img.shields.io/badge/Docs-docs.bespokelabs.ai-blue?style=flat&link=https%3A%2F%2Fdocs.bespokelabs.ai">
</a>
<a href="https://bespokelabs.ai/">
<img alt="Site" src="https://img.shields.io/badge/Site-bespokelabs.ai-blue?link=https%3A%2F%2Fbespokelabs.ai"/>
</a>
<img alt="PyPI - Version" src="https://img.shields.io/pypi/v/bespokelabs-curator">
<a href="https://twitter.com/bespokelabsai">
<img src="https://img.shields.io/twitter/follow/bespokelabsai" alt="Follow on X" />
</a>
<a href="https://discord.gg/KqpXvpzVBS">
<img alt="Discord" src="https://img.shields.io/discord/1230990265867698186">
</a>
</p>

Bespoke Labs Synthetic Data Curation Library

### Installation

Expand All @@ -17,14 +42,12 @@ import os
os.environ['OPENAI_API_KEY'] = 'sk-...' # Set your OpenAI API key here

poet = curator.Prompter(
prompt_func=lambda: {
"user_prompt": "Write a poem about the beauty of computer science"
},
prompt_func=lambda: "Write a poem about the beauty of computer science",
model_name="gpt-4o-mini",
)

poem = poet()
print(poem.to_list()[0])
print(poem["response"][0])
```

You can see more examples in the [examples](examples) directory.
Expand Down
71 changes: 57 additions & 14 deletions bespoke-dataset-viewer/app/api/responses/[runHash]/route.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,75 @@
import { DataItem } from '@/types/dataset'
import { existsSync, promises as fs } from 'fs'
import { NextRequest, NextResponse } from 'next/server'
import { promises as fs } from 'fs'
import { homedir } from 'os'
import { join } from 'path'

export const dynamic = 'force-dynamic'
export const runtime = 'nodejs'

async function readJsonlFile(filePath: string): Promise<DataItem[]> {
const content = await fs.readFile(filePath, 'utf-8')
return content.split('\n')
.filter(line => line.trim() !== '')
.map(line => JSON.parse(line))
}

export async function GET(
request: NextRequest,
{ params }: { params: Promise<{ runHash: string }> }
): Promise<Response> { // This is the key change
{ params }: { params: Promise<{ runHash: string }> }
): Promise<Response> {
try {
const { runHash } = await params
const responsesPath = join(homedir(), '.cache', 'curator', runHash, 'responses.jsonl')
const runDir = join(homedir(), '.cache', 'curator', runHash)

const searchParams = request.nextUrl.searchParams
const lastLineNumber = parseInt(searchParams.get('lastLine') || '0')
const processedFiles = (searchParams.get('processedFiles') || '').split(',').filter(Boolean)
const isBatchMode = searchParams.get('batchMode') === 'true'

if (!existsSync(runDir)) {
return NextResponse.json(
{ error: "Run directory not found" },
{ status: 404 }
)
}

if (isBatchMode) {
// Batch streaming mode: Read all response files that haven't been processed
const files = await fs.readdir(runDir)
const responseFiles = files
.filter(f => f.startsWith('responses_') && f.endsWith('.jsonl'))
.filter(f => !processedFiles.includes(f))

const allData: DataItem[] = []
for (const file of responseFiles) {
const filePath = join(runDir, file)
const fileData = await readJsonlFile(filePath)
allData.push(...fileData)
}

return NextResponse.json({
data: allData,
processedFiles: responseFiles,
isBatchMode: true,
totalLines: null
})
} else {
// Online streaming mode
const responsesPath = join(runDir, 'responses_0.jsonl')
const content = await fs.readFile(responsesPath, 'utf-8')
const lines = content.split('\n').filter(line => line.trim() !== '')

const content = await fs.readFile(responsesPath, 'utf-8')
const lines = content.split('\n').filter(line => line.trim() !== '')

const newLines = lines.slice(lastLineNumber)
const jsonData = newLines.map(line => JSON.parse(line))
const newLines = lines.slice(lastLineNumber)
const jsonData = newLines.map(line => JSON.parse(line))

return NextResponse.json({
data: jsonData,
totalLines: lines.length
})
return NextResponse.json({
data: jsonData,
totalLines: lines.length,
isBatchMode: false,
processedFiles: null
})
}
} catch (error) {
console.error("Error reading responses file:", error)
return NextResponse.json(
Expand Down
10 changes: 7 additions & 3 deletions bespoke-dataset-viewer/app/dataset/[runHash]/page.tsx
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import { DatasetViewer } from "@/components/dataset-viewer/DatasetViewer"

export default async function DatasetPage({
params
params,
searchParams
}: {
params: Promise<{ runHash: string }>
params: Promise<{ runHash: string }>,
searchParams: Promise<{ [key: string]: string | string[] | undefined }>
}) {
const { runHash } = await params
const { batchMode } = await searchParams
const isBatchMode = batchMode === '1'
return (
<html lang="en" suppressHydrationWarning>
<body>
<DatasetViewer runHash={runHash} />
<DatasetViewer runHash={runHash} batchMode={isBatchMode} />
</body>
</html>
)
Expand Down
Loading

0 comments on commit a6536c9

Please sign in to comment.