Skip to content

Add Firecrawl integration for website content crawling #1849

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion apps/web/client/.env.example
Original file line number Diff line number Diff line change
@@ -9,4 +9,7 @@ SUPABASE_DATABASE_URL=postgresql://postgres:[email protected]:54322/postgres
CSB_API_KEY=<Your api key from https://codesandbox.io/api >

# Anthropic
ANTHROPIC_API_KEY=<Your api key from https://console.anthropic.com/settings/keys >
ANTHROPIC_API_KEY=<Your api key from https://console.anthropic.com/settings/keys >

# Firecrawl
VITE_FIRECRAWL_API_KEY=<Your api key from https://firecrawl.dev >
42 changes: 41 additions & 1 deletion apps/web/client/src/app/api/chat/route.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,52 @@
import { chatToolSet, initModel, PromptProvider } from '@onlook/ai';
import { chatToolSet, CRAWL_URL_TOOL_NAME, initModel, PromptProvider } from '@onlook/ai';
import { extractUrls } from '@onlook/ai/src/tools/helpers';
import { CLAUDE_MODELS, LLMProvider } from '@onlook/models';
import { generateObject, NoSuchToolError, streamText } from 'ai';

const model = await initModel(LLMProvider.ANTHROPIC, CLAUDE_MODELS.SONNET);
const promptProvider = new PromptProvider();

async function processUrls(content: string): Promise<string> {
const urls = extractUrls(content);

if (urls.length === 0) {
return content;
}

try {
const result = await streamText({
model,
system: promptProvider.getSystemPrompt(),
messages: [
{
role: 'user',
content: content,
},
],
tools: { [CRAWL_URL_TOOL_NAME]: chatToolSet[CRAWL_URL_TOOL_NAME] as any },
maxTokens: 4000,
});

return `
Original request:
${content}

Referenced content from URLs:
${JSON.stringify(result, null, 2)}
`;
} catch (error) {
console.error('Error processing URLs:', error);
return content;
}
}

export async function POST(req: Request) {
const { messages, maxSteps } = await req.json();

const lastUserMessage = messages.findLast((m: any) => m.role === 'user');
if (lastUserMessage && typeof lastUserMessage.content === 'string') {
lastUserMessage.content = await processUrls(lastUserMessage.content);
}

const result = streamText({
model,
4 changes: 4 additions & 0 deletions apps/web/client/src/app/project/[id]/_hooks/use-chat.tsx
Original file line number Diff line number Diff line change
@@ -10,6 +10,8 @@ import {
ONLOOK_INSTRUCTIONS_TOOL_NAME,
READ_FILES_TOOL_NAME,
READ_FILES_TOOL_PARAMETERS,
CRAWL_URL_TOOL_NAME,
CRAWL_URL_TOOL_PARAMETERS,
} from '@onlook/ai';
import type { ToolCall } from 'ai';
import { createContext, useContext } from 'react';
@@ -61,6 +63,8 @@ async function handleToolCall(toolCall: ToolCall<string, unknown>, editorEngine:
} else if (toolName === ONLOOK_INSTRUCTIONS_TOOL_NAME) {
const result = ONLOOK_INSTRUCTIONS;
return result;
} else if (toolName === CRAWL_URL_TOOL_NAME) {
return 'Web content crawled successfully.';
} else {
throw new Error(`Unknown tool call: ${toolCall.toolName}`);
}
Binary file added bun.lockb
Binary file not shown.
1 change: 1 addition & 0 deletions packages/ai/package.json
Original file line number Diff line number Diff line change
@@ -33,6 +33,7 @@
"dependencies": {
"@ai-sdk/anthropic": "^1.2.10",
"ai": "^4.3.10",
"@mendable/firecrawl-js": "^1.24.0",
"diff-match-patch": "^1.0.5",
"fg": "^0.0.3",
"marked": "^15.0.7"
105 changes: 105 additions & 0 deletions packages/ai/src/tools/crawler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import FirecrawlApp from '@mendable/firecrawl-js';

export interface CrawlOptions {
limit?: number;
scrapeOptions?: {
formats?: (
| 'markdown'
| 'html'
| 'rawHtml'
| 'content'
| 'links'
| 'screenshot'
| 'screenshot@fullPage'
| 'extract'
| 'json'
| 'changeTracking'
)[];
};
}

export interface CrawlerResponse {
success: boolean;
error?: string;
data: Array<{
html?: string;
markdown?: string;
}>;
}

export interface CrawledContent {
markdown?: string;
html?: string;
}

export function validateCrawlerResponse(response: unknown): response is CrawlerResponse {
if (!response || typeof response !== 'object') {
return false;
}

if (!('success' in response) || typeof response.success !== 'boolean') {
return false;
}

if (!('data' in response) || !Array.isArray(response.data)) {
return false;
}

if (response.data.length === 0) {
return false;
}

const firstItem = response.data[0];
return (
typeof firstItem === 'object' &&
firstItem !== null &&
('html' in firstItem || 'markdown' in firstItem) &&
(firstItem.html === undefined || typeof firstItem.html === 'string') &&
(firstItem.markdown === undefined || typeof firstItem.markdown === 'string')
);
}

export class CrawlerService {
private static instance: CrawlerService;

private app: FirecrawlApp;

private constructor() {
const apiKey = import.meta.env.VITE_FIRECRAWL_API_KEY;
if (!apiKey) {
throw new Error(
'VITE_FIRECRAWL_API_KEY is not defined. Please provide a valid API key.',
);
}
this.app = new FirecrawlApp({ apiKey });
}

static getInstance(): CrawlerService {
if (!this.instance) {
this.instance = new CrawlerService();
}
return this.instance;
}

async crawlUrl(
url: string,
options: CrawlOptions = {
limit: 100,
scrapeOptions: {
formats: ['markdown', 'html'],
},
},
) {
try {
const response = await this.app.crawlUrl(url, options);

if (!response.success) {
throw new Error(`Failed to crawl: ${response.error}`);
}
return response;
} catch (error) {
console.error('Error during crawling:', error);
throw error;
}
}
}
37 changes: 37 additions & 0 deletions packages/ai/src/tools/helpers.ts
Original file line number Diff line number Diff line change
@@ -40,3 +40,40 @@ export async function getAllFiles(
return { success: false, error: error instanceof Error ? error.message : 'Unknown error' };
}
}

export function extractUrls(text: string): string[] {
const httpPattern =
/https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)/gi;

const wwwPattern =
/(?:^|\s)www\.[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)/gi;

const markdownPattern = /\[([^\]]+)\]\(([^)]+)\)/g;

const httpUrls: string[] = Array.from(text.matchAll(httpPattern), (match) => match[0]);
const wwwUrls: string[] = Array.from(
text.matchAll(wwwPattern),
(match) => 'https://' + match[0].trim(),
);
const markdownUrls: string[] = Array.from(
text.matchAll(markdownPattern),
(match) => match[2] || '',
);

const allUrls: string[] = [...httpUrls, ...wwwUrls, ...markdownUrls];

return Array.from(
new Set(
allUrls.filter((url) => {
if (!url) return false;
try {
const fullUrl = url.startsWith('http') ? url : `https://${url}`;
new URL(fullUrl);
return fullUrl;
} catch {
return false;
}
}),
),
);
}
37 changes: 37 additions & 0 deletions packages/ai/src/tools/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { tool, type ToolSet } from 'ai';
import { z } from 'zod';
import { CrawlerService } from './crawler';

export const LIST_FILES_TOOL_NAME = 'list_files';
export const LIST_FILES_TOOL_PARAMETERS = z.object({
@@ -30,8 +31,44 @@ export const onlookInstructionsTool = tool({
parameters: z.object({}),
});

export const CRAWL_URL_TOOL_NAME = 'crawl_url';
export const CRAWL_URL_TOOL_PARAMETERS = z.object({
urls: z.array(z.string()).describe('Array of URLs to crawl'),
options: z
.object({
limit: z.number().optional(),
scrapeOptions: z
.object({
formats: z
.array(
z.enum([
'markdown',
'html',
'rawHtml',
'content',
'links',
'screenshot',
'screenshot@fullPage',
'extract',
'json',
'changeTracking',
]),
)
.optional(),
})
.optional(),
})
.optional(),
});

export const crawlUrlTool = tool({
description: 'Crawl webpage content from provided URL',
parameters: CRAWL_URL_TOOL_PARAMETERS,
});

export const chatToolSet: ToolSet = {
[LIST_FILES_TOOL_NAME]: listFilesTool,
[READ_FILES_TOOL_NAME]: readFilesTool,
[ONLOOK_INSTRUCTIONS_TOOL_NAME]: onlookInstructionsTool,
[CRAWL_URL_TOOL_NAME]: crawlUrlTool,
};
9 changes: 8 additions & 1 deletion packages/models/src/chat/message/context.ts
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@ export enum MessageContextType {
IMAGE = 'image',
ERROR = 'error',
PROJECT = 'project',
LINK = 'link',
}

type BaseMessageContext = {
@@ -38,9 +39,15 @@ export type ProjectMessageContext = BaseMessageContext & {
path: string;
};

export type LinkMessageContext = BaseMessageContext & {
type: MessageContextType.LINK;
url: string;
};

export type ChatMessageContext =
| FileMessageContext
| HighlightMessageContext
| ImageMessageContext
| ErrorMessageContext
| ProjectMessageContext;
| ProjectMessageContext
| LinkMessageContext;