diff --git a/agentstack/_tools/agentql/__init__.py b/agentstack/_tools/agentql/__init__.py index 46364974..469fcfcf 100644 --- a/agentstack/_tools/agentql/__init__.py +++ b/agentstack/_tools/agentql/__init__.py @@ -17,32 +17,32 @@ def query_data(url: str, query: Optional[str], prompt: Optional[str]) -> dict: AgentQL query to scrape the url. -Here is a guide on AgentQL query syntax: + Here is a guide on AgentQL query syntax: -Enclose all AgentQL query terms within curly braces `{}`. The following query structure isn't valid because the term "social\_media\_links" is wrongly enclosed within parenthesis `()`. + Enclose all AgentQL query terms within curly braces `{}`. The following query structure isn't valid because the term "social_media_links" is wrongly enclosed within parenthesis `()`. -``` -( # Should be { - social_media_links(The icons that lead to Facebook, Snapchat, etc.)[] -) # Should be } -``` + ``` + ( # Should be { + social_media_links(The icons that lead to Facebook, Snapchat, etc.)[] + ) # Should be } + ``` -The following query is also invalid since its missing the curly braces `{}` + The following query is also invalid since its missing the curly braces `{}` -``` -# should include { -social_media_links(The icons that lead to Facebook, Snapchat, etc.)[] -# should include } -``` + ``` + # should include { + social_media_links(The icons that lead to Facebook, Snapchat, etc.)[] + # should include } + ``` -You can't include new lines in your semantic context. The following query structure isn't valid because the semantic context isn't contained within one line. + You can't include new lines in your semantic context. The following query structure isn't valid because the semantic context isn't contained within one line. -``` -{ - social_media_links(The icons that lead - to Facebook, Snapchat, etc.)[] -} -``` + ``` + { + social_media_links(The icons that lead + to Facebook, Snapchat, etc.)[] + } + ``` """ payload = { "url": url, diff --git a/agentstack/_tools/firecrawl/__init__.py b/agentstack/_tools/firecrawl/__init__.py index 1f912b31..bbda5381 100644 --- a/agentstack/_tools/firecrawl/__init__.py +++ b/agentstack/_tools/firecrawl/__init__.py @@ -1,6 +1,6 @@ import os from firecrawl import FirecrawlApp - +from typing import List, Dict, Any, Optional app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY')) @@ -38,3 +38,106 @@ def retrieve_web_crawl(crawl_id: str): will tell you if the crawl is finished. If it is not, wait some more time then try again. """ return app.check_crawl_status(crawl_id) + + +def batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']): + """ + Batch scrape multiple URLs simultaneously. + + Args: + urls: List of URLs to scrape + formats: List of desired output formats (e.g., ['markdown', 'html']) + + Returns: + Dictionary containing the batch scrape results + """ + batch_result = app.batch_scrape_urls(urls, {'formats': formats}) + return batch_result + + +def async_batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']): + """ + Asynchronously batch scrape multiple URLs. + + Args: + urls: List of URLs to scrape + formats: List of desired output formats (e.g., ['markdown', 'html']) + + Returns: + Dictionary containing the job ID and status URL + """ + batch_job = app.async_batch_scrape_urls(urls, {'formats': formats}) + return batch_job + + +def check_batch_status(job_id: str): + """ + Check the status of an asynchronous batch scrape job. + + Args: + job_id: The ID of the batch scrape job + + Returns: + Dictionary containing the current status and results if completed + """ + return app.check_batch_scrape_status(job_id) + + +def extract_data(urls: List[str], schema: Optional[Dict[str, Any]] = None, prompt: Optional[str] = None) -> Dict[ + str, Any]: + """ + Extract structured data from URLs using LLMs. + + Args: + urls: List of URLs to extract data from + schema: Optional JSON schema defining the structure of data to extract + prompt: Optional natural language prompt describing the data to extract + + Returns: + Dictionary containing the extracted structured data + """ + params: Dict[str, Any] = {} + + if prompt is not None: + params['prompt'] = prompt + elif schema is not None: + params['schema'] = schema + + data = app.extract(urls, params) + return data + + +def map_website(url: str, search: Optional[str] = None): + """ + Map a website to get all URLs, with optional search functionality. + + Args: + url: The base URL to map + search: Optional search term to filter URLs + + Returns: + Dictionary containing the list of discovered URLs + """ + params = {'search': search} if search else {} + map_result = app.map_url(url, params) + return map_result + + +def batch_extract(urls: List[str], extract_params: Dict[str, Any]): + """ + Batch extract structured data from multiple URLs. + + Args: + urls: List of URLs to extract data from + extract_params: Dictionary containing extraction parameters including prompt or schema + + Returns: + Dictionary containing the extracted data from all URLs + """ + params = { + 'formats': ['extract'], + 'extract': extract_params + } + + batch_result = app.batch_scrape_urls(urls, params) + return batch_result \ No newline at end of file diff --git a/agentstack/_tools/firecrawl/config.json b/agentstack/_tools/firecrawl/config.json index 5dcf2748..42c45756 100644 --- a/agentstack/_tools/firecrawl/config.json +++ b/agentstack/_tools/firecrawl/config.json @@ -8,6 +8,15 @@ "dependencies": [ "firecrawl-py>=1.6.4" ], - "tools": ["web_scrape", "web_crawl", "retrieve_web_crawl"], + "tools": [ + "web_scrape", + "web_crawl", + "retrieve_web_crawl", + "batch_scrape", + "check_batch_status", + "extract_data", + "map_website", + "batch_extract" + ], "cta": "Create an API key at https://www.firecrawl.dev/" } \ No newline at end of file