diff --git a/README.md b/README.md index 50a5b995be..569b649211 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

-Build multi-modal Agents with memory, knowledge, tools and reasoning +Build multi-modal Agents with memory, knowledge, tools and reasoning.

-# What is phidata? +## What is phidata? -**Phidata is a framework for building multi-modal agents with memory, knowledge, tools and reasoning.** +**Phidata is a framework for building multi-modal agents**, use phidata to: + +- **Build multi-modal agents with memory, knowledge, tools and reasoning.** +- **Build teams of agents that can work together to solve problems.** +- **Chat with your agents using a beautiful Agent UI.** ## Install @@ -27,7 +31,7 @@ Build multi-modal Agents with memory, knowledge, tools and reasoning pip install -U phidata ``` -# Key Features +## Key Features - [Simple & Elegant](#simple--elegant) - [Powerful & Flexible](#powerful--flexible) @@ -42,7 +46,9 @@ pip install -U phidata ## Simple & Elegant -Phidata Agents are simple and elegant, resulting in minimal, beautiful code. For example, you can create a web search agent using 10 lines of code, create a file `web_search.py` +Phidata Agents are simple and elegant, resulting in minimal, beautiful code. + +For example, you can create a web search agent in 10 lines of code, create a file `web_search.py` ```python from phi.agent import Agent @@ -71,7 +77,9 @@ python web_search.py ## Powerful & Flexible -Phidata agents can use multiple tools and follow instructions to achieve complex tasks. For example, you can create a finance agent that can query financial data, create a file `finance_agent.py` +Phidata agents can use multiple tools and follow instructions to achieve complex tasks. + +For example, you can create a finance agent with tools to query financial data, create a file `finance_agent.py` ```python from phi.agent import Agent @@ -99,7 +107,9 @@ python finance_agent.py ## Multi-Modal by default -Phidata agents support text, images, audio and video. For example, you can create an image agent that can understand images and make tool calls as needed, create a file `image_agent.py` +Phidata agents support text, images, audio and video. + +For example, you can create an image agent that can understand images and make tool calls as needed, create a file `image_agent.py` ```python from phi.agent import Agent @@ -215,14 +225,18 @@ if __name__ == "__main__": serve_playground_app("playground:app", reload=True) ``` -Authenticate with phidata: -``` +Authenticate with phidata by running the following command: + +```shell phi auth ``` -> [!NOTE] -> If `phi auth` fails, you can set the `PHI_API_KEY` environment variable by copying it from [phidata.app](https://www.phidata.app) +or by exporting the `PHI_API_KEY` for your workspace from [phidata.app](https://www.phidata.app) + +```bash +export PHI_API_KEY=phi-*** +``` Install dependencies and run the Agent Playground: @@ -373,9 +387,6 @@ python reasoning_agent.py > > It is an experiment fueled by curiosity, combining COT and tool use. Set your expectations very low for this initial release. For example: It will not be able to count ‘r’s in ‘strawberry’. -> [!TIP] -> If using tools with `reasoning=True`, set `structured_outputs=False` because gpt-4o doesnt support tools with structured outputs. - ## Demo Agents The Agent Playground includes a few demo agents that you can test with. If you have recommendations for other demo agents, please let us know in our [community forum](https://community.phidata.com/). diff --git a/cookbook/agents/30_pre_and_post_hooks.py b/cookbook/agents/30_pre_and_post_hooks.py index 900387b8d1..97a8bcab0b 100644 --- a/cookbook/agents/30_pre_and_post_hooks.py +++ b/cookbook/agents/30_pre_and_post_hooks.py @@ -6,16 +6,16 @@ from phi.tools import tool, FunctionCall -def pre_hook(function_call: FunctionCall): - print(f"Pre-hook: {function_call.function.name}") - print(f"Arguments: {function_call.arguments}") - print(f"Result: {function_call.result}") +def pre_hook(fc: FunctionCall): + print(f"Pre-hook: {fc.function.name}") + print(f"Arguments: {fc.arguments}") + print(f"Result: {fc.result}") -def post_hook(function_call: FunctionCall): - print(f"Post-hook: {function_call.function.name}") - print(f"Arguments: {function_call.arguments}") - print(f"Result: {function_call.result}") +def post_hook(fc: FunctionCall): + print(f"Post-hook: {fc.function.name}") + print(f"Arguments: {fc.arguments}") + print(f"Result: {fc.result}") @tool(pre_hook=pre_hook, post_hook=post_hook) diff --git a/cookbook/agents/43_research_agent_exa.py b/cookbook/agents/43_research_agent_exa.py new file mode 100644 index 0000000000..4184f5276c --- /dev/null +++ b/cookbook/agents/43_research_agent_exa.py @@ -0,0 +1,55 @@ +"""Please install dependencies using: +pip install openai exa-py phidata +""" + +from textwrap import dedent +from datetime import datetime + +from phi.agent import Agent +from phi.model.openai import OpenAIChat +from phi.tools.exa import ExaTools + +agent = Agent( + model=OpenAIChat(id="gpt-4o"), + tools=[ExaTools(start_published_date=datetime.now().strftime("%Y-%m-%d"), type="keyword")], + description="You are an advanced AI researcher writing a report on a topic.", + instructions=[ + "For the provided topic, run 3 different searches.", + "Read the results carefully and prepare a NYT worthy report.", + "Focus on facts and make sure to provide references.", + ], + expected_output=dedent("""\ + An engaging, informative, and well-structured report in markdown format: + + ## Engaging Report Title + + ### Overview + {give a brief introduction of the report and why the user should read this report} + {make this section engaging and create a hook for the reader} + + ### Section 1 + {break the report into sections} + {provide details/facts/processes in this section} + + ... more sections as necessary... + + ### Takeaways + {provide key takeaways from the article} + + ### References + - [Reference 1](link) + - [Reference 2](link) + - [Reference 3](link) + + ### About the Author + {write a made up for yourself, give yourself a cyberpunk name and a title} + + - published on {date} in dd/mm/yyyy + """), + markdown=True, + show_tool_calls=True, + add_datetime_to_instructions=True, + save_response_to_file="tmp/{message}.md", + # debug_mode=True, +) +agent.print_response("Simulation theory", stream=True) diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py index 104177972e..25f0405684 100644 --- a/cookbook/playground/multimodal_agent.py +++ b/cookbook/playground/multimodal_agent.py @@ -14,7 +14,6 @@ from phi.playground import Playground, serve_playground_app from phi.storage.agent.sqlite import SqlAgentStorage from phi.tools.fal_tools import FalTools -from pydantic import BaseModel, Field image_agent_storage_file: str = "tmp/image_agent.db" @@ -26,7 +25,7 @@ description="You are an AI agent that can generate images using DALL-E.", instructions=[ "When the user asks you to create an image, use the `create_image` tool to create the image.", - "Don't provide the URL of the image in the response. Only describe what image was generated." + "Don't provide the URL of the image in the response. Only describe what image was generated.", ], markdown=True, debug_mode=True, @@ -43,7 +42,7 @@ description="You are an AI agent that can generate gifs using the ModelsLabs API.", instructions=[ "When the user asks you to create an image, use the `generate_media` tool to create the image.", - "Don't provide the URL of the image in the response. Only describe what image was generated." + "Don't provide the URL of the image in the response. Only describe what image was generated.", ], markdown=True, debug_mode=True, @@ -60,7 +59,7 @@ description="You are an AI agent that can generate videos using the ModelsLabs API.", instructions=[ "When the user asks you to create a video, use the `generate_media` tool to create the video.", - "Don't provide the URL of the video in the response. Only describe what video was generated." + "Don't provide the URL of the video in the response. Only describe what video was generated.", ], markdown=True, debug_mode=True, @@ -77,7 +76,7 @@ description="You are an AI agent that can generate videos using the Fal API.", instructions=[ "When the user asks you to create a video, use the `generate_media` tool to create the video.", - "Don't provide the URL of the video in the response. Only describe what video was generated." + "Don't provide the URL of the video in the response. Only describe what video was generated.", ], markdown=True, debug_mode=True, diff --git a/cookbook/tools/lumalabs_tool.py b/cookbook/tools/lumalabs_tool.py new file mode 100644 index 0000000000..8d87d31f13 --- /dev/null +++ b/cookbook/tools/lumalabs_tool.py @@ -0,0 +1,45 @@ +from phi.agent import Agent +from phi.llm.openai import OpenAIChat +from phi.tools.lumalab import LumaLabTools + +"""Create an agent specialized for Luma AI video generation""" + +luma_agent = Agent( + name="Luma Video Agent", + agent_id="luma-video-agent", + llm=OpenAIChat(model="gpt-4o"), + tools=[LumaLabTools()], # Using the LumaLab tool we created + markdown=True, + debug_mode=True, + show_tool_calls=True, + instructions=[ + "You are an agent designed to generate videos using the Luma AI API.", + "You can generate videos in two ways:", + "1. Text-to-Video Generation:", + " - Use the generate_video function for creating videos from text prompts", + " - Default parameters: loop=False, aspect_ratio='16:9', keyframes=None", + "2. Image-to-Video Generation:", + " - Use the image_to_video function when starting from one or two images", + " - Required parameters: prompt, start_image_url", + " - Optional parameters: end_image_url, loop=False, aspect_ratio='16:9'", + " - The image URLs must be publicly accessible", + "Choose the appropriate function based on whether the user provides image URLs or just a text prompt.", + "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.", + "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.", + "After generating any video, if generation is async (wait_for_completion=False), inform about the generation ID", + ], + system_message=( + "Use generate_video for text-to-video requests and image_to_video for image-based " + "generation. Don't modify default parameters unless specifically requested. " + "Always provide clear feedback about the video generation status." + ), +) + +luma_agent.run("Generate a video of a car in a sky") +# luma_agent.run("Transform this image into a video of a tiger walking: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Walking_tiger_female.jpg/1920px-Walking_tiger_female.jpg") +# luma_agent.run(""" +# Create a transition video between these two images: +# Start: https://img.freepik.com/premium-photo/car-driving-dark-forest-generative-ai_634053-6661.jpg?w=1380 +# End: https://img.freepik.com/free-photo/front-view-black-luxury-sedan-road_114579-5030.jpg?t=st=1733821884~exp=1733825484~hmac=735ca584a9b985c53875fc1ad343c3fd394e1de4db49e5ab1a9ab37ac5f91a36&w=1380 +# Make it a smooth, natural movement +# """) diff --git a/phi/agent/agent.py b/phi/agent/agent.py index 0acae861bc..16715a79b6 100644 --- a/phi/agent/agent.py +++ b/phi/agent/agent.py @@ -32,7 +32,7 @@ from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction from phi.run.response import RunEvent, RunResponse, RunResponseExtraData from phi.knowledge.agent import AgentKnowledge -from phi.model import Model +from phi.model.base import Model from phi.model.message import Message, MessageReferences from phi.model.response import ModelResponse, ModelResponseEvent from phi.memory.agent import AgentMemory, MemoryRetrieval, Memory, AgentRun, SessionSummary # noqa: F401 diff --git a/phi/llm/openai/chat.py b/phi/llm/openai/chat.py index 60b3fe2e3c..666313522d 100644 --- a/phi/llm/openai/chat.py +++ b/phi/llm/openai/chat.py @@ -181,7 +181,9 @@ def to_dict(self) -> Dict[str, Any]: if self.presence_penalty: _dict["presence_penalty"] = self.presence_penalty if self.response_format: - _dict["response_format"] = self.response_format if isinstance(self.response_format, dict) else str(self.response_format) + _dict["response_format"] = ( + self.response_format if isinstance(self.response_format, dict) else str(self.response_format) + ) if self.seed is not None: _dict["seed"] = self.seed if self.stop: diff --git a/phi/model/__init__.py b/phi/model/__init__.py index 00c37db694..e69de29bb2 100644 --- a/phi/model/__init__.py +++ b/phi/model/__init__.py @@ -1 +0,0 @@ -from phi.model.base import Model diff --git a/phi/model/openai/chat.py b/phi/model/openai/chat.py index 66dbf6242f..ef177512c8 100644 --- a/phi/model/openai/chat.py +++ b/phi/model/openai/chat.py @@ -255,7 +255,9 @@ def to_dict(self) -> Dict[str, Any]: if self.presence_penalty is not None: model_dict["presence_penalty"] = self.presence_penalty if self.response_format is not None: - model_dict["response_format"] = self.response_format if isinstance(self.response_format, dict) else str(self.response_format) + model_dict["response_format"] = ( + self.response_format if isinstance(self.response_format, dict) else str(self.response_format) + ) if self.seed is not None: model_dict["seed"] = self.seed if self.stop is not None: diff --git a/phi/tools/lumalab.py b/phi/tools/lumalab.py new file mode 100644 index 0000000000..bebb2b652d --- /dev/null +++ b/phi/tools/lumalab.py @@ -0,0 +1,168 @@ +import time +import uuid +from os import getenv +from typing import Optional, Dict, Any, Literal, TypedDict + +from phi.agent import Agent +from phi.tools import Toolkit +from phi.utils.log import logger +from phi.model.content import Video + +try: + from lumaai import LumaAI # type: ignore +except ImportError: + raise ImportError("`lumaai` not installed. Please install using `pip install lumaai`") + + +# Define types for keyframe structure +class KeyframeImage(TypedDict): + type: Literal["image"] + url: str + + +Keyframes = Dict[str, KeyframeImage] + + +class LumaLabTools(Toolkit): + def __init__( + self, + api_key: Optional[str] = None, + wait_for_completion: bool = True, + poll_interval: int = 3, + max_wait_time: int = 300, # 5 minutes + ): + super().__init__(name="luma_lab") + + self.wait_for_completion = wait_for_completion + self.poll_interval = poll_interval + self.max_wait_time = max_wait_time + self.api_key = api_key or getenv("LUMAAI_API_KEY") + + if not self.api_key: + logger.error("LUMAAI_API_KEY not set. Please set the LUMAAI_API_KEY environment variable.") + + self.client = LumaAI(auth_token=self.api_key) + self.register(self.generate_video) + self.register(self.image_to_video) + + def image_to_video( + self, + agent: Agent, + prompt: str, + start_image_url: str, + end_image_url: Optional[str] = None, + loop: bool = False, + aspect_ratio: Literal["1:1", "16:9", "9:16", "4:3", "3:4", "21:9", "9:21"] = "16:9", + ) -> str: + """Generate a video from one or two images with a prompt. + + Args: + agent: The agent instance + prompt: Text description of the desired video + start_image_url: URL of the starting image + end_image_url: Optional URL of the ending image + loop: Whether the video should loop + aspect_ratio: Aspect ratio of the output video + + Returns: + str: Status message or error + """ + + try: + # Construct keyframes + keyframes: Dict[str, Dict[str, str]] = {"frame0": {"type": "image", "url": start_image_url}} + + # Add end image if provided + if end_image_url: + keyframes["frame1"] = {"type": "image", "url": end_image_url} + + # Create generation with keyframes + generation = self.client.generations.create( + prompt=prompt, + loop=loop, + aspect_ratio=aspect_ratio, + keyframes=keyframes, # type: ignore + ) + + video_id = str(uuid.uuid4()) + + if not self.wait_for_completion: + return "Async generation unsupported" + + # Poll for completion + seconds_waited = 0 + while seconds_waited < self.max_wait_time: + if not generation or not generation.id: + return "Failed to get generation ID" + + generation = self.client.generations.get(generation.id) + + if generation.state == "completed" and generation.assets: + video_url = generation.assets.video + if video_url: + agent.add_video(Video(id=video_id, url=video_url, eta="completed")) + return f"Video generated successfully: {video_url}" + elif generation.state == "failed": + return f"Generation failed: {generation.failure_reason}" + + logger.info(f"Generation in progress... State: {generation.state}") + time.sleep(self.poll_interval) + seconds_waited += self.poll_interval + + return f"Video generation timed out after {self.max_wait_time} seconds" + + except Exception as e: + logger.error(f"Failed to generate video: {e}") + return f"Error: {e}" + + def generate_video( + self, + agent: Agent, + prompt: str, + loop: bool = False, + aspect_ratio: Literal["1:1", "16:9", "9:16", "4:3", "3:4", "21:9", "9:21"] = "16:9", + keyframes: Optional[Dict[str, Dict[str, str]]] = None, + ) -> str: + """Use this function to generate a video given a prompt.""" + + try: + generation_params: Dict[str, Any] = { + "prompt": prompt, + "loop": loop, + "aspect_ratio": aspect_ratio, + } + + if keyframes is not None: + generation_params["keyframes"] = keyframes + + generation = self.client.generations.create(**generation_params) # type: ignore + + video_id = str(uuid.uuid4()) + if not self.wait_for_completion: + return "Async generation unsupported" + + # Poll for completion + seconds_waited = 0 + while seconds_waited < self.max_wait_time: + if not generation or not generation.id: + return "Failed to get generation ID" + + generation = self.client.generations.get(generation.id) + + if generation.state == "completed" and generation.assets: + video_url = generation.assets.video + if video_url: + agent.add_video(Video(id=video_id, url=video_url, state="completed")) + return f"Video generated successfully: {video_url}" + elif generation.state == "failed": + return f"Generation failed: {generation.failure_reason}" + + logger.info(f"Generation in progress... State: {generation.state}") + time.sleep(self.poll_interval) + seconds_waited += self.poll_interval + + return f"Video generation timed out after {self.max_wait_time} seconds" + + except Exception as e: + logger.error(f"Failed to generate video: {e}") + return f"Error: {e}"