diff --git a/scripts/run_evolution.py b/scripts/run_evolution.py new file mode 100644 index 0000000000..4b8a648948 --- /dev/null +++ b/scripts/run_evolution.py @@ -0,0 +1,52 @@ +import asyncio +import structlog + +from skyvern.evolution.evolve import Evolve +from skyvern.evolution.prompt_manager import PromptManager + +LOG = structlog.get_logger() + +async def main(): + """ + Main function to run the prompt evolution loop. + """ + LOG.info("Initializing prompt evolution process...") + + prompt_manager = PromptManager() + evolver = Evolve(prompt_manager) + + # Check if the baseline prompt was loaded correctly + if not prompt_manager.get_prompt("baseline"): + LOG.error("Failed to load baseline prompt. Aborting evolution process.") + return + + LOG.info("Starting evolution loop...") + + # Run the evolution loop for a few generations as a demonstration + num_generations = 5 + for i in range(num_generations): + LOG.info(f"--- Generation {i+1}/{num_generations} ---") + + # Evolve the prompts to create new variations + await evolver.evolve_prompts() + + # Evaluate the performance of the new prompts + evolver.evaluate_and_score_prompts() + + # Log the best prompt of the current generation + best_prompt = prompt_manager.get_best_prompt() + if best_prompt: + LOG.info(f"Best prompt of generation {i+1}: '{best_prompt.name}' with score {best_prompt.score}") + else: + LOG.warning("No prompts in manager after evolution and evaluation.") + + # In a real application, you might add a delay or run this as a continuous background process + await asyncio.sleep(5) + + LOG.info("Evolution loop finished.") + +if __name__ == "__main__": + # This script needs to be run in an environment where the skyvern package is installed + # and the necessary configurations (like .env for LLM providers) are set up. + # Example: poetry run python scripts/run_evolution.py + asyncio.run(main()) \ No newline at end of file diff --git a/skyvern/evolution/__init__.py b/skyvern/evolution/__init__.py new file mode 100644 index 0000000000..7abdb6f494 --- /dev/null +++ b/skyvern/evolution/__init__.py @@ -0,0 +1 @@ +# This file is intentionally left blank to mark the directory as a Python package. \ No newline at end of file diff --git a/skyvern/evolution/evolve.py b/skyvern/evolution/evolve.py new file mode 100644 index 0000000000..064bab8bd9 --- /dev/null +++ b/skyvern/evolution/evolve.py @@ -0,0 +1,74 @@ +import structlog +import random + +from skyvern.forge.prompts import prompt_engine +from skyvern.forge.sdk.llm import LLM_API_HANDLER + +LOG = structlog.get_logger() + +class Evolve: + def __init__(self, prompt_manager): + self.prompt_manager = prompt_manager + self.evolution_count = 0 + + async def evolve_prompts(self): + """ + Takes the top-performing prompts and uses an LLM to generate new variations. + """ + best_prompt = self.prompt_manager.get_best_prompt() + if not best_prompt: + LOG.warning("No prompts found to evolve.") + return + + LOG.info(f"Evolving prompt '{best_prompt.name}' with score {best_prompt.score}") + + # Use an LLM to generate a new variation of the prompt. + evolution_prompt = prompt_engine.load_prompt( + "evolve-prompt", + prompt_to_evolve=best_prompt.template, + ) + + # In a real implementation, a 'step' object would be passed here. + # This is a placeholder for demonstration purposes. + response = await LLM_API_HANDLER(prompt=evolution_prompt, step=None) + + # Assuming the response is the raw string of the new prompt + evolved_prompt_str = response if isinstance(response, str) else str(response) + + # Add the new prompt to the population + self.evolution_count += 1 + new_prompt_name = f"evolved_v{self.evolution_count}" + self.prompt_manager.add_prompt(new_prompt_name, evolved_prompt_str, score=0) + + LOG.info(f"Evolved new prompt '{new_prompt_name}': {evolved_prompt_str[:100]}...") + + def evaluate_and_score_prompts(self): + """ + Simulates the evaluation of prompts and updates their scores based on deterministic criteria. + In a real-world scenario, this would involve running benchmarks. + """ + LOG.info("Evaluating and scoring prompts...") + for name, prompt in self.prompt_manager.prompts.items(): + # Skip the baseline prompt as its score is fixed. + if name == "baseline": + continue + + score = 0 + # Score based on length (ideal length between 500 and 1500 characters) + length = len(prompt.template) + if 500 <= length <= 1500: + score += 0.5 + else: + score -= 0.2 + + # Score based on presence of keywords + keywords = ["action", "reasoning", "COMPLETE", "TERMINATE", "element", "goal"] + for keyword in keywords: + if keyword in prompt.template.lower(): + score += 0.2 + + # Normalize score to be between 0 and 2 for this simulation + normalized_score = max(0, min(2, score)) + + self.prompt_manager.update_score(name, normalized_score) + LOG.info(f"Evaluated '{name}', assigned score: {normalized_score}") \ No newline at end of file diff --git a/skyvern/evolution/prompt_manager.py b/skyvern/evolution/prompt_manager.py new file mode 100644 index 0000000000..c499f05057 --- /dev/null +++ b/skyvern/evolution/prompt_manager.py @@ -0,0 +1,68 @@ +import structlog + +from skyvern.forge.prompts import prompt_engine + +LOG = structlog.get_logger() + +class Prompt: + def __init__(self, name, template, score=0): + self.name = name + self.template = template + self.score = score + +class PromptManager: + def __init__(self): + self.prompts = {} + self._load_baseline_prompt() + + def _load_baseline_prompt(self): + """ + Loads the original 'extract-action.j2' prompt as the baseline. + """ + try: + # Access the Jinja2 environment from the prompt_engine + env = prompt_engine.env + # Construct the path to the template within the Jinja2 environment + template_path = "skyvern/extract-action.j2" + # Get the template source from the loader + baseline_template = env.loader.get_source(env, template_path)[0] + + self.add_prompt("baseline", baseline_template, score=1.0) # Assuming baseline is good. + LOG.info("Loaded baseline prompt 'extract-action.j2'.") + except Exception as e: + LOG.error(f"Failed to load baseline prompt: {e}", exc_info=True) + + def add_prompt(self, name, template, score=0): + """ + Adds a new prompt to the population. + """ + if name in self.prompts: + LOG.warning(f"Prompt with name '{name}' already exists. Overwriting.") + + self.prompts[name] = Prompt(name, template, score) + LOG.info(f"Added prompt '{name}' with score {score}.") + + def get_prompt(self, name): + """ + Retrieves a prompt object by its name. + """ + return self.prompts.get(name) + + def get_best_prompt(self): + """ + Returns the prompt with the highest score. + """ + if not self.prompts: + return None + + return max(self.prompts.values(), key=lambda p: p.score) + + def update_score(self, name, score): + """ + Updates the score of a prompt after evaluation. + """ + if name in self.prompts: + self.prompts[name].score = score + LOG.info(f"Updated score for prompt '{name}' to {score}.") + else: + LOG.warning(f"Prompt '{name}' not found for score update.") \ No newline at end of file diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index b7df768747..51c54aaf70 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1316,11 +1316,31 @@ async def _build_extract_action_prompt( ) task_type = task.task_type if task.task_type else TaskType.general - template = "" + + # Determine which template to use. Evolved prompts are handled as raw strings, + # while standard prompts are handled by name. + template_name: str | None = None + template_str: str | None = None + if task_type == TaskType.general: - template = "extract-action" + # For general tasks, try to use the best prompt from our evolution manager. + best_prompt = app.PROMPT_MANAGER.get_best_prompt() + if best_prompt: + LOG.info(f"Using evolved prompt: {best_prompt.name} with score {best_prompt.score}") + template_str = best_prompt.template + else: + # If no evolved prompts, fall back to the baseline prompt. + LOG.warning("PromptManager has no prompts. Falling back to baseline 'extract-action'.") + baseline_prompt = app.PROMPT_MANAGER.get_prompt("baseline") + if baseline_prompt: + template_str = baseline_prompt.template + else: + # If even the baseline is missing, this is a critical error. + LOG.error("Baseline prompt could not be loaded from PromptManager.") + # As a last resort, use the template name. + template_name = "extract-action" elif task_type == TaskType.validation: - template = "decisive-criterion-validate" + template_name = "decisive-criterion-validate" elif task_type == TaskType.action: prompt = prompt_engine.load_prompt("infer-action-type", navigation_goal=navigation_goal) json_response = await app.LLM_API_HANDLER(prompt=prompt, step=step) @@ -1329,26 +1349,22 @@ async def _build_extract_action_prompt( reason=json_response.get("thought"), error_type=json_response.get("error") ) - action_type: str = json_response.get("action_type") or "" - action_type = ActionType[action_type.upper()] + action_type_str: str = json_response.get("action_type") or "" + action_type = ActionType[action_type_str.upper()] if action_type == ActionType.CLICK: - template = "single-click-action" + template_name = "single-click-action" elif action_type == ActionType.INPUT_TEXT: - template = "single-input-action" + template_name = "single-input-action" elif action_type == ActionType.UPLOAD_FILE: - template = "single-upload-action" + template_name = "single-upload-action" elif action_type == ActionType.SELECT_OPTION: - template = "single-select-action" + template_name = "single-select-action" else: raise UnsupportedActionType(action_type=action_type) - if not template: - raise UnsupportedTaskType(task_type=task_type) - context = skyvern_context.ensure_context() - return prompt_engine.load_prompt( - template=template, + render_kwargs = dict( navigation_goal=navigation_goal, navigation_payload_str=json.dumps(final_navigation_payload), starting_url=starting_url, @@ -1363,6 +1379,22 @@ async def _build_extract_action_prompt( terminate_criterion=task.terminate_criterion, ) + if template_str is not None: + # Render the prompt from a raw string (used for evolved prompts) + return prompt_engine.load_prompt_from_string( + template=template_str, + **render_kwargs, + ) + + if template_name is not None: + # Render the prompt from a template file by name (standard behavior) + return prompt_engine.load_prompt( + template=template_name, + **render_kwargs, + ) + + raise UnsupportedTaskType(task_type=task_type) + def _build_navigation_payload( self, task: Task, diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py index 926e63498c..41505f8ede 100644 --- a/skyvern/forge/app.py +++ b/skyvern/forge/app.py @@ -2,6 +2,7 @@ from fastapi import FastAPI +from skyvern.evolution.prompt_manager import PromptManager from skyvern.forge.agent import ForgeAgent from skyvern.forge.agent_functions import AgentFunction from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory @@ -43,4 +44,5 @@ authentication_function: Callable[[str], Awaitable[Organization]] | None = None setup_api_app: Callable[[FastAPI], None] | None = None +PROMPT_MANAGER = PromptManager() agent = ForgeAgent() diff --git a/skyvern/forge/prompts/skyvern/evolve-prompt.j2 b/skyvern/forge/prompts/skyvern/evolve-prompt.j2 new file mode 100644 index 0000000000..1c12284c90 --- /dev/null +++ b/skyvern/forge/prompts/skyvern/evolve-prompt.j2 @@ -0,0 +1,17 @@ +You are an expert in prompt engineering for large language models that control web automation agents. +Your task is to evolve the following prompt to make it more effective. The goal is to improve the agent's ability to understand a webpage and decide on the next best action to achieve a user's goal. + +Here are some principles for a good prompt: +- **Clarity and Conciseness:** The prompt should be easy for the LLM to understand. Avoid ambiguity. +- **Role-setting:** Clearly define the role and capabilities of the agent. +- **Comprehensive Context:** Ensure all necessary information (like page elements, user goal, history) is presented logically. +- **Action-oriented:** The prompt should guide the LLM towards producing a concrete, executable action. +- **Robustness:** The prompt should encourage the model to handle unexpected situations gracefully (e.g., by providing fallback actions or reasoning about errors). + +Here is the prompt to evolve: +--- +{{ prompt_to_evolve }} +--- + +Based on the principles above, please provide a new, improved version of this prompt. +Only output the new prompt template. Do not include any other text or explanation. \ No newline at end of file