Merge pull request #10 from RealShocky/main

suitedaces · web-flow · commit 5d360ac21d70 · 2025-01-27T23:26:00.000-06:00
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,7 @@ python-dotenv
 pillow
 numpy
 qtawesome
+SpeechRecognition
+pyttsx3
+keyboard
+pyaudio
diff --git a/src/anthropic.py b/src/anthropic.py
@@ -3,6 +3,7 @@
 import os
 from dotenv import load_dotenv
 import logging
+from .prompt_manager import PromptManager
 
 class AnthropicClient:
     def __init__(self):
@@ -13,6 +14,7 @@ def __init__(self):
         
         try:
             self.client = anthropic.Anthropic(api_key=self.api_key)
+            self.prompt_manager = PromptManager()
         except Exception as e:
             raise ValueError(f"Failed to initialize Anthropic client: {str(e)}")
         
@@ -62,7 +64,7 @@ def get_next_action(self, run_history) -> BetaMessage:
                     }
                 ],
                 messages=cleaned_history,
-                system="The user will ask you to perform a task and you should use their computer to do so. After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. Explicitly show your thinking: 'I have evaluated step X...' If not correct, try again. Only when you confirm a step was executed correctly should you move on to the next one. Note that you have to click into the browser address bar before typing a URL. You should always call a tool! Always return a tool call. Remember call the finish_run tool when you have achieved the goal of the task. Do not explain you have finished the task, just call the tool. Use keyboard shortcuts to navigate whenever possible. Please remember to take a screenshot after EVERY step to confirm you have achieved the right outcome.",
+                system=self.prompt_manager.get_current_prompt(),
                 betas=["computer-use-2024-10-22"],
             )
 
diff --git a/src/computer.py b/src/computer.py
@@ -8,41 +8,75 @@ class ComputerControl:
     def __init__(self):
         self.screen_width, self.screen_height = pyautogui.size()
         pyautogui.PAUSE = 0.5  # Add a small delay between actions for stability
+        self.last_click_position = None
         
     def perform_action(self, action):
         action_type = action['type']
         
-        if action_type == 'mouse_move':
-            x, y = self.map_from_ai_space(action['x'], action['y'])
-            pyautogui.moveTo(x, y)
-        elif action_type == 'left_click':
-            pyautogui.click()
-            time.sleep(0.1)  # Add a small delay after clicking
-        elif action_type == 'right_click':
-            pyautogui.rightClick()
-            time.sleep(0.1)
-        elif action_type == 'middle_click':
-            pyautogui.middleClick()
-            time.sleep(0.1)
-        elif action_type == 'double_click':
-            pyautogui.doubleClick()
-            time.sleep(0.1)
-        elif action_type == 'left_click_drag':
-            start_x, start_y = pyautogui.position()
-            end_x, end_y = self.map_from_ai_space(action['x'], action['y'])
-            pyautogui.dragTo(end_x, end_y, button='left', duration=0.5)
-        elif action_type == 'type':
-            pyautogui.write(action['text'], interval=0.1)  # Add a small delay between keystrokes
-        elif action_type == 'key':
-            pyautogui.press(action['text'])
-            time.sleep(0.1)
-        elif action_type == 'screenshot':
-            return self.take_screenshot()
-        elif action_type == 'cursor_position':
-            x, y = pyautogui.position()
-            return self.map_to_ai_space(x, y)
-        else:
-            raise ValueError(f"Unsupported action: {action_type}")
+        # Take a screenshot before the action
+        before_screenshot = self.take_screenshot()
+        
+        try:
+            if action_type == 'mouse_move':
+                x, y = self.map_from_ai_space(action['x'], action['y'])
+                pyautogui.moveTo(x, y)
+                time.sleep(0.2)  # Wait for move to complete
+                
+            elif action_type == 'left_click':
+                pyautogui.click()
+                time.sleep(0.2)  # Wait for click to register
+                self.last_click_position = pyautogui.position()
+                
+            elif action_type == 'right_click':
+                pyautogui.rightClick()
+                time.sleep(0.2)
+                
+            elif action_type == 'middle_click':
+                pyautogui.middleClick()
+                time.sleep(0.2)
+                
+            elif action_type == 'double_click':
+                pyautogui.doubleClick()
+                time.sleep(0.2)
+                self.last_click_position = pyautogui.position()
+                
+            elif action_type == 'left_click_drag':
+                start_x, start_y = pyautogui.position()
+                end_x, end_y = self.map_from_ai_space(action['x'], action['y'])
+                pyautogui.dragTo(end_x, end_y, button='left', duration=0.5)
+                time.sleep(0.2)
+                
+            elif action_type == 'type':
+                # If we have a last click position, ensure we're still there
+                if self.last_click_position:
+                    current_pos = pyautogui.position()
+                    if current_pos != self.last_click_position:
+                        pyautogui.click(self.last_click_position)
+                        time.sleep(0.2)
+                
+                pyautogui.write(action['text'], interval=0.1)
+                time.sleep(0.2)
+                
+            elif action_type == 'key':
+                pyautogui.press(action['text'])
+                time.sleep(0.2)
+                
+            elif action_type == 'screenshot':
+                return self.take_screenshot()
+                
+            elif action_type == 'cursor_position':
+                x, y = pyautogui.position()
+                return self.map_to_ai_space(x, y)
+                
+            else:
+                raise ValueError(f"Unsupported action: {action_type}")
+            
+            # Take a screenshot after the action
+            after_screenshot = self.take_screenshot()
+            return after_screenshot
+            
+        except Exception as e:
+            raise Exception(f"Action failed: {action_type} - {str(e)}")
         
     def take_screenshot(self):
         screenshot = pyautogui.screenshot()
@@ -61,3 +95,8 @@ def map_to_ai_space(self, x, y):
         
     def resize_for_ai(self, screenshot):
         return screenshot.resize((1280, 800), Image.LANCZOS)
+
+    def cleanup(self):
+        """Clean up any resources or running processes"""
+        # Add cleanup code here if needed
+        pass
diff --git a/src/prompt_manager.py b/src/prompt_manager.py
@@ -0,0 +1,47 @@
+import json
+import os
+from pathlib import Path
+
+DEFAULT_SYSTEM_PROMPT = """The user will ask you to perform a task and you should use their computer to do so. After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. Explicitly show your thinking: 'I have evaluated step X...' If not correct, try again. Only when you confirm a step was executed correctly should you move on to the next one. Note that you have to click into the browser address bar before typing a URL. You should always call a tool! Always return a tool call. Remember call the finish_run tool when you have achieved the goal of the task. Do not explain you have finished the task, just call the tool. Use keyboard shortcuts to navigate whenever possible. Please remember to take a screenshot after EVERY step to confirm you have achieved the right outcome."""
+
+class PromptManager:
+    def __init__(self):
+        self.config_dir = Path.home() / ".grunty"
+        self.config_file = self.config_dir / "prompts.json"
+        self.current_prompt = self.load_prompt()
+
+    def load_prompt(self) -> str:
+        """Load the system prompt from the config file or return the default"""
+        try:
+            if not self.config_dir.exists():
+                self.config_dir.mkdir(parents=True)
+            
+            if not self.config_file.exists():
+                self.save_prompt(DEFAULT_SYSTEM_PROMPT)
+                return DEFAULT_SYSTEM_PROMPT
+
+            with open(self.config_file, 'r') as f:
+                data = json.load(f)
+                return data.get('system_prompt', DEFAULT_SYSTEM_PROMPT)
+        except Exception as e:
+            print(f"Error loading prompt: {e}")
+            return DEFAULT_SYSTEM_PROMPT
+
+    def save_prompt(self, prompt: str) -> bool:
+        """Save the system prompt to the config file"""
+        try:
+            with open(self.config_file, 'w') as f:
+                json.dump({'system_prompt': prompt}, f, indent=2)
+            self.current_prompt = prompt
+            return True
+        except Exception as e:
+            print(f"Error saving prompt: {e}")
+            return False
+
+    def reset_to_default(self) -> bool:
+        """Reset the system prompt to the default value"""
+        return self.save_prompt(DEFAULT_SYSTEM_PROMPT)
+
+    def get_current_prompt(self) -> str:
+        """Get the current system prompt"""
+        return self.current_prompt
diff --git a/src/store.py b/src/store.py
@@ -63,25 +63,35 @@ def run_agent(self, update_callback):
                     self.running = False
                     break
                 
-                self.computer_control.perform_action(action)
-
-                logger.info(f"Performed action: {action['type']}")
-                
-                screenshot = self.computer_control.take_screenshot()
-                self.run_history.append({
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "tool_result",
-                            "tool_use_id": self.last_tool_use_id,
+                try:
+                    # Perform the action and get the screenshot
+                    screenshot = self.computer_control.perform_action(action)
+                    
+                    if screenshot:  # Only add screenshot if one was returned
+                        self.run_history.append({
+                            "role": "user",
                             "content": [
-                                {"type": "text", "text": "Here is a screenshot after the action was executed"},
-                                {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": screenshot}}
+                                {
+                                    "type": "tool_result",
+                                    "tool_use_id": self.last_tool_use_id,
+                                    "content": [
+                                        {"type": "text", "text": "Here is a screenshot after the action was executed"},
+                                        {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": screenshot}}
+                                    ]
+                                }
                             ]
-                        }
-                    ]
-                })
-                logger.debug("Screenshot added to run history")
+                        })
+                        logger.debug("Screenshot added to run history")
+                    
+                except Exception as action_error:
+                    error_msg = f"Action failed: {str(action_error)}"
+                    update_callback(f"Error: {error_msg}")
+                    logger.error(error_msg)
+                    # Don't stop running, let the AI handle the error
+                    self.run_history.append({
+                        "role": "user",
+                        "content": [{"type": "text", "text": error_msg}]
+                    })
                 
             except Exception as e:
                 self.error = str(e)
@@ -91,8 +101,16 @@ def run_agent(self, update_callback):
                 break
         
     def stop_run(self):
+        """Stop the current agent run and clean up resources"""
         self.running = False
+        if hasattr(self, 'computer_control'):
+            self.computer_control.cleanup()
         logger.info("Agent run stopped")
+        # Add a message to the run history to indicate stopping
+        self.run_history.append({
+            "role": "user",
+            "content": [{"type": "text", "text": "Agent run stopped by user."}]
+        })
         
     def extract_action(self, message):
         logger.debug(f"Extracting action from message: {message}")
@@ -161,6 +179,10 @@ def display_assistant_message(self, message, update_callback):
                         }
                         update_callback(f"Performed action: {json.dumps(action)}")
                     elif tool_name == 'finish_run':
-                        update_callback("Assistant: Task completed! ✨")
+                        update_callback("Assistant: Task completed! ")
                     else:
                         update_callback(f"Assistant action: {tool_name} - {json.dumps(tool_input)}")
+
+    def cleanup(self):
+        if hasattr(self, 'computer_control'):
+            self.computer_control.cleanup()
diff --git a/src/voice_control.py b/src/voice_control.py
diff --git a/src/window.py b/src/window.py