Skip to content

Commit 5d360ac

Browse files
authored
Merge pull request #10 from RealShocky/main
2 parents 82c0d9f + 120eb0d commit 5d360ac

File tree

7 files changed

+580
-230
lines changed

7 files changed

+580
-230
lines changed

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ python-dotenv
66
pillow
77
numpy
88
qtawesome
9+
SpeechRecognition
10+
pyttsx3
11+
keyboard
12+
pyaudio

src/anthropic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
from dotenv import load_dotenv
55
import logging
6+
from .prompt_manager import PromptManager
67

78
class AnthropicClient:
89
def __init__(self):
@@ -13,6 +14,7 @@ def __init__(self):
1314

1415
try:
1516
self.client = anthropic.Anthropic(api_key=self.api_key)
17+
self.prompt_manager = PromptManager()
1618
except Exception as e:
1719
raise ValueError(f"Failed to initialize Anthropic client: {str(e)}")
1820

@@ -62,7 +64,7 @@ def get_next_action(self, run_history) -> BetaMessage:
6264
}
6365
],
6466
messages=cleaned_history,
65-
system="The user will ask you to perform a task and you should use their computer to do so. After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. Explicitly show your thinking: 'I have evaluated step X...' If not correct, try again. Only when you confirm a step was executed correctly should you move on to the next one. Note that you have to click into the browser address bar before typing a URL. You should always call a tool! Always return a tool call. Remember call the finish_run tool when you have achieved the goal of the task. Do not explain you have finished the task, just call the tool. Use keyboard shortcuts to navigate whenever possible. Please remember to take a screenshot after EVERY step to confirm you have achieved the right outcome.",
67+
system=self.prompt_manager.get_current_prompt(),
6668
betas=["computer-use-2024-10-22"],
6769
)
6870

src/computer.py

Lines changed: 70 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,41 +8,75 @@ class ComputerControl:
88
def __init__(self):
99
self.screen_width, self.screen_height = pyautogui.size()
1010
pyautogui.PAUSE = 0.5 # Add a small delay between actions for stability
11+
self.last_click_position = None
1112

1213
def perform_action(self, action):
1314
action_type = action['type']
1415

15-
if action_type == 'mouse_move':
16-
x, y = self.map_from_ai_space(action['x'], action['y'])
17-
pyautogui.moveTo(x, y)
18-
elif action_type == 'left_click':
19-
pyautogui.click()
20-
time.sleep(0.1) # Add a small delay after clicking
21-
elif action_type == 'right_click':
22-
pyautogui.rightClick()
23-
time.sleep(0.1)
24-
elif action_type == 'middle_click':
25-
pyautogui.middleClick()
26-
time.sleep(0.1)
27-
elif action_type == 'double_click':
28-
pyautogui.doubleClick()
29-
time.sleep(0.1)
30-
elif action_type == 'left_click_drag':
31-
start_x, start_y = pyautogui.position()
32-
end_x, end_y = self.map_from_ai_space(action['x'], action['y'])
33-
pyautogui.dragTo(end_x, end_y, button='left', duration=0.5)
34-
elif action_type == 'type':
35-
pyautogui.write(action['text'], interval=0.1) # Add a small delay between keystrokes
36-
elif action_type == 'key':
37-
pyautogui.press(action['text'])
38-
time.sleep(0.1)
39-
elif action_type == 'screenshot':
40-
return self.take_screenshot()
41-
elif action_type == 'cursor_position':
42-
x, y = pyautogui.position()
43-
return self.map_to_ai_space(x, y)
44-
else:
45-
raise ValueError(f"Unsupported action: {action_type}")
16+
# Take a screenshot before the action
17+
before_screenshot = self.take_screenshot()
18+
19+
try:
20+
if action_type == 'mouse_move':
21+
x, y = self.map_from_ai_space(action['x'], action['y'])
22+
pyautogui.moveTo(x, y)
23+
time.sleep(0.2) # Wait for move to complete
24+
25+
elif action_type == 'left_click':
26+
pyautogui.click()
27+
time.sleep(0.2) # Wait for click to register
28+
self.last_click_position = pyautogui.position()
29+
30+
elif action_type == 'right_click':
31+
pyautogui.rightClick()
32+
time.sleep(0.2)
33+
34+
elif action_type == 'middle_click':
35+
pyautogui.middleClick()
36+
time.sleep(0.2)
37+
38+
elif action_type == 'double_click':
39+
pyautogui.doubleClick()
40+
time.sleep(0.2)
41+
self.last_click_position = pyautogui.position()
42+
43+
elif action_type == 'left_click_drag':
44+
start_x, start_y = pyautogui.position()
45+
end_x, end_y = self.map_from_ai_space(action['x'], action['y'])
46+
pyautogui.dragTo(end_x, end_y, button='left', duration=0.5)
47+
time.sleep(0.2)
48+
49+
elif action_type == 'type':
50+
# If we have a last click position, ensure we're still there
51+
if self.last_click_position:
52+
current_pos = pyautogui.position()
53+
if current_pos != self.last_click_position:
54+
pyautogui.click(self.last_click_position)
55+
time.sleep(0.2)
56+
57+
pyautogui.write(action['text'], interval=0.1)
58+
time.sleep(0.2)
59+
60+
elif action_type == 'key':
61+
pyautogui.press(action['text'])
62+
time.sleep(0.2)
63+
64+
elif action_type == 'screenshot':
65+
return self.take_screenshot()
66+
67+
elif action_type == 'cursor_position':
68+
x, y = pyautogui.position()
69+
return self.map_to_ai_space(x, y)
70+
71+
else:
72+
raise ValueError(f"Unsupported action: {action_type}")
73+
74+
# Take a screenshot after the action
75+
after_screenshot = self.take_screenshot()
76+
return after_screenshot
77+
78+
except Exception as e:
79+
raise Exception(f"Action failed: {action_type} - {str(e)}")
4680

4781
def take_screenshot(self):
4882
screenshot = pyautogui.screenshot()
@@ -61,3 +95,8 @@ def map_to_ai_space(self, x, y):
6195

6296
def resize_for_ai(self, screenshot):
6397
return screenshot.resize((1280, 800), Image.LANCZOS)
98+
99+
def cleanup(self):
100+
"""Clean up any resources or running processes"""
101+
# Add cleanup code here if needed
102+
pass

src/prompt_manager.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import json
2+
import os
3+
from pathlib import Path
4+
5+
DEFAULT_SYSTEM_PROMPT = """The user will ask you to perform a task and you should use their computer to do so. After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. Explicitly show your thinking: 'I have evaluated step X...' If not correct, try again. Only when you confirm a step was executed correctly should you move on to the next one. Note that you have to click into the browser address bar before typing a URL. You should always call a tool! Always return a tool call. Remember call the finish_run tool when you have achieved the goal of the task. Do not explain you have finished the task, just call the tool. Use keyboard shortcuts to navigate whenever possible. Please remember to take a screenshot after EVERY step to confirm you have achieved the right outcome."""
6+
7+
class PromptManager:
8+
def __init__(self):
9+
self.config_dir = Path.home() / ".grunty"
10+
self.config_file = self.config_dir / "prompts.json"
11+
self.current_prompt = self.load_prompt()
12+
13+
def load_prompt(self) -> str:
14+
"""Load the system prompt from the config file or return the default"""
15+
try:
16+
if not self.config_dir.exists():
17+
self.config_dir.mkdir(parents=True)
18+
19+
if not self.config_file.exists():
20+
self.save_prompt(DEFAULT_SYSTEM_PROMPT)
21+
return DEFAULT_SYSTEM_PROMPT
22+
23+
with open(self.config_file, 'r') as f:
24+
data = json.load(f)
25+
return data.get('system_prompt', DEFAULT_SYSTEM_PROMPT)
26+
except Exception as e:
27+
print(f"Error loading prompt: {e}")
28+
return DEFAULT_SYSTEM_PROMPT
29+
30+
def save_prompt(self, prompt: str) -> bool:
31+
"""Save the system prompt to the config file"""
32+
try:
33+
with open(self.config_file, 'w') as f:
34+
json.dump({'system_prompt': prompt}, f, indent=2)
35+
self.current_prompt = prompt
36+
return True
37+
except Exception as e:
38+
print(f"Error saving prompt: {e}")
39+
return False
40+
41+
def reset_to_default(self) -> bool:
42+
"""Reset the system prompt to the default value"""
43+
return self.save_prompt(DEFAULT_SYSTEM_PROMPT)
44+
45+
def get_current_prompt(self) -> str:
46+
"""Get the current system prompt"""
47+
return self.current_prompt

src/store.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,35 @@ def run_agent(self, update_callback):
6363
self.running = False
6464
break
6565

66-
self.computer_control.perform_action(action)
67-
68-
logger.info(f"Performed action: {action['type']}")
69-
70-
screenshot = self.computer_control.take_screenshot()
71-
self.run_history.append({
72-
"role": "user",
73-
"content": [
74-
{
75-
"type": "tool_result",
76-
"tool_use_id": self.last_tool_use_id,
66+
try:
67+
# Perform the action and get the screenshot
68+
screenshot = self.computer_control.perform_action(action)
69+
70+
if screenshot: # Only add screenshot if one was returned
71+
self.run_history.append({
72+
"role": "user",
7773
"content": [
78-
{"type": "text", "text": "Here is a screenshot after the action was executed"},
79-
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": screenshot}}
74+
{
75+
"type": "tool_result",
76+
"tool_use_id": self.last_tool_use_id,
77+
"content": [
78+
{"type": "text", "text": "Here is a screenshot after the action was executed"},
79+
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": screenshot}}
80+
]
81+
}
8082
]
81-
}
82-
]
83-
})
84-
logger.debug("Screenshot added to run history")
83+
})
84+
logger.debug("Screenshot added to run history")
85+
86+
except Exception as action_error:
87+
error_msg = f"Action failed: {str(action_error)}"
88+
update_callback(f"Error: {error_msg}")
89+
logger.error(error_msg)
90+
# Don't stop running, let the AI handle the error
91+
self.run_history.append({
92+
"role": "user",
93+
"content": [{"type": "text", "text": error_msg}]
94+
})
8595

8696
except Exception as e:
8797
self.error = str(e)
@@ -91,8 +101,16 @@ def run_agent(self, update_callback):
91101
break
92102

93103
def stop_run(self):
104+
"""Stop the current agent run and clean up resources"""
94105
self.running = False
106+
if hasattr(self, 'computer_control'):
107+
self.computer_control.cleanup()
95108
logger.info("Agent run stopped")
109+
# Add a message to the run history to indicate stopping
110+
self.run_history.append({
111+
"role": "user",
112+
"content": [{"type": "text", "text": "Agent run stopped by user."}]
113+
})
96114

97115
def extract_action(self, message):
98116
logger.debug(f"Extracting action from message: {message}")
@@ -161,6 +179,10 @@ def display_assistant_message(self, message, update_callback):
161179
}
162180
update_callback(f"Performed action: {json.dumps(action)}")
163181
elif tool_name == 'finish_run':
164-
update_callback("Assistant: Task completed! ")
182+
update_callback("Assistant: Task completed! ")
165183
else:
166184
update_callback(f"Assistant action: {tool_name} - {json.dumps(tool_input)}")
185+
186+
def cleanup(self):
187+
if hasattr(self, 'computer_control'):
188+
self.computer_control.cleanup()

0 commit comments

Comments
 (0)