Skip to content

Commit b3920b5

Browse files
raiden-stagingraidendotai
andauthored
cua samples (ts,python) added, browser-use resize glitch fixed, updated to latest kernel sdk version in packages (#33)
* ts-cua sample added * python/browser-use : fixed viewport & window resize * added python-cua + updated readme & cli --------- Co-authored-by: raidendotai <[email protected]>
1 parent 3fe0b0d commit b3920b5

39 files changed

+1748
-8
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ create-kernel-app [app-name] [options]
4747
- `stagehand`: Template with Stagehand SDK (Typescript only)
4848
- `advanced-sample`: Implements sample apps using advanced Kernel configs
4949
- `computer-use`: Implements a prompt loop using Anthropic Computer Use
50+
- `cua`: Implements a Computer Use Agent (OpenAI CUA) sample
5051

5152
### Examples
5253

@@ -121,6 +122,12 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google
121122

122123
# Python + Browser Use
123124
kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}'
125+
126+
# Typescript + CUA Sample
127+
kernel invoke ts-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}'
128+
129+
# Python + CUA Sample
130+
kernel invoke python-cua cua-task --payload '{"task": "Get current market price range for an unboxed Dreamcast"}'
124131
```
125132

126133
## Sample apps reference
@@ -134,6 +141,7 @@ These are the sample apps currently available when you run `npx @onkernel/create
134141
| **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` |
135142
| **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a |
136143
| **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` |
144+
| **cua** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ task }` |
137145

138146
## Documentation
139147

index.ts

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ type TemplateKey =
1818
| "browser-use"
1919
| "stagehand"
2020
| "advanced-sample"
21-
| "computer-use";
21+
| "computer-use"
22+
| "cua";
2223
type LanguageInfo = { name: string; shorthand: string };
2324
type TemplateInfo = {
2425
name: string;
@@ -34,6 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use";
3435
const TEMPLATE_STAGEHAND = "stagehand";
3536
const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample";
3637
const TEMPLATE_COMPUTER_USE = "computer-use";
38+
const TEMPLATE_CUA = "cua";
3739
const LANGUAGE_SHORTHAND_TS = "ts";
3840
const LANGUAGE_SHORTHAND_PY = "py";
3941

@@ -73,6 +75,11 @@ const TEMPLATES: Record<TemplateKey, TemplateInfo> = {
7375
description: "Implements the Anthropic Computer Use SDK",
7476
languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON],
7577
},
78+
[TEMPLATE_CUA]: {
79+
name: "CUA Sample",
80+
description: "Implements a Computer Use Agent (OpenAI CUA) sample",
81+
languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON],
82+
},
7683
};
7784

7885
const INVOKE_SAMPLES: Record<
@@ -88,6 +95,8 @@ const INVOKE_SAMPLES: Record<
8895
'kernel invoke ts-advanced test-captcha-solver',
8996
[TEMPLATE_COMPUTER_USE]:
9097
'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'',
98+
[TEMPLATE_CUA]:
99+
'kernel invoke ts-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'',
91100
},
92101
[LANGUAGE_PYTHON]: {
93102
[TEMPLATE_SAMPLE_APP]:
@@ -98,6 +107,8 @@ const INVOKE_SAMPLES: Record<
98107
'kernel invoke python-advanced test-captcha-solver',
99108
[TEMPLATE_COMPUTER_USE]:
100109
'kernel invoke python-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'',
110+
[TEMPLATE_CUA]:
111+
'kernel invoke python-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'',
101112
},
102113
};
103114

@@ -114,6 +125,8 @@ const REGISTERED_APP_NAMES: Record<
114125
'ts-advanced',
115126
[TEMPLATE_COMPUTER_USE]:
116127
'ts-cu',
128+
[TEMPLATE_CUA]:
129+
'ts-cua',
117130
},
118131
[LANGUAGE_PYTHON]: {
119132
[TEMPLATE_SAMPLE_APP]:
@@ -124,6 +137,8 @@ const REGISTERED_APP_NAMES: Record<
124137
'python-advanced',
125138
[TEMPLATE_COMPUTER_USE]:
126139
'python-cu',
140+
[TEMPLATE_CUA]:
141+
'python-cua',
127142
},
128143
};
129144

@@ -354,12 +369,16 @@ function printNextSteps(
354369
? "kernel deploy index.ts --env OPENAI_API_KEY=XXX"
355370
: language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE
356371
? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX"
372+
: language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA
373+
? "kernel deploy index.ts --env OPENAI_API_KEY=XXX"
357374
: language === LANGUAGE_PYTHON && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_ADVANCED_SAMPLE)
358375
? "kernel deploy main.py"
359376
: language === LANGUAGE_PYTHON && template === TEMPLATE_BROWSER_USE
360377
? "kernel deploy main.py --env OPENAI_API_KEY=XXX"
361378
: language === LANGUAGE_PYTHON && template === TEMPLATE_COMPUTER_USE
362379
? "kernel deploy main.py --env ANTHROPIC_API_KEY=XXX"
380+
: language === LANGUAGE_PYTHON && template === TEMPLATE_CUA
381+
? "kernel deploy main.py --env OPENAI_API_KEY=XXX"
363382
: "";
364383

365384
console.log(

templates/python/browser-use/main.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from langchain_openai import ChatOpenAI
2-
from browser_use import Agent, BrowserSession
2+
from browser_use import Agent
33
import kernel
44
from kernel import Kernel
55
from typing import TypedDict
6+
from session import BrowserSessionCustomResize
67

78
client = Kernel()
89

@@ -13,7 +14,7 @@ class TaskInput(TypedDict):
1314

1415
# LLM API Keys are set in the environment during `kernel deploy <filename> -e OPENAI_API_KEY=XXX`
1516
# See https://docs.onkernel.com/launch/deploy#environment-variables
16-
llm = ChatOpenAI(model="gpt-4o")
17+
llm = ChatOpenAI(model="gpt-4o-mini")
1718

1819
@app.action("bu-task")
1920
async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
@@ -37,7 +38,7 @@ async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
3738
#task="Compare the price of gpt-4o and DeepSeek-V3",
3839
task=input_data["task"],
3940
llm=llm,
40-
browser_session=BrowserSession(cdp_url=kernel_browser.cdp_ws_url)
41+
browser_session=BrowserSessionCustomResize(cdp_url=kernel_browser.cdp_ws_url)
4142
)
4243
result = await agent.run()
4344
if result.final_result() is not None:
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from browser_use import BrowserSession
2+
3+
# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp)
4+
class BrowserSessionCustomResize(BrowserSession):
5+
async def _setup_viewports(self) -> None:
6+
"""Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc."""
7+
8+
assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()'
9+
10+
self.browser_profile.window_size = {"width": 1024, "height": 786}
11+
self.browser_profile.viewport = {"width": 1024, "height": 786}
12+
self.browser_profile.screen = {"width": 1024, "height": 786}
13+
self.browser_profile.device_scale_factor = 1.0
14+
15+
# log the viewport settings to terminal
16+
viewport = self.browser_profile.viewport
17+
# if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults
18+
if self.browser_profile.permissions:
19+
try:
20+
await self.browser_context.grant_permissions(self.browser_profile.permissions)
21+
except Exception as e:
22+
print(e)
23+
try:
24+
if self.browser_profile.default_timeout:
25+
self.browser_context.set_default_timeout(self.browser_profile.default_timeout)
26+
if self.browser_profile.default_navigation_timeout:
27+
self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout)
28+
except Exception as e:
29+
print(e)
30+
try:
31+
if self.browser_profile.extra_http_headers:
32+
self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers)
33+
except Exception as e:
34+
print(e)
35+
36+
try:
37+
if self.browser_profile.geolocation:
38+
await self.browser_context.set_geolocation(self.browser_profile.geolocation)
39+
except Exception as e:
40+
print(e)
41+
42+
await self.load_storage_state()
43+
44+
page = None
45+
46+
for page in self.browser_context.pages:
47+
# apply viewport size settings to any existing pages
48+
if viewport:
49+
await page.set_viewport_size(viewport)
50+
51+
# show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages
52+
if page.url == 'about:blank':
53+
await self._show_dvd_screensaver_loading_animation(page)
54+
55+
page = page or (await self.browser_context.new_page())
56+
57+
if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless:
58+
# attempt to resize the actual browser window
59+
60+
# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
61+
try:
62+
cdp_session = await page.context.new_cdp_session(page)
63+
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
64+
await cdp_session.send(
65+
'Browser.setWindowBounds',
66+
{
67+
'windowId': window_id_result['windowId'],
68+
'bounds': {
69+
**self.browser_profile.window_size,
70+
'windowState': 'normal', # Ensure window is not minimized/maximized
71+
},
72+
},
73+
)
74+
await cdp_session.detach()
75+
except Exception as e:
76+
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
77+
try:
78+
# fallback to javascript resize if cdp setWindowBounds fails
79+
await page.evaluate(
80+
"""(width, height) => {window.resizeTo(width, height)}""",
81+
**self.browser_profile.window_size,
82+
)
83+
return
84+
except Exception as e:
85+
pass

templates/python/cua/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Kernel Python Sample App - CUA
2+
3+
This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI.
4+
5+
It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation.
6+
7+
See the [docs](https://docs.onkernel.com/quickstart) for more information.

templates/python/cua/__init__.py

Whitespace-only changes.

templates/python/cua/_gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
__pycache__/
2+
.env
3+
.venv/
4+
env/
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .agent import Agent

0 commit comments

Comments
 (0)