Skip to content

cua samples (ts,python) added, browser-use resize glitch fixed, updated to latest kernel sdk version in packages #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ create-kernel-app [app-name] [options]
- `stagehand`: Template with Stagehand SDK (Typescript only)
- `advanced-sample`: Implements sample apps using advanced Kernel configs
- `computer-use`: Implements a prompt loop using Anthropic Computer Use
- `cua`: Implements a Computer Use Agent (OpenAI CUA) sample

### Examples

Expand Down Expand Up @@ -121,6 +122,12 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google

# Python + Browser Use
kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}'

# Typescript + CUA Sample
kernel invoke ts-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}'

# Python + CUA Sample
kernel invoke python-cua cua-task --payload '{"task": "Get current market price range for an unboxed Dreamcast"}'
```

## Sample apps reference
Expand All @@ -134,6 +141,7 @@ These are the sample apps currently available when you run `npx @onkernel/create
| **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` |
| **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a |
| **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` |
| **cua** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ task }` |

## Documentation

Expand Down
21 changes: 20 additions & 1 deletion index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ type TemplateKey =
| "browser-use"
| "stagehand"
| "advanced-sample"
| "computer-use";
| "computer-use"
| "cua";
type LanguageInfo = { name: string; shorthand: string };
type TemplateInfo = {
name: string;
Expand All @@ -34,6 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use";
const TEMPLATE_STAGEHAND = "stagehand";
const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample";
const TEMPLATE_COMPUTER_USE = "computer-use";
const TEMPLATE_CUA = "cua";
const LANGUAGE_SHORTHAND_TS = "ts";
const LANGUAGE_SHORTHAND_PY = "py";

Expand Down Expand Up @@ -73,6 +75,11 @@ const TEMPLATES: Record<TemplateKey, TemplateInfo> = {
description: "Implements the Anthropic Computer Use SDK",
languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON],
},
[TEMPLATE_CUA]: {
name: "CUA Sample",
description: "Implements a Computer Use Agent (OpenAI CUA) sample",
languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON],
},
};

const INVOKE_SAMPLES: Record<
Expand All @@ -88,6 +95,8 @@ const INVOKE_SAMPLES: Record<
'kernel invoke ts-advanced test-captcha-solver',
[TEMPLATE_COMPUTER_USE]:
'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'',
[TEMPLATE_CUA]:
'kernel invoke ts-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'',
},
[LANGUAGE_PYTHON]: {
[TEMPLATE_SAMPLE_APP]:
Expand All @@ -98,6 +107,8 @@ const INVOKE_SAMPLES: Record<
'kernel invoke python-advanced test-captcha-solver',
[TEMPLATE_COMPUTER_USE]:
'kernel invoke python-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'',
[TEMPLATE_CUA]:
'kernel invoke python-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'',
},
};

Expand All @@ -114,6 +125,8 @@ const REGISTERED_APP_NAMES: Record<
'ts-advanced',
[TEMPLATE_COMPUTER_USE]:
'ts-cu',
[TEMPLATE_CUA]:
'ts-cua',
},
[LANGUAGE_PYTHON]: {
[TEMPLATE_SAMPLE_APP]:
Expand All @@ -124,6 +137,8 @@ const REGISTERED_APP_NAMES: Record<
'python-advanced',
[TEMPLATE_COMPUTER_USE]:
'python-cu',
[TEMPLATE_CUA]:
'python-cua',
},
};

Expand Down Expand Up @@ -354,12 +369,16 @@ function printNextSteps(
? "kernel deploy index.ts --env OPENAI_API_KEY=XXX"
: language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE
? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX"
: language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA
? "kernel deploy index.ts --env OPENAI_API_KEY=XXX"
: language === LANGUAGE_PYTHON && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_ADVANCED_SAMPLE)
? "kernel deploy main.py"
: language === LANGUAGE_PYTHON && template === TEMPLATE_BROWSER_USE
? "kernel deploy main.py --env OPENAI_API_KEY=XXX"
: language === LANGUAGE_PYTHON && template === TEMPLATE_COMPUTER_USE
? "kernel deploy main.py --env ANTHROPIC_API_KEY=XXX"
: language === LANGUAGE_PYTHON && template === TEMPLATE_CUA
? "kernel deploy main.py --env OPENAI_API_KEY=XXX"
: "";

console.log(
Expand Down
7 changes: 4 additions & 3 deletions templates/python/browser-use/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from langchain_openai import ChatOpenAI
from browser_use import Agent, BrowserSession
from browser_use import Agent
import kernel
from kernel import Kernel
from typing import TypedDict
from session import BrowserSessionCustomResize

client = Kernel()

Expand All @@ -13,7 +14,7 @@ class TaskInput(TypedDict):

# LLM API Keys are set in the environment during `kernel deploy <filename> -e OPENAI_API_KEY=XXX`
# See https://docs.onkernel.com/launch/deploy#environment-variables
llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-4o-mini")

@app.action("bu-task")
async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
Expand All @@ -37,7 +38,7 @@ async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
#task="Compare the price of gpt-4o and DeepSeek-V3",
task=input_data["task"],
llm=llm,
browser_session=BrowserSession(cdp_url=kernel_browser.cdp_ws_url)
browser_session=BrowserSessionCustomResize(cdp_url=kernel_browser.cdp_ws_url)
)
result = await agent.run()
if result.final_result() is not None:
Expand Down
85 changes: 85 additions & 0 deletions templates/python/browser-use/session.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from browser_use import BrowserSession

# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp)
class BrowserSessionCustomResize(BrowserSession):
async def _setup_viewports(self) -> None:
"""Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc."""

assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()'

self.browser_profile.window_size = {"width": 1024, "height": 786}
self.browser_profile.viewport = {"width": 1024, "height": 786}
self.browser_profile.screen = {"width": 1024, "height": 786}
self.browser_profile.device_scale_factor = 1.0

# log the viewport settings to terminal
viewport = self.browser_profile.viewport
# if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults
if self.browser_profile.permissions:
try:
await self.browser_context.grant_permissions(self.browser_profile.permissions)
except Exception as e:
print(e)
try:
if self.browser_profile.default_timeout:
self.browser_context.set_default_timeout(self.browser_profile.default_timeout)
if self.browser_profile.default_navigation_timeout:
self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout)
except Exception as e:
print(e)
try:
if self.browser_profile.extra_http_headers:
self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers)
except Exception as e:
print(e)

try:
if self.browser_profile.geolocation:
await self.browser_context.set_geolocation(self.browser_profile.geolocation)
except Exception as e:
print(e)

await self.load_storage_state()

page = None

for page in self.browser_context.pages:
# apply viewport size settings to any existing pages
if viewport:
await page.set_viewport_size(viewport)

# show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages
if page.url == 'about:blank':
await self._show_dvd_screensaver_loading_animation(page)

page = page or (await self.browser_context.new_page())

if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless:
# attempt to resize the actual browser window

# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
try:
cdp_session = await page.context.new_cdp_session(page)
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
await cdp_session.send(
'Browser.setWindowBounds',
{
'windowId': window_id_result['windowId'],
'bounds': {
**self.browser_profile.window_size,
'windowState': 'normal', # Ensure window is not minimized/maximized
},
},
)
await cdp_session.detach()
except Exception as e:
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
try:
# fallback to javascript resize if cdp setWindowBounds fails
await page.evaluate(
"""(width, height) => {window.resizeTo(width, height)}""",
**self.browser_profile.window_size,
)
return
except Exception as e:
pass
7 changes: 7 additions & 0 deletions templates/python/cua/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Kernel Python Sample App - CUA

This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI.

It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation.

See the [docs](https://docs.onkernel.com/quickstart) for more information.
Empty file.
4 changes: 4 additions & 0 deletions templates/python/cua/_gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__/
.env
.venv/
env/
1 change: 1 addition & 0 deletions templates/python/cua/agent/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .agent import Agent
Loading