Skip to content

Commit

Permalink
feat: add a real-time progress indicator for playground (#177)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyutaotao authored Dec 16, 2024
1 parent 863e5a5 commit 537a5c4
Show file tree
Hide file tree
Showing 103 changed files with 5,841 additions and 4,007 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
MIDSCENE_MODEL_NAME: gpt-4o-2024-08-06
MIDSCENE_DEBUG_AI_PROFILE: 1
CI: 1
# MIDSCENE_DEBUG_AI_PROFILE: 1

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
},
"editor.defaultFormatter": "biomejs.biome",
"editor.formatOnSave": true,
"cSpell.words": ["AITEST", "aweme", "httpbin", "iconfont", "taobao"]
"cSpell.words": ["AITEST", "aweme", "douyin", "httpbin", "iconfont", "taobao"]
}
10 changes: 6 additions & 4 deletions packages/cli/src/printer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ export const flowItemBrief = (flowItem?: MidsceneYamlFlowItem) => {
}

const sliceText = (text?: string) => {
if (text && text.length > 12) {
return `${text.slice(0, 12)}...`;
const lengthLimit = 60;
if (text && text.length > lengthLimit) {
return `${text.slice(0, lengthLimit)}...`;
}

return text || '';
Expand All @@ -42,7 +43,8 @@ export const flowItemBrief = (flowItem?: MidsceneYamlFlowItem) => {
(flowItem as MidsceneYamlFlowItemAIAction).ai
) {
return `aiAction: ${sliceText(
(flowItem as MidsceneYamlFlowItemAIAction).aiAction ||
(flowItem as MidsceneYamlFlowItemAIAction).aiActionProgressTip ||
(flowItem as MidsceneYamlFlowItemAIAction).aiAction ||
(flowItem as MidsceneYamlFlowItemAIAction).ai,
)}`;
}
Expand Down Expand Up @@ -104,7 +106,7 @@ export const contextInfo = (context: MidsceneYamlFileContext) => {
const reportFile = context.player.reportFile;
const reportFileToShow = relative(process.cwd(), reportFile || '');
const reportText = reportFile
? `\n${indent}${chalk.gray(`report: ${reportFileToShow}`)}`
? `\n${indent}${chalk.gray(`report: ./${reportFileToShow}`)}`
: '';

const mergedText =
Expand Down
1 change: 1 addition & 0 deletions packages/cli/src/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export interface MidsceneYamlScriptEnv {
export interface MidsceneYamlFlowItemAIAction {
ai?: string; // this is the shortcut for aiAction
aiAction?: string;
aiActionProgressTip?: string;
}

export interface MidsceneYamlFlowItemAIAssert {
Expand Down
16 changes: 12 additions & 4 deletions packages/cli/src/yaml-player.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import assert from 'node:assert';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import { basename, dirname, extname, join } from 'node:path';
import { PuppeteerAgent } from '@midscene/web/puppeteer';
import { paramStr, typeStr } from '@midscene/web/ui-utils';

import {
contextInfo,
contextTaskListSummary,
Expand Down Expand Up @@ -116,14 +118,14 @@ export async function playYamlFiles(

ttyRenderer.start();
for (const context of fileContextList) {
await context.player.play();
await context.player.run();
}
ttyRenderer.stop();
} else {
for (const context of fileContextList) {
const { mergedText } = contextInfo(context);
console.log(mergedText);
await context.player.play();
await context.player.run();
console.log(contextTaskListSummary(context.player.taskStatus, context));
}
}
Expand Down Expand Up @@ -220,7 +222,13 @@ export class ScriptPlayer {
typeof prompt === 'string',
'prompt for aiAction must be a string',
);
await agent.aiAction(prompt);
await agent.aiAction(prompt, {
onTaskStart(task) {
const tip = `${typeStr(task)} - ${paramStr(task)}`;
(flowItem as MidsceneYamlFlowItemAIAction).aiActionProgressTip =
tip;
},
});
} else if ((flowItem as MidsceneYamlFlowItemAIAssert).aiAssert) {
const assertTask = flowItem as MidsceneYamlFlowItemAIAssert;
const prompt = assertTask.aiAssert;
Expand Down Expand Up @@ -273,7 +281,7 @@ export class ScriptPlayer {
this.reportFile = agent.reportFile;
}

async play() {
async run() {
const { target, tasks } = this.script;
this.setPlayerStatus('running');

Expand Down
4 changes: 2 additions & 2 deletions packages/cli/tests/__snapshots__/printer.test.ts.snap
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html

exports[`printer > action brief text 1`] = `"aiAction: search for w..."`;
exports[`printer > action brief text 1`] = `"aiAction: search for weather"`;

exports[`printer > action brief text 2`] = `"sleep: 1000"`;

exports[`printer > action brief text 3`] = `"aiWaitFor: wait for som..."`;
exports[`printer > action brief text 3`] = `"aiWaitFor: wait for something"`;
2 changes: 1 addition & 1 deletion packages/cli/tests/midscene_scripts/sub/bing.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
target:
url: https://www.baidu.com
url: https://www.bing.com
tasks:
- name: search weather
flow:
Expand Down
2 changes: 1 addition & 1 deletion packages/cli/tests/yaml.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { assert, describe, expect, test, vi } from 'vitest';
const runYaml = async (yamlString: string) => {
const script = loadYamlScript(yamlString);
const player = new ScriptPlayer(script);
await player.play();
await player.run();
assert(
player.status === 'done',
player.errorInSetup?.message || 'unknown error',
Expand Down
25 changes: 12 additions & 13 deletions packages/midscene/src/action/executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import type {
ExecutionTask,
ExecutionTaskApply,
ExecutionTaskInsightLocateOutput,
ExecutionTaskProgressOptions,
ExecutionTaskReturn,
ExecutorContext,
} from '@/types';
Expand All @@ -20,19 +21,19 @@ export class Executor {
// status of executor
status: 'init' | 'pending' | 'running' | 'completed' | 'error';

onFlushUpdate?: () => void;
onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];

constructor(
name: string,
description?: string,
tasks?: ExecutionTaskApply[],
onFlushUpdate?: () => void,
options?: ExecutionTaskProgressOptions,
) {
this.status = tasks && tasks.length > 0 ? 'pending' : 'init';
this.name = name;
this.description = description;
this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
this.onFlushUpdate = onFlushUpdate;
this.onTaskStart = options?.onTaskStart;
}

private markTaskAsPending(task: ExecutionTaskApply): ExecutionTask {
Expand Down Expand Up @@ -84,13 +85,6 @@ export class Executor {

while (taskIndex < this.tasks.length) {
const task = this.tasks[taskIndex];
try {
if (this.onFlushUpdate) {
this.onFlushUpdate();
}
} catch (e) {
// console.error('error in onFlushUpdate', e);
}
assert(
task.status === 'pending',
`task status should be pending, but got: ${task.status}`,
Expand All @@ -100,6 +94,13 @@ export class Executor {
};
try {
task.status = 'running';
try {
if (this.onTaskStart) {
await this.onTaskStart(task);
}
} catch (e) {
// console.error('error in onTaskStart', e);
}
assert(
['Insight', 'Action', 'Planning'].indexOf(task.type) >= 0,
`unsupported task type: ${task.type}`,
Expand Down Expand Up @@ -162,9 +163,7 @@ export class Executor {
} else {
this.status = 'error';
}
if (this.onFlushUpdate) {
await this.onFlushUpdate();
}

if (this.tasks.length) {
// return the last output
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
Expand Down
9 changes: 2 additions & 7 deletions packages/midscene/src/ai-model/automation/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ export async function plan(
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
const { description: pageDescription, elementByPosition } =
await describeUserPage(context);
let planFromAI: PlanningAIResponse | undefined;

const systemPrompt = systemPromptToTaskPlanning();

Expand Down Expand Up @@ -76,18 +75,14 @@ ${taskBackgroundContext}
useModel,
});

planFromAI = content;
const planFromAI = content;

const actions = planFromAI?.actions || [];
assert(planFromAI, "can't get plans from AI");
assert(
actions.length > 0,
`no actions in ai plan with context: ${planFromAI}`,
`Failed to plan actions with context: ${planFromAI.error}`,
);

if (planFromAI.error) {
throw new Error(planFromAI.error);
}

return planFromAI;
}
2 changes: 1 addition & 1 deletion packages/midscene/src/ai-model/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export type AIArgs = [

const liteContextConfig = {
filterNonTextContent: true,
truncateTextLength: 100,
truncateTextLength: 200,
};

export function transformElementPositionToId(
Expand Down
29 changes: 15 additions & 14 deletions packages/midscene/src/ai-model/prompt/planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ const quickAnswerFormat = () => {

const sample = matchByPosition
? '{"position": { x: 100, y: 200 }}'
: '{"id": "14562"}';
: '{"id": "c81c4e9a33"}';

return {
description,
Expand All @@ -35,23 +35,24 @@ You are a versatile professional in software UI automation. Your outstanding con
## Objective
- Decompose the task user asked into a series of actions
- Precisely locate the target element if needed
- Locate the target element if possible
- If the task cannot be accomplished, give a further plan.
## Workflow
1. Receive the user's element description, screenshot, and instruction.
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). Please refer to the "About the action" section below.
3. Precisely locate the target element if needed, put the location info in the \`locate\` field.
4. Consider whether a task will be accomplished after all the actions
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
5. Consider whether the user's instruction will be accomplished after all the actions
- If yes, set \`taskWillBeAccomplished\` to true
- If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. Refer to the "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section for more details.
- If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details.
## Constraints
- All the actions you composed MUST be based on the page context information you get.
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
- If the page content is irrelevant to the task, put the error message in the \`error\` field.
- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
## About the \`actions\` field
Expand All @@ -60,9 +61,9 @@ You are a versatile professional in software UI automation. Your outstanding con
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it follows the following scheme:
type LocateParam = {
"id": string, // the id of the element found. If its not on the page, locate should be null
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
prompt?: string // the description of the element to find. It can only be omitted when locate is null.
} | null
} | null // If it's not on the page, the LocateParam should be null
### Supported actions
Expand Down Expand Up @@ -101,7 +102,7 @@ Please return the result in JSON format as follows:
"type": "Tap",
"param": null,
"locate": {
{"id": "14562"},
{"id": "c81c4e9a33"},
prompt: "the search bar"
} | null,
},
Expand All @@ -123,8 +124,9 @@ ${samplePageDescription}
By viewing the page screenshot and description, you should consider this and output the JSON:
* The main steps should be: tap the switch button, sleep, and tap the 'English' option
* The "English" option button is not shown in the page context now, the last action will have a \`null\` value in the \`locate\` field.
* The task cannot be accomplished (because we cannot find the "English" option), so a \`furtherPlan\` field is needed.
* The language switch button is shown in the screenshot, but it's not marked with a rectangle. So we have to use the page description to find the element. By carefully checking the context information (coordinates, attributes, content, etc.), you can find the element.
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So the last action will have a \`null\` value in the \`locate\` field.
* The task cannot be accomplished (because we cannot see the "English" option now), so a \`furtherPlan\` field is needed.
\`\`\`json
{
Expand Down Expand Up @@ -156,6 +158,7 @@ By viewing the page screenshot and description, you should consider this and out
"whatHaveDone": "Click the language switch button and wait 1s"
}
}
\`\`\`
## Example #2 : When task is accomplished, don't plan more actions
Expand Down Expand Up @@ -201,8 +204,6 @@ Wrong output:
Reason:
* The \`prompt\` is missing in the first 'Locate' action
* Since the option button is not shown in the screenshot, the task cannot be accomplished, so a \`furtherPlan\` field is needed.
\`\`\`
`;
}

Expand Down
Loading

0 comments on commit 537a5c4

Please sign in to comment.