feat: add a real-time progress indicator for playground (#177)

web-infra-dev · Dec 16, 2024 · 537a5c4 · 537a5c4
1 parent 863e5a5
commit 537a5c4
Show file tree

Hide file tree

Showing 103 changed files with 5,841 additions and 4,007 deletions.
diff --git a/.github/workflows/ai.yml b/.github/workflows/ai.yml
@@ -22,7 +22,8 @@ jobs:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
       MIDSCENE_MODEL_NAME: gpt-4o-2024-08-06
-      MIDSCENE_DEBUG_AI_PROFILE: 1
+      CI: 1
+      # MIDSCENE_DEBUG_AI_PROFILE: 1
 
     steps:
     - uses: actions/checkout@v4

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,5 +4,5 @@
   },
   "editor.defaultFormatter": "biomejs.biome",
   "editor.formatOnSave": true,
-  "cSpell.words": ["AITEST", "aweme", "httpbin", "iconfont", "taobao"]
+  "cSpell.words": ["AITEST", "aweme", "douyin", "httpbin", "iconfont", "taobao"]
 }
diff --git a/packages/cli/src/printer.ts b/packages/cli/src/printer.ts
@@ -30,8 +30,9 @@ export const flowItemBrief = (flowItem?: MidsceneYamlFlowItem) => {
   }
 
   const sliceText = (text?: string) => {
-    if (text && text.length > 12) {
-      return `${text.slice(0, 12)}...`;
+    const lengthLimit = 60;
+    if (text && text.length > lengthLimit) {
+      return `${text.slice(0, lengthLimit)}...`;
     }
 
     return text || '';
@@ -42,7 +43,8 @@ export const flowItemBrief = (flowItem?: MidsceneYamlFlowItem) => {
     (flowItem as MidsceneYamlFlowItemAIAction).ai
   ) {
     return `aiAction: ${sliceText(
-      (flowItem as MidsceneYamlFlowItemAIAction).aiAction ||
+      (flowItem as MidsceneYamlFlowItemAIAction).aiActionProgressTip ||
+        (flowItem as MidsceneYamlFlowItemAIAction).aiAction ||
         (flowItem as MidsceneYamlFlowItemAIAction).ai,
     )}`;
   }
@@ -104,7 +106,7 @@ export const contextInfo = (context: MidsceneYamlFileContext) => {
   const reportFile = context.player.reportFile;
   const reportFileToShow = relative(process.cwd(), reportFile || '');
   const reportText = reportFile
-    ? `\n${indent}${chalk.gray(`report: ${reportFileToShow}`)}`
+    ? `\n${indent}${chalk.gray(`report: ./${reportFileToShow}`)}`
     : '';
 
   const mergedText =

diff --git a/packages/cli/src/types.d.ts b/packages/cli/src/types.d.ts
@@ -16,6 +16,7 @@ export interface MidsceneYamlScriptEnv {
 export interface MidsceneYamlFlowItemAIAction {
   ai?: string; // this is the shortcut for aiAction
   aiAction?: string;
+  aiActionProgressTip?: string;
 }
 
 export interface MidsceneYamlFlowItemAIAssert {

diff --git a/packages/cli/src/yaml-player.ts b/packages/cli/src/yaml-player.ts
@@ -7,6 +7,8 @@ import assert from 'node:assert';
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
 import { basename, dirname, extname, join } from 'node:path';
 import { PuppeteerAgent } from '@midscene/web/puppeteer';
+import { paramStr, typeStr } from '@midscene/web/ui-utils';
+
 import {
   contextInfo,
   contextTaskListSummary,
@@ -116,14 +118,14 @@ export async function playYamlFiles(
 
     ttyRenderer.start();
     for (const context of fileContextList) {
-      await context.player.play();
+      await context.player.run();
     }
     ttyRenderer.stop();
   } else {
     for (const context of fileContextList) {
       const { mergedText } = contextInfo(context);
       console.log(mergedText);
-      await context.player.play();
+      await context.player.run();
       console.log(contextTaskListSummary(context.player.taskStatus, context));
     }
   }
@@ -220,7 +222,13 @@ export class ScriptPlayer {
           typeof prompt === 'string',
           'prompt for aiAction must be a string',
         );
-        await agent.aiAction(prompt);
+        await agent.aiAction(prompt, {
+          onTaskStart(task) {
+            const tip = `${typeStr(task)} - ${paramStr(task)}`;
+            (flowItem as MidsceneYamlFlowItemAIAction).aiActionProgressTip =
+              tip;
+          },
+        });
       } else if ((flowItem as MidsceneYamlFlowItemAIAssert).aiAssert) {
         const assertTask = flowItem as MidsceneYamlFlowItemAIAssert;
         const prompt = assertTask.aiAssert;
@@ -273,7 +281,7 @@ export class ScriptPlayer {
     this.reportFile = agent.reportFile;
   }
 
-  async play() {
+  async run() {
     const { target, tasks } = this.script;
     this.setPlayerStatus('running');
 

diff --git a/packages/cli/tests/__snapshots__/printer.test.ts.snap b/packages/cli/tests/__snapshots__/printer.test.ts.snap
@@ -1,7 +1,7 @@
 // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
 
-exports[`printer > action brief text 1`] = `"aiAction: search for w..."`;
+exports[`printer > action brief text 1`] = `"aiAction: search for weather"`;
 
 exports[`printer > action brief text 2`] = `"sleep: 1000"`;
 
-exports[`printer > action brief text 3`] = `"aiWaitFor: wait for som..."`;
+exports[`printer > action brief text 3`] = `"aiWaitFor: wait for something"`;
diff --git a/packages/cli/tests/midscene_scripts/sub/bing.yaml b/packages/cli/tests/midscene_scripts/sub/bing.yaml
@@ -1,5 +1,5 @@
 target:
-  url: https://www.baidu.com
+  url: https://www.bing.com
 tasks:
   - name: search weather
     flow:

diff --git a/packages/cli/tests/yaml.test.ts b/packages/cli/tests/yaml.test.ts
@@ -7,7 +7,7 @@ import { assert, describe, expect, test, vi } from 'vitest';
 const runYaml = async (yamlString: string) => {
   const script = loadYamlScript(yamlString);
   const player = new ScriptPlayer(script);
-  await player.play();
+  await player.run();
   assert(
     player.status === 'done',
     player.errorInSetup?.message || 'unknown error',

diff --git a/packages/midscene/src/action/executor.ts b/packages/midscene/src/action/executor.ts
@@ -5,6 +5,7 @@ import type {
   ExecutionTask,
   ExecutionTaskApply,
   ExecutionTaskInsightLocateOutput,
+  ExecutionTaskProgressOptions,
   ExecutionTaskReturn,
   ExecutorContext,
 } from '@/types';
@@ -20,19 +21,19 @@ export class Executor {
   // status of executor
   status: 'init' | 'pending' | 'running' | 'completed' | 'error';
 
-  onFlushUpdate?: () => void;
+  onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
 
   constructor(
     name: string,
     description?: string,
     tasks?: ExecutionTaskApply[],
-    onFlushUpdate?: () => void,
+    options?: ExecutionTaskProgressOptions,
   ) {
     this.status = tasks && tasks.length > 0 ? 'pending' : 'init';
     this.name = name;
     this.description = description;
     this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
-    this.onFlushUpdate = onFlushUpdate;
+    this.onTaskStart = options?.onTaskStart;
   }
 
   private markTaskAsPending(task: ExecutionTaskApply): ExecutionTask {
@@ -84,13 +85,6 @@ export class Executor {
 
     while (taskIndex < this.tasks.length) {
       const task = this.tasks[taskIndex];
-      try {
-        if (this.onFlushUpdate) {
-          this.onFlushUpdate();
-        }
-      } catch (e) {
-        // console.error('error in onFlushUpdate', e);
-      }
       assert(
         task.status === 'pending',
         `task status should be pending, but got: ${task.status}`,
@@ -100,6 +94,13 @@ export class Executor {
       };
       try {
         task.status = 'running';
+        try {
+          if (this.onTaskStart) {
+            await this.onTaskStart(task);
+          }
+        } catch (e) {
+          // console.error('error in onTaskStart', e);
+        }
         assert(
           ['Insight', 'Action', 'Planning'].indexOf(task.type) >= 0,
           `unsupported task type: ${task.type}`,
@@ -162,9 +163,7 @@ export class Executor {
     } else {
       this.status = 'error';
     }
-    if (this.onFlushUpdate) {
-      await this.onFlushUpdate();
-    }
+
     if (this.tasks.length) {
       // return the last output
       const outputIndex = Math.min(taskIndex, this.tasks.length - 1);

diff --git a/packages/midscene/src/ai-model/automation/index.ts b/packages/midscene/src/ai-model/automation/index.ts
@@ -23,7 +23,6 @@ export async function plan(
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
   const { description: pageDescription, elementByPosition } =
     await describeUserPage(context);
-  let planFromAI: PlanningAIResponse | undefined;
 
   const systemPrompt = systemPromptToTaskPlanning();
 
@@ -76,18 +75,14 @@ ${taskBackgroundContext}
     useModel,
   });
 
-  planFromAI = content;
+  const planFromAI = content;
 
   const actions = planFromAI?.actions || [];
   assert(planFromAI, "can't get plans from AI");
   assert(
     actions.length > 0,
-    `no actions in ai plan with context: ${planFromAI}`,
+    `Failed to plan actions with context: ${planFromAI.error}`,
   );
 
-  if (planFromAI.error) {
-    throw new Error(planFromAI.error);
-  }
-
   return planFromAI;
 }
diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts
@@ -30,7 +30,7 @@ export type AIArgs = [
 
 const liteContextConfig = {
   filterNonTextContent: true,
-  truncateTextLength: 100,
+  truncateTextLength: 200,
 };
 
 export function transformElementPositionToId(

diff --git a/packages/midscene/src/ai-model/prompt/planning.ts b/packages/midscene/src/ai-model/prompt/planning.ts
@@ -18,7 +18,7 @@ const quickAnswerFormat = () => {
 
   const sample = matchByPosition
     ? '{"position": { x: 100, y: 200 }}'
-    : '{"id": "14562"}';
+    : '{"id": "c81c4e9a33"}';
 
   return {
     description,
@@ -35,23 +35,24 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Objective
 
 - Decompose the task user asked into a series of actions
-- Precisely locate the target element if needed
+- Locate the target element if possible
 - If the task cannot be accomplished, give a further plan.
 
 ## Workflow
 
 1. Receive the user's element description, screenshot, and instruction.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). Please refer to the "About the action" section below.
-3. Precisely locate the target element if needed, put the location info in the \`locate\` field.
-4. Consider whether a task will be accomplished after all the actions
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
+3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
+4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
+5. Consider whether the user's instruction will be accomplished after all the actions
  - If yes, set \`taskWillBeAccomplished\` to true
- - If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. Refer to the "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section for more details.
+ - If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details.
 
 ## Constraints
 
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- If the page content is irrelevant to the task, put the error message in the \`error\` field.
+- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
 
 ## About the \`actions\` field
 
@@ -60,9 +61,9 @@ You are a versatile professional in software UI automation. Your outstanding con
 The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it follows the following scheme:
 
 type LocateParam = {
-  "id": string, // the id of the element found. If its not on the page, locate should be null
+  "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
   prompt?: string // the description of the element to find. It can only be omitted when locate is null.
-} | null
+} | null // If it's not on the page, the LocateParam should be null
 
 ### Supported actions
 
@@ -101,7 +102,7 @@ Please return the result in JSON format as follows:
       "type": "Tap",
       "param": null,
       "locate": {
-        {"id": "14562"},
+        {"id": "c81c4e9a33"},
         prompt: "the search bar"
       } | null,
     },
@@ -123,8 +124,9 @@ ${samplePageDescription}
 By viewing the page screenshot and description, you should consider this and output the JSON:
 
 * The main steps should be: tap the switch button, sleep, and tap the 'English' option 
-* The "English" option button is not shown in the page context now, the last action will have a \`null\` value in the \`locate\` field. 
-* The task cannot be accomplished (because we cannot find the "English" option), so a \`furtherPlan\` field is needed.
+* The language switch button is shown in the screenshot, but it's not marked with a rectangle. So we have to use the page description to find the element. By carefully checking the context information (coordinates, attributes, content, etc.), you can find the element.
+* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So the last action will have a \`null\` value in the \`locate\` field. 
+* The task cannot be accomplished (because we cannot see the "English" option now), so a \`furtherPlan\` field is needed.
 
 \`\`\`json
 {
@@ -156,6 +158,7 @@ By viewing the page screenshot and description, you should consider this and out
     "whatHaveDone": "Click the language switch button and wait 1s" 
   }
 }
+\`\`\`
 
 ## Example #2 : When task is accomplished, don't plan more actions
 
@@ -201,8 +204,6 @@ Wrong output:
 Reason: 
 * The \`prompt\` is missing in the first 'Locate' action
 * Since the option button is not shown in the screenshot, the task cannot be accomplished, so a \`furtherPlan\` field is needed.
-
-\`\`\`
 `;
 }