feat: implement .aiAssert, update some docs (#38)

* feat: implement .aiAssert, update some docs * fix: lint * fix: ci * feat: update quick-start
web-infra-dev · Aug 6, 2024 · 7edc2be · 7edc2be
1 parent df12c7e
commit 7edc2be
Show file tree

Hide file tree

Showing 35 changed files with 485 additions and 195 deletions.
diff --git a/apps/site/docs/en/docs/getting-started/quick-start.mdx b/apps/site/docs/en/docs/getting-started/quick-start.mdx
@@ -33,7 +33,6 @@ export default defineConfig({
   testDir: './e2e',
 + timeout: 90 * 1000,
 + reporter: [["list"], ["@midscene/web/playwright-report"]],
-
 });
 ```
 
@@ -58,12 +57,12 @@ import { expect } from "@playwright/test";
 import { test } from "./fixture";
 
 test.beforeEach(async ({ page }) => {
-  page.setViewportSize({ width: 400, height: 905 });
+  page.setViewportSize({ width: 1280, height: 800 });
   await page.goto("https://www.ebay.com");
   await page.waitForLoadState("networkidle");
 });
 
-test("search headphone on ebay", async ({ ai, aiQuery }) => {
+test("search headphone on ebay", async ({ ai, aiQuery, aiAssert }) => {
   // 👀 type keywords, perform a search
   await ai('type "Headphones" in search box, hit Enter');
 
@@ -73,7 +72,10 @@ test("search headphone on ebay", async ({ ai, aiQuery }) => {
   );
 
   console.log("headphones in stock", items);
-  expect(items?.length).toBeGreaterThan(1);
+  expect(items?.length).toBeGreaterThan(0);
+
+  // 👀 assert by AI
+  await aiAssert("There is a category filter on the left");
 });
 
 ```
@@ -139,19 +141,23 @@ Promise.resolve(
     );
     console.log("headphones in stock", items);
 
+    // 👀 assert by AI
+    await mid.aiAssert("There is a category filter on the left");
+
     await browser.close();
   })()
 );
 ```
 
 :::tip
-You may have noticed that the key lines of code for this only consist of two lines. They are all written in plain language.
+You may have noticed that the key lines of code for this only consist of three lines. They are all written in plain language.
 
 ```typescript
 await mid.aiAction('type "Headphones" in search box, hit Enter');
 await mid.aiQuery(
   '{itemTitle: string, price: Number}[], find item in list and corresponding price',
 );
+await mid.aiAssert("There is a category filter on the left");
 ```
 :::
 

diff --git a/apps/site/docs/en/docs/usage/API.md b/apps/site/docs/en/docs/usage/API.md
@@ -105,14 +105,12 @@ const dataB = await mid.aiQuery('string[], task names in the list');
 const dataC = await mid.aiQuery('{name: string, age: string}[], Data Record in the table');
 ```
 
-### `.aiAssert(conditionPrompt: string, errorMsg?: string)` - do an assertion
+### `.aiAssert(assertion: string, errorMsg?: string)` - do an assertion
 
-This method will soon be available in Midscene.
-
-`.aiAssert` works just like the normal `assert` method, except that the condition is a prompt string written in natural language. Midscene will call AI to determine if the `conditionPrompt` is true. If not, a detailed reason will be concatenated to the `errorMsg`.
+`.aiAssert` works just like the normal `assert` method, except that the condition is a prompt string written in natural language. Midscene will call AI to determine if the `assertion` is true. If the condition is not met, an error will be thrown containing `errorMsg` and a detailed reason generated by AI.
 
 ```typescript
-// coming soon
+await mid.aiAssert('There should be a searchbox on the page');
 ```
 
 ## Use LangSmith (Optional)

diff --git a/apps/site/docs/public/midscene_with_text_dark.png b/apps/site/docs/public/midscene_with_text_dark.png
diff --git a/apps/site/docs/public/rspress-icon.png b/apps/site/docs/public/rspress-icon.png
diff --git a/apps/site/docs/public/south_logo.png b/apps/site/docs/public/south_logo.png
diff --git a/apps/site/docs/public/step-by-step-r.png b/apps/site/docs/public/step-by-step-r.png
diff --git a/apps/site/docs/public/view-demo-visualization.gif b/apps/site/docs/public/view-demo-visualization.gif
diff --git a/apps/site/docs/public/visualizer.jpg b/apps/site/docs/public/visualizer.jpg
diff --git a/apps/site/docs/zh/docs/getting-started/quick-start.mdx b/apps/site/docs/zh/docs/getting-started/quick-start.mdx
@@ -34,7 +34,6 @@ export default defineConfig({
   testDir: './e2e',
 + timeout: 90 * 1000,
 + reporter: [["list"], ["@midscene/web/playwright-report"]],
-
 });
 ```
 
@@ -64,7 +63,7 @@ test.beforeEach(async ({ page }) => {
   await page.waitForLoadState("networkidle");
 });
 
-test("search headphone on ebay", async ({ ai, aiQuery }) => {
+test("search headphone on ebay", async ({ ai, aiQuery, aiAssert }) => {
   // 👀 输入关键字，执行搜索
   // 注：尽管这是一个英文页面，你也可以用中文指令控制它
   await ai('在搜索框输入 "Headphones" ，敲回车');
@@ -76,6 +75,9 @@ test("search headphone on ebay", async ({ ai, aiQuery }) => {
 
   console.log("headphones in stock", items);
   expect(items?.length).toBeGreaterThan(0);
+
+  // 👀 用 AI 断言
+  await aiAssert("界面左侧有类目筛选功能");
 });
 
 ```
@@ -145,20 +147,24 @@ Promise.resolve(
     );
     console.log("耳机商品信息", items);
 
+    // 👀 用 AI 断言
+    await mid.aiAssert("界面左侧有类目筛选功能");
+
     await browser.close();
   })()
 );
 ```
 
 :::tip
 
-你可能已经注意到了，上述文件中的关键代码只有两行，且都是用自然语言编写的
+你可能已经注意到了，上述文件中的关键代码只有三行，且都是用自然语言编写的
 
 ```typescript
 await mid.aiAction('在搜索框输入 "Headphones" ，敲回车');
 await mid.aiQuery(
   '{itemTitle: string, price: Number}[], 找到列表里的商品标题和价格',
 );
+await mid.aiAssert("界面左侧有类目筛选功能");
 ```
 :::
 

diff --git a/apps/site/docs/zh/docs/usage/API.md b/apps/site/docs/zh/docs/usage/API.md
@@ -103,11 +103,13 @@ const dataB = await mid.aiQuery('string[]，列表中的任务名称');
 const dataC = await mid.aiQuery('{name: string, age: string}[], 表格中的数据记录');
 ```
 
-### `.aiAssert(conditionPrompt: string, errorMsg?: string)` - 进行断言
+### `.aiAssert(assertion: string, errorMsg?: string)` - 进行断言
 
-这个方法即将上线。
+`.aiAssert` 的功能类似于一般的断言（assert）方法，但可以用自然语言编写条件参数 `assertion`。Midscene 会调用 AI 来判断条件是否为真。若条件不满足，SDK 会抛出一个错误并在 `errorMsg` 后附上 AI 生成的错误原因。
 
-`.aiAssert` 的功能类似于一般的 `assert` 方法，但可以用自然语言编写条件参数 `conditionPrompt`。Midscene 会调用 AI 来判断条件是否为真。若满足条件，详细原因会附加到 `errorMsg` 中。
+```typescript
+await mid.aiAssert('界面中应该有个搜索框');
+```
 
 ## 使用 LangSmith （可选）
 

diff --git a/apps/site/rspress.config.ts b/apps/site/rspress.config.ts
@@ -4,7 +4,8 @@ import { defineConfig } from 'rspress/config';
 export default defineConfig({
   root: path.join(__dirname, 'docs'),
   title: 'Midscene.js',
-  description: 'Your AI-Driven UI Compass',
+  description:
+    'An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language.',
   icon: '/midscene-icon.png',
   logo: {
     light: '/midscene_with_text_light.png',

diff --git a/packages/midscene/src/action/executor.ts b/packages/midscene/src/action/executor.ts
@@ -56,7 +56,7 @@ export class Executor {
     }
   }
 
-  async flush(): Promise<void> {
+  async flush(): Promise<any> {
     if (this.status === 'init' && this.tasks.length > 0) {
       console.warn(
         'illegal state for executor, status is init but tasks are not empty',
@@ -108,7 +108,9 @@ export class Executor {
         };
         if (task.type === 'Insight') {
           assert(
-            task.subType === 'Locate' || task.subType === 'Query',
+            task.subType === 'Locate' ||
+              task.subType === 'Query' ||
+              task.subType === 'Assert',
             `unsupported insight subType: ${task.subType}`,
           );
           returnValue = await task.executor(param, executorContext);
@@ -151,6 +153,10 @@ export class Executor {
 
     if (successfullyCompleted) {
       this.status = 'completed';
+      if (this.tasks.length) {
+        // return the last output
+        return this.tasks[this.tasks.length - 1].output;
+      }
     } else {
       this.status = 'error';
       throw new Error(`executor failed: ${errorMsg}`);

diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts
@@ -1,4 +1,6 @@
+import assert from 'node:assert';
 import type {
+  AIAssertionResponse,
   AIElementParseResponse,
   AISectionParseResponse,
   BaseElement,
@@ -7,7 +9,11 @@ import type {
 import type { ChatCompletionMessageParam } from 'openai/resources';
 import { callToGetJSONObject } from './openai';
 import { systemPromptToFindElement } from './prompt/element_inspector';
-import { describeUserPage, systemPromptToExtract } from './prompt/util';
+import {
+  describeUserPage,
+  systemPromptToAssert,
+  systemPromptToExtract,
+} from './prompt/util';
 
 export async function AiInspectElement<
   ElementType extends BaseElement = BaseElement,
@@ -51,7 +57,6 @@ export async function AiInspectElement<
   return {
     parseResult,
     elementById,
-    systemPrompt,
   };
 }
 
@@ -101,6 +106,43 @@ export async function AiExtractElementInfo<
   return {
     parseResult,
     elementById,
-    systemPrompt,
   };
 }
+
+export async function AiAssert<
+  ElementType extends BaseElement = BaseElement,
+>(options: {
+  assertion: string;
+  context: UIContext<ElementType>;
+  callAI?: typeof callToGetJSONObject;
+}) {
+  const { assertion, context, callAI = callToGetJSONObject } = options;
+
+  assert(assertion, 'assertion should be a string');
+  const systemPrompt = systemPromptToAssert(assertion);
+
+  const { screenshotBase64 } = context;
+  const { description, elementById } = await describeUserPage(context);
+
+  const msgs: ChatCompletionMessageParam[] = [
+    { role: 'system', content: systemPrompt },
+    {
+      role: 'user',
+      content: [
+        {
+          type: 'image_url',
+          image_url: {
+            url: screenshotBase64,
+          },
+        },
+        {
+          type: 'text',
+          text: description,
+        },
+      ],
+    },
+  ];
+
+  const assertResult = await callAI<AIAssertionResponse>(msgs);
+  return assertResult;
+}
diff --git a/packages/midscene/src/ai-model/prompt/util.ts b/packages/midscene/src/ai-model/prompt/util.ts
@@ -136,6 +136,22 @@ Return in the following JSON format:
 `;
 }
 
+export function systemPromptToAssert(assertion: string) {
+  return `
+${characteristic}
+${contextFormatIntro}
+
+Based on the information you get, assert the following:
+${assertion}
+
+Return in the following JSON format:
+{
+  thought: string, // string, the thought of the assertion
+  pass: true, // true or false, whether the assertion is passed
+}
+`;
+}
+
 /*
 To modify the response format:
   1. update the function `describeSectionResponseFormat` here