feat(ai-model): support coze platfrom's model (#50)

* chore: optimize prompt structure * feat(ai-model): add coze platform * feat(ai-model): add coze platform * chore: add unit test * feat(ai-model): add extract unit test * chore: add unit test * chore: add unit test * chore: optimize plan prompt * chore: optimize plan prompt * chore: fix lint error * chore: fix lint error * chore: fix unit test * chore: ignore cache file
web-infra-dev · Aug 9, 2024 · 959473f · 959473f
1 parent 8365497
commit 959473f
Show file tree

Hide file tree

Showing 41 changed files with 949 additions and 1,236 deletions.
diff --git a/.gitignore b/.gitignore
@@ -98,7 +98,7 @@ playwright/.cache/
 
 # Midscene.js dump files
 __ai_responses__/
-midscene_run
+midscene_run/
 
 .nx/cache
 .nx/workspace-data

diff --git a/biome.json b/biome.json
@@ -7,6 +7,7 @@
       "**/midscene_run",
       ".nx",
       "**/dist",
+      "dist",
       "**/doc_build",
       "*-dump.json",
       "script_get_all_texts.tmp.js",

diff --git a/packages/cli/package.json b/packages/cli/package.json
@@ -7,10 +7,7 @@
   "bin": {
     "midscene": "./bin/midscene"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "modern dev",
     "build": "modern build",

diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -6,10 +6,7 @@
   "main": "./dist/lib/index.js",
   "module": "./dist/es/index.js",
   "types": "./dist/types/index.d.ts",
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "exports": {
     ".": {
       "types": "./dist/types/index.d.ts",
@@ -44,18 +41,10 @@
   },
   "typesVersions": {
     "*": {
-      ".": [
-        "./dist/types/index.d.ts"
-      ],
-      "utils": [
-        "./dist/types/utils.d.ts"
-      ],
-      "ai-model": [
-        "./dist/types/ai-model.d.ts"
-      ],
-      "image": [
-        "./dist/types/image.d.ts"
-      ]
+      ".": ["./dist/types/index.d.ts"],
+      "utils": ["./dist/types/utils.d.ts"],
+      "ai-model": ["./dist/types/ai-model.d.ts"],
+      "image": ["./dist/types/image.d.ts"]
     }
   },
   "scripts": {
@@ -70,9 +59,11 @@
   },
   "dependencies": {
     "openai": "4.47.1",
-    "sharp": "0.33.3"
+    "sharp": "0.33.3",
+    "node-fetch": "2.6.7"
   },
   "devDependencies": {
+    "@types/node-fetch": "2.6.11",
     "@modern-js/module-tools": "^2.56.1",
     "@types/node": "^18.0.0",
     "langsmith": "0.1.36",

diff --git a/packages/midscene/src/ai-model/automation/index.ts b/packages/midscene/src/ai-model/automation/index.ts
@@ -0,0 +1,82 @@
+import assert from 'node:assert';
+import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
+import { AIActionType, type AIArgs, callAiFn } from '../common';
+import { describeUserPage } from '../prompt/util';
+import { systemPromptToTaskPlanning } from './planning';
+
+export async function plan(
+  userPrompt: string,
+  opts: {
+    context: UIContext;
+    callAI?: typeof callAiFn<PlanningAIResponse>;
+  },
+  useModel?: 'coze' | 'openAI',
+): Promise<{ plans: PlanningAction[] }> {
+  const { callAI, context } = opts || {};
+  const { screenshotBase64 } = context;
+  const { description: pageDescription } = await describeUserPage(context);
+  let planFromAI: PlanningAIResponse | undefined;
+
+  const systemPrompt = systemPromptToTaskPlanning();
+  const msgs: AIArgs = [
+    { role: 'system', content: systemPrompt },
+    {
+      role: 'user',
+      content: [
+        {
+          type: 'image_url',
+          image_url: {
+            url: screenshotBase64,
+            detail: 'high',
+          },
+        },
+        {
+          type: 'text',
+          text: `
+            pageDescription: ${pageDescription}
+          `,
+        },
+        {
+          type: 'text',
+          text: `
+                Here is the description of the task. Just go ahead:
+                =====================================
+                ${userPrompt}
+                =====================================
+            `,
+        },
+      ],
+    },
+  ];
+
+  if (callAI) {
+    planFromAI = await callAI({
+      msgs,
+      AIActionType: AIActionType.PLAN,
+      useModel,
+    });
+  } else {
+    planFromAI = await callAiFn({
+      msgs,
+      AIActionType: AIActionType.PLAN,
+      useModel,
+    });
+  }
+
+  const actions = planFromAI?.actions || [];
+
+  assert(planFromAI, "can't get planFromAI");
+  assert(actions && actions.length > 0, 'no actions in ai plan');
+
+  if (planFromAI.error) {
+    throw new Error(planFromAI.error);
+  }
+
+  actions.forEach((task) => {
+    if (task.type === 'Error') {
+      throw new Error(task.thought);
+    }
+  });
+
+  return { plans: actions };
+}
diff --git a/packages/midscene/src/automation/planning.ts → ...scene/src/ai-model/automation/planning.ts b/packages/midscene/src/automation/planning.ts → ...scene/src/ai-model/automation/planning.ts
@@ -1,14 +1,6 @@
-import { describeUserPage } from '@/ai-model';
-import { callToGetJSONObject } from '@/ai-model/openai';
-import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
-import type { ChatCompletionMessageParam } from 'openai/resources';
-
-const characteristic =
-  'You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.';
-
-export function systemPromptToTaskPlanning(query: string) {
+export function systemPromptToTaskPlanning() {
   return `
-  ${characteristic}
+  You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
   
   Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
   Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
@@ -40,7 +32,7 @@ export function systemPromptToTaskPlanning(query: string) {
   1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
   2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
 
-  If any error occurs during the task planning (like the page content and task are irrelevant, or the element mentioned does not exist at all), please return the error message with explanation in the errors field. The thoughts、prompts、error messages should all in the same language as the user query.
+  If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. If any errors occur during task planning (such as the page content being irrelevant to the task or the mentioned element not existing), please return the error message with an explanation in the errors field. Thoughts, prompts, and error messages should all be in the same language as the user query.
   
   Return in the following JSON format:
   {
@@ -55,57 +47,5 @@ export function systemPromptToTaskPlanning(query: string) {
     ],
     error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here
   }
-
-  Here is the description of the task. Just go ahead:
-  =====================================
-  ${query}
-  =====================================
   `;
 }
-
-export async function plan(
-  userPrompt: string,
-  opts: {
-    context: UIContext;
-    callAI?: typeof callToGetJSONObject<PlanningAIResponse>;
-  },
-): Promise<{ plans: PlanningAction[] }> {
-  const { callAI = callToGetJSONObject<PlanningAIResponse>, context } =
-    opts || {};
-  const { screenshotBase64 } = context;
-  const { description } = await describeUserPage(context);
-  const systemPrompt = systemPromptToTaskPlanning(userPrompt);
-  const msgs: ChatCompletionMessageParam[] = [
-    { role: 'system', content: systemPrompt },
-    {
-      role: 'user',
-      content: [
-        {
-          type: 'image_url',
-          image_url: {
-            url: screenshotBase64,
-            detail: 'high',
-          },
-        },
-        {
-          type: 'text',
-          text: description,
-        },
-      ],
-    },
-  ];
-
-  const planFromAI = await callAI(msgs);
-  if (planFromAI.error) {
-    throw new Error(planFromAI.error);
-  }
-
-  const { actions } = planFromAI;
-  actions.forEach((task) => {
-    if (task.type === 'Error') {
-      throw new Error(task.thought);
-    }
-  });
-
-  return { plans: actions };
-}
diff --git a/packages/midscene/src/ai-model/common.ts b/packages/midscene/src/ai-model/common.ts
@@ -0,0 +1,63 @@
+import type {
+  ChatCompletionSystemMessageParam,
+  ChatCompletionUserMessageParam,
+} from 'openai/resources';
+import {
+  COZE_AI_ACTION_BOT_ID,
+  COZE_AI_ASSERT_BOT_ID,
+  COZE_EXTRACT_INFO_BOT_ID,
+  COZE_INSPECT_ELEMENT_BOT_ID,
+  callCozeAi,
+  transfromOpenAiArgsToCoze,
+  useCozeModel,
+} from './coze';
+import { callToGetJSONObject, useOpenAIModel } from './openai';
+
+export type AIArgs = [
+  ChatCompletionSystemMessageParam,
+  ChatCompletionUserMessageParam,
+];
+
+export enum AIActionType {
+  ASSERT = 0,
+  INSPECT_ELEMENT = 1,
+  EXTRACT_DATA = 2,
+  PLAN = 3,
+}
+
+export async function callAiFn<T>(options: {
+  msgs: AIArgs;
+  AIActionType: AIActionType;
+  useModel?: 'openAI' | 'coze';
+}) {
+  const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
+  if (useOpenAIModel(useModel)) {
+    const parseResult = await callToGetJSONObject<T>(msgs);
+    return parseResult;
+  }
+
+  if (useCozeModel(useModel)) {
+    let botId = '';
+    switch (AIActionTypeValue) {
+      case AIActionType.ASSERT:
+        botId = COZE_AI_ASSERT_BOT_ID;
+        break;
+      case AIActionType.EXTRACT_DATA:
+        botId = COZE_EXTRACT_INFO_BOT_ID;
+        break;
+      case AIActionType.INSPECT_ELEMENT:
+        botId = COZE_INSPECT_ELEMENT_BOT_ID;
+        break;
+      default:
+        botId = COZE_AI_ACTION_BOT_ID;
+    }
+    const cozeMsg = transfromOpenAiArgsToCoze(msgs[1]);
+    const parseResult = await callCozeAi<T>({
+      ...cozeMsg,
+      botId,
+    });
+    return parseResult;
+  }
+
+  throw Error('Does not contain coze and openai environment variables');
+}