web-infra-dev · zhoushaw · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/.cursorignore b/.cursorignore
@@ -0,0 +1,2 @@
+.env/**
+.env
diff --git a/.npmrc b/.npmrc
@@ -4,4 +4,4 @@ save-prefix=''
 save-workspace-protocol=rolling
 ignore-compatibility-db=true
 use-lockfile-v6=true
-puppeteer_download_base_url=https://cdn.npmmirror.com/binaries/chrome-for-testing
+puppeteer_download_base_url=https://cdn.npmmirror.com/binaries/chrome-for-testing
diff --git a/packages/midscene/modern.config.ts b/packages/midscene/modern.config.ts
@@ -11,6 +11,7 @@ export default defineConfig({
       env: 'src/env.ts',
       utils: 'src/utils.ts',
       'ai-model': 'src/ai-model/index.ts',
+      'plan-prompt': 'src/ai-model/prompt/planning.ts',
     },
     outDir: 'dist/lib',
     externals: ['langsmith'],

diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -17,10 +17,10 @@
   },
   "typesVersions": {
     "*": {
-      ".": ["./dist/lib/types/index.d.ts"],
-      "env": ["./dist/lib/types/env.d.ts"],
-      "utils": ["./dist/lib/types/utils.d.ts"],
-      "ai-model": ["./dist/lib/types/ai-model.d.ts"]
+      ".": ["./src/index.ts"],
+      "env": ["./src/env.ts"],
+      "utils": ["./src/utils.ts"],
+      "ai-model": ["./src/ai-model/index.ts"]
     }
   },
   "scripts": {
@@ -31,8 +31,15 @@
     "upgrade": "modern upgrade",
     "test": "vitest --run -u",
     "test:ai": "AITEST=true npm run test",
+    "test:ai:open-ai": "AITEST=true npm run test -- tests/ai/openai.test.ts",
+    "test:ai:img-point": "AITEST=true npm run test -- tests/ai/evaluate/locate/img-point.test.ts",
+    "test:ai:img-box": "AITEST=true npm run test -- tests/ai/evaluate/locate/img-box.test.ts",
+    "test:ai:img-box-two": "AITEST=true npm run test -- tests/ai/evaluate/locate/img-box.test.ts",
+    "test:ai:img-box-with-text": "AITEST=true npm run test -- tests/ai/evaluate/locate/img-box-with-text.test.ts",
     "computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
+    "ai:target": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/plan/plan-target.test.ts",
     "evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
+    "evaluate:plan": "npm run test:ai -- tests/ai/evaluate/planning.test.ts",
     "evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
     "prepublishOnly": "npm run build"
   },
@@ -50,16 +57,25 @@
     "@types/node": "^18.0.0",
     "@types/node-fetch": "2.6.11",
     "dotenv": "16.4.5",
+    "image-size": "1.1.1",
     "langsmith": "0.1.36",
     "typescript": "~5.0.4",
-    "vitest": "^1.6.0"
+    "vitest": "2.1.8"
   },
   "engines": {
     "node": ">=18.0.0"
   },
   "publishConfig": {
     "access": "public",
-    "registry": "https://registry.npmjs.org"
+    "registry": "https://registry.npmjs.org",
+    "typesVersions": {
+      "*": {
+        ".": ["./dist/lib/types/index.d.ts"],
+        "env": ["./dist/lib/types/env.d.ts"],
+        "utils": ["./dist/lib/types/utils.d.ts"],
+        "ai-model": ["./dist/lib/types/ai-model.d.ts"]
+      }
+    }
   },
   "license": "MIT"
 }
diff --git a/packages/midscene/src/ai-model/automation/index.ts b/packages/midscene/src/ai-model/automation/index.ts
@@ -1,11 +1,19 @@
 import assert from 'node:assert';
+import {
+  MIDSCENE_MODEL_NAME,
+  OPENAI_BASE_URL,
+  getAIConfig,
+  matchByTagNumber,
+} from '@/env';
 import type { AIUsageInfo, PlanningAIResponse, UIContext } from '@/types';
+import { parseNonStrictJSON } from '@/utils';
 import {
   AIActionType,
   type AIArgs,
   callAiFn,
   transformUserMessages,
 } from '../common';
+import { extractJSONFromCodeBlock } from '../openai';
 import { systemPromptToTaskPlanning } from '../prompt/planning';
 import { describeUserPage } from '../prompt/util';
 
@@ -20,22 +28,25 @@ export async function plan(
 ): Promise<PlanningAIResponse> {
   const { callAI, context } = opts || {};
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
-  const { description: pageDescription, elementByPosition } =
-    await describeUserPage(context);
+  const {
+    description: pageDescription,
+    elementByPosition,
+    elementByIndexId,
+  } = await describeUserPage(context);
 
   const systemPrompt = systemPromptToTaskPlanning();
 
   let taskBackgroundContext = '';
   if (opts.originalPrompt && opts.whatHaveDone) {
     taskBackgroundContext = `For your information, this is a task that some important person handed to you. Here is the original task description and what have been done after the previous actions:
-=====================================
-Original task description:
-${opts.originalPrompt}
-=====================================
-What have been done:
-${opts.whatHaveDone}
-=====================================
-`;
+  =====================================
+  Original task description:
+  ${opts.originalPrompt}
+  =====================================
+  What have been done:
+  ${opts.whatHaveDone}
+  =====================================
+  `;
   }
   const msgs: AIArgs = [
     { role: 'system', content: systemPrompt },
@@ -45,8 +56,8 @@ ${opts.whatHaveDone}
         {
           type: 'image_url',
           image_url: {
-            url: screenshotBase64WithElementMarker || screenshotBase64,
-            detail: 'high',
+            url: screenshotBase64,
+            // detail: 'high',
           },
         },
         {
@@ -67,11 +78,57 @@ ${taskBackgroundContext}
     },
   ];
 
+  if (matchByTagNumber) {
+    const response = await fetch(
+      `${getAIConfig(OPENAI_BASE_URL)}/chat/completions`,
+      {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          Cookie: process.env.MIDSCENE_COOKIE || '',
+        },
+        body: JSON.stringify({
+          model: getAIConfig(MIDSCENE_MODEL_NAME),
+          messages: msgs,
+          temperature: 0.1,
+        }),
+      },
+    );
+    const data = await response.json();
+
+    const message = data.choices[0].message.content;
+    const jsonData = parseNonStrictJSON(message);
+    console.log('AiPlan jsonData', JSON.stringify(jsonData, null, 2));
+    const actions = jsonData.actions || [];
+
+    actions.forEach((action: any) => {
+      if (action.locate) {
+        if ('position' in action.locate) {
+          action.locate = {
+            ...action.locate,
+            id: elementByPosition(action.locate.position)?.id!,
+          };
+        }
+        if ('boxTagNumber' in action.locate) {
+          action.locate = {
+            ...action.locate,
+            id: elementByIndexId(action.locate.boxTagNumber)?.id!,
+          };
+        }
+      }
+    });
+    return jsonData;
+  }
+
   const call = callAI || callAiFn;
+
+  const startTime = Date.now();
   const { content, usage } = await call({
     msgs,
     AIActionType: AIActionType.PLAN,
   });
+  const endTime = Date.now();
+  console.log(`AI planning took ${endTime - startTime}ms`);
 
   const planFromAI = content;
 

diff --git a/packages/midscene/src/ai-model/index.ts b/packages/midscene/src/ai-model/index.ts
@@ -11,5 +11,7 @@ export {
   transformElementPositionToId,
 } from './inspect';
 
+export { findElementPoin } from './prompt/find_element_point';
 export { plan } from './automation';
+export { planTargetAction } from './prompt/plan-action';
 export { callAiFn } from './common';