Feat/rename env key (#9)

* feat: rename insight.find -> insight.locate * feat: rename the env key for model name * fix: ci
web-infra-dev · Jul 25, 2024 · b16b14f · b16b14f
1 parent de1129f
commit b16b14f
Show file tree

Hide file tree

Showing 22 changed files with 97 additions and 94 deletions.
diff --git a/apps/site/docs/doc/getting-started/introduction.mdx b/apps/site/docs/doc/getting-started/introduction.mdx
@@ -26,7 +26,7 @@ Here is a flowchart illustrating the main process of MidScene.
 Using GPT-4o, you can now locate the elements by natural language. Just like someone is viewing your page. DOM selectors should no longer be necessary.
 
 ```typescript
-const downloadBtns = await insight.find('download buttons on the page', {multi: true});
+const downloadBtns = await insight.locate('download buttons on the page', {multi: true});
 console.log(downloadBtns);
 ```
 
@@ -48,7 +48,7 @@ Use `query` to  achieve this.
 For example, if you want to understand some properties while locating elements:
 
 ```typescript
-const downloadBtns = await insight.find(query('download buttons on the page', {
+const downloadBtns = await insight.locate(query('download buttons on the page', {
   textsOnButton: 'string',
   backgroundColor: 'string, color of text, one of blue / red / yellow / green / white / black / others',
   type: '`major` or `minor`. The Bigger one is major and the others are minor',

diff --git a/apps/site/docs/doc/getting-started/quick-start.md b/apps/site/docs/doc/getting-started/quick-start.md
@@ -38,7 +38,7 @@ Promise.resolve(
 
     // ⭐ find the main download button and its backgroundColor ⭐
     const insight = await Insight.fromPuppeteerBrowser(browser);
-    const downloadBtn = await insight.find(
+    const downloadBtn = await insight.locate(
       query('main download button on the page', {
         textsOnButton: 'string',
         backgroundColor: 'string, color of text, one of blue / red / yellow / green / white / black / others',

diff --git a/apps/site/docs/doc/integration/puppeteer.md b/apps/site/docs/doc/integration/puppeteer.md
@@ -11,7 +11,7 @@ const insightA = await Insight.fromPuppeteerPage(page);
 const insightB = await Insight.fromPuppeteerBrowser(browser);
 
 // continue your code here
-const button = await insightA.find(/* ... */);
+const button = await insightA.locate(/* ... */);
 
 // perform a click action by coordinates
 await page.mouse.click(...button.center);

diff --git a/apps/site/docs/doc/usage/API.md b/apps/site/docs/doc/usage/API.md
@@ -3,7 +3,7 @@
 A typical process to understand a UI is like:
 * Build an `Insight` instance
 * Locate something
-  * Use `insight.find` to locate an easily identifiable element
+  * Use `insight.locate` to locate an easily identifiable element
   * Use `insight.segment` to find an area consisting of multiple elements (i.e. a section).
   * By passing a `query`, both `find` and `segment` can be used to extract data from the user interface. For example, extract a username from an input element or extract well-structured table data from a section.
 * Make an interaction or an assertion
@@ -65,10 +65,10 @@ Usually you need to find an `element` to interact with the page.
 Use `await find(queryPrompt, opt?: {multi: boolean})` to find one or more basic elements. For example:
 ```typescript
 // find one text element
-const result = await insight.find('the biggest Download button');
+const result = await insight.locate('the biggest Download button');
 
 // find all the buttons on nav bar
-const result = await insight.find('the nav bar buttons', {multi: true});
+const result = await insight.locate('the nav bar buttons', {multi: true});
 ```
 
 ### extract data from element(s)
@@ -77,7 +77,7 @@ Pass `query(queryPrompt, dataShape)` as `queryPrompt`, you can extract customize
 
 ```typescript
 // find one text element
-const result = await insight.find(query('the biggest Download button', {{
+const result = await insight.locate(query('the biggest Download button', {{
   textColor: 'string, color of text, one of blue / red / yellow / green / white / black / others',
   backgroundColor: 'string, color of background, one of blue / red / yellow / green / white / black / others',
 }}));
@@ -223,7 +223,7 @@ query<RichUI>(
 On the other hand, if you do not need to extract any data from the UI, you can use a plain string as a shortcut instead of a query.
 
 ```typescript
-const result1 = await insight.find('the biggest Download button');
+const result1 = await insight.locate('the biggest Download button');
 
 const result2 = await insight.segment({
   'data-record': 'Data Record title and table',

diff --git a/packages/midscene/src/action/executor.ts b/packages/midscene/src/action/executor.ts
@@ -3,7 +3,7 @@ import {
   ExecutionTask,
   ExecutionTaskApply,
   ExecutionDump,
-  ExecutionTaskInsightFindOutput,
+  ExecutionTaskInsightLocateOutput,
   ExecutionTaskReturn,
   ExecutorContext,
 } from '@/types';
@@ -69,7 +69,7 @@ export class Executor {
     let successfullyCompleted = true;
     let errorMsg = '';
 
-    let previousFindOutput: ExecutionTaskInsightFindOutput | undefined;
+    let previousFindOutput: ExecutionTaskInsightLocateOutput | undefined;
 
     while (taskIndex < this.tasks.length) {
       const task = this.tasks[taskIndex];
@@ -94,11 +94,14 @@ export class Executor {
         };
         if (task.type === 'Insight') {
           assert(
-            task.subType === 'find' || task.subType === 'query',
+            task.subType === 'Locate' || task.subType === 'Query',
             `unsupported insight subType: ${task.subType}`,
           );
           returnValue = await task.executor(param, executorContext);
-          previousFindOutput = (returnValue as ExecutionTaskReturn<ExecutionTaskInsightFindOutput>)?.output;
+          if (task.subType === 'Locate') {
+            previousFindOutput = (returnValue as ExecutionTaskReturn<ExecutionTaskInsightLocateOutput>)
+              ?.output;
+          }
         } else if (task.type === 'Action' || task.type === 'Planning') {
           returnValue = await task.executor(param, executorContext);
         } else {

diff --git a/packages/midscene/src/ai-model/openai.ts b/packages/midscene/src/ai-model/openai.ts
@@ -5,19 +5,22 @@ import wrapper from 'langsmith/wrappers';
 import { AIResponseFormat } from '@/types';
 
 const envConfigKey = 'MIDSCENE_OPENAI_INIT_CONFIG_JSON';
-const envModelKey = 'MIDSCENE_OPENAI_MODEL';
+const envModelKey = 'MIDSCENE_MODEL_NAME';
 const envSmithDebug = 'MIDSCENE_LANGSMITH_DEBUG';
 
-async function createOpenAI() {
-  let extraConfig: ClientOptions = {};
+let extraConfig: ClientOptions = {};
+if (typeof process.env[envConfigKey] === 'string') {
+  console.log('will use env config for openai');
+  extraConfig = JSON.parse(process.env[envConfigKey]);
+}
 
-  if (typeof process.env[envConfigKey] === 'string') {
-    console.log('will use env config for openai');
-    extraConfig = JSON.parse(process.env[envConfigKey]);
-  } else if (!process.env.OPENAI_API_KEY) {
-    console.warn('OPENAI_API is missing');
-  }
+let model = 'gpt-4o';
+if (typeof process.env[envModelKey] === 'string') {
+  console.log(`will use model: ${process.env[envModelKey]}`);
+  model = process.env[envModelKey];
+}
 
+async function createOpenAI() {
   const openai = new OpenAI(extraConfig);
 
   if (process.env[envSmithDebug]) {
@@ -35,7 +38,7 @@ export async function call(
 ): Promise<string> {
   const openai = await createOpenAI();
   const completion = await openai.chat.completions.create({
-    model: process.env[envModelKey] || 'gpt-4o',
+    model,
     messages,
     response_format: { type: responseFormat },
   });

diff --git a/packages/midscene/src/automation/planning.ts b/packages/midscene/src/automation/planning.ts
@@ -14,8 +14,8 @@ export function systemPromptToTaskPlanning(query: string) {
   Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
 
   Each action has a type and corresponding param. To be detailed:
-  * type: 'Find', it means to locate one element
-    * param: { prompt: string }, the prompt describes 'which element to find on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
+  * type: 'Locate', it means to locate one element
+    * param: { prompt: string }, the prompt describes 'which element to focus on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
   * type: 'Tap', tap the previous element found 
     * param: null
   * type: 'Hover', hover the previous element found
@@ -35,7 +35,7 @@ export function systemPromptToTaskPlanning(query: string) {
   
   Remember: 
   1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
-  2. In most cases, you should Find one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+  2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
 
   If any error occurs during the task planning (like the page content and task are irrelevant, or the element mentioned does not exist at all), please return the error message with explanation in the errors field. The thoughts、prompts、error messages should all in the same language as the user query.
   

diff --git a/packages/midscene/src/insight/index.ts b/packages/midscene/src/insight/index.ts
@@ -60,14 +60,14 @@ export default class Insight<ElementType extends BaseElement = BaseElement> {
       this.aiVendorFn = opt.aiVendorFn;
     }
     if (typeof opt?.taskInfo !== 'undefined') {
-      this.taskInfo = opt.taskInfo; // TODO: remove `name` field
+      this.taskInfo = opt.taskInfo;
     }
   }
 
-  async find(queryPrompt: string): Promise<ElementType | null>;
-  async find(queryPrompt: string, opt: { multi: true }): Promise<ElementType[]>;
-  async find(queryPrompt: string, opt?: FindElementOptions) {
-    assert(queryPrompt, 'query is required for find');
+  async locate(queryPrompt: string): Promise<ElementType | null>;
+  async locate(queryPrompt: string, opt: { multi: true }): Promise<ElementType[]>;
+  async locate(queryPrompt: string, opt?: FindElementOptions) {
+    assert(queryPrompt, 'query is required for located');
     const dumpSubscriber = this.onceDumpUpdatedFn;
     this.onceDumpUpdatedFn = undefined;
     const context = await this.contextRetrieverFn();
@@ -90,11 +90,11 @@ export default class Insight<ElementType extends BaseElement = BaseElement> {
 
     let errorLog: string | undefined;
     if (parseResult.errors?.length) {
-      errorLog = `find - AI response error: \n${parseResult.errors.join('\n')}`;
+      errorLog = `locate - AI response error: \n${parseResult.errors.join('\n')}`;
     }
 
     const dumpData: PartialInsightDumpFromSDK = {
-      type: 'find',
+      type: 'locate',
       context,
       userQuery: {
         element: queryPrompt,
@@ -118,7 +118,7 @@ export default class Insight<ElementType extends BaseElement = BaseElement> {
       const element = elementById(item.id);
 
       if (!element) {
-        console.warn(`find: cannot find element id=${item.id}. Maybe an unstable response from AI model`);
+        console.warn(`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`);
         return;
       }
       elements.push(element);
@@ -136,7 +136,7 @@ export default class Insight<ElementType extends BaseElement = BaseElement> {
     if (opt?.multi) {
       return elements;
     } else if (elements.length >= 2) {
-      console.warn(`find: multiple elements found, return the first one. (query: ${queryPrompt})`);
+      console.warn(`locate: multiple elements found, return the first one. (query: ${queryPrompt})`);
       return elements[0];
     } else if (elements.length === 1) {
       return elements[0];

diff --git a/packages/midscene/src/types.ts b/packages/midscene/src/types.ts
@@ -107,9 +107,6 @@ export interface BasicSectionQuery {
 export type InsightExtractParam = string | Record<string, string>;
 
 export interface InsightTaskInfo {
-  // TODO: remove name / url
-  name?: string;
-  url?: string;
   durationMs: number;
   systemPrompt?: string;
   rawResponse?: string;
@@ -121,7 +118,7 @@ export interface DumpMeta {
 }
 
 export interface InsightDump extends DumpMeta {
-  type: 'find' | 'extract';
+  type: 'locate' | 'extract';
   logId: string;
   context: UIContext;
   userQuery: {
@@ -157,7 +154,7 @@ export type ElementById = (id: string) => BaseElement | null;
 
 export interface PlanningAction<ParamType = any> {
   thought: string;
-  type: 'Find' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error';
+  type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error';
   param: ParamType;
 }
 
@@ -257,26 +254,26 @@ export interface ExecutionDump extends DumpMeta {
 /*
 task - insight-find
 */
-export interface ExecutionTaskInsightFindParam {
-  query: string;
+export interface ExecutionTaskInsightLocateParam {
+  prompt: string;
 }
 
-export interface ExecutionTaskInsightFindOutput {
+export interface ExecutionTaskInsightLocateOutput {
   element: BaseElement | null;
 }
 
-export interface ExecutionTaskInsightFindLog {
+export interface ExecutionTaskInsightLocateLog {
   dump?: InsightDump;
 }
 
-export type ExecutionTaskInsightFindApply = ExecutionTaskApply<
+export type ExecutionTaskInsightLocateApply = ExecutionTaskApply<
   'Insight',
-  ExecutionTaskInsightFindParam,
-  ExecutionTaskInsightFindOutput,
-  ExecutionTaskInsightFindLog
+  ExecutionTaskInsightLocateParam,
+  ExecutionTaskInsightLocateOutput,
+  ExecutionTaskInsightLocateLog
 >;
 
-export type ExecutionTaskInsightFind = ExecutionTask<ExecutionTaskInsightFindApply>;
+export type ExecutionTaskInsightLocate = ExecutionTask<ExecutionTaskInsightLocateApply>;
 
 /*
 task - insight-extract
@@ -293,7 +290,7 @@ export type ExecutionTaskInsightQueryApply = ExecutionTaskApply<'Insight', Execu
 
 export type ExecutionTaskInsightQuery = ExecutionTask<ExecutionTaskInsightQueryApply>;
 
-// export type ExecutionTaskInsight = ExecutionTaskInsightFind; // | ExecutionTaskInsightExtract;
+// export type ExecutionTaskInsight = ExecutionTaskInsightLocate; // | ExecutionTaskInsightExtract;
 
 /*
 task - action (i.e. interact) 

diff --git a/packages/midscene/src/utils.ts b/packages/midscene/src/utils.ts
@@ -38,7 +38,7 @@ export function getPkgInfo(): PkgInfo {
 let logDir = join(process.cwd(), './midscene_run/');
 let logEnvReady = false;
 export const insightDumpFileExt = 'insight-dump.json';
-export const groupedActionDumpFileExt = 'all-logs.json';
+export const groupedActionDumpFileExt = 'web-dump.json';
 
 export function getDumpDir() {
   return logDir;

diff --git a/packages/midscene/tests/ai-model/inspector/todo_inspector.test.ts b/packages/midscene/tests/ai-model/inspector/todo_inspector.test.ts
@@ -29,7 +29,7 @@ const testTodoCases = [
 
 
 repeat(2, (repeatIndex) => {
-  it('todo: inspector element', async () => {
+  it('todo: inspect element', async () => {
     const { context } = await getPageTestData(path.join(__dirname, './test-data/todo'));
 
     const { aiResponse, filterUnStableinf } = await runTestCases(testTodoCases, async (testCase)=>{

diff --git a/packages/midscene/tests/ai-model/inspector/xicha_inspector.test.ts b/packages/midscene/tests/ai-model/inspector/xicha_inspector.test.ts
@@ -31,7 +31,7 @@ const testCases = [
 ];
 
 repeat(5, (repeatIndex) => {
-  test('xicha: inspector element', async () => {
+  test('xicha: inspect element', async () => {
     const { context } = await getPageTestData(path.join(__dirname, './test-data/xicha'));
 
     const { aiResponse, filterUnStableinf } = await runTestCases(testCases, async (testCase)=>{

diff --git a/packages/midscene/tests/ai-model/showcase.test.ts b/packages/midscene/tests/ai-model/showcase.test.ts
@@ -1,7 +1,7 @@
 /* eslint-disable max-len */
 /* eslint-disable max-lines-per-function */
 import { it, describe, expect, vi, beforeEach, afterAll } from 'vitest';
-import Insight, { getSection , ExecutionTaskActionApply, ExecutionTaskInsightFindApply, Executor, BaseElement } from '@/index';
+import Insight, { getSection , ExecutionTaskActionApply, ExecutionTaskInsightLocateApply, Executor, BaseElement } from '@/index';
 
 // import { launch } from 'tests/utils';
 import { copyFileSync, existsSync, readFileSync } from 'fs';
@@ -25,7 +25,7 @@ describe('Show case - vscode site, write demo data', () => {
 
   it('download buttons of vscode', async (context) => {
     const insight = new Insight(generateUIContext(path.join(__dirname, './inspector/test-data/visualstudio')));
-    const downloadBtns = await insight.find('download buttons on the page');
+    const downloadBtns = await insight.locate('download buttons on the page');
     assert(downloadBtns, 'donwload buttons not found');
     expect(downloadBtns.content).toBe('Download for Windows');
   });
@@ -59,7 +59,7 @@ describe('Show case - vscode site, write demo data', () => {
 //     browser = await launch(todomvc);
 //     const insight = await Insight.fromPuppeteerBrowser(browser);
 
-//     const insightTask: ExecutionTaskInsightFindApply = {
+//     const insightTask: ExecutionTaskInsightLocateApply = {
 //       type: 'Insight',
 //       param: {
 //         query: 'input box of the page',
@@ -144,7 +144,7 @@ describe('Show case - vscode site, write demo data', () => {
 //     browser = await launch(vscodeSite);
 //     const insight = await Insight.fromPuppeteerBrowser(browser);
 
-//     const downloadBtns = await insight.find('download buttons on the page', {multi: true});
+//     const downloadBtns = await insight.locate('download buttons on the page', {multi: true});
 //     expect(downloadBtns.length).toBe(2);
 //   });
 
@@ -163,7 +163,7 @@ describe('Show case - vscode site, write demo data', () => {
 //     browser = await launch(vscodeSite);
 //     const insight = await Insight.fromPuppeteerBrowser(browser);
 
-//     const downloadBtns = await insight.find(query('download buttons on the page', {
+//     const downloadBtns = await insight.locate(query('download buttons on the page', {
 //       textsOnButton: 'string',
 //       backgroundColor: 'string, color of text, one of blue / red / yellow / green / white / black / others',
 //       type: '`major` or `minor`. The Bigger one is major and the others are minor',