Skip to content

Commit

Permalink
feat(ai-model): support coze platfrom's model (#50)
Browse files Browse the repository at this point in the history
* chore: optimize prompt structure

* feat(ai-model): add coze platform

* feat(ai-model): add coze platform

* chore: add unit test

* feat(ai-model): add extract unit test

* chore: add unit test

* chore: add unit test

* chore: optimize plan prompt

* chore: optimize plan prompt

* chore: fix lint error

* chore: fix lint error

* chore: fix unit test

* chore: ignore cache file
  • Loading branch information
zhoushaw authored Aug 9, 2024
1 parent 8365497 commit 959473f
Show file tree
Hide file tree
Showing 41 changed files with 949 additions and 1,236 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ playwright/.cache/

# Midscene.js dump files
__ai_responses__/
midscene_run
midscene_run/

.nx/cache
.nx/workspace-data
Expand Down
1 change: 1 addition & 0 deletions biome.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"**/midscene_run",
".nx",
"**/dist",
"dist",
"**/doc_build",
"*-dump.json",
"script_get_all_texts.tmp.js",
Expand Down
5 changes: 1 addition & 4 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@
"bin": {
"midscene": "./bin/midscene"
},
"files": [
"dist",
"README.md"
],
"files": ["dist", "README.md"],
"scripts": {
"dev": "modern dev",
"build": "modern build",
Expand Down
25 changes: 8 additions & 17 deletions packages/midscene/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
"main": "./dist/lib/index.js",
"module": "./dist/es/index.js",
"types": "./dist/types/index.d.ts",
"files": [
"dist",
"README.md"
],
"files": ["dist", "README.md"],
"exports": {
".": {
"types": "./dist/types/index.d.ts",
Expand Down Expand Up @@ -44,18 +41,10 @@
},
"typesVersions": {
"*": {
".": [
"./dist/types/index.d.ts"
],
"utils": [
"./dist/types/utils.d.ts"
],
"ai-model": [
"./dist/types/ai-model.d.ts"
],
"image": [
"./dist/types/image.d.ts"
]
".": ["./dist/types/index.d.ts"],
"utils": ["./dist/types/utils.d.ts"],
"ai-model": ["./dist/types/ai-model.d.ts"],
"image": ["./dist/types/image.d.ts"]
}
},
"scripts": {
Expand All @@ -70,9 +59,11 @@
},
"dependencies": {
"openai": "4.47.1",
"sharp": "0.33.3"
"sharp": "0.33.3",
"node-fetch": "2.6.7"
},
"devDependencies": {
"@types/node-fetch": "2.6.11",
"@modern-js/module-tools": "^2.56.1",
"@types/node": "^18.0.0",
"langsmith": "0.1.36",
Expand Down
82 changes: 82 additions & 0 deletions packages/midscene/src/ai-model/automation/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import assert from 'node:assert';
import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
import { AIActionType, type AIArgs, callAiFn } from '../common';
import { describeUserPage } from '../prompt/util';
import { systemPromptToTaskPlanning } from './planning';

export async function plan(
userPrompt: string,
opts: {
context: UIContext;
callAI?: typeof callAiFn<PlanningAIResponse>;
},
useModel?: 'coze' | 'openAI',
): Promise<{ plans: PlanningAction[] }> {
const { callAI, context } = opts || {};
const { screenshotBase64 } = context;
const { description: pageDescription } = await describeUserPage(context);
let planFromAI: PlanningAIResponse | undefined;

const systemPrompt = systemPromptToTaskPlanning();
const msgs: AIArgs = [
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: screenshotBase64,
detail: 'high',
},
},
{
type: 'text',
text: `
pageDescription: ${pageDescription}
`,
},
{
type: 'text',
text: `
Here is the description of the task. Just go ahead:
=====================================
${userPrompt}
=====================================
`,
},
],
},
];

if (callAI) {
planFromAI = await callAI({
msgs,
AIActionType: AIActionType.PLAN,
useModel,
});
} else {
planFromAI = await callAiFn({
msgs,
AIActionType: AIActionType.PLAN,
useModel,
});
}

const actions = planFromAI?.actions || [];

assert(planFromAI, "can't get planFromAI");
assert(actions && actions.length > 0, 'no actions in ai plan');

if (planFromAI.error) {
throw new Error(planFromAI.error);
}

actions.forEach((task) => {
if (task.type === 'Error') {
throw new Error(task.thought);
}
});

return { plans: actions };
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
import { describeUserPage } from '@/ai-model';
import { callToGetJSONObject } from '@/ai-model/openai';
import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
import type { ChatCompletionMessageParam } from 'openai/resources';

const characteristic =
'You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.';

export function systemPromptToTaskPlanning(query: string) {
export function systemPromptToTaskPlanning() {
return `
${characteristic}
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
Expand Down Expand Up @@ -40,7 +32,7 @@ export function systemPromptToTaskPlanning(query: string) {
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
If any error occurs during the task planning (like the page content and task are irrelevant, or the element mentioned does not exist at all), please return the error message with explanation in the errors field. The thoughts、promptserror messages should all in the same language as the user query.
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. If any errors occur during task planning (such as the page content being irrelevant to the task or the mentioned element not existing), please return the error message with an explanation in the errors field. Thoughts, prompts, and error messages should all be in the same language as the user query.
Return in the following JSON format:
{
Expand All @@ -55,57 +47,5 @@ export function systemPromptToTaskPlanning(query: string) {
],
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here
}
Here is the description of the task. Just go ahead:
=====================================
${query}
=====================================
`;
}

export async function plan(
userPrompt: string,
opts: {
context: UIContext;
callAI?: typeof callToGetJSONObject<PlanningAIResponse>;
},
): Promise<{ plans: PlanningAction[] }> {
const { callAI = callToGetJSONObject<PlanningAIResponse>, context } =
opts || {};
const { screenshotBase64 } = context;
const { description } = await describeUserPage(context);
const systemPrompt = systemPromptToTaskPlanning(userPrompt);
const msgs: ChatCompletionMessageParam[] = [
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: screenshotBase64,
detail: 'high',
},
},
{
type: 'text',
text: description,
},
],
},
];

const planFromAI = await callAI(msgs);
if (planFromAI.error) {
throw new Error(planFromAI.error);
}

const { actions } = planFromAI;
actions.forEach((task) => {
if (task.type === 'Error') {
throw new Error(task.thought);
}
});

return { plans: actions };
}
63 changes: 63 additions & 0 deletions packages/midscene/src/ai-model/common.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import type {
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
} from 'openai/resources';
import {
COZE_AI_ACTION_BOT_ID,
COZE_AI_ASSERT_BOT_ID,
COZE_EXTRACT_INFO_BOT_ID,
COZE_INSPECT_ELEMENT_BOT_ID,
callCozeAi,
transfromOpenAiArgsToCoze,
useCozeModel,
} from './coze';
import { callToGetJSONObject, useOpenAIModel } from './openai';

export type AIArgs = [
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
];

export enum AIActionType {
ASSERT = 0,
INSPECT_ELEMENT = 1,
EXTRACT_DATA = 2,
PLAN = 3,
}

export async function callAiFn<T>(options: {
msgs: AIArgs;
AIActionType: AIActionType;
useModel?: 'openAI' | 'coze';
}) {
const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
if (useOpenAIModel(useModel)) {
const parseResult = await callToGetJSONObject<T>(msgs);
return parseResult;
}

if (useCozeModel(useModel)) {
let botId = '';
switch (AIActionTypeValue) {
case AIActionType.ASSERT:
botId = COZE_AI_ASSERT_BOT_ID;
break;
case AIActionType.EXTRACT_DATA:
botId = COZE_EXTRACT_INFO_BOT_ID;
break;
case AIActionType.INSPECT_ELEMENT:
botId = COZE_INSPECT_ELEMENT_BOT_ID;
break;
default:
botId = COZE_AI_ACTION_BOT_ID;
}
const cozeMsg = transfromOpenAiArgsToCoze(msgs[1]);
const parseResult = await callCozeAi<T>({
...cozeMsg,
botId,
});
return parseResult;
}

throw Error('Does not contain coze and openai environment variables');
}
Loading

0 comments on commit 959473f

Please sign in to comment.