feat: use dirty-json to parse json response

web-infra-dev · Dec 19, 2024 · 9481f6d · 9481f6d
1 parent 6c14610
commit 9481f6d
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 17 deletions.
diff --git a/apps/site/docs/en/faq.md b/apps/site/docs/en/faq.md
@@ -2,9 +2,7 @@
 
 ## Can Midscene smartly plan the actions according to my one-line goal? Like executing "Tweet 'hello world'"
 
-Midscene is an automation assistance SDK with a key feature of action stability — ensuring the same actions are performed in each run. To maintain this stability, we encourage you to provide detailed instructions to help the AI understand each step of your task.
-
-If you require a 'goal-to-task' AI planning tool, you can develop one based on Midscene.
+No. Midscene is an automation assistance SDK with a key feature of action stability — ensuring the same actions are performed in each run. To maintain this stability, we encourage you to provide detailed instructions to help the AI understand each step of your task.
 
 Related Docs: [Prompting Tips](./prompting-tips.html)
 

diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -39,6 +39,7 @@
   "dependencies": {
     "@azure/identity": "4.5.0",
     "@midscene/shared": "workspace:*",
+    "dirty-json": "0.9.2",
     "openai": "4.57.1",
     "optional": "0.1.4",
     "socks-proxy-agent": "8.0.4"

diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts
@@ -5,6 +5,7 @@ import {
   getBearerTokenProvider,
 } from '@azure/identity';
 import { ifInBrowser } from '@midscene/shared/utils';
+import dJSON from 'dirty-json';
 import OpenAI, { AzureOpenAI } from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources';
 import { SocksProxyAgent } from 'socks-proxy-agent';
@@ -188,12 +189,18 @@ export async function callToGetJSONObject<T>(
   let jsonContent = safeJsonParse(response.content);
   if (jsonContent) return { content: jsonContent, usage: response.usage };
 
-  jsonContent = extractJSONFromCodeBlock(response.content);
+  const cleanJsonString = extractJSONFromCodeBlock(response.content);
   try {
-    return { content: JSON.parse(jsonContent), usage: response.usage };
-  } catch {
-    throw Error(`failed to parse json response: ${response.content}`);
-  }
+    jsonContent = JSON.parse(cleanJsonString);
+  } catch {}
+  if (jsonContent) return { content: jsonContent, usage: response.usage };
+
+  try {
+    jsonContent = dJSON.parse(cleanJsonString);
+  } catch {}
+  if (jsonContent) return { content: jsonContent, usage: response.usage };
+
+  throw Error(`failed to parse json response: ${response.content}`);
 }
 
 export function extractJSONFromCodeBlock(response: string) {

diff --git a/packages/midscene/src/ai-model/openai/types.d.ts b/packages/midscene/src/ai-model/openai/types.d.ts
@@ -0,0 +1 @@
+declare module 'dirty-json';
diff --git a/packages/midscene/src/ai-model/prompt/planning.ts b/packages/midscene/src/ai-model/prompt/planning.ts
@@ -52,7 +52,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- Respond only with valid JSON. Do not write an introduction or summary.
+- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
 - If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
 
 ## About the \`actions\` field
@@ -140,7 +140,6 @@ By viewing the page screenshot and description, you should consider this and out
 * The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So the last action will have a \`null\` value in the \`locate\` field. 
 * The task cannot be accomplished (because we cannot see the "English" option now), so a \`furtherPlan\` field is needed.
 
-\`\`\`json
 {
   "actions":[
     {
@@ -171,8 +170,6 @@ By viewing the page screenshot and description, you should consider this and out
     "whatHaveDone": "Click the language switch button and wait 1s" 
   }
 }
-\`\`\`
-
 
 ## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
 
@@ -181,7 +178,6 @@ If the user says "If there is a popup, close it", you should consider this and o
 * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
 * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
 
-\`\`\`json
 {
   "actions": [{
       "thought": "There is no popup on the page",
@@ -192,18 +188,15 @@ If the user says "If there is a popup, close it", you should consider this and o
   "taskWillBeAccomplished": true,
   "furtherPlan": null
 }
-\`\`\`
 
 For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
 
-\`\`\`json
 {
   "actions": [],
   "error": "The instruction and page context are irrelevant, there is no popup on the page",
   "taskWillBeAccomplished": true,
   "furtherPlan": null
 }
-\`\`\`
 
 ## Example #3 : When task is accomplished, don't plan more actions
 
@@ -224,6 +217,7 @@ When the user ask to "Wait 4s", you should consider this:
 ## Bad case #1 : Missing \`prompt\` in the 'Locate' field; Missing \`furtherPlan\` field when the task won't be accomplished
 
 Wrong output:
+
 {
   "actions":[
     {

diff --git a/packages/midscene/tests/ai/openai.test.ts → ...es/midscene/tests/ai/connectivity.test.ts b/packages/midscene/tests/ai/openai.test.ts → ...es/midscene/tests/ai/connectivity.test.ts
@@ -35,7 +35,7 @@ describe('openai sdk connectivity', () => {
       ],
       AIActionType.EXTRACT_DATA,
     );
-    expect(result.content.answer).toBe(15);
+    expect(result.content).toEqual({ answer: 15 });
   });
 
   it('image input', async () => {

diff --git a/packages/midscene/tests/unit-test/utils.test.ts b/packages/midscene/tests/unit-test/utils.test.ts
@@ -98,6 +98,10 @@ describe('extractJSONFromCodeBlock', () => {
     const input = '```json\n{ "key": "value" }\n```';
     const result = extractJSONFromCodeBlock(input);
     expect(result).toBe('{ "key": "value" }');
+
+    const input2 = '  ```JSON\n{ "key": "value" }\n```';
+    const result2 = extractJSONFromCodeBlock(input2);
+    expect(result2).toBe('{ "key": "value" }');
   });
 
   it('should extract JSON from a code block without language specifier', () => {

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml