From 6c1461049ceb9e417caf33d8bbfbfc9cbb834b46 Mon Sep 17 00:00:00 2001
From: yutao <yutao.tao@bytedance.com>
Date: Thu, 19 Dec 2024 16:45:32 +0800
Subject: [PATCH 1/3] feat: update the Azure OpenAI integration #191

---
 apps/site/docs/en/model-provider.md           |   8 +
 apps/site/docs/zh/model-provider.md           |  11 +-
 packages/midscene/package.json                |   1 +
 .../midscene/src/ai-model/openai/index.ts     |  31 +++
 packages/midscene/src/env.ts                  |  17 +-
 pnpm-lock.yaml                                | 231 ++++++++++++++++++
 6 files changed, 295 insertions(+), 4 deletions(-)

diff --git a/apps/site/docs/en/model-provider.md b/apps/site/docs/en/model-provider.md
index 2dcb71c6..af094c5f 100644
--- a/apps/site/docs/en/model-provider.md
+++ b/apps/site/docs/en/model-provider.md
@@ -30,6 +30,14 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
 export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
 ```
 
+Using Azure OpenAI Service:
+
+```bash
+export MIDSCENE_USE_AZURE_OPENAI=1
+export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default"
+export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}'
+```
+
 Note:
 
 - Always choose a model that supports vision input. 
diff --git a/apps/site/docs/zh/model-provider.md b/apps/site/docs/zh/model-provider.md
index 444898b1..bdb94922 100644
--- a/apps/site/docs/zh/model-provider.md
+++ b/apps/site/docs/zh/model-provider.md
@@ -17,9 +17,6 @@ export OPENAI_API_KEY="sk-abcdefghijklmnopqrstuvwxyz"
 # 可选, 如果你想更换 base URL
 export OPENAI_BASE_URL="https://..."
 
-# 可选, 如果你想使用 Azure OpenAI 服务
-export OPENAI_USE_AZURE="true"
-
 # 可选, 如果你想指定模型名称
 export MIDSCENE_MODEL_NAME='qwen-vl-max-lates';
 
@@ -30,6 +27,14 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
 export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
 ```
 
+使用 Azure OpenAI 服务时的配置：
+
+```bash
+export MIDSCENE_USE_AZURE_OPENAI=1
+export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default"
+export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}'
+```
+
 说明：
 
 - 务必选择一个支持视觉输入的模型。目前我们已知支持的模型有：`gpt-4o`, `qwen-vl-max-latest` (千问), `gemini-1.5-pro`
diff --git a/packages/midscene/package.json b/packages/midscene/package.json
index 11575189..78b8f805 100644
--- a/packages/midscene/package.json
+++ b/packages/midscene/package.json
@@ -37,6 +37,7 @@
     "prepublishOnly": "npm run build"
   },
   "dependencies": {
+    "@azure/identity": "4.5.0",
     "@midscene/shared": "workspace:*",
     "openai": "4.57.1",
     "optional": "0.1.4",
diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts
index 1c6d70e4..b838e311 100644
--- a/packages/midscene/src/ai-model/openai/index.ts
+++ b/packages/midscene/src/ai-model/openai/index.ts
@@ -1,16 +1,23 @@
 import assert from 'node:assert';
 import { AIResponseFormat, type AIUsageInfo } from '@/types';
+import {
+  DefaultAzureCredential,
+  getBearerTokenProvider,
+} from '@azure/identity';
 import { ifInBrowser } from '@midscene/shared/utils';
 import OpenAI, { AzureOpenAI } from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources';
 import { SocksProxyAgent } from 'socks-proxy-agent';
 import {
+  MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
+  MIDSCENE_AZURE_OPENAI_SCOPE,
   MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG,
   MIDSCENE_DEBUG_AI_PROFILE,
   MIDSCENE_LANGSMITH_DEBUG,
   MIDSCENE_MODEL_NAME,
   MIDSCENE_OPENAI_INIT_CONFIG_JSON,
   MIDSCENE_OPENAI_SOCKS_PROXY,
+  MIDSCENE_USE_AZURE_OPENAI,
   OPENAI_API_KEY,
   OPENAI_BASE_URL,
   OPENAI_USE_AZURE,
@@ -26,6 +33,7 @@ import { assertSchema } from '../prompt/util';
 export function preferOpenAIModel(preferVendor?: 'coze' | 'openAI') {
   if (preferVendor && preferVendor !== 'openAI') return false;
   if (getAIConfig(OPENAI_API_KEY)) return true;
+  if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) return true;
 
   return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
 }
@@ -47,7 +55,9 @@ async function createOpenAI() {
 
   const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
   const socksAgent = socksProxy ? new SocksProxyAgent(socksProxy) : undefined;
+
   if (getAIConfig(OPENAI_USE_AZURE)) {
+    // this is deprecated
     openai = new AzureOpenAI({
       baseURL: getAIConfig(OPENAI_BASE_URL),
       apiKey: getAIConfig(OPENAI_API_KEY),
@@ -55,6 +65,27 @@ async function createOpenAI() {
       ...extraConfig,
       dangerouslyAllowBrowser: true,
     });
+  } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
+    // sample code: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/openai/openai/samples/cookbook/simpleCompletionsPage/app.js
+    const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
+
+    assert(
+      !ifInBrowser,
+      'Azure OpenAI is not supported in browser with Midscene.',
+    );
+    const credential = new DefaultAzureCredential();
+
+    assert(scope, 'MIDSCENE_AZURE_OPENAI_SCOPE is required');
+    const tokenProvider = getBearerTokenProvider(credential, scope);
+
+    const extraAzureConfig = getAIConfigInJson(
+      MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
+    );
+    openai = new AzureOpenAI({
+      azureADTokenProvider: tokenProvider,
+      ...extraConfig,
+      ...extraAzureConfig,
+    });
   } else {
     openai = new OpenAI({
       baseURL: getAIConfig(OPENAI_BASE_URL),
diff --git a/packages/midscene/src/env.ts b/packages/midscene/src/env.ts
index e2a233c0..b906e218 100644
--- a/packages/midscene/src/env.ts
+++ b/packages/midscene/src/env.ts
@@ -11,11 +11,19 @@ export const MIDSCENE_OPENAI_SOCKS_PROXY = 'MIDSCENE_OPENAI_SOCKS_PROXY';
 export const OPENAI_API_KEY = 'OPENAI_API_KEY';
 export const OPENAI_BASE_URL = 'OPENAI_BASE_URL';
 export const MIDSCENE_MODEL_TEXT_ONLY = 'MIDSCENE_MODEL_TEXT_ONLY';
-export const OPENAI_USE_AZURE = 'OPENAI_USE_AZURE';
+
 export const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
 export const MATCH_BY_POSITION = 'MATCH_BY_POSITION';
 export const MIDSCENE_REPORT_TAG_NAME = 'MIDSCENE_REPORT_TAG_NAME';
 
+export const MIDSCENE_USE_AZURE_OPENAI = 'MIDSCENE_USE_AZURE_OPENAI';
+export const MIDSCENE_AZURE_OPENAI_SCOPE = 'MIDSCENE_AZURE_OPENAI_SCOPE';
+export const MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON =
+  'MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON';
+
+// @deprecated
+export const OPENAI_USE_AZURE = 'OPENAI_USE_AZURE';
+
 const allConfigFromEnv = () => {
   return {
     [MIDSCENE_OPENAI_INIT_CONFIG_JSON]:
@@ -39,6 +47,13 @@ const allConfigFromEnv = () => {
       process.env[MIDSCENE_REPORT_TAG_NAME] || undefined,
     [MIDSCENE_OPENAI_SOCKS_PROXY]:
       process.env[MIDSCENE_OPENAI_SOCKS_PROXY] || undefined,
+    [MIDSCENE_USE_AZURE_OPENAI]:
+      process.env[MIDSCENE_USE_AZURE_OPENAI] || undefined,
+    [MIDSCENE_AZURE_OPENAI_SCOPE]:
+      process.env[MIDSCENE_AZURE_OPENAI_SCOPE] ||
+      'https://cognitiveservices.azure.com/.default',
+    [MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON]:
+      process.env[MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON] || undefined,
   };
 };
 
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 34772fbc..82fe79df 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -142,6 +142,9 @@ importers:
 
   packages/midscene:
     dependencies:
+      '@azure/identity':
+        specifier: 4.5.0
+        version: 4.5.0
       '@midscene/shared':
         specifier: workspace:*
         version: link:../shared
@@ -459,6 +462,50 @@ packages:
     resolution: {integrity: sha512-qOqQG9o97Q4tIZXZyWI7JuDZGJi3yibTN7LiGLmnzNLaIhmpv26BWj5OYJibUyQLVH/aTjdZSNx4spa7EihUzg==}
     engines: {node: '>= 10'}
 
+  '@azure/abort-controller@2.1.2':
+    resolution: {integrity: sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/core-auth@1.9.0':
+    resolution: {integrity: sha512-FPwHpZywuyasDSLMqJ6fhbOK3TqUdviZNF8OqRGA4W5Ewib2lEEZ+pBsYcBa88B2NGO/SEnYPGhyBqNlE8ilSw==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/core-client@1.9.2':
+    resolution: {integrity: sha512-kRdry/rav3fUKHl/aDLd/pDLcB+4pOFwPPTVEExuMyaI5r+JBbMWqRbCY1pn5BniDaU3lRxO9eaQ1AmSMehl/w==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/core-rest-pipeline@1.18.1':
+    resolution: {integrity: sha512-/wS73UEDrxroUEVywEm7J0p2c+IIiVxyfigCGfsKvCxxCET4V/Hef2aURqltrXMRjNmdmt5IuOgIpl8f6xdO5A==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/core-tracing@1.2.0':
+    resolution: {integrity: sha512-UKTiEJPkWcESPYJz3X5uKRYyOcJD+4nYph+KpfdPRnQJVrZfk0KJgdnaAWKfhsBBtAf/D58Az4AvCJEmWgIBAg==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/core-util@1.11.0':
+    resolution: {integrity: sha512-DxOSLua+NdpWoSqULhjDyAZTXFdP/LKkqtYuxxz1SCN289zk3OG8UOpnCQAz/tygyACBtWp/BoO72ptK7msY8g==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/identity@4.5.0':
+    resolution: {integrity: sha512-EknvVmtBuSIic47xkOqyNabAme0RYTw52BTMz8eBgU1ysTyMrD1uOoM+JdS0J/4Yfp98IBT3osqq3BfwSaNaGQ==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/logger@1.1.4':
+    resolution: {integrity: sha512-4IXXzcCdLdlXuCG+8UKEwLA1T1NHqUfanhXYHiQTn+6sfWCZXduqbtXDGceg3Ce5QxTGo7EqmbV6Bi+aqKuClQ==}
+    engines: {node: '>=18.0.0'}
+
+  '@azure/msal-browser@3.28.0':
+    resolution: {integrity: sha512-1c1qUF6vB52mWlyoMem4xR1gdwiQWYEQB2uhDkbAL4wVJr8WmAcXybc1Qs33y19N4BdPI8/DHI7rPE8L5jMtWw==}
+    engines: {node: '>=0.8.0'}
+
+  '@azure/msal-common@14.16.0':
+    resolution: {integrity: sha512-1KOZj9IpcDSwpNiQNjt0jDYZpQvNZay7QAEi/5DLubay40iGYtLzya/jbjRPLyOTZhEKyL1MzPuw2HqBCjceYA==}
+    engines: {node: '>=0.8.0'}
+
+  '@azure/msal-node@2.16.2':
+    resolution: {integrity: sha512-An7l1hEr0w1HMMh1LU+rtDtqL7/jw74ORlc9Wnh06v7TU/xpG39/Zdr1ZJu3QpjUfKJ+E0/OXMW8DRSWTlh7qQ==}
+    engines: {node: '>=16'}
+
   '@babel/code-frame@7.26.2':
     resolution: {integrity: sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==}
     engines: {node: '>=6.9.0'}
@@ -4150,6 +4197,9 @@ packages:
     resolution: {integrity: sha512-Db1SbgBS/fg/392AblrMJk97KggmvYhr4pB5ZIMTWtaivCPMWLkmb7m21cJvpvgK+J3nsU2CmmixNBZx4vFj/w==}
     engines: {node: '>=8.0.0'}
 
+  buffer-equal-constant-time@1.0.1:
+    resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==}
+
   buffer-equal@0.0.1:
     resolution: {integrity: sha512-RgSV6InVQ9ODPdLWJ5UAqBqJBOg370Nz6ZQtRzpt6nUjc8v0St97uJ4PYC6NztqIScrAXafKM3mZPMygSe1ggA==}
     engines: {node: '>=0.4.0'}
@@ -4966,6 +5016,9 @@ packages:
   eastasianwidth@0.2.0:
     resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==}
 
+  ecdsa-sig-formatter@1.0.11:
+    resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==}
+
   edge-paths@3.0.5:
     resolution: {integrity: sha512-sB7vSrDnFa4ezWQk9nZ/n0FdpdUuC6R1EOrlU3DL+bovcNFK28rqu2emmAUjujYEJTWIgQGqgVVWUZXMnc8iWg==}
     engines: {node: '>=14.0.0'}
@@ -6343,9 +6396,25 @@ packages:
     resolution: {integrity: sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==}
     engines: {'0': node >= 0.2.0}
 
+  jsonwebtoken@9.0.2:
+    resolution: {integrity: sha512-PRp66vJ865SSqOlgqS8hujT5U4AOgMfhrwYIuIhfKaoSCZcirrmASQr8CX7cUg+RMih+hgznrjp99o+W4pJLHQ==}
+    engines: {node: '>=12', npm: '>=6'}
+
   jszip@3.10.1:
     resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==}
 
+  jwa@1.4.1:
+    resolution: {integrity: sha512-qiLX/xhEEFKUAJ6FiBMbes3w9ATzyk5W7Hvzpa/SLYdxNtng+gcurvrI7TbACjIXlsJyr05/S1oUhZrc63evQA==}
+
+  jwa@2.0.0:
+    resolution: {integrity: sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==}
+
+  jws@3.2.2:
+    resolution: {integrity: sha512-YHlZCB6lMTllWDtSPHz/ZXTsi8S00usEV6v1tjq8tOUZzw7DpSDWVXjXDre6ed1w/pd495ODpHZYSdkRTsa0HA==}
+
+  jws@4.0.0:
+    resolution: {integrity: sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==}
+
   kind-of@2.0.1:
     resolution: {integrity: sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==}
     engines: {node: '>=0.10.0'}
@@ -6471,12 +6540,27 @@ packages:
   lodash.debounce@4.0.8:
     resolution: {integrity: sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==}
 
+  lodash.includes@4.3.0:
+    resolution: {integrity: sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==}
+
+  lodash.isboolean@3.0.3:
+    resolution: {integrity: sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==}
+
   lodash.isfunction@3.0.9:
     resolution: {integrity: sha512-AirXNj15uRIMMPihnkInB4i3NHeb4iBtNg9WRWuK2o31S+ePwwNmDPaTL3o7dTJ+VXNZim7rFs4rxN4YU1oUJw==}
 
+  lodash.isinteger@4.0.4:
+    resolution: {integrity: sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==}
+
+  lodash.isnumber@3.0.3:
+    resolution: {integrity: sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==}
+
   lodash.isplainobject@4.0.6:
     resolution: {integrity: sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==}
 
+  lodash.isstring@4.0.1:
+    resolution: {integrity: sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==}
+
   lodash.kebabcase@4.1.1:
     resolution: {integrity: sha512-N8XRTIMMqqDgSy4VLKPnJ/+hpGZN+PHQiJnSenYqPaVV/NCqEogTnAdZLQiGKhxX+JCs8waWq2t1XHWKOmlY8g==}
 
@@ -6492,6 +6576,9 @@ packages:
   lodash.mergewith@4.6.2:
     resolution: {integrity: sha512-GK3g5RPZWTRSeLSpgP8Xhra+pnjBC56q9FZYe1d5RN3TJ35dbkGy3YqBSMbyCrlbi+CM9Z3Jk5yTL7RCsqboyQ==}
 
+  lodash.once@4.1.1:
+    resolution: {integrity: sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==}
+
   lodash.snakecase@4.1.1:
     resolution: {integrity: sha512-QZ1d4xoBHYUeuouhEq3lk3Uq7ldgyFXGBhg04+oRLnIz8o9T65Eh+8YdroUwn846zchkA9yDsDl5CVVaV2nqYw==}
 
@@ -8872,6 +8959,10 @@ packages:
   std-env@3.7.0:
     resolution: {integrity: sha512-JPbdCEQLj1w5GilpiHAx3qJvFndqybBysA3qUOnznweH4QbNYUsW/ea8QzSrnh0vNsezMMw5bcVool8lM0gwzg==}
 
+  stoppable@1.1.0:
+    resolution: {integrity: sha512-KXDYZ9dszj6bzvnEMRYvxgeTHU74QBFL54XKtP3nyMuJ81CFYtABZ3bAzL2EdFUaEwJOBOgENyFj3R7oTzDyyw==}
+    engines: {node: '>=4', npm: '>=6'}
+
   stream-browserify@3.0.0:
     resolution: {integrity: sha512-H73RAHsVBapbim0tU2JwwOiXUj+fikfiaoYAKHF3VJfA0pe2BCzkhAHBlLG6REzE+2WNZcxOXjK7lkso+9euLA==}
 
@@ -9468,6 +9559,10 @@ packages:
     resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==}
     engines: {node: '>= 0.4.0'}
 
+  uuid@8.3.2:
+    resolution: {integrity: sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==}
+    hasBin: true
+
   uuid@9.0.1:
     resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==}
     hasBin: true
@@ -9976,6 +10071,85 @@ snapshots:
       '@ast-grep/napi-win32-ia32-msvc': 0.16.0
       '@ast-grep/napi-win32-x64-msvc': 0.16.0
 
+  '@azure/abort-controller@2.1.2':
+    dependencies:
+      tslib: 2.8.1
+
+  '@azure/core-auth@1.9.0':
+    dependencies:
+      '@azure/abort-controller': 2.1.2
+      '@azure/core-util': 1.11.0
+      tslib: 2.8.1
+
+  '@azure/core-client@1.9.2':
+    dependencies:
+      '@azure/abort-controller': 2.1.2
+      '@azure/core-auth': 1.9.0
+      '@azure/core-rest-pipeline': 1.18.1
+      '@azure/core-tracing': 1.2.0
+      '@azure/core-util': 1.11.0
+      '@azure/logger': 1.1.4
+      tslib: 2.8.1
+    transitivePeerDependencies:
+      - supports-color
+
+  '@azure/core-rest-pipeline@1.18.1':
+    dependencies:
+      '@azure/abort-controller': 2.1.2
+      '@azure/core-auth': 1.9.0
+      '@azure/core-tracing': 1.2.0
+      '@azure/core-util': 1.11.0
+      '@azure/logger': 1.1.4
+      http-proxy-agent: 7.0.2
+      https-proxy-agent: 7.0.5
+      tslib: 2.8.1
+    transitivePeerDependencies:
+      - supports-color
+
+  '@azure/core-tracing@1.2.0':
+    dependencies:
+      tslib: 2.8.1
+
+  '@azure/core-util@1.11.0':
+    dependencies:
+      '@azure/abort-controller': 2.1.2
+      tslib: 2.8.1
+
+  '@azure/identity@4.5.0':
+    dependencies:
+      '@azure/abort-controller': 2.1.2
+      '@azure/core-auth': 1.9.0
+      '@azure/core-client': 1.9.2
+      '@azure/core-rest-pipeline': 1.18.1
+      '@azure/core-tracing': 1.2.0
+      '@azure/core-util': 1.11.0
+      '@azure/logger': 1.1.4
+      '@azure/msal-browser': 3.28.0
+      '@azure/msal-node': 2.16.2
+      events: 3.3.0
+      jws: 4.0.0
+      open: 8.4.2
+      stoppable: 1.1.0
+      tslib: 2.8.1
+    transitivePeerDependencies:
+      - supports-color
+
+  '@azure/logger@1.1.4':
+    dependencies:
+      tslib: 2.8.1
+
+  '@azure/msal-browser@3.28.0':
+    dependencies:
+      '@azure/msal-common': 14.16.0
+
+  '@azure/msal-common@14.16.0': {}
+
+  '@azure/msal-node@2.16.2':
+    dependencies:
+      '@azure/msal-common': 14.16.0
+      jsonwebtoken: 9.0.2
+      uuid: 8.3.2
+
   '@babel/code-frame@7.26.2':
     dependencies:
       '@babel/helper-validator-identifier': 7.25.9
@@ -15022,6 +15196,8 @@ snapshots:
 
   buffer-crc32@1.0.0: {}
 
+  buffer-equal-constant-time@1.0.1: {}
+
   buffer-equal@0.0.1: {}
 
   buffer-from@1.1.2: {}
@@ -15961,6 +16137,10 @@ snapshots:
 
   eastasianwidth@0.2.0: {}
 
+  ecdsa-sig-formatter@1.0.11:
+    dependencies:
+      safe-buffer: 5.2.1
+
   edge-paths@3.0.5:
     dependencies:
       '@types/which': 2.0.2
@@ -17750,6 +17930,19 @@ snapshots:
 
   jsonparse@1.3.1: {}
 
+  jsonwebtoken@9.0.2:
+    dependencies:
+      jws: 3.2.2
+      lodash.includes: 4.3.0
+      lodash.isboolean: 3.0.3
+      lodash.isinteger: 4.0.4
+      lodash.isnumber: 3.0.3
+      lodash.isplainobject: 4.0.6
+      lodash.isstring: 4.0.1
+      lodash.once: 4.1.1
+      ms: 2.1.3
+      semver: 7.6.3
+
   jszip@3.10.1:
     dependencies:
       lie: 3.3.0
@@ -17757,6 +17950,28 @@ snapshots:
       readable-stream: 2.3.8
       setimmediate: 1.0.5
 
+  jwa@1.4.1:
+    dependencies:
+      buffer-equal-constant-time: 1.0.1
+      ecdsa-sig-formatter: 1.0.11
+      safe-buffer: 5.2.1
+
+  jwa@2.0.0:
+    dependencies:
+      buffer-equal-constant-time: 1.0.1
+      ecdsa-sig-formatter: 1.0.11
+      safe-buffer: 5.2.1
+
+  jws@3.2.2:
+    dependencies:
+      jwa: 1.4.1
+      safe-buffer: 5.2.1
+
+  jws@4.0.0:
+    dependencies:
+      jwa: 2.0.0
+      safe-buffer: 5.2.1
+
   kind-of@2.0.1:
     dependencies:
       is-buffer: 1.1.6
@@ -17880,10 +18095,20 @@ snapshots:
 
   lodash.debounce@4.0.8: {}
 
+  lodash.includes@4.3.0: {}
+
+  lodash.isboolean@3.0.3: {}
+
   lodash.isfunction@3.0.9: {}
 
+  lodash.isinteger@4.0.4: {}
+
+  lodash.isnumber@3.0.3: {}
+
   lodash.isplainobject@4.0.6: {}
 
+  lodash.isstring@4.0.1: {}
+
   lodash.kebabcase@4.1.1: {}
 
   lodash.map@4.6.0: {}
@@ -17894,6 +18119,8 @@ snapshots:
 
   lodash.mergewith@4.6.2: {}
 
+  lodash.once@4.1.1: {}
+
   lodash.snakecase@4.1.1: {}
 
   lodash.startcase@4.4.0: {}
@@ -20781,6 +21008,8 @@ snapshots:
 
   std-env@3.7.0: {}
 
+  stoppable@1.1.0: {}
+
   stream-browserify@3.0.0:
     dependencies:
       inherits: 2.0.4
@@ -21434,6 +21663,8 @@ snapshots:
 
   utils-merge@1.0.1: {}
 
+  uuid@8.3.2: {}
+
   uuid@9.0.1: {}
 
   uvu@0.5.6:

From 9481f6dfd8be44b4e1877037dab72cb0e137c6ac Mon Sep 17 00:00:00 2001
From: yutao <yutao.tao@bytedance.com>
Date: Thu, 19 Dec 2024 21:12:08 +0800
Subject: [PATCH 2/3] feat: use dirty-json to parse json response

---
 apps/site/docs/en/faq.md                      |  4 +--
 packages/midscene/package.json                |  1 +
 .../midscene/src/ai-model/openai/index.ts     | 17 ++++++---
 .../midscene/src/ai-model/openai/types.d.ts   |  1 +
 .../midscene/src/ai-model/prompt/planning.ts  | 10 ++----
 .../{openai.test.ts => connectivity.test.ts}  |  2 +-
 .../midscene/tests/unit-test/utils.test.ts    |  4 +++
 pnpm-lock.yaml                                | 36 +++++++++++++++++++
 8 files changed, 58 insertions(+), 17 deletions(-)
 create mode 100644 packages/midscene/src/ai-model/openai/types.d.ts
 rename packages/midscene/tests/ai/{openai.test.ts => connectivity.test.ts} (96%)

diff --git a/apps/site/docs/en/faq.md b/apps/site/docs/en/faq.md
index 7863fb12..d583c925 100644
--- a/apps/site/docs/en/faq.md
+++ b/apps/site/docs/en/faq.md
@@ -2,9 +2,7 @@
 
 ## Can Midscene smartly plan the actions according to my one-line goal? Like executing "Tweet 'hello world'"
 
-Midscene is an automation assistance SDK with a key feature of action stability — ensuring the same actions are performed in each run. To maintain this stability, we encourage you to provide detailed instructions to help the AI understand each step of your task.
-
-If you require a 'goal-to-task' AI planning tool, you can develop one based on Midscene.
+No. Midscene is an automation assistance SDK with a key feature of action stability — ensuring the same actions are performed in each run. To maintain this stability, we encourage you to provide detailed instructions to help the AI understand each step of your task.
 
 Related Docs: [Prompting Tips](./prompting-tips.html)
 
diff --git a/packages/midscene/package.json b/packages/midscene/package.json
index 78b8f805..fb1da5dc 100644
--- a/packages/midscene/package.json
+++ b/packages/midscene/package.json
@@ -39,6 +39,7 @@
   "dependencies": {
     "@azure/identity": "4.5.0",
     "@midscene/shared": "workspace:*",
+    "dirty-json": "0.9.2",
     "openai": "4.57.1",
     "optional": "0.1.4",
     "socks-proxy-agent": "8.0.4"
diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts
index b838e311..44bbfd5f 100644
--- a/packages/midscene/src/ai-model/openai/index.ts
+++ b/packages/midscene/src/ai-model/openai/index.ts
@@ -5,6 +5,7 @@ import {
   getBearerTokenProvider,
 } from '@azure/identity';
 import { ifInBrowser } from '@midscene/shared/utils';
+import dJSON from 'dirty-json';
 import OpenAI, { AzureOpenAI } from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources';
 import { SocksProxyAgent } from 'socks-proxy-agent';
@@ -188,12 +189,18 @@ export async function callToGetJSONObject<T>(
   let jsonContent = safeJsonParse(response.content);
   if (jsonContent) return { content: jsonContent, usage: response.usage };
 
-  jsonContent = extractJSONFromCodeBlock(response.content);
+  const cleanJsonString = extractJSONFromCodeBlock(response.content);
   try {
-    return { content: JSON.parse(jsonContent), usage: response.usage };
-  } catch {
-    throw Error(`failed to parse json response: ${response.content}`);
-  }
+    jsonContent = JSON.parse(cleanJsonString);
+  } catch {}
+  if (jsonContent) return { content: jsonContent, usage: response.usage };
+
+  try {
+    jsonContent = dJSON.parse(cleanJsonString);
+  } catch {}
+  if (jsonContent) return { content: jsonContent, usage: response.usage };
+
+  throw Error(`failed to parse json response: ${response.content}`);
 }
 
 export function extractJSONFromCodeBlock(response: string) {
diff --git a/packages/midscene/src/ai-model/openai/types.d.ts b/packages/midscene/src/ai-model/openai/types.d.ts
new file mode 100644
index 00000000..a0e87e6c
--- /dev/null
+++ b/packages/midscene/src/ai-model/openai/types.d.ts
@@ -0,0 +1 @@
+declare module 'dirty-json';
diff --git a/packages/midscene/src/ai-model/prompt/planning.ts b/packages/midscene/src/ai-model/prompt/planning.ts
index 5fb90c8f..def1a97d 100644
--- a/packages/midscene/src/ai-model/prompt/planning.ts
+++ b/packages/midscene/src/ai-model/prompt/planning.ts
@@ -52,7 +52,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- Respond only with valid JSON. Do not write an introduction or summary.
+- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
 - If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
 
 ## About the \`actions\` field
@@ -140,7 +140,6 @@ By viewing the page screenshot and description, you should consider this and out
 * The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So the last action will have a \`null\` value in the \`locate\` field. 
 * The task cannot be accomplished (because we cannot see the "English" option now), so a \`furtherPlan\` field is needed.
 
-\`\`\`json
 {
   "actions":[
     {
@@ -171,8 +170,6 @@ By viewing the page screenshot and description, you should consider this and out
     "whatHaveDone": "Click the language switch button and wait 1s" 
   }
 }
-\`\`\`
-
 
 ## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
 
@@ -181,7 +178,6 @@ If the user says "If there is a popup, close it", you should consider this and o
 * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
 * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
 
-\`\`\`json
 {
   "actions": [{
       "thought": "There is no popup on the page",
@@ -192,18 +188,15 @@ If the user says "If there is a popup, close it", you should consider this and o
   "taskWillBeAccomplished": true,
   "furtherPlan": null
 }
-\`\`\`
 
 For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
 
-\`\`\`json
 {
   "actions": [],
   "error": "The instruction and page context are irrelevant, there is no popup on the page",
   "taskWillBeAccomplished": true,
   "furtherPlan": null
 }
-\`\`\`
 
 ## Example #3 : When task is accomplished, don't plan more actions
 
@@ -224,6 +217,7 @@ When the user ask to "Wait 4s", you should consider this:
 ## Bad case #1 : Missing \`prompt\` in the 'Locate' field; Missing \`furtherPlan\` field when the task won't be accomplished
 
 Wrong output:
+
 {
   "actions":[
     {
diff --git a/packages/midscene/tests/ai/openai.test.ts b/packages/midscene/tests/ai/connectivity.test.ts
similarity index 96%
rename from packages/midscene/tests/ai/openai.test.ts
rename to packages/midscene/tests/ai/connectivity.test.ts
index 1370f74a..a8617ad4 100644
--- a/packages/midscene/tests/ai/openai.test.ts
+++ b/packages/midscene/tests/ai/connectivity.test.ts
@@ -35,7 +35,7 @@ describe('openai sdk connectivity', () => {
       ],
       AIActionType.EXTRACT_DATA,
     );
-    expect(result.content.answer).toBe(15);
+    expect(result.content).toEqual({ answer: 15 });
   });
 
   it('image input', async () => {
diff --git a/packages/midscene/tests/unit-test/utils.test.ts b/packages/midscene/tests/unit-test/utils.test.ts
index 2a6c18be..90c98c83 100644
--- a/packages/midscene/tests/unit-test/utils.test.ts
+++ b/packages/midscene/tests/unit-test/utils.test.ts
@@ -98,6 +98,10 @@ describe('extractJSONFromCodeBlock', () => {
     const input = '```json\n{ "key": "value" }\n```';
     const result = extractJSONFromCodeBlock(input);
     expect(result).toBe('{ "key": "value" }');
+
+    const input2 = '  ```JSON\n{ "key": "value" }\n```';
+    const result2 = extractJSONFromCodeBlock(input2);
+    expect(result2).toBe('{ "key": "value" }');
   });
 
   it('should extract JSON from a code block without language specifier', () => {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 82fe79df..a05b512e 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -148,6 +148,9 @@ importers:
       '@midscene/shared':
         specifier: workspace:*
         version: link:../shared
+      dirty-json:
+        specifier: 0.9.2
+        version: 0.9.2
       openai:
         specifier: 4.57.1
         version: 4.57.1(zod@3.23.8)
@@ -4942,6 +4945,10 @@ packages:
     resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==}
     engines: {node: '>=8'}
 
+  dirty-json@0.9.2:
+    resolution: {integrity: sha512-7SCDfnQtBObcngVXNPZcnxGxqqPTK4UqeXeKAch+RGH5qpqadWbV9FmN71x9Bb4tTs0TNFb4FT/4Kz4P4Cjqcw==}
+    engines: {node: '>=6.0.0'}
+
   dlv@1.1.3:
     resolution: {integrity: sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==}
 
@@ -6466,6 +6473,10 @@ packages:
   levdist@1.0.0:
     resolution: {integrity: sha512-YguwC2spb0pqpJM3a5OsBhih/GG2ZHoaSHnmBqhEI7997a36buhqcRTegEjozHxyxByIwLpZHZTVYMThq+Zd3g==}
 
+  lex@1.7.9:
+    resolution: {integrity: sha512-vzaalVBmFLnMaedq0QAsBAaXsWahzRpvnIBdBjj7y+7EKTS6lnziU2y/PsU2c6rV5qYj2B5IDw0uNJ9peXD0vw==}
+    deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info.
+
   lie@3.3.0:
     resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==}
 
@@ -8998,6 +9009,9 @@ packages:
     resolution: {integrity: sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==}
     engines: {node: '>=12'}
 
+  string.fromcodepoint@0.2.1:
+    resolution: {integrity: sha512-n69H31OnxSGSZyZbgBlvYIXlrMhJQ0dQAX1js1QDhpaUH6zmU3QYlj07bCwCNlPOu3oRXIubGPl2gDGnHsiCqg==}
+
   string.prototype.trim@1.2.9:
     resolution: {integrity: sha512-klHuCNxiMZ8MlsOihJhJEBJAiMVqU3Z2nEXWfWnIqjN0gEFS9J9+IxKozWWtQGcgoa1WUZzLjKPTr4ZHNFTFxw==}
     engines: {node: '>= 0.4'}
@@ -9440,6 +9454,9 @@ packages:
     resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==}
     engines: {node: '>=18.17'}
 
+  unescape-js@1.1.4:
+    resolution: {integrity: sha512-42SD8NOQEhdYntEiUQdYq/1V/YHwr1HLwlHuTJB5InVVdOSbgI6xu8jK5q65yIzuFCfczzyDF/7hbGzVbyCw0g==}
+
   unicode-canonical-property-names-ecmascript@2.0.1:
     resolution: {integrity: sha512-dA8WbNeb2a6oQzAQ55YlT5vQAWGV9WXOsi3SskE3bcCdM0P4SDd+24zS/OCacdRq5BkdsRj9q3Pg6YyQoxIGqg==}
     engines: {node: '>=4'}
@@ -9546,6 +9563,9 @@ packages:
     resolution: {integrity: sha512-5cnLm4gseXjAclKowC4IjByaGsjtAoV6PrOQOljplNB54ReUYJP8HdAFq2muHinSDAh09PPX/uXDPfdxRHvuSA==}
     engines: {node: '>= 0.8.0'}
 
+  utf8@3.0.0:
+    resolution: {integrity: sha512-E8VjFIQ/TyQgp+TZfS6l8yp/xWppSAHzidGiRrqe4bK4XP9pTRyKFgGJpO3SN7zdX4DeomTrwaseCHovfpFcqQ==}
+
   utif2@4.1.0:
     resolution: {integrity: sha512-+oknB9FHrJ7oW7A2WZYajOcv4FcDR4CfoGB0dPNfxbi4GO05RRnFmt5oa23+9w32EanrYcSJWspUiJkLMs+37w==}
 
@@ -16018,6 +16038,12 @@ snapshots:
     dependencies:
       path-type: 4.0.0
 
+  dirty-json@0.9.2:
+    dependencies:
+      lex: 1.7.9
+      unescape-js: 1.1.4
+      utf8: 3.0.0
+
   dlv@1.1.3: {}
 
   doctrine-temporary-fork@2.1.0:
@@ -18013,6 +18039,8 @@ snapshots:
 
   levdist@1.0.0: {}
 
+  lex@1.7.9: {}
+
   lie@3.3.0:
     dependencies:
       immediate: 3.0.6
@@ -21058,6 +21086,8 @@ snapshots:
       emoji-regex: 9.2.2
       strip-ansi: 7.1.0
 
+  string.fromcodepoint@0.2.1: {}
+
   string.prototype.trim@1.2.9:
     dependencies:
       call-bind: 1.0.7
@@ -21534,6 +21564,10 @@ snapshots:
 
   undici@6.20.1: {}
 
+  unescape-js@1.1.4:
+    dependencies:
+      string.fromcodepoint: 0.2.1
+
   unicode-canonical-property-names-ecmascript@2.0.1: {}
 
   unicode-match-property-ecmascript@2.0.0:
@@ -21647,6 +21681,8 @@ snapshots:
 
   userhome@1.0.1: {}
 
+  utf8@3.0.0: {}
+
   utif2@4.1.0:
     dependencies:
       pako: 1.0.11

From 10661333e275111a5bf14c06e7acc682b7b24515 Mon Sep 17 00:00:00 2001
From: yutao <yutao.tao@bytedance.com>
Date: Fri, 20 Dec 2024 14:51:49 +0800
Subject: [PATCH 3/3] feat: update docs for more model

---
 .gitignore                                    |  1 +
 .vscode/settings.json                         |  4 +-
 apps/site/docs/en/faq.md                      |  6 +--
 apps/site/docs/en/model-provider.md           | 39 ++++++++++++++++---
 apps/site/docs/zh/faq.md                      |  6 +--
 apps/site/docs/zh/model-provider.md           | 38 +++++++++++++++---
 .../midscene/tests/ai/connectivity.test.ts    | 11 +++++-
 .../tests/ai/evaluate/inspect.test.ts         |  3 +-
 8 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index c34a6aea..7df6cec2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,7 @@ jspm_packages/
 
 # dotenv environment variables file
 .env
+.env.*
 
 # next.js build output
 .next
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a519cb2d..439ea43f 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -8,10 +8,12 @@
     "AITEST",
     "Aliyun",
     "aweme",
+    "doubao",
     "douyin",
     "httpbin",
     "iconfont",
     "qwen",
-    "taobao"
+    "taobao",
+    "Volcengine"
   ]
 }
diff --git a/apps/site/docs/en/faq.md b/apps/site/docs/en/faq.md
index d583c925..fc39b160 100644
--- a/apps/site/docs/en/faq.md
+++ b/apps/site/docs/en/faq.md
@@ -14,11 +14,9 @@ There are some limitations with Midscene. We are still working on them.
 2. LLM is not 100% stable. Even GPT-4o can't return the right answer all the time. Following the [Prompting Tips](./prompting-tips) will help improve stability.
 3. Since we use JavaScript to retrieve items from the page, the elements inside the iframe cannot be accessed.
 
-## Which LLM should I choose ?
+## Can I use a model other than `gpt-4o`?
 
-Midscene needs a multimodal Large Language Model (LLM) to understand the UI. Currently, we find that OpenAI's  GPT-4o performs much better than others.
-
-You can [customize model and provider](./model-provider.html) if needed.
+Yes. You can [customize model and provider](./model-provider.html) if needed.
 
 ## About the token cost
 
diff --git a/apps/site/docs/en/model-provider.md b/apps/site/docs/en/model-provider.md
index af094c5f..a15adb6f 100644
--- a/apps/site/docs/en/model-provider.md
+++ b/apps/site/docs/en/model-provider.md
@@ -30,7 +30,7 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
 export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
 ```
 
-Using Azure OpenAI Service:
+## Using Azure OpenAI Service
 
 ```bash
 export MIDSCENE_USE_AZURE_OPENAI=1
@@ -38,13 +38,28 @@ export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default
 export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}'
 ```
 
-Note:
+## Choose a model other than `gpt-4o`
 
-- Always choose a model that supports vision input. 
-- Currently, the known supported models are: `gpt-4o`, `qwen-vl-max-latest`, `gemini-1.5-pro`
-- Please follow the terms of use of each model.
+We find that `gpt-4o` performs the best for Midscene at this moment. The other known supported models are: `gemini-1.5-pro`, `qwen-vl-max-latest`, `doubao-vision-pro-32k`
 
-## Example: Using `qwen-vl-max-latest` service from Aliyun
+If you want to use other models, please follow these steps:
+
+1. Choose a model that supports image input (a.k.a. multimodal model).
+2. Find out how to to call it with an OpenAI SDK compatible endpoint. Usually you should set the `OPENAI_BASE_URL`, `OPENAI_API_KEY` and `MIDSCENE_MODEL_NAME`.
+3. If you find it not working well after changing the model, you can try using some short and clear prompt (or roll back to the previous model). See more details in [Prompting Tips](./prompting-tips.html).
+4. Remember to follow the terms of use of each model.
+
+## Example: Using `gemini-1.5-pro` from Google
+
+Configure the environment variables:
+
+```bash
+export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai"
+export OPENAI_API_KEY="....."
+export MIDSCENE_MODEL_NAME="gemini-1.5-pro"
+```
+
+## Example: Using `qwen-vl-max-latest` from Aliyun
 
 Configure the environment variables:
 
@@ -53,3 +68,15 @@ export OPENAI_API_KEY="sk-..."
 export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
 export MIDSCENE_MODEL_NAME="qwen-vl-max-latest"
 ```
+
+## Example: Using `doubao-vision-pro-32k` from Volcengine
+
+Create a inference point first: https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint
+
+Configure the environment variables:
+
+```bash
+export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
+export OPENAI_API_KEY="..."
+export MIDSCENE_MODEL_NAME="ep-202....."
+```
diff --git a/apps/site/docs/zh/faq.md b/apps/site/docs/zh/faq.md
index 87926ec1..7f70e2f2 100644
--- a/apps/site/docs/zh/faq.md
+++ b/apps/site/docs/zh/faq.md
@@ -16,11 +16,9 @@ Midscene 存在一些局限性，我们仍在努力改进。
 2. 稳定性风险：即使是 GPT-4o 也无法确保 100% 返回正确答案。遵循 [编写提示词的技巧](./prompting-tips) 可以帮助提高 SDK 稳定性。
 3. 元素访问受限：由于我们使用 JavaScript 从页面提取元素，所以无法访问 iframe 内部的元素。
 
-## 选用那个 LLM 模型？
+## 能否选用 `gpt-4o` 以外的其他模型？
 
-Midscene 需要一个能够理解用户界面的多模态大型语言模型。目前，我们发现 OpenAI 的 GPT-4o 表现最好，远超其它模型。
-
-你可以根据需要[自定义模型和服务商](./model-provider.html)。
+可以。你可以[自定义模型和服务商](./model-provider.html)。
 
 ## 关于 token 成本
 
diff --git a/apps/site/docs/zh/model-provider.md b/apps/site/docs/zh/model-provider.md
index bdb94922..c70b7de1 100644
--- a/apps/site/docs/zh/model-provider.md
+++ b/apps/site/docs/zh/model-provider.md
@@ -27,7 +27,7 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
 export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
 ```
 
-使用 Azure OpenAI 服务时的配置：
+## 使用 Azure OpenAI 服务时的配置
 
 ```bash
 export MIDSCENE_USE_AZURE_OPENAI=1
@@ -35,12 +35,28 @@ export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default
 export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}'
 ```
 
-说明：
+## 选用 `gpt-4o` 以外的其他模型
 
-- 务必选择一个支持视觉输入的模型。目前我们已知支持的模型有：`gpt-4o`, `qwen-vl-max-latest` (千问), `gemini-1.5-pro`
-- 请遵守各项模型的使用条款
+我们发现 `gpt-4o` 是目前表现最佳的模型。其他已知支持的模型有：`qwen-vl-max-latest` (千问), `gemini-1.5-pro`, `doubao-vision-pro-32k` (豆包)
 
-## 示例：使用部署在阿里云的 `qwen-vl-max-latest` 模型
+如果你想要使用其他模型，请遵循以下步骤：
+
+1. 选择一个支持视觉输入的模型（也就是“多模态模型”）。
+2. 找出如何使用 OpenAI SDK 兼容的方式调用它，模型提供商一般都会提供这样的接入点，你需要配置的是 `OPENAI_BASE_URL`, `OPENAI_API_KEY` 和 `MIDSCENE_MODEL_NAME`。
+3. 如果发现使用新模型后效果不佳，可以尝试使用一些简短且清晰的提示词（或回滚到之前的模型）。更多详情请参阅 [Prompting Tips](./prompting-tips.html)。
+4. 请遵守各模型的使用条款。
+
+## 示例：使用 Google 的 `gemini-1.5-pro` 模型
+
+配置环境变量：
+
+```bash
+export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai"
+export OPENAI_API_KEY="....."
+export MIDSCENE_MODEL_NAME="gemini-1.5-pro"
+```
+
+## 示例：使用阿里云的 `qwen-vl-max-latest` 模型
 
 配置环境变量：
 
@@ -49,3 +65,15 @@ export OPENAI_API_KEY="sk-..."
 export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
 export MIDSCENE_MODEL_NAME="qwen-vl-max-latest"
 ```
+
+## 示例：使用火山云的豆包 `doubao-vision-pro-32k` 模型
+
+调用前需要配置推理点：https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint
+
+配置环境变量：
+
+```bash
+export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
+export OPENAI_API_KEY="..."
+export MIDSCENE_MODEL_NAME="ep-202....."
+```
diff --git a/packages/midscene/tests/ai/connectivity.test.ts b/packages/midscene/tests/ai/connectivity.test.ts
index a8617ad4..ec742ead 100644
--- a/packages/midscene/tests/ai/connectivity.test.ts
+++ b/packages/midscene/tests/ai/connectivity.test.ts
@@ -1,7 +1,15 @@
 import { AIActionType } from '@/ai-model/common';
 import { call, callToGetJSONObject } from '@/ai-model/openai';
+import { base64Encoded } from '@/image';
+import dotenv from 'dotenv';
+import { getFixture } from 'tests/utils';
 import { describe, expect, it, vi } from 'vitest';
 
+const result = dotenv.config({ debug: true });
+if (result.error) {
+  throw result.error;
+}
+
 vi.setConfig({
   testTimeout: 20 * 1000,
 });
@@ -39,6 +47,7 @@ describe('openai sdk connectivity', () => {
   });
 
   it('image input', async () => {
+    const imagePath = getFixture('baidu.png');
     const result = await call([
       {
         role: 'user',
@@ -50,7 +59,7 @@ describe('openai sdk connectivity', () => {
           {
             type: 'image_url',
             image_url: {
-              url: 'https://portal.volccdn.com/obj/volcfe/bee_prod/biz_950/tos_38e6e81e1366482ed046045e72b0684d.png',
+              url: base64Encoded(imagePath),
               detail: 'high',
             },
           },
diff --git a/packages/midscene/tests/ai/evaluate/inspect.test.ts b/packages/midscene/tests/ai/evaluate/inspect.test.ts
index 236940a5..049be393 100644
--- a/packages/midscene/tests/ai/evaluate/inspect.test.ts
+++ b/packages/midscene/tests/ai/evaluate/inspect.test.ts
@@ -1,4 +1,4 @@
-import { readFileSync, writeFileSync } from 'node:fs';
+import { readFileSync } from 'node:fs';
 import path from 'node:path';
 import { describe } from 'node:test';
 import { AiInspectElement, plan } from '@/ai-model';
@@ -13,6 +13,7 @@ import {
   repeat,
   runTestCases,
 } from './test-suite/util';
+import 'dotenv/config';
 
 const repeatTime = 2;
 const relocateAfterPlanning = false;