web-infra-dev · quanru · Oct 17, 2025
diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts
@@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env';
 import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
 import { getDebug } from '@midscene/shared/logger';
 import { assert } from '@midscene/shared/utils';
-import type {
-  ChatCompletionContentPart,
-  ChatCompletionMessageParam,
-} from 'openai/resources/index';
+import type { ChatCompletionMessageParam } from 'openai/resources/index';
 import {
   AIActionType,
   buildYamlFlowFromPlans,
   fillBboxParam,
   findAllMidsceneLocatorField,
-  markupImageForLLM,
   warnGPT4oSizeLimit,
 } from './common';
 import type { ConversationHistory } from './conversation-history';
 import { systemPromptToTaskPlanning } from './prompt/llm-planning';
-import { describeUserPage } from './prompt/util';
 import { callAIWithObjectResponse } from './service-caller/index';
 
 const debug = getDebug('planning');
@@ -43,10 +38,9 @@ export async function plan(
 
   const { modelName, vlMode } = modelConfig;
 
-  const { description: pageDescription, elementById } = await describeUserPage(
-    context,
-    { vlMode },
-  );
+  // Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
+  assert(vlMode, 'Planning requires vlMode to be configured.');
+
   const systemPrompt = await systemPromptToTaskPlanning({
     actionSpace: opts.actionSpace,
     vlMode: vlMode,
@@ -57,21 +51,19 @@ export async function plan(
   let imageHeight = size.height;
   const rightLimit = imageWidth;
   const bottomLimit = imageHeight;
+
+  // Process image based on VL mode requirements
   if (vlMode === 'qwen-vl') {
     const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
     imageWidth = paddedResult.width;
     imageHeight = paddedResult.height;
     imagePayload = paddedResult.imageBase64;
   } else if (vlMode === 'qwen3-vl') {
+    // Reserved for qwen3-vl specific processing
     // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
     // imageWidth = paddedResult.width;
     // imageHeight = paddedResult.height;
     // imagePayload = paddedResult.imageBase64;
-  } else if (!vlMode) {
-    imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
-      width: imageWidth,
-      height: imageHeight,
-    });
   }
 
   warnGPT4oSizeLimit(size, modelName);
@@ -120,14 +112,7 @@ export async function plan(
             detail: 'high',
           },
         },
-        ...(vlMode
-          ? []
-          : ([
-              {
-                type: 'text',
-                text: pageDescription,
-              },
-            ] as ChatCompletionContentPart[])),
+        // Planning uses pure vision mode, no DOM description needed
       ],
     },
   ];
@@ -173,21 +158,15 @@ export async function plan(
     locateFields.forEach((field) => {
       const locateResult = action.param[field];
       if (locateResult) {
-        if (vlMode) {
-          action.param[field] = fillBboxParam(
-            locateResult,
-            imageWidth,
-            imageHeight,
-            rightLimit,
-            bottomLimit,
-            vlMode,
-          );
-        } else {
-          const element = elementById(locateResult);
-          if (element) {
-            action.param[field].id = element.id;
-          }
-        }
+        // Always use VL mode to fill bbox parameters
+        action.param[field] = fillBboxParam(
+          locateResult,
+          imageWidth,
+          imageHeight,
+          rightLimit,
+          bottomLimit,
+          vlMode,
+        );
       }
     });
   });

diff --git a/packages/core/tests/unit-test/proxy-integration.test.ts b/packages/core/tests/unit-test/proxy-integration.test.ts
@@ -1,5 +1,5 @@
-import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest';
 import type { IModelConfig } from '@midscene/shared/env';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 
 // Mock the dependencies before importing the module under test
 vi.mock('openai', () => {

diff --git a/packages/core/tests/unit-test/service-caller.test.ts b/packages/core/tests/unit-test/service-caller.test.ts
@@ -1,7 +1,7 @@
 import { AIActionType } from '@/ai-model';
 import { getResponseFormat } from '@/ai-model/service-caller';
-import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest';
 import type { IModelConfig } from '@midscene/shared/env';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 
 describe('service-caller', () => {
   describe('getResponseFormat', () => {

diff --git a/packages/shared/src/env/model-config-manager.ts b/packages/shared/src/env/model-config-manager.ts
@@ -5,6 +5,7 @@ import {
 import type { GlobalConfigManager } from './global-config-manager';
 
 import type { IModelConfig, TIntent, TModelConfigFn } from './types';
+import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types';
 
 const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning'];
 
@@ -101,13 +102,15 @@ export class ModelConfigManager {
    * if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined
    */
   getModelConfig(intent: TIntent): IModelConfig {
+    let config: IModelConfig;
+
     if (this.isolatedMode) {
       if (!this.modelConfigMap) {
         throw new Error(
           'modelConfigMap is not initialized in isolated mode, which should not happen',
         );
       }
-      return this.modelConfigMap[intent];
+      config = this.modelConfigMap[intent];
     } else {
       if (!this.modelConfigMap) {
         if (!this.globalConfigManager) {
@@ -119,8 +122,26 @@ export class ModelConfigManager {
           this.globalConfigManager.getAllEnvConfig(),
         );
       }
-      return this.modelConfigMap[intent];
+      config = this.modelConfigMap[intent];
+    }
+
+    // Validate Planning must use VL mode
+    if (intent === 'planning' && !config.vlMode) {
+      throw new Error(
+        `Planning requires a vision language model (VL model). DOM-based planning is not supported.
+
+Please configure one of the following VL modes:
+  ${VL_MODES.map((mode) => `- ${mode}`).join('\n  ')}
+
+Configuration examples:
+  - Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl
+  - Or use modelConfig function with planning intent
+
+Learn more: https://midscenejs.com/choose-a-model`,
+      );
     }
+
+    return config;
   }
 
   getUploadTestServerUrl(): string | undefined {

diff --git a/packages/shared/src/env/types.ts b/packages/shared/src/env/types.ts
@@ -238,6 +238,21 @@ export interface IModelConfigForVQA {
   [MIDSCENE_VQA_VL_MODE]?: TVlModeValues;
 }
 
+/**
+ * Model configuration for Planning intent.
+ *
+ * IMPORTANT: Planning MUST use a vision language model (VL mode).
+ * DOM-based planning is not supported.
+ *
+ * Required: MIDSCENE_PLANNING_VL_MODE must be set to one of:
+ *   - 'qwen-vl'
+ *   - 'qwen3-vl'
+ *   - 'gemini'
+ *   - 'doubao-vision'
+ *   - 'vlm-ui-tars'
+ *   - 'vlm-ui-tars-doubao'
+ *   - 'vlm-ui-tars-doubao-1.5'
+ */
 export interface IModelConfigForPlanning {
   // model name
   [MIDSCENE_PLANNING_MODEL_NAME]: string;