diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts index 9f46d8ff3..a21ed7f13 100644 --- a/packages/core/src/ai-model/llm-planning.ts +++ b/packages/core/src/ai-model/llm-planning.ts @@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env'; import { paddingToMatchBlockByBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; -import type { - ChatCompletionContentPart, - ChatCompletionMessageParam, -} from 'openai/resources/index'; +import type { ChatCompletionMessageParam } from 'openai/resources/index'; import { AIActionType, buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField, - markupImageForLLM, warnGPT4oSizeLimit, } from './common'; import type { ConversationHistory } from './conversation-history'; import { systemPromptToTaskPlanning } from './prompt/llm-planning'; -import { describeUserPage } from './prompt/util'; import { callAIWithObjectResponse } from './service-caller/index'; const debug = getDebug('planning'); @@ -43,10 +38,9 @@ export async function plan( const { modelName, vlMode } = modelConfig; - const { description: pageDescription, elementById } = await describeUserPage( - context, - { vlMode }, - ); + // Planning requires VL mode (validated by ModelConfigManager.getModelConfig) + assert(vlMode, 'Planning requires vlMode to be configured.'); + const systemPrompt = await systemPromptToTaskPlanning({ actionSpace: opts.actionSpace, vlMode: vlMode, @@ -57,21 +51,19 @@ export async function plan( let imageHeight = size.height; const rightLimit = imageWidth; const bottomLimit = imageHeight; + + // Process image based on VL mode requirements if (vlMode === 'qwen-vl') { const paddedResult = await paddingToMatchBlockByBase64(imagePayload); imageWidth = paddedResult.width; imageHeight = paddedResult.height; imagePayload = paddedResult.imageBase64; } else if (vlMode === 'qwen3-vl') { + // Reserved for qwen3-vl specific processing // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32); // imageWidth = paddedResult.width; // imageHeight = paddedResult.height; // imagePayload = paddedResult.imageBase64; - } else if (!vlMode) { - imagePayload = await markupImageForLLM(screenshotBase64, context.tree, { - width: imageWidth, - height: imageHeight, - }); } warnGPT4oSizeLimit(size, modelName); @@ -120,14 +112,7 @@ export async function plan( detail: 'high', }, }, - ...(vlMode - ? [] - : ([ - { - type: 'text', - text: pageDescription, - }, - ] as ChatCompletionContentPart[])), + // Planning uses pure vision mode, no DOM description needed ], }, ]; @@ -173,21 +158,15 @@ export async function plan( locateFields.forEach((field) => { const locateResult = action.param[field]; if (locateResult) { - if (vlMode) { - action.param[field] = fillBboxParam( - locateResult, - imageWidth, - imageHeight, - rightLimit, - bottomLimit, - vlMode, - ); - } else { - const element = elementById(locateResult); - if (element) { - action.param[field].id = element.id; - } - } + // Always use VL mode to fill bbox parameters + action.param[field] = fillBboxParam( + locateResult, + imageWidth, + imageHeight, + rightLimit, + bottomLimit, + vlMode, + ); } }); }); diff --git a/packages/core/tests/unit-test/proxy-integration.test.ts b/packages/core/tests/unit-test/proxy-integration.test.ts index 5ffe99289..68b6b571e 100644 --- a/packages/core/tests/unit-test/proxy-integration.test.ts +++ b/packages/core/tests/unit-test/proxy-integration.test.ts @@ -1,5 +1,5 @@ -import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest'; import type { IModelConfig } from '@midscene/shared/env'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; // Mock the dependencies before importing the module under test vi.mock('openai', () => { diff --git a/packages/core/tests/unit-test/service-caller.test.ts b/packages/core/tests/unit-test/service-caller.test.ts index 02d607b81..dc90b54cb 100644 --- a/packages/core/tests/unit-test/service-caller.test.ts +++ b/packages/core/tests/unit-test/service-caller.test.ts @@ -1,7 +1,7 @@ import { AIActionType } from '@/ai-model'; import { getResponseFormat } from '@/ai-model/service-caller'; -import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest'; import type { IModelConfig } from '@midscene/shared/env'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; describe('service-caller', () => { describe('getResponseFormat', () => { diff --git a/packages/shared/src/env/model-config-manager.ts b/packages/shared/src/env/model-config-manager.ts index 2e9aa7a0c..edb8f7b57 100644 --- a/packages/shared/src/env/model-config-manager.ts +++ b/packages/shared/src/env/model-config-manager.ts @@ -5,6 +5,7 @@ import { import type { GlobalConfigManager } from './global-config-manager'; import type { IModelConfig, TIntent, TModelConfigFn } from './types'; +import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types'; const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning']; @@ -101,13 +102,15 @@ export class ModelConfigManager { * if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined */ getModelConfig(intent: TIntent): IModelConfig { + let config: IModelConfig; + if (this.isolatedMode) { if (!this.modelConfigMap) { throw new Error( 'modelConfigMap is not initialized in isolated mode, which should not happen', ); } - return this.modelConfigMap[intent]; + config = this.modelConfigMap[intent]; } else { if (!this.modelConfigMap) { if (!this.globalConfigManager) { @@ -119,8 +122,26 @@ export class ModelConfigManager { this.globalConfigManager.getAllEnvConfig(), ); } - return this.modelConfigMap[intent]; + config = this.modelConfigMap[intent]; + } + + // Validate Planning must use VL mode + if (intent === 'planning' && !config.vlMode) { + throw new Error( + `Planning requires a vision language model (VL model). DOM-based planning is not supported. + +Please configure one of the following VL modes: + ${VL_MODES.map((mode) => `- ${mode}`).join('\n ')} + +Configuration examples: + - Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl + - Or use modelConfig function with planning intent + +Learn more: https://midscenejs.com/choose-a-model`, + ); } + + return config; } getUploadTestServerUrl(): string | undefined { diff --git a/packages/shared/src/env/types.ts b/packages/shared/src/env/types.ts index 617190426..4399909f4 100644 --- a/packages/shared/src/env/types.ts +++ b/packages/shared/src/env/types.ts @@ -238,6 +238,21 @@ export interface IModelConfigForVQA { [MIDSCENE_VQA_VL_MODE]?: TVlModeValues; } +/** + * Model configuration for Planning intent. + * + * IMPORTANT: Planning MUST use a vision language model (VL mode). + * DOM-based planning is not supported. + * + * Required: MIDSCENE_PLANNING_VL_MODE must be set to one of: + * - 'qwen-vl' + * - 'qwen3-vl' + * - 'gemini' + * - 'doubao-vision' + * - 'vlm-ui-tars' + * - 'vlm-ui-tars-doubao' + * - 'vlm-ui-tars-doubao-1.5' + */ export interface IModelConfigForPlanning { // model name [MIDSCENE_PLANNING_MODEL_NAME]: string; diff --git a/packages/shared/tests/unit-test/env/modle-config-manager.test.ts b/packages/shared/tests/unit-test/env/modle-config-manager.test.ts index 4ba364535..59f953144 100644 --- a/packages/shared/tests/unit-test/env/modle-config-manager.test.ts +++ b/packages/shared/tests/unit-test/env/modle-config-manager.test.ts @@ -13,6 +13,7 @@ import { MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, + MIDSCENE_PLANNING_VL_MODE, MIDSCENE_VQA_MODEL_NAME, MIDSCENE_VQA_OPENAI_API_KEY, MIDSCENE_VQA_OPENAI_BASE_URL, @@ -48,9 +49,10 @@ describe('ModelConfigManager', () => { }; case 'planning': return { - [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', + [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus', [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key', [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const, }; case 'grounding': return { @@ -105,9 +107,10 @@ describe('ModelConfigManager', () => { }; case 'planning': return { - [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', + [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus', [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key', [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl', }; case 'grounding': return { @@ -131,10 +134,11 @@ describe('ModelConfigManager', () => { expect(vqaConfig.from).toBe('modelConfig'); const planningConfig = manager.getModelConfig('planning'); - expect(planningConfig.modelName).toBe('gpt-4'); + expect(planningConfig.modelName).toBe('qwen-vl-plus'); expect(planningConfig.openaiApiKey).toBe('test-planning-key'); expect(planningConfig.intent).toBe('planning'); expect(planningConfig.from).toBe('modelConfig'); + expect(planningConfig.vlMode).toBe('qwen-vl'); const groundingConfig = manager.getModelConfig('grounding'); expect(groundingConfig.modelName).toBe('gpt-4-vision'); @@ -263,4 +267,167 @@ describe('ModelConfigManager', () => { expect(config.openaiBaseURL).toBe('https://isolated.openai.com/v1'); }); }); + + describe('Planning VL mode validation', () => { + it('should throw error when planning has no vlMode in isolated mode', () => { + const modelConfigFn: TModelConfigFn = ({ intent }) => { + if (intent === 'planning') { + // Missing VL mode for planning + return { + [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', + [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + }; + } + return { + [MIDSCENE_MODEL_NAME]: 'gpt-4', + [MIDSCENE_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + }; + }; + + const manager = new ModelConfigManager(modelConfigFn); + + expect(() => manager.getModelConfig('planning')).toThrow( + 'Planning requires a vision language model (VL model). DOM-based planning is not supported.', + ); + }); + + it('should succeed when planning has valid vlMode in isolated mode', () => { + const modelConfigFn: TModelConfigFn = ({ intent }) => { + if (intent === 'planning') { + return { + [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus', + [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const, + }; + } + return { + [MIDSCENE_MODEL_NAME]: 'gpt-4', + [MIDSCENE_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + }; + }; + + const manager = new ModelConfigManager(modelConfigFn); + const config = manager.getModelConfig('planning'); + + expect(config.vlMode).toBe('qwen-vl'); + expect(config.modelName).toBe('qwen-vl-plus'); + }); + + it('should throw error when planning has no vlMode in normal mode', () => { + vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'gpt-4'); + vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key'); + vi.stubEnv( + MIDSCENE_PLANNING_OPENAI_BASE_URL, + 'https://api.openai.com/v1', + ); + // Intentionally not setting MIDSCENE_PLANNING_VL_MODE + + const manager = new ModelConfigManager(); + manager.registerGlobalConfigManager(new GlobalConfigManager()); + + expect(() => manager.getModelConfig('planning')).toThrow( + 'Planning requires a vision language model (VL model). DOM-based planning is not supported.', + ); + }); + + it('should succeed when planning has valid vlMode in normal mode', () => { + vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'qwen-vl-plus'); + vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key'); + vi.stubEnv( + MIDSCENE_PLANNING_OPENAI_BASE_URL, + 'https://api.openai.com/v1', + ); + vi.stubEnv(MIDSCENE_PLANNING_VL_MODE, 'qwen-vl'); + + const manager = new ModelConfigManager(); + manager.registerGlobalConfigManager(new GlobalConfigManager()); + + const config = manager.getModelConfig('planning'); + + expect(config.vlMode).toBe('qwen-vl'); + expect(config.modelName).toBe('qwen-vl-plus'); + expect(config.intent).toBe('planning'); + }); + + it('should not affect other intents when planning validation fails', () => { + const modelConfigFn: TModelConfigFn = ({ intent }) => { + if (intent === 'planning') { + // Missing VL mode for planning - should fail + return { + [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', + [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + }; + } + // Other intents should work fine + return { + [MIDSCENE_MODEL_NAME]: 'gpt-4', + [MIDSCENE_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + }; + }; + + const manager = new ModelConfigManager(modelConfigFn); + + // Planning should fail + expect(() => manager.getModelConfig('planning')).toThrow( + 'Planning requires a vision language model', + ); + + // Other intents should succeed + expect(() => manager.getModelConfig('default')).not.toThrow(); + expect(() => manager.getModelConfig('VQA')).not.toThrow(); + expect(() => manager.getModelConfig('grounding')).not.toThrow(); + }); + + it('should accept all valid VL modes for planning', () => { + const vlModeTestCases: Array<{ + raw: + | 'qwen-vl' + | 'qwen3-vl' + | 'gemini' + | 'doubao-vision' + | 'vlm-ui-tars' + | 'vlm-ui-tars-doubao' + | 'vlm-ui-tars-doubao-1.5'; + expected: string; + }> = [ + { raw: 'qwen-vl', expected: 'qwen-vl' }, + { raw: 'qwen3-vl', expected: 'qwen3-vl' }, + { raw: 'gemini', expected: 'gemini' }, + { raw: 'doubao-vision', expected: 'doubao-vision' }, + // UI-TARS variants all normalize to 'vlm-ui-tars' + { raw: 'vlm-ui-tars', expected: 'vlm-ui-tars' }, + { raw: 'vlm-ui-tars-doubao', expected: 'vlm-ui-tars' }, + { raw: 'vlm-ui-tars-doubao-1.5', expected: 'vlm-ui-tars' }, + ]; + + for (const { raw, expected } of vlModeTestCases) { + const modelConfigFn: TModelConfigFn = ({ intent }) => { + if (intent === 'planning') { + return { + [MIDSCENE_PLANNING_MODEL_NAME]: 'test-model', + [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + [MIDSCENE_PLANNING_VL_MODE]: raw, + }; + } + return { + [MIDSCENE_MODEL_NAME]: 'gpt-4', + [MIDSCENE_OPENAI_API_KEY]: 'test-key', + [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1', + }; + }; + + const manager = new ModelConfigManager(modelConfigFn); + const config = manager.getModelConfig('planning'); + + expect(config.vlMode).toBe(expected); + } + }); + }); });