Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 17 additions & 38 deletions packages/core/src/ai-model/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env';
import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
import { getDebug } from '@midscene/shared/logger';
import { assert } from '@midscene/shared/utils';
import type {
ChatCompletionContentPart,
ChatCompletionMessageParam,
} from 'openai/resources/index';
import type { ChatCompletionMessageParam } from 'openai/resources/index';
import {
AIActionType,
buildYamlFlowFromPlans,
fillBboxParam,
findAllMidsceneLocatorField,
markupImageForLLM,
warnGPT4oSizeLimit,
} from './common';
import type { ConversationHistory } from './conversation-history';
import { systemPromptToTaskPlanning } from './prompt/llm-planning';
import { describeUserPage } from './prompt/util';
import { callAIWithObjectResponse } from './service-caller/index';

const debug = getDebug('planning');
Expand All @@ -43,10 +38,9 @@ export async function plan(

const { modelName, vlMode } = modelConfig;

const { description: pageDescription, elementById } = await describeUserPage(
context,
{ vlMode },
);
// Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
assert(vlMode, 'Planning requires vlMode to be configured.');

const systemPrompt = await systemPromptToTaskPlanning({
actionSpace: opts.actionSpace,
vlMode: vlMode,
Expand All @@ -57,21 +51,19 @@ export async function plan(
let imageHeight = size.height;
const rightLimit = imageWidth;
const bottomLimit = imageHeight;

// Process image based on VL mode requirements
if (vlMode === 'qwen-vl') {
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
imageWidth = paddedResult.width;
imageHeight = paddedResult.height;
imagePayload = paddedResult.imageBase64;
} else if (vlMode === 'qwen3-vl') {
// Reserved for qwen3-vl specific processing
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
// imageWidth = paddedResult.width;
// imageHeight = paddedResult.height;
// imagePayload = paddedResult.imageBase64;
} else if (!vlMode) {
imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
width: imageWidth,
height: imageHeight,
});
}

warnGPT4oSizeLimit(size, modelName);
Expand Down Expand Up @@ -120,14 +112,7 @@ export async function plan(
detail: 'high',
},
},
...(vlMode
? []
: ([
{
type: 'text',
text: pageDescription,
},
] as ChatCompletionContentPart[])),
// Planning uses pure vision mode, no DOM description needed
],
},
];
Expand Down Expand Up @@ -173,21 +158,15 @@ export async function plan(
locateFields.forEach((field) => {
const locateResult = action.param[field];
if (locateResult) {
if (vlMode) {
action.param[field] = fillBboxParam(
locateResult,
imageWidth,
imageHeight,
rightLimit,
bottomLimit,
vlMode,
);
} else {
const element = elementById(locateResult);
if (element) {
action.param[field].id = element.id;
}
}
// Always use VL mode to fill bbox parameters
action.param[field] = fillBboxParam(
locateResult,
imageWidth,
imageHeight,
rightLimit,
bottomLimit,
vlMode,
);
}
});
});
Expand Down
2 changes: 1 addition & 1 deletion packages/core/tests/unit-test/proxy-integration.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest';
import type { IModelConfig } from '@midscene/shared/env';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';

// Mock the dependencies before importing the module under test
vi.mock('openai', () => {
Expand Down
2 changes: 1 addition & 1 deletion packages/core/tests/unit-test/service-caller.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { AIActionType } from '@/ai-model';
import { getResponseFormat } from '@/ai-model/service-caller';
import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest';
import type { IModelConfig } from '@midscene/shared/env';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';

describe('service-caller', () => {
describe('getResponseFormat', () => {
Expand Down
25 changes: 23 additions & 2 deletions packages/shared/src/env/model-config-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
import type { GlobalConfigManager } from './global-config-manager';

import type { IModelConfig, TIntent, TModelConfigFn } from './types';
import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types';

const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning'];

Expand Down Expand Up @@ -101,13 +102,15 @@ export class ModelConfigManager {
* if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined
*/
getModelConfig(intent: TIntent): IModelConfig {
let config: IModelConfig;

if (this.isolatedMode) {
if (!this.modelConfigMap) {
throw new Error(
'modelConfigMap is not initialized in isolated mode, which should not happen',
);
}
return this.modelConfigMap[intent];
config = this.modelConfigMap[intent];
} else {
if (!this.modelConfigMap) {
if (!this.globalConfigManager) {
Expand All @@ -119,8 +122,26 @@ export class ModelConfigManager {
this.globalConfigManager.getAllEnvConfig(),
);
}
return this.modelConfigMap[intent];
config = this.modelConfigMap[intent];
}

// Validate Planning must use VL mode
if (intent === 'planning' && !config.vlMode) {
throw new Error(
`Planning requires a vision language model (VL model). DOM-based planning is not supported.

Please configure one of the following VL modes:
${VL_MODES.map((mode) => `- ${mode}`).join('\n ')}

Configuration examples:
- Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl
- Or use modelConfig function with planning intent

Learn more: https://midscenejs.com/choose-a-model`,
);
}

return config;
}

getUploadTestServerUrl(): string | undefined {
Expand Down
15 changes: 15 additions & 0 deletions packages/shared/src/env/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,21 @@ export interface IModelConfigForVQA {
[MIDSCENE_VQA_VL_MODE]?: TVlModeValues;
}

/**
* Model configuration for Planning intent.
*
* IMPORTANT: Planning MUST use a vision language model (VL mode).
* DOM-based planning is not supported.
*
* Required: MIDSCENE_PLANNING_VL_MODE must be set to one of:
* - 'qwen-vl'
* - 'qwen3-vl'
* - 'gemini'
* - 'doubao-vision'
* - 'vlm-ui-tars'
* - 'vlm-ui-tars-doubao'
* - 'vlm-ui-tars-doubao-1.5'
*/
export interface IModelConfigForPlanning {
// model name
[MIDSCENE_PLANNING_MODEL_NAME]: string;
Expand Down
Loading