diff --git a/packages/core/src/ai-model/common.ts b/packages/core/src/ai-model/common.ts index 3f9e229ef..8650d47da 100644 --- a/packages/core/src/ai-model/common.ts +++ b/packages/core/src/ai-model/common.ts @@ -299,27 +299,6 @@ export function adaptBboxToRect( return rect; } -let warned = false; -export function warnGPT4oSizeLimit(size: Size, modelName: string) { - if (warned) return; - if (modelName.toLowerCase().includes('gpt-4o')) { - const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your interface to a smaller resolution. Otherwise, the result may be inaccurate.`; - - if ( - Math.max(size.width, size.height) > 2000 || - Math.min(size.width, size.height) > 768 - ) { - console.warn(warningMsg); - warned = true; - } - } else if (size.width > 1800 || size.height > 1800) { - console.warn( - `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`, - ); - warned = true; - } -} - export function mergeRects(rects: Rect[]) { const minLeft = Math.min(...rects.map((r) => r.left)); const minTop = Math.min(...rects.map((r) => r.top)); diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts index 9173277f1..921045c85 100644 --- a/packages/core/src/ai-model/inspect.ts +++ b/packages/core/src/ai-model/inspect.ts @@ -28,7 +28,6 @@ import { AIActionType, adaptBboxToRect, expandSearchArea, - markupImageForLLM, mergeRects, } from './common'; import { @@ -177,17 +176,6 @@ export async function AiLocateElement< imageWidth = paddedResult.width; imageHeight = paddedResult.height; imagePayload = paddedResult.imageBase64; - } else if (vlMode === 'qwen3-vl') { - // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32); - // imageWidth = paddedResult.width; - // imageHeight = paddedResult.height; - // imagePayload = paddedResult.imageBase64; - } else if (!vlMode) { - imagePayload = await markupImageForLLM( - screenshotBase64, - context.tree, - context.size, - ); } const msgs: AIArgs = [ diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts index a21ed7f13..4797b3a5e 100644 --- a/packages/core/src/ai-model/llm-planning.ts +++ b/packages/core/src/ai-model/llm-planning.ts @@ -14,7 +14,6 @@ import { buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField, - warnGPT4oSizeLimit, } from './common'; import type { ConversationHistory } from './conversation-history'; import { systemPromptToTaskPlanning } from './prompt/llm-planning'; @@ -36,7 +35,7 @@ export async function plan( const { context, modelConfig, conversationHistory } = opts; const { screenshotBase64, size } = context; - const { modelName, vlMode } = modelConfig; + const { vlMode } = modelConfig; // Planning requires VL mode (validated by ModelConfigManager.getModelConfig) assert(vlMode, 'Planning requires vlMode to be configured.'); @@ -58,16 +57,8 @@ export async function plan( imageWidth = paddedResult.width; imageHeight = paddedResult.height; imagePayload = paddedResult.imageBase64; - } else if (vlMode === 'qwen3-vl') { - // Reserved for qwen3-vl specific processing - // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32); - // imageWidth = paddedResult.width; - // imageHeight = paddedResult.height; - // imagePayload = paddedResult.imageBase64; } - warnGPT4oSizeLimit(size, modelName); - const historyLog = opts.conversationHistory?.snapshot() || []; // .filter((item) => item.role === 'assistant') || []; diff --git a/packages/core/src/ai-model/prompt/llm-locator.ts b/packages/core/src/ai-model/prompt/llm-locator.ts index 0045e7dd4..aacdfd0aa 100644 --- a/packages/core/src/ai-model/prompt/llm-locator.ts +++ b/packages/core/src/ai-model/prompt/llm-locator.ts @@ -1,11 +1,9 @@ import { PromptTemplate } from '@langchain/core/prompts'; import type { TVlModeTypes } from '@midscene/shared/env'; -import type { ResponseFormatJSONSchema } from 'openai/resources/index'; import { bboxDescription } from './common'; export function systemPromptToLocateElement(vlMode: TVlModeTypes | undefined) { - if (vlMode) { - const bboxComment = bboxDescription(vlMode); - return ` + const bboxComment = bboxDescription(vlMode); + return ` ## Role: You are an expert in software testing. @@ -57,203 +55,8 @@ When no element is found and the description is not order-sensitive: } \`\`\` `; - } - - return ` -## Role: -You are an expert in software page image (2D) and page element text analysis. - -## Objective: -- Identify elements in screenshots and text that match the user's description. -- Return JSON data containing the selection reason and element ID. -- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.). - -## Skills: -- Image analysis and recognition -- Multilingual text understanding -- Software UI design and testing - -## Workflow: -1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English. -2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot. -3. Found the required number of elements -4. Return JSON data containing the selection reason and element ID. -5. Judge whether the user's description is order-sensitive (see below for definition and examples). - -## Constraints: -- Strictly adhere to the specified location when describing the required element; do not select elements from other locations. -- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements. -- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image. -- If no elements are found, the "elements" array should be empty. -- The returned data must conform to the specified JSON format. -- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**) - -## Order-Sensitive Definition: -- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true). -- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false). - -## Output Format: - -Please return the result in JSON format as follows: - -\`\`\`json -{ - "elements": [ - // If no matching elements are found, return an empty array [] - { - "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process - "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty - "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo - } - // More elements... - ], - "isOrderSensitive": true, // or false, depending on the user's description - "errors": [] // Array of strings containing any error messages -} -\`\`\` - -## Example: -Example 1: -Input Example: -\`\`\`json -// Description: "Shopping cart icon in the upper right corner" -{ - "description": "PLACEHOLDER", // Description of the target element - "screenshot": "path/screenshot.png", - "text": '{ - "pageSize": { - "width": 400, // Width of the page - "height": 905 // Height of the page - }, - "elementInfos": [ - { - "id": "1231", // ID of the element - "indexId": "0", // Index of the element,The image is labeled to the left of the element - "attributes": { // Attributes of the element - "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node - "src": "https://ap-southeast-3.m", - "class": ".img" - }, - "content": "", // Text content of the element - "rect": { - "left": 280, // Distance from the left side of the page - "top": 8, // Distance from the top of the page - "width": 44, // Width of the element - "height": 44 // Height of the element - } - }, - { - "id": "66551", // ID of the element - "indexId": "1", // Index of the element,The image is labeled to the left of the element - "attributes": { // Attributes of the element - "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node - "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...", - "class": ".icon" - }, - "content": "", // Text content of the element - "rect": { - "left": 350, // Distance from the left side of the page - "top": 16, // Distance from the top of the page - "width": 25, // Width of the element - "height": 25 // Height of the element - } - }, - ... - { - "id": "12344", - "indexId": "2", // Index of the element,The image is labeled to the left of the element - "attributes": { - "nodeType": "TEXT Node", - "class": ".product-name" - }, - "center": [ - 288, - 834 - ], - "content": "Mango Drink", - "rect": { - "left": 188, - "top": 827, - "width": 199, - "height": 13 - } - }, - ... - ] - } - ' -} -\`\`\` -Output Example: -\`\`\`json -{ - "elements": [ - { - // Describe the reason for finding this element, replace with actual value in practice - "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button", - "text": "", - // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId** - "id": "1231" - } - ], - "isOrderSensitive": true, - "errors": [] -} -\`\`\` - - `; } -export const locatorSchema: ResponseFormatJSONSchema = { - type: 'json_schema', - json_schema: { - name: 'find_elements', - strict: true, - schema: { - type: 'object', - properties: { - elements: { - type: 'array', - items: { - type: 'object', - properties: { - reason: { - type: 'string', - description: 'Reason for finding this element', - }, - text: { - type: 'string', - description: 'Text content of the element', - }, - id: { - type: 'string', - description: 'ID of this element', - }, - }, - required: ['reason', 'text', 'id'], - additionalProperties: false, - }, - description: 'List of found elements', - }, - isOrderSensitive: { - type: 'boolean', - description: - 'Whether the targetElementDescription is order-sensitive (true/false)', - }, - errors: { - type: 'array', - items: { - type: 'string', - }, - description: 'List of error messages, if any', - }, - }, - required: ['elements', 'isOrderSensitive', 'errors'], - additionalProperties: false, - }, - }, -}; - export const findElementPrompt = new PromptTemplate({ template: ` Here is the item user want to find: diff --git a/packages/core/src/ai-model/service-caller/index.ts b/packages/core/src/ai-model/service-caller/index.ts index 7a29f9529..84d65e010 100644 --- a/packages/core/src/ai-model/service-caller/index.ts +++ b/packages/core/src/ai-model/service-caller/index.ts @@ -19,7 +19,6 @@ import type { Stream } from 'openai/streaming'; import { SocksProxyAgent } from 'socks-proxy-agent'; import { AIActionType, type AIArgs } from '../common'; import { assertSchema } from '../prompt/assertion'; -import { locatorSchema } from '../prompt/llm-locator'; import { planSchema } from '../prompt/llm-planning'; async function createChatClient({ @@ -289,9 +288,6 @@ export const getResponseFormat = ( case AIActionType.ASSERT: responseFormat = assertSchema; break; - case AIActionType.INSPECT_ELEMENT: - responseFormat = locatorSchema; - break; case AIActionType.PLAN: responseFormat = planSchema; break;