Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions packages/core/src/ai-model/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -299,27 +299,6 @@ export function adaptBboxToRect(
return rect;
}

let warned = false;
export function warnGPT4oSizeLimit(size: Size, modelName: string) {
if (warned) return;
if (modelName.toLowerCase().includes('gpt-4o')) {
const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your interface to a smaller resolution. Otherwise, the result may be inaccurate.`;

if (
Math.max(size.width, size.height) > 2000 ||
Math.min(size.width, size.height) > 768
) {
console.warn(warningMsg);
warned = true;
}
} else if (size.width > 1800 || size.height > 1800) {
console.warn(
`The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,
);
warned = true;
}
}

export function mergeRects(rects: Rect[]) {
const minLeft = Math.min(...rects.map((r) => r.left));
const minTop = Math.min(...rects.map((r) => r.top));
Expand Down
12 changes: 0 additions & 12 deletions packages/core/src/ai-model/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import {
AIActionType,
adaptBboxToRect,
expandSearchArea,
markupImageForLLM,
mergeRects,
} from './common';
import {
Expand Down Expand Up @@ -177,17 +176,6 @@ export async function AiLocateElement<
imageWidth = paddedResult.width;
imageHeight = paddedResult.height;
imagePayload = paddedResult.imageBase64;
Comment on lines 176 to 178

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Non-VL element lookup no longer overlays element IDs

The AiLocateElement flow now skips markupImageForLLM whenever vlMode is undefined, so GPT‑4o receives the raw screenshot instead of the annotated version. The locator prompt for non-VL models still asserts that non-text elements “have been highlighted” and expects IDs marked in the image (prompt/llm-locator.ts around lines 83‑86). Without those overlays the model cannot map the textual element list to the screenshot, so element searches for default models will degrade or fail outright. Consider restoring the markup step or updating the prompt and downstream logic to align with the unannotated image.

Useful? React with 👍 / 👎.

} else if (vlMode === 'qwen3-vl') {
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
// imageWidth = paddedResult.width;
// imageHeight = paddedResult.height;
// imagePayload = paddedResult.imageBase64;
} else if (!vlMode) {
imagePayload = await markupImageForLLM(
screenshotBase64,
context.tree,
context.size,
);
}

const msgs: AIArgs = [
Expand Down
11 changes: 1 addition & 10 deletions packages/core/src/ai-model/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import {
buildYamlFlowFromPlans,
fillBboxParam,
findAllMidsceneLocatorField,
warnGPT4oSizeLimit,
} from './common';
import type { ConversationHistory } from './conversation-history';
import { systemPromptToTaskPlanning } from './prompt/llm-planning';
Expand All @@ -36,7 +35,7 @@ export async function plan(
const { context, modelConfig, conversationHistory } = opts;
const { screenshotBase64, size } = context;

const { modelName, vlMode } = modelConfig;
const { vlMode } = modelConfig;

// Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
assert(vlMode, 'Planning requires vlMode to be configured.');
Expand All @@ -58,16 +57,8 @@ export async function plan(
imageWidth = paddedResult.width;
imageHeight = paddedResult.height;
imagePayload = paddedResult.imageBase64;
} else if (vlMode === 'qwen3-vl') {
// Reserved for qwen3-vl specific processing
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
// imageWidth = paddedResult.width;
// imageHeight = paddedResult.height;
// imagePayload = paddedResult.imageBase64;
}

warnGPT4oSizeLimit(size, modelName);

const historyLog = opts.conversationHistory?.snapshot() || [];
// .filter((item) => item.role === 'assistant') || [];

Expand Down
201 changes: 2 additions & 199 deletions packages/core/src/ai-model/prompt/llm-locator.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import { PromptTemplate } from '@langchain/core/prompts';
import type { TVlModeTypes } from '@midscene/shared/env';
import type { ResponseFormatJSONSchema } from 'openai/resources/index';
import { bboxDescription } from './common';
export function systemPromptToLocateElement(vlMode: TVlModeTypes | undefined) {
if (vlMode) {
const bboxComment = bboxDescription(vlMode);
return `
const bboxComment = bboxDescription(vlMode);
return `
## Role:
You are an expert in software testing.

Expand Down Expand Up @@ -57,203 +55,8 @@ When no element is found and the description is not order-sensitive:
}
\`\`\`
`;
}

return `
## Role:
You are an expert in software page image (2D) and page element text analysis.

## Objective:
- Identify elements in screenshots and text that match the user's description.
- Return JSON data containing the selection reason and element ID.
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).

## Skills:
- Image analysis and recognition
- Multilingual text understanding
- Software UI design and testing

## Workflow:
1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
3. Found the required number of elements
4. Return JSON data containing the selection reason and element ID.
5. Judge whether the user's description is order-sensitive (see below for definition and examples).

## Constraints:
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
- If no elements are found, the "elements" array should be empty.
- The returned data must conform to the specified JSON format.
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)

## Order-Sensitive Definition:
- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).

## Output Format:

Please return the result in JSON format as follows:

\`\`\`json
{
"elements": [
// If no matching elements are found, return an empty array []
{
"reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
"id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
}
// More elements...
],
"isOrderSensitive": true, // or false, depending on the user's description
"errors": [] // Array of strings containing any error messages
}
\`\`\`

## Example:
Example 1:
Input Example:
\`\`\`json
// Description: "Shopping cart icon in the upper right corner"
{
"description": "PLACEHOLDER", // Description of the target element
"screenshot": "path/screenshot.png",
"text": '{
"pageSize": {
"width": 400, // Width of the page
"height": 905 // Height of the page
},
"elementInfos": [
{
"id": "1231", // ID of the element
"indexId": "0", // Index of the element,The image is labeled to the left of the element
"attributes": { // Attributes of the element
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
"src": "https://ap-southeast-3.m",
"class": ".img"
},
"content": "", // Text content of the element
"rect": {
"left": 280, // Distance from the left side of the page
"top": 8, // Distance from the top of the page
"width": 44, // Width of the element
"height": 44 // Height of the element
}
},
{
"id": "66551", // ID of the element
"indexId": "1", // Index of the element,The image is labeled to the left of the element
"attributes": { // Attributes of the element
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
"src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
"class": ".icon"
},
"content": "", // Text content of the element
"rect": {
"left": 350, // Distance from the left side of the page
"top": 16, // Distance from the top of the page
"width": 25, // Width of the element
"height": 25 // Height of the element
}
},
...
{
"id": "12344",
"indexId": "2", // Index of the element,The image is labeled to the left of the element
"attributes": {
"nodeType": "TEXT Node",
"class": ".product-name"
},
"center": [
288,
834
],
"content": "Mango Drink",
"rect": {
"left": 188,
"top": 827,
"width": 199,
"height": 13
}
},
...
]
}
'
}
\`\`\`
Output Example:
\`\`\`json
{
"elements": [
{
// Describe the reason for finding this element, replace with actual value in practice
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
"text": "",
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
"id": "1231"
}
],
"isOrderSensitive": true,
"errors": []
}
\`\`\`

`;
}

export const locatorSchema: ResponseFormatJSONSchema = {
type: 'json_schema',
json_schema: {
name: 'find_elements',
strict: true,
schema: {
type: 'object',
properties: {
elements: {
type: 'array',
items: {
type: 'object',
properties: {
reason: {
type: 'string',
description: 'Reason for finding this element',
},
text: {
type: 'string',
description: 'Text content of the element',
},
id: {
type: 'string',
description: 'ID of this element',
},
},
required: ['reason', 'text', 'id'],
additionalProperties: false,
},
description: 'List of found elements',
},
isOrderSensitive: {
type: 'boolean',
description:
'Whether the targetElementDescription is order-sensitive (true/false)',
},
errors: {
type: 'array',
items: {
type: 'string',
},
description: 'List of error messages, if any',
},
},
required: ['elements', 'isOrderSensitive', 'errors'],
additionalProperties: false,
},
},
};

export const findElementPrompt = new PromptTemplate({
template: `
Here is the item user want to find:
Expand Down
4 changes: 0 additions & 4 deletions packages/core/src/ai-model/service-caller/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import type { Stream } from 'openai/streaming';
import { SocksProxyAgent } from 'socks-proxy-agent';
import { AIActionType, type AIArgs } from '../common';
import { assertSchema } from '../prompt/assertion';
import { locatorSchema } from '../prompt/llm-locator';
import { planSchema } from '../prompt/llm-planning';

async function createChatClient({
Expand Down Expand Up @@ -289,9 +288,6 @@ export const getResponseFormat = (
case AIActionType.ASSERT:
responseFormat = assertSchema;
break;
case AIActionType.INSPECT_ELEMENT:
responseFormat = locatorSchema;
break;
case AIActionType.PLAN:
responseFormat = planSchema;
break;
Expand Down