web-infra-dev · quanru · Oct 20, 2025 · Oct 17, 2025 · Oct 20, 2025 · chatgpt-codex-connector
diff --git a/packages/core/src/ai-model/common.ts b/packages/core/src/ai-model/common.ts
@@ -299,27 +299,6 @@ export function adaptBboxToRect(
   return rect;
 }
 
-let warned = false;
-export function warnGPT4oSizeLimit(size: Size, modelName: string) {
-  if (warned) return;
-  if (modelName.toLowerCase().includes('gpt-4o')) {
-    const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your interface to a smaller resolution. Otherwise, the result may be inaccurate.`;
-
-    if (
-      Math.max(size.width, size.height) > 2000 ||
-      Math.min(size.width, size.height) > 768
-    ) {
-      console.warn(warningMsg);
-      warned = true;
-    }
-  } else if (size.width > 1800 || size.height > 1800) {
-    console.warn(
-      `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,
-    );
-    warned = true;
-  }
-}
-
 export function mergeRects(rects: Rect[]) {
   const minLeft = Math.min(...rects.map((r) => r.left));
   const minTop = Math.min(...rects.map((r) => r.top));

diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts
@@ -28,7 +28,6 @@ import {
   AIActionType,
   adaptBboxToRect,
   expandSearchArea,
-  markupImageForLLM,
   mergeRects,
 } from './common';
 import {
@@ -177,17 +176,6 @@ export async function AiLocateElement<
     imageWidth = paddedResult.width;
     imageHeight = paddedResult.height;
     imagePayload = paddedResult.imageBase64;
-  } else if (vlMode === 'qwen3-vl') {
-    // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
-    // imageWidth = paddedResult.width;
-    // imageHeight = paddedResult.height;
-    // imagePayload = paddedResult.imageBase64;
-  } else if (!vlMode) {
-    imagePayload = await markupImageForLLM(
-      screenshotBase64,
-      context.tree,
-      context.size,
-    );
   }
 
   const msgs: AIArgs = [

diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts
@@ -14,7 +14,6 @@ import {
   buildYamlFlowFromPlans,
   fillBboxParam,
   findAllMidsceneLocatorField,
-  warnGPT4oSizeLimit,
 } from './common';
 import type { ConversationHistory } from './conversation-history';
 import { systemPromptToTaskPlanning } from './prompt/llm-planning';
@@ -36,7 +35,7 @@ export async function plan(
   const { context, modelConfig, conversationHistory } = opts;
   const { screenshotBase64, size } = context;
 
-  const { modelName, vlMode } = modelConfig;
+  const { vlMode } = modelConfig;
 
   // Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
   assert(vlMode, 'Planning requires vlMode to be configured.');
@@ -58,16 +57,8 @@ export async function plan(
     imageWidth = paddedResult.width;
     imageHeight = paddedResult.height;
     imagePayload = paddedResult.imageBase64;
-  } else if (vlMode === 'qwen3-vl') {
-    // Reserved for qwen3-vl specific processing
-    // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
-    // imageWidth = paddedResult.width;
-    // imageHeight = paddedResult.height;
-    // imagePayload = paddedResult.imageBase64;
   }
 
-  warnGPT4oSizeLimit(size, modelName);
-
   const historyLog = opts.conversationHistory?.snapshot() || [];
   // .filter((item) => item.role === 'assistant') || [];
 

diff --git a/packages/core/src/ai-model/prompt/llm-locator.ts b/packages/core/src/ai-model/prompt/llm-locator.ts
@@ -1,11 +1,9 @@
 import { PromptTemplate } from '@langchain/core/prompts';
 import type { TVlModeTypes } from '@midscene/shared/env';
-import type { ResponseFormatJSONSchema } from 'openai/resources/index';
 import { bboxDescription } from './common';
 export function systemPromptToLocateElement(vlMode: TVlModeTypes | undefined) {
-  if (vlMode) {
-    const bboxComment = bboxDescription(vlMode);
-    return `
+  const bboxComment = bboxDescription(vlMode);
+  return `
 ## Role:
 You are an expert in software testing.
 
@@ -57,203 +55,8 @@ When no element is found and the description is not order-sensitive:
 }
 \`\`\`
 `;
-  }
-
-  return `
-## Role:
-You are an expert in software page image (2D) and page element text analysis.
-
-## Objective:
-- Identify elements in screenshots and text that match the user's description.
-- Return JSON data containing the selection reason and element ID.
-- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
-
-## Skills:
-- Image analysis and recognition
-- Multilingual text understanding
-- Software UI design and testing
-
-## Workflow:
-1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
-2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
-3. Found the required number of elements
-4. Return JSON data containing the selection reason and element ID.
-5. Judge whether the user's description is order-sensitive (see below for definition and examples).
-
-## Constraints:
-- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
-- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
-- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
-- If no elements are found, the "elements" array should be empty.
-- The returned data must conform to the specified JSON format.
-- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
-
-## Order-Sensitive Definition:
-- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
-- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
-
-## Output Format:
-
-Please return the result in JSON format as follows:
-
-\`\`\`json
-{
-  "elements": [
-    // If no matching elements are found, return an empty array []
-    {
-      "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
-      "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-      "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
-    }
-    // More elements...
-  ],
-  "isOrderSensitive": true, // or false, depending on the user's description
-  "errors": [] // Array of strings containing any error messages
-}
-\`\`\`
-
-## Example:
-Example 1:
-Input Example:
-\`\`\`json
-// Description: "Shopping cart icon in the upper right corner"
-{
-  "description": "PLACEHOLDER", // Description of the target element
-  "screenshot": "path/screenshot.png",
-  "text": '{
-      "pageSize": {
-        "width": 400, // Width of the page
-        "height": 905 // Height of the page
-      },
-      "elementInfos": [
-        {
-          "id": "1231", // ID of the element
-          "indexId": "0", // Index of the element，The image is labeled to the left of the element
-          "attributes": { // Attributes of the element
-            "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
-            "src": "https://ap-southeast-3.m",
-            "class": ".img"
-          },
-          "content": "", // Text content of the element
-          "rect": {
-            "left": 280, // Distance from the left side of the page
-            "top": 8, // Distance from the top of the page
-            "width": 44, // Width of the element
-            "height": 44 // Height of the element
-          }
-        },
-        {
-          "id": "66551", // ID of the element
-          "indexId": "1", // Index of the element,The image is labeled to the left of the element
-          "attributes": { // Attributes of the element
-            "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
-            "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
-            "class": ".icon"
-          },
-          "content": "", // Text content of the element
-          "rect": {
-            "left": 350, // Distance from the left side of the page
-            "top": 16, // Distance from the top of the page
-            "width": 25, // Width of the element
-            "height": 25 // Height of the element
-          }
-        },
-        ...
-        {
-          "id": "12344",
-          "indexId": "2", // Index of the element，The image is labeled to the left of the element
-          "attributes": {
-            "nodeType": "TEXT Node",
-            "class": ".product-name"
-          },
-          "center": [
-            288,
-            834
-          ],
-          "content": "Mango Drink",
-          "rect": {
-            "left": 188,
-            "top": 827,
-            "width": 199,
-            "height": 13
-          }
-        },
-        ...
-      ]
-    }
-  '
-}
-\`\`\`
-Output Example:
-\`\`\`json
-{
-  "elements": [
-    {
-      // Describe the reason for finding this element, replace with actual value in practice
-      "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
-      "text": "",
-      // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
-      "id": "1231"
-    }
-  ],
-  "isOrderSensitive": true,
-  "errors": []
-}
-\`\`\`
-
-  `;
 }
 
-export const locatorSchema: ResponseFormatJSONSchema = {
-  type: 'json_schema',
-  json_schema: {
-    name: 'find_elements',
-    strict: true,
-    schema: {
-      type: 'object',
-      properties: {
-        elements: {
-          type: 'array',
-          items: {
-            type: 'object',
-            properties: {
-              reason: {
-                type: 'string',
-                description: 'Reason for finding this element',
-              },
-              text: {
-                type: 'string',
-                description: 'Text content of the element',
-              },
-              id: {
-                type: 'string',
-                description: 'ID of this element',
-              },
-            },
-            required: ['reason', 'text', 'id'],
-            additionalProperties: false,
-          },
-          description: 'List of found elements',
-        },
-        isOrderSensitive: {
-          type: 'boolean',
-          description:
-            'Whether the targetElementDescription is order-sensitive (true/false)',
-        },
-        errors: {
-          type: 'array',
-          items: {
-            type: 'string',
-          },
-          description: 'List of error messages, if any',
-        },
-      },
-      required: ['elements', 'isOrderSensitive', 'errors'],
-      additionalProperties: false,
-    },
-  },
-};
-
 export const findElementPrompt = new PromptTemplate({
   template: `
 Here is the item user want to find:

diff --git a/packages/core/src/ai-model/service-caller/index.ts b/packages/core/src/ai-model/service-caller/index.ts
@@ -19,7 +19,6 @@ import type { Stream } from 'openai/streaming';
 import { SocksProxyAgent } from 'socks-proxy-agent';
 import { AIActionType, type AIArgs } from '../common';
 import { assertSchema } from '../prompt/assertion';
-import { locatorSchema } from '../prompt/llm-locator';
 import { planSchema } from '../prompt/llm-planning';
 
 async function createChatClient({
@@ -289,9 +288,6 @@ export const getResponseFormat = (
       case AIActionType.ASSERT:
         responseFormat = assertSchema;
         break;
-      case AIActionType.INSPECT_ELEMENT:
-        responseFormat = locatorSchema;
-        break;
       case AIActionType.PLAN:
         responseFormat = planSchema;
         break;