diff --git a/examples/Vstar/README.md b/examples/Vstar/README.md
new file mode 100644
index 00000000..1a1d6ae3
--- /dev/null
+++ b/examples/Vstar/README.md
@@ -0,0 +1,110 @@
+# VStar Example
+
+VStar is a workflow that utilizes visual contextual information to process high-definition images. It utilizes hierarchical image structures and adaptive thresholding to efficiently identify objects relevant to user queries.
+
+This example demonstrates how to use the OMAgent framework for visual search and analysis tasks. The example code can be found in the "examples/VStar" directory.
+
+```bash
+cd examples/VStar
+```
+
+## Overview
+
+This example implements a comprehensive VStar workflow that consists of the following components:
+
+1. **VStar Input**
+ - Handles user input containing text queries and image uploads
+ - Processes multi-modal inputs to prepare for visual analysis
+
+2. **Vstar Workflow**
+ - Determine the elements needed to answer the question
+ - Performing confidence-guided searches to localize visual elements
+ - Optimize search efficiency using adaptive thresholding
+
+### This workflow is structured as follows:
+
+
+
+## Prerequisites
+
+- Python 3.11+
+- Required packages installed (see requirements.txt)
+- Access to a multimodal LLM (e.g., LLaVA, GPT-4V) or compatible endpoint
+- Redis server running locally or remotely (for pro mode)
+- Conductor server running locally or remotely (for pro mode)
+
+## Configuration
+
+The `container.yaml` file manages dependencies and settings for different components of the system. To set up your configuration:
+
+1. Generate the `container.yaml` file:
+ ```bash
+ python compile_container.py
+ ```
+ This will create a `container.yaml` file with default settings under `examples/VStar`.
+
+2. Configure your multimodal LLM settings in `configs/llms/*.yml`:
+ - Set your model endpoint through environment variables or by directly modifying the yml file
+ ```bash
+ export custom_vstar_endpoint="your_vstar_endpoint"
+ ```
+
+ - Configure other model settings like temperature as needed.
+
+3. Update settings in the generated `container.yaml`:
+ - Modify Redis connection settings (for pro mode):
+ - Set the host, port, and credentials for your Redis instance.
+ - Configure both `redis_stream_client` and `redis_stm_client` sections.
+ - Update the Conductor server URL under the conductor_config section (for pro mode).
+ - Adjust any other component settings as needed.
+
+## Running the Example
+
+Run the VStar example:
+
+For terminal/CLI usage:
+```bash
+python run_cli.py
+```
+
+You can run the VStar workflow in `pro` mode or `lite` mode by changing the `OMAGENT_MODE` environment variable. The default mode is `pro`, which uses the conductor and Redis server. The `lite` mode will run the workflow in the current Python process without external services.
+
+For pro mode:
+```bash
+export OMAGENT_MODE="pro"
+python run_cli.py
+```
+
+For lite mode:
+```bash
+export OMAGENT_MODE="lite"
+python run_cli.py
+```
+
+## How VStar Works
+
+VStar uses a hierarchical approach to image analysis:
+
+1. The image is first processed to extract relevant features and prepare for analysis.
+2. Visual cues are generated from the user's query to guide the search.
+3. A confidence-guided search algorithm traverses the image data to locate visual elements.
+4. Adaptive thresholding ensures high-quality results while optimizing computation.
+5. Found elements are synthesized into a comprehensive answer.
+
+This approach enables precise localization of visual elements while maintaining computational efficiency.
+
+## Troubleshooting
+
+If you encounter issues:
+- Verify your multimodal LLM endpoint is accessible and working.
+- For pro mode, confirm Redis is running and accessible.
+- Ensure all dependencies are installed correctly.
+- Check for sufficient GPU resources if using local model deployment.
+- Review logs for any error messages.
+- **Open an issue on GitHub if you can't find a solution; we will do our best to help you out!**
+
+## Local deployment of vstar
+
+Since vstar does not yet support deployment by vllm, for example, we need to deploy locally.
+First of all, go to V*'s code repository and download the [source code](https://github.com/penghao-wu/vstar)
+, then copy the python file `OmAgent/examples/Vstar/docs/files/vstar_api.py` for deploying the api to the vstar source folder, and change the model path to your download seal models. Finally run `uvicorn vstar_api:app --host 0.0.0.0 --port 8000` to start the service, and then `export custom_vstar_endpoint=http://localhost:8000/`.
\ No newline at end of file
diff --git a/examples/Vstar/agent/vstar_input/__init__.py b/examples/Vstar/agent/vstar_input/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/Vstar/agent/vstar_input/vstar_input.py b/examples/Vstar/agent/vstar_input/vstar_input.py
new file mode 100644
index 00000000..f13e52ff
--- /dev/null
+++ b/examples/Vstar/agent/vstar_input/vstar_input.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.utils.logger import logging
+from omagent_core.utils.registry import registry
+
+CURRENT_PATH = Path(__file__).parents[0]
+
+
+@registry.register_worker()
+class VstarInput(BaseWorker):
+ """
+ A worker class for handling VStar input processing.
+ This class is responsible for receiving and processing user input,
+ including both text queries and image data.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for processing VStar input.
+
+ This method handles:
+ 1. Reading user input through the input interface
+ 2. Extracting messages from the input
+ 3. Processing both image and text content
+
+ Returns:
+ dict: A dictionary containing:
+ - 'query': The text query from the user
+ - 'image_path': The path to the uploaded image
+
+ Raises:
+ Exception: If any error occurs during input processing
+ """
+ try:
+ # Request user input through the designated input interface
+ user_input = self.input.read_input(
+ workflow_instance_id=self.workflow_instance_id,
+ input_prompt="Please input your question:",
+ )
+
+ # Extract the message list from user input
+ messages = user_input["messages"]
+ # Get the most recent message (last message in the list)
+ message = messages[-1]
+
+ # Initialize variables to store image and text data
+ image_path = None
+ text = None
+
+ # Iterate through each content item in the message
+ for each_content in message["content"]:
+ if each_content["type"] == "image_url":
+ # If content is an image, store its path
+ image_path = each_content["data"]
+ elif each_content["type"] == "text":
+ # If content is text, store the text query
+ text = each_content["data"]
+
+ # Return a dictionary containing both the text query and image path
+ return {
+ "query": text,
+ "image_path": image_path
+ }
+
+ except Exception as e:
+ # Log any errors that occur during the input processing
+ logging.error(f"Error in ToT input processing: {str(e)}")
+ # Re-raise the exception for proper error handling upstream
+ raise
\ No newline at end of file
diff --git a/examples/Vstar/compile_container.py b/examples/Vstar/compile_container.py
new file mode 100755
index 00000000..9632cc6f
--- /dev/null
+++ b/examples/Vstar/compile_container.py
@@ -0,0 +1,20 @@
+from omagent_core.utils.container import container
+from pathlib import Path
+from omagent_core.utils.registry import registry
+
+
+# Load all registered workflow components
+registry.import_module()
+
+# Configure import path for agent modules
+from pathlib import Path
+CURRENT_PATH = Path(__file__).parents[0]
+
+# Register core workflow components for state management, callbacks and input handling
+container.register_stm(stm='SharedMemSTM')
+container.register_callback(callback='AppCallback')
+container.register_input(input='AppInput')
+
+
+# Compile container config
+container.compile_config(CURRENT_PATH)
\ No newline at end of file
diff --git a/examples/Vstar/configs/llms/vstar.yml b/examples/Vstar/configs/llms/vstar.yml
new file mode 100755
index 00000000..3cc12c11
--- /dev/null
+++ b/examples/Vstar/configs/llms/vstar.yml
@@ -0,0 +1,2 @@
+name: VStarLLM
+endpoint: ${env| custom_vstar_endpoint, https://vstar.om-ai.com}
diff --git a/examples/Vstar/configs/workers/vstar_input.yaml b/examples/Vstar/configs/workers/vstar_input.yaml
new file mode 100644
index 00000000..f5c422cf
--- /dev/null
+++ b/examples/Vstar/configs/workers/vstar_input.yaml
@@ -0,0 +1 @@
+name: VstarInput
\ No newline at end of file
diff --git a/examples/Vstar/configs/workers/vstar_workflow.yaml b/examples/Vstar/configs/workers/vstar_workflow.yaml
new file mode 100755
index 00000000..057ced3b
--- /dev/null
+++ b/examples/Vstar/configs/workers/vstar_workflow.yaml
@@ -0,0 +1,16 @@
+- name: VQA_LLM_Preprocess
+
+- name: VstarLoopCheck
+
+- name: VQA_LLM
+ llm: ${sub|vstar}
+
+- name: VstarSearchPreprocess
+
+- name: VstarSearch
+ llm: ${sub|vstar}
+
+- name: VstarSearchCheck
+
+- name: VQA_LLM_Post
+ llm: ${sub|vstar}
\ No newline at end of file
diff --git a/examples/Vstar/docs/files/vstar_api.py b/examples/Vstar/docs/files/vstar_api.py
new file mode 100644
index 00000000..92da0d65
--- /dev/null
+++ b/examples/Vstar/docs/files/vstar_api.py
@@ -0,0 +1,126 @@
+import base64
+from fastapi import FastAPI
+from pydantic import BaseModel
+from io import BytesIO
+from PIL import Image
+import numpy as np
+from dataclasses import dataclass
+from vstar_bench_eval import VQA_LLM, expand2square, normalize_bbox
+from visual_search import VSM
+import torch
+from typing import List
+
+@dataclass
+class Args:
+ vqa_model_path: str = None
+ version: str = None
+ conv_type: str = None
+ vision_tower: str = None
+vqa_args = Args(
+ vqa_model_path="./seal_vqa_7b", # your path to seal_vqa_7b
+ conv_type="v1"
+)
+print(f"Using model path: {vqa_args.vqa_model_path}")
+vqa_llm = VQA_LLM(vqa_args)
+
+vsm_args = Args(
+ version="./seal_vsm_7b", # your path to seal_vsm_7b
+ vision_tower="./clip-vit-large-patch14" # your path to clip-vit-large-patch14
+)
+vsm = VSM(vsm_args)
+
+app = FastAPI()
+
+class VQAOutput(BaseModel):
+ generated_text: str
+
+class VQAPayload(BaseModel):
+ prompt: str
+ image_base64: str
+
+@app.post("/vqa_llm")
+async def generate_from_base64(data: VQAPayload):
+ prompt = data.prompt
+ image_base64 = data.image_base64
+ image_data = base64.b64decode(image_base64)
+ image = Image.open(BytesIO(image_data))
+ response = vqa_llm.free_form_inference(image, prompt)
+
+ return VQAOutput(
+ generated_text=response,
+ )
+
+class VSMOutput(BaseModel):
+ response: object
+
+class VSMPayload(BaseModel):
+ prompt: str
+ image_base64: str
+ mode: str
+
+@app.post("/visual_search_model")
+async def generate_vsm(data: VSMPayload):
+ prompt = data.prompt
+ image_base64 = data.image_base64
+ image_data = base64.b64decode(image_base64)
+ image = Image.open(BytesIO(image_data))
+ mode = data.mode
+ response = vsm.inference(image, prompt, mode)
+ print(response)
+ if mode == 'segmentation':
+ response = response.cpu().tolist()
+ elif mode == 'vqa':
+ pass
+ elif mode == 'detection':
+ response = [r.cpu().tolist() for r in response]
+ return VSMOutput(
+ response=response,
+ )
+
+class VQAPostPayload(BaseModel):
+ prompt: str
+ image_base64: str
+ bboxes: List[List[int]]
+ object_names: List[str]
+
+@app.post("/vqa_llm_post")
+async def generate_from_base64(data: VQAPostPayload):
+ prompt = data.prompt
+ image_base64 = data.image_base64
+ bboxes = data.bboxes
+ object_names = data.object_names
+ image_data = base64.b64decode(image_base64)
+ image = Image.open(BytesIO(image_data))
+
+ if len(object_names) <= 2:
+ images_long = [False]
+ objects_long = [True]*len(object_names)
+ else:
+ images_long = [False]
+ objects_long = [False]*len(object_names)
+ object_crops = []
+ for bbox in bboxes:
+ object_crop = vqa_llm.get_object_crop(image, bbox, patch_scale=1.2)
+ object_crops.append(object_crop)
+ object_crops = torch.stack(object_crops, 0)
+ image, left, top = expand2square(image, tuple(int(x*255) for x in vqa_llm.image_processor.image_mean))
+ bbox_list = []
+ for bbox in bboxes:
+ bbox[0] += left
+ bbox[1] += top
+ bbox_list.append(bbox)
+ bbox_list = [normalize_bbox(bbox, image.width, image.height) for bbox in bbox_list]
+ focus_msg = "Additional visual information to focus on: "
+ cur_focus_msg = focus_msg
+ for i, (object_name, bbox) in enumerate(zip(object_names, bbox_list)):
+ cur_focus_msg = cur_focus_msg + "{} at location [{:.3f},{:.3f},{:.3f},{:.3f}]".format(object_name, bbox[0], bbox[1], bbox[2], bbox[3])
+ if i != len(bbox_list)-1:
+ cur_focus_msg = cur_focus_msg+"; "
+ else:
+ cur_focus_msg = cur_focus_msg +'.'
+ question_with_focus = cur_focus_msg+"\n"+prompt
+
+ response = vqa_llm.free_form_inference(image, question_with_focus, object_crops=object_crops, objects_long=objects_long, images_long=images_long)
+ return VQAOutput(
+ generated_text=response,
+ )
\ No newline at end of file
diff --git a/examples/Vstar/docs/images/vstar_workflow.jpg b/examples/Vstar/docs/images/vstar_workflow.jpg
new file mode 100644
index 00000000..148774d7
--- /dev/null
+++ b/examples/Vstar/docs/images/vstar_workflow.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e27ce2a910318e0315b6147f726bc1f72eeaa42d316f53a801dc193b1a125b3
+size 1185946
diff --git a/examples/Vstar/run_cli.py b/examples/Vstar/run_cli.py
new file mode 100755
index 00000000..dd9b9474
--- /dev/null
+++ b/examples/Vstar/run_cli.py
@@ -0,0 +1,57 @@
+# Import necessary components from the OmAgent core
+from omagent_core.utils.container import container
+from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
+from omagent_core.engine.workflow.task.simple_task import simple_task
+from pathlib import Path
+from omagent_core.utils.registry import registry
+from omagent_core.clients.devices.cli import DefaultClient
+from omagent_core.utils.logger import logging
+from omagent_core.advanced_components.workflow.Vstar.workflow import VstarWorkflow
+from dotenv import load_dotenv
+
+# Load environment variables from a .env file
+load_dotenv()
+
+# Initialize logging for the OmAgent with INFO level
+logging.init_logger("omagent", "omagent", level="INFO")
+
+# Set the current working directory path
+CURRENT_PATH = Path(__file__).parents[0]
+
+# Import registered modules from the agent directory
+registry.import_module(project_path=CURRENT_PATH.joinpath('agent'))
+
+# Register the shared memory state management (STM) for the workflow
+container.register_stm("SharedMemSTM")
+
+# Load container configuration from a YAML file
+container.from_config(CURRENT_PATH.joinpath('container.yaml'))
+
+# Initialize a simple VQA (Visual Question Answering) workflow
+workflow = ConductorWorkflow(name='Vstar')
+
+# Configure workflow tasks:
+# 1. Create an input task for user input
+vstar_input_task = simple_task(task_def_name='VstarInput', task_reference_name='vstar_input_task')
+
+# Initialize the Vstar workflow
+vstar_workflow = VstarWorkflow()
+
+# Set the input for the Vstar workflow using outputs from the input task
+vstar_workflow.set_input(
+ query=vstar_input_task.output("query"),
+ image_path=vstar_input_task.output("image_path")
+)
+
+# Configure the execution flow of the workflow: Input -> Vstar Input Task -> Vstar Workflow
+workflow >> vstar_input_task >> vstar_workflow
+
+# Register the workflow with the option to overwrite any existing workflow with the same name
+workflow.register(overwrite=True)
+
+# Initialize and start the CLI client with the configured workflow
+config_path = CURRENT_PATH.joinpath('configs')
+cli_client = DefaultClient(interactor=workflow, config_path=config_path, workers=[])
+
+# Start the interactor for the CLI client
+cli_client.start_interactor()
diff --git a/examples/ZoomEye/README.md b/examples/ZoomEye/README.md
new file mode 100644
index 00000000..f312f231
--- /dev/null
+++ b/examples/ZoomEye/README.md
@@ -0,0 +1,116 @@
+# ZoomEye Example
+
+ZoomEye is an advanced visual search and analysis framework that combines visual understanding with search capabilities. It can pinpoint and identify visual elements by simulating human zoom-in image behavior through a tree-structured image analysis approach that interrogates image content in detail.
+
+This example demonstrates how to use the OMAgent framework for visual search and analysis tasks. The example code can be found in the "examples/ZoomEye" directory.
+
+```bash
+cd examples/ZoomEye
+```
+
+## Overview
+
+This example implements a comprehensive ZoomEye workflow that consists of the following components:
+
+1. **ZoomEye Input**
+ - Handles user input containing text queries and image uploads
+ - Processes multi-modal inputs to prepare for visual analysis
+
+2. **ZoomEye Workflow**
+ - Builds a hierarchical image tree for multi-scale analysis
+ - Performs confidence-guided search to locate visual elements
+ - Uses adaptive thresholding to optimize search efficiency
+
+### This workflow is structured as follows:
+
+
+
+## Prerequisites
+
+- Python 3.11+
+- Required packages installed (see requirements.txt)
+- Access to a multimodal LLM (e.g., LLaVA, GPT-4V) or compatible endpoint
+- Redis server running locally or remotely (for pro mode)
+- Conductor server running locally or remotely (for pro mode)
+
+## Configuration
+
+The container.yaml file manages dependencies and settings for different components of the system. To set up your configuration:
+
+1. Generate the container.yaml file:
+ ```bash
+ python compile_container.py
+ ```
+ This will create a container.yaml file with default settings under `examples/ZoomEye`.
+
+2. Configure your multimodal LLM settings in `configs/llms/*.yml`:
+ - Set your model endpoint through environment variables or by directly modifying the yml file
+ ```bash
+ export custom_openai_model_id="your_model_id"
+ export custom_openai_key="your_openai_api_key"
+ export custom_openai_endpoint="your_openai_endpoint"
+ ```
+
+ - Configure other model settings like temperature as needed
+
+3. Update settings in the generated `container.yaml`:
+ - Modify Redis connection settings (for pro mode):
+ - Set the host, port and credentials for your Redis instance
+ - Configure both `redis_stream_client` and `redis_stm_client` sections
+ - Update the Conductor server URL under conductor_config section (for pro mode)
+ - Adjust any other component settings as needed
+
+## Running the Example
+
+Run the ZoomEye example:
+
+For terminal/CLI usage:
+```bash
+python run_cli.py
+```
+
+You can run the ZoomEye workflow in `pro` mode or `lite` mode by changing the `OMAGENT_MODE` environment variable. The default mode is `pro` which uses the conductor and redis server. The `lite` mode will run the workflow in the current python process without external services.
+
+For pro mode:
+```bash
+export OMAGENT_MODE="pro"
+python run_cli.py
+```
+
+For lite mode:
+```bash
+export OMAGENT_MODE="lite"
+python run_cli.py
+```
+
+## How ZoomEye Works
+
+ZoomEye uses a hierarchical approach to image analysis:
+
+1. The image is first decomposed into a tree structure with multiple scales
+2. Visual cues are extracted from the user's query to guide the search
+3. A confidence-guided search algorithm traverses the image tree to locate visual elements
+4. Adaptive thresholding ensures high-quality results while optimizing computation
+5. Found elements are synthesized into a comprehensive answer
+
+This approach enables precise localization of visual elements while maintaining computational efficiency.
+
+## Troubleshooting
+
+If you encounter issues:
+- Verify your multimodal LLM endpoint is accessible and working
+- For pro mode, confirm Redis is running and accessible
+- Ensure all dependencies are installed correctly
+- Check for sufficient GPU resources if using local model deployment
+- Review logs for any error messages
+- **Open an issue on GitHub if you can't find a solution, we will do our best to help you out!**
+
+**Note**:
+1. The llm called by zoomeye needs to support the return of **logprobs**.
+2. Sometimes you may encounter not enough memory error, because stm default size is 100MB, and when retrieve high definition image will be out of memory, if encountered, please adjust the function `_get_shm` in `omagent-core/src/omagent_core/memories/stms/stm_sharedMem.py` to get a larger size.
+```python
+92 def _get_shm(self, workflow_instance_id, size: int = 1024 * 1024 * 100):
+...
+103 return shm
+```
+
diff --git a/examples/ZoomEye/agent/zoomeye_input/__init__.py b/examples/ZoomEye/agent/zoomeye_input/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/ZoomEye/agent/zoomeye_input/zoomeye_input.py b/examples/ZoomEye/agent/zoomeye_input/zoomeye_input.py
new file mode 100644
index 00000000..cd51bf4f
--- /dev/null
+++ b/examples/ZoomEye/agent/zoomeye_input/zoomeye_input.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.utils.logger import logging
+from omagent_core.utils.registry import registry
+
+CURRENT_PATH = Path(__file__).parents[0]
+
+
+@registry.register_worker()
+class ZoomEyeInput(BaseWorker):
+ """
+ Worker class for handling user input in the ZoomEye workflow.
+ Processes both text queries and image inputs from users.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for processing user input.
+
+ Returns:
+ dict: Contains the text query and image path (if provided)
+ """
+ try:
+ # Get user input through the input interface
+ user_input = self.input.read_input(
+ workflow_instance_id=self.workflow_instance_id,
+ input_prompt="Please input your question:",
+ )
+
+ # Extract messages from user input
+ messages = user_input["messages"]
+ message = messages[-1] # Get the latest message
+
+ # Initialize variables for image and text
+ image_path = None
+ text = None
+
+ # Process each content item in the message
+ for each_content in message["content"]:
+ if each_content["type"] == "image_url":
+ # Store the image path if an image was provided
+ image_path = each_content["data"]
+ elif each_content["type"] == "text":
+ # Store the text content if text was provided
+ text = each_content["data"]
+
+ # Return both the text query and image path
+ return {"query": text,
+ "image_path": image_path}
+ except Exception as e:
+ # Log any errors during input processing
+ logging.error(f"Error in ToT input processing: {str(e)}")
+ raise
\ No newline at end of file
diff --git a/examples/ZoomEye/compile_container.py b/examples/ZoomEye/compile_container.py
new file mode 100644
index 00000000..9632cc6f
--- /dev/null
+++ b/examples/ZoomEye/compile_container.py
@@ -0,0 +1,20 @@
+from omagent_core.utils.container import container
+from pathlib import Path
+from omagent_core.utils.registry import registry
+
+
+# Load all registered workflow components
+registry.import_module()
+
+# Configure import path for agent modules
+from pathlib import Path
+CURRENT_PATH = Path(__file__).parents[0]
+
+# Register core workflow components for state management, callbacks and input handling
+container.register_stm(stm='SharedMemSTM')
+container.register_callback(callback='AppCallback')
+container.register_input(input='AppInput')
+
+
+# Compile container config
+container.compile_config(CURRENT_PATH)
\ No newline at end of file
diff --git a/examples/ZoomEye/configs/llms/gpt.yml b/examples/ZoomEye/configs/llms/gpt.yml
new file mode 100644
index 00000000..f42ed340
--- /dev/null
+++ b/examples/ZoomEye/configs/llms/gpt.yml
@@ -0,0 +1,7 @@
+name: OpenaiGPTLLM
+model_id: ${env| custom_openai_model_id, model_id}
+api_key: ${env| custom_openai_key, openai_api_key}
+endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
+max_tokens: 2048
+temperature: 0
+vision: true
\ No newline at end of file
diff --git a/examples/ZoomEye/configs/workers/zoomeye_input.yaml b/examples/ZoomEye/configs/workers/zoomeye_input.yaml
new file mode 100644
index 00000000..7866fd9e
--- /dev/null
+++ b/examples/ZoomEye/configs/workers/zoomeye_input.yaml
@@ -0,0 +1 @@
+name: ZoomEyeInput
\ No newline at end of file
diff --git a/examples/ZoomEye/configs/workers/zoomeye_workflow.yaml b/examples/ZoomEye/configs/workers/zoomeye_workflow.yaml
new file mode 100644
index 00000000..dd470263
--- /dev/null
+++ b/examples/ZoomEye/configs/workers/zoomeye_workflow.yaml
@@ -0,0 +1,15 @@
+- name: VisualCueGeneration
+ llm: ${sub|gpt}
+
+- name: ZoomEyePreprocess
+
+- name: ZoomEyeSearch
+ llm: ${sub|gpt}
+
+- name: ZoomEyeSearchCheck
+
+- name: ZoomEyeLoopCheck
+
+- name: ZoomEyeOutput
+ llm: ${sub|gpt}
+
diff --git a/examples/ZoomEye/doc/images/zoomeye_workflow.jpg b/examples/ZoomEye/doc/images/zoomeye_workflow.jpg
new file mode 100644
index 00000000..54efc8e5
--- /dev/null
+++ b/examples/ZoomEye/doc/images/zoomeye_workflow.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4547a33f598e7a8069c60f0c7d9b00cb29a654a44da6e3764f85261f006004e0
+size 918999
diff --git a/examples/ZoomEye/run_cli.py b/examples/ZoomEye/run_cli.py
new file mode 100644
index 00000000..ea7a6e80
--- /dev/null
+++ b/examples/ZoomEye/run_cli.py
@@ -0,0 +1,60 @@
+# Import essential components from the OMAgent framework
+from omagent_core.utils.container import container
+from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
+from omagent_core.engine.workflow.task.simple_task import simple_task
+from pathlib import Path
+from omagent_core.utils.registry import registry
+from omagent_core.clients.devices.cli import DefaultClient
+from omagent_core.utils.logger import logging
+from omagent_core.advanced_components.workflow.ZoomEye.workflow import ZoomEyeWorkflow
+from dotenv import load_dotenv
+from agent.zoomeye_input.zoomeye_input import ZoomEyeInput
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Initialize logging with INFO level
+logging.init_logger("omagent", "omagent", level="INFO")
+
+# Set current working directory path for resource location
+CURRENT_PATH = Path(__file__).parents[0]
+
+# Import registered modules from the agent directory to register custom components
+registry.import_module(project_path=CURRENT_PATH.joinpath('agent'))
+
+# Configure the Short-Term Memory system
+container.register_stm("SharedMemSTM")
+# Load container configuration from YAML file
+container.from_config(CURRENT_PATH.joinpath('container.yaml'))
+
+# Initialize ZoomEye workflow with the ConductorWorkflow
+workflow = ConductorWorkflow(name='ZoomEye')
+
+# Define the input task to collect query and image from user
+zoom_eye_input_task = simple_task(
+ task_def_name='ZoomEyeInput',
+ task_reference_name='zoom_eye_input'
+)
+
+# Initialize the main ZoomEye processing workflow
+zoom_eye_workflow = ZoomEyeWorkflow()
+
+# Set inputs for the ZoomEye workflow from the input task outputs
+zoom_eye_workflow.set_input(
+ query = zoom_eye_input_task.output("query"),
+ image_path = zoom_eye_input_task.output("image_path")
+)
+
+# Configure workflow execution flow:
+# First collect input, then process with ZoomEye workflow
+workflow >> zoom_eye_input_task >> zoom_eye_workflow
+
+# Register workflow in the system (overwrite if it already exists)
+workflow.register(overwrite=True)
+
+# Initialize CLI client with the workflow configuration
+config_path = CURRENT_PATH.joinpath('configs')
+cli_client = DefaultClient(interactor=workflow, config_path=config_path, workers=[])
+
+# Start the CLI client to begin interaction with the user
+cli_client.start_interactor()
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ToT/agent/search_algorithm/search_algorithm.py b/omagent-core/src/omagent_core/advanced_components/workflow/ToT/agent/search_algorithm/search_algorithm.py
index 977f4ef6..64630387 100644
--- a/omagent-core/src/omagent_core/advanced_components/workflow/ToT/agent/search_algorithm/search_algorithm.py
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ToT/agent/search_algorithm/search_algorithm.py
@@ -164,6 +164,7 @@ def _record_results(self, current_best_thought_chain, thought_tree):
record['completion_tokens'] = self.token_usage['completion_tokens']
record['total_tokens'] = self.token_usage['total_tokens']
record['thought_tree'] = thought_tree
+ self.stm(self.workflow_instance_id)['record'] = record
print('-'*100)
print(record)
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/README.md b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/README.md
new file mode 100644
index 00000000..e8dc3e16
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/README.md
@@ -0,0 +1,115 @@
+# VStar Operator
+
+VStar is a workflow that utilizes visual contextual information to process high-definition images. It utilizes hierarchical image structures and adaptive thresholding to efficiently identify objects relevant to user queries.
+
+You can refer to the example in the `examples/VStar` directory to understand how to use this operator.
+
+## Inputs, Outputs, and Configs
+
+### Inputs:
+The inputs that the VStar operator requires are as follows:
+| Name | Type | Required | Description |
+|-----------|------|----------|-------------|
+| query | str | true | The text question about the image content |
+| image_path| str | true | Path to the image file for analysis |
+| qid | str | false | Optional query identifier for tracking purposes |
+
+### Outputs:
+The outputs that the VStar operator returns are as follows:
+| Name | Type | Description |
+|---------|------|-------------|
+| result | dict | The result of the VStar workflow. It includes the final answer, identified visual elements, prompt tokens, completion tokens, and the original query. |
+
+### Configs:
+The config of the VStar operator is as follows. You can simply copy and paste the following config into your project as a `vstar_workflow.yml` file.
+```yml
+- name: VQA_LLM_Preprocess
+
+- name: VstarLoopCheck
+
+- name: VQA_LLM
+ llm: ${sub|vstar}
+
+- name: VstarSearchPreprocess
+
+- name: VstarSearch
+ llm: ${sub|vstar}
+
+- name: VstarSearchCheck
+
+- name: VQA_LLM_Post
+ llm: ${sub|vstar}
+```
+
+The VStar operator settings are as follows:
+| Name | Type | Description |
+|----------------------------------|--------|-------------------------------------------------------------------------------------------------|
+| CONFIDENCE_HIGH | float | The upper confidence threshold for accepting a node during the search. If the confidence score of a detected object exceeds this threshold, it is considered a valid detection. |
+| CONFIDENCE_LOW | float | The lower confidence threshold below which the search stops. If the confidence score falls below this threshold, the search will cease for that particular object. |
+| TARGET_CUE_THRESHOLD | float | This parameter defines the maximum depth of the image tree to explore during the search process. It helps limit the search to a manageable level, preventing excessive computation. |
+| TARGET_CUE_THRESHOLD_DECAY | float | This value is used to adjust the confidence threshold dynamically during the search process. As the search progresses, this decay factor reduces the threshold, allowing for more flexible detection of objects. |
+| TARGET_CUE_THRESHOLD_MINIMUM | list | A sequence of threshold reductions to apply during adaptive search. This list specifies how much to decrease the confidence threshold at each step, allowing for a gradual adjustment based on search results. |
+| SMALLEST_SIZE | int | This parameter defines the minimum size of the image to process. Images smaller than this size will be ignored, ensuring that the search focuses on relevant, adequately sized images. |
+| MAX_SEARCH_STEP | int | The maximum number of nodes to process in a single search iteration. This limit helps control the computational load and ensures that the search process remains efficient. |
+
+Set these parameters in `omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/utils.py`.
+
+```python
+CONFIDENCE_HIGH = 0.5
+CONFIDENCE_LOW = 0.3
+TARGET_CUE_THRESHOLD = 6.0
+TARGET_CUE_THRESHOLD_DECAY = 0.7
+TARGET_CUE_THRESHOLD_MINIMUM = 3.0
+SMALLEST_SIZE = 224
+MAX_SEARCH_STEP = 10
+```
+
+## How VStar Works
+
+VStar operates in multiple stages:
+
+1. **VQA LLM Preprocess**: Prepares the input data for the VQA LLM by reading the image and storing it in the state management system (STM).
+2. **VQA LLM**: Executes the Visual Question Answering task using the VStar LLM to get answers or target based on the input image and query.
+3. **Vstar Search Preprocess**: Initializes the search parameters based on the missing objects identified in the VQA LLM response.
+4. **Vstar Search**: Performs a confidence-guided search to locate visual elements in the image.
+5. **Vstar Loop Check**: Manages iterations across multiple visual cues to ensure all relevant objects are searched.
+6. **Vstar Search Check**: Determines when the search is complete and collects found candidates.
+7. **VQA LLM Post**: Processes the results and prepares the final output for the user.
+
+The search process uses a priority-based traversal with multiple confidence metrics to efficiently locate visual elements while maintaining high precision.
+
+## Example Usage
+
+Here's a simple example of how to use the VStar operator in a workflow:
+
+```python
+from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
+from omagent_core.engine.workflow.task.simple_task import simple_task
+from omagent_core.advanced_components.workflow.Vstar.workflow import VstarWorkflow
+
+# Initialize workflow
+workflow = ConductorWorkflow(name='VStarExample')
+
+# Create input task
+input_task = simple_task(
+ task_def_name='VstarInput',
+ task_reference_name='vstar_input'
+)
+
+# Initialize VStar workflow
+vstar_workflow = VstarWorkflow()
+
+# Connect input to VStar workflow
+vstar_workflow.set_input(
+ query=input_task.output("query"),
+ image_path=input_task.output("image_path")
+)
+
+# Configure workflow execution flow
+workflow >> input_task >> vstar_workflow
+
+# Register workflow
+workflow.register(overwrite=True)
+```
+
+This combines the power of visual understanding with efficient search algorithms to provide detailed analysis of image content based on natural language queries.
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/__init__.py b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/__init__.py
new file mode 100755
index 00000000..e69de29b
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/utils.py b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/utils.py
new file mode 100755
index 00000000..201544af
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/utils.py
@@ -0,0 +1,122 @@
+import spacy
+nlp = spacy.load("en_core_web_sm")
+MISSING_INFO = "Sorry, I can not answer the question. Some visual information about the following objects is missing or unclear:"
+
+CONFIDENCE_HIGH = 0.5
+CONFIDENCE_LOW = 0.3
+TARGET_CUE_THRESHOLD = 6.0
+TARGET_CUE_THRESHOLD_DECAY = 0.7
+TARGET_CUE_THRESHOLD_MINIMUM = 3.0
+SMALLEST_SIZE = 224
+MAX_SEARCH_STEP = 10
+
+def tranverse(token):
+ children = [_ for _ in token.children]
+ if len(children) == 0:
+ return token.i, token.i
+ left_i = token.i
+ right_i = token.i
+ for child in children:
+ child_left_i, child_right_i = tranverse(child)
+ left_i = min(left_i, child_left_i)
+ right_i = max(right_i, child_right_i)
+ return left_i, right_i
+
+def get_noun_chunks(token):
+ left_children = []
+ right_children = []
+ for child in token.children:
+ if child.i < token.i:
+ left_children.append(child)
+ else:
+ right_children.append(child)
+
+ start_token_i = token.i
+ for left_child in left_children[::-1]:
+ if left_child.dep_ in ['amod', 'compound', 'poss']:
+ start_token_i, _ = tranverse(left_child)
+ else:
+ break
+ end_token_i = token.i
+ for right_child in right_children:
+ if right_child.dep_ in ['relcl', 'prep']:
+ _, end_token_i = tranverse(right_child)
+ else:
+ break
+ return start_token_i, end_token_i
+
+def filter_chunk_list(chunks):
+ def overlap(min1, max1, min2, max2):
+ return min(max1, max2) - max(min1, min2)
+ chunks = sorted(chunks, key=lambda chunk: chunk[1]-chunk[0], reverse=True)
+ filtered_chunks = []
+ for chunk in chunks:
+ flag=True
+ for exist_chunk in filtered_chunks:
+ if overlap(exist_chunk[0], exist_chunk[1], chunk[0], chunk[1]) >= 0:
+ flag = False
+ break
+ if flag:
+ filtered_chunks.append(chunk)
+ return sorted(filtered_chunks, key=lambda chunk: chunk[0])
+
+def extract_noun_chunks(expression):
+ doc = nlp(expression)
+ cur_chunks = []
+ for token in doc:
+ if token.pos_ not in ["NOUN", "PRON"]:
+ continue
+ cur_chunks.append(get_noun_chunks(token))
+ cur_chunks = filter_chunk_list(cur_chunks)
+ cur_chunks = [doc[chunk[0]:chunk[1]+1].text for chunk in cur_chunks]
+ return cur_chunks
+
+def get_sub_patches(current_patch_bbox, num_of_width_patches, num_of_height_patches):
+ width_stride = int(current_patch_bbox[2]//num_of_width_patches)
+ height_stride = int(current_patch_bbox[3]/num_of_height_patches)
+ sub_patches = []
+ for j in range(num_of_height_patches):
+ for i in range(num_of_width_patches):
+ sub_patch_width = current_patch_bbox[2] - i*width_stride if i == num_of_width_patches-1 else width_stride
+ sub_patch_height = current_patch_bbox[3] - j*height_stride if j == num_of_height_patches-1 else height_stride
+ sub_patch = [current_patch_bbox[0]+i*width_stride, current_patch_bbox[1]+j*height_stride, sub_patch_width, sub_patch_height]
+ sub_patches.append(sub_patch)
+ return sub_patches, width_stride, height_stride
+
+def split_4subpatches(current_patch_bbox):
+ hw_ratio = current_patch_bbox[3] / current_patch_bbox[2]
+ if hw_ratio >= 2:
+ return 1, 4
+ elif hw_ratio <= 0.5:
+ return 4, 1
+ else:
+ return 2, 2
+
+def get_subpatch_scores(score_heatmap, current_patch_bbox, sub_patches):
+ total_sum = (score_heatmap/(current_patch_bbox[2]*current_patch_bbox[3])).sum()
+ sub_scores = []
+ for sub_patch in sub_patches:
+ bbox = [(sub_patch[0]-current_patch_bbox[0]), sub_patch[1]-current_patch_bbox[1], sub_patch[2], sub_patch[3]]
+ score = (score_heatmap[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]]/(current_patch_bbox[2]*current_patch_bbox[3])).sum()
+ if total_sum > 0:
+ score /= total_sum
+ else:
+ score *= 0
+ sub_scores.append(score)
+ return sub_scores
+
+
+import functools
+
+@functools.total_ordering
+class Prioritize:
+
+ def __init__(self, priority, item):
+ self.priority = priority
+ self.item = item
+
+ def __eq__(self, other):
+ return self.priority == other.priority
+
+ def __lt__(self, other):
+ return self.priority < other.priority
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/vstar_modules.py b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/vstar_modules.py
new file mode 100644
index 00000000..01546edf
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/agent/vstar/vstar_modules.py
@@ -0,0 +1,455 @@
+from pathlib import Path
+from omagent_core.utils.registry import registry
+from omagent_core.utils.general import read_image
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.models.llms.vstar import VStarLLM
+import copy
+import torch
+import heapq
+from .utils import *
+
+# Set the current path to the directory of this file
+CURRENT_PATH = Path(__file__).parents[0]
+
+@registry.register_worker()
+class VQA_LLM_Preprocess(BaseWorker):
+ """
+ Preprocessing worker for Visual Question Answering (VQA) tasks.
+ This class prepares the input data for the VQA LLM by reading the image
+ and storing it in the state management system (STM).
+ """
+
+ def _run(self, qid: str, query: str, image_path: str):
+ """
+ Main execution method for preprocessing VQA input.
+
+ Args:
+ qid (str): The question ID.
+ query (str): The text query from the user.
+ image_path (str): The path to the input image.
+ """
+ # Clear the state management for the current workflow instance
+ self.stm.clear(self.workflow_instance_id)
+
+ # Read the image from the specified path
+ img = read_image(input_source=image_path)
+
+ # Store the image and query in the state management
+ self.stm(self.workflow_instance_id)['image_cache'] = {'': img}
+ self.stm(self.workflow_instance_id)['prompt'] = query
+ self.stm(self.workflow_instance_id)['id'] = qid
+ return
+
+@registry.register_worker()
+class VQA_LLM(BaseWorker, VStarLLM):
+ """
+ Worker for executing the Visual Question Answering (VQA) LLM.
+ This class interacts with the VStar LLM to get answers based on the input image and query.
+ """
+
+ llm: VStarLLM
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for the VQA LLM.
+
+ This method retrieves the image and prompt from the state management,
+ calls the VStar LLM to get a response, and processes the result.
+ """
+ # Retrieve the cached image and prompt from the state management
+ img = self.stm(self.workflow_instance_id)['image_cache']['']
+ prompt = self.stm(self.workflow_instance_id)['prompt']
+
+ # Call the VStar LLM to get a response for the VQA task
+ response = self.llm.vqa(prompt, img)
+ res = response['choices'][0]['message']['content']
+
+ # Log the response for debugging purposes
+ self.callback.info(self.workflow_instance_id, progress='VQA LLM', message=f'response: {response}')
+
+ # Determine if the VQA LLM succeeded based on the response content
+ vqa_llm_succeed = 0 if MISSING_INFO in res else 1
+
+ if vqa_llm_succeed == 1:
+ # Store the answer in the state management if successful
+ self.stm(self.workflow_instance_id)['answer'] = res
+ else:
+ # Process missing objects from the response
+ missing_objects = res.split(MISSING_INFO)[-1]
+ if missing_objects.endswith('.'):
+ missing_objects = missing_objects[:-1]
+ missing_objects = [obj.strip() for obj in missing_objects.split(',')]
+ self.stm(self.workflow_instance_id)['missing_objects'] = missing_objects
+ self.stm(self.workflow_instance_id)['all_missing_objects'] = missing_objects
+
+ return {"vqa_llm_succeed": vqa_llm_succeed}
+
+@registry.register_worker()
+class VstarSearchPreprocess(BaseWorker):
+ """
+ Preprocessing worker for the VStar search task.
+ This class prepares the search parameters based on the missing objects.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for preprocessing the VStar search.
+
+ This method initializes the search parameters and updates the state management.
+ """
+ # Retrieve the cached image and missing objects from the state management
+ image = self.stm(self.workflow_instance_id)['image_cache']['']
+ missing_objects = self.stm(self.workflow_instance_id)['missing_objects']
+
+ print("missing_objects", missing_objects)
+
+ # Get the target object name for the search
+ target_object_name = missing_objects.pop(0)
+
+ # Initialize the search parameters
+ init_patch = dict()
+ init_patch['bbox'] = [0, 0, image.width, image.height]
+ init_patch['scale_level'] = 1
+ init_patch['score'] = None
+ init_patch['parent_index'] = -1
+
+ # Initialize the search path with the initial patch
+ search_path = [init_patch]
+ print("missing_objects_after_pop", missing_objects)
+
+ # Update the state management with the initialized values
+ self.stm(self.workflow_instance_id)['missing_objects'] = missing_objects
+ self.stm(self.workflow_instance_id)['target_object_name'] = target_object_name
+ self.stm(self.workflow_instance_id)['search_path'] = search_path
+ self.stm(self.workflow_instance_id)['all_valid_boxes'] = []
+ self.stm(self.workflow_instance_id)['search_result'] = None
+ self.stm(self.workflow_instance_id)['queue'] = []
+ self.stm(self.workflow_instance_id)['search_step'] = 0
+
+ return
+
+@registry.register_worker()
+class VstarLoopCheck(BaseWorker):
+ """
+ Worker to check if there are any missing objects in the search process.
+ This class determines if the search can be completed based on the missing objects.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for checking the loop condition.
+
+ This method checks if there are any missing objects and returns the status.
+ """
+ missing_objects = self.stm(self.workflow_instance_id)['missing_objects']
+
+ if len(missing_objects) == 0:
+ print("missing_objects is empty")
+ return {"finish": True}
+ else:
+ print("missing_objects is not empty")
+ return {"finish": False}
+
+
+
+@registry.register_worker()
+class VstarSearch(BaseWorker, VStarLLM):
+ """
+ Worker for executing the VStar search task.
+ This class interacts with the VStar LLM to perform visual searches based on the input image and target object.
+ """
+
+ llm: VStarLLM
+
+ def _normalize_score(self, score_heatmap):
+ """
+ Normalize the score heatmap to a range of [0, 1].
+
+ Args:
+ score_heatmap (torch.Tensor): The heatmap scores to normalize.
+
+ Returns:
+ torch.Tensor: The normalized heatmap scores.
+ """
+ max_score = score_heatmap.max()
+ min_score = score_heatmap.min()
+ if max_score != min_score:
+ score_heatmap = (score_heatmap - min_score) / (max_score - min_score)
+ else:
+ score_heatmap = score_heatmap * 0
+ return score_heatmap
+
+
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for the VStar search.
+
+ This method retrieves the current patch, performs the search, and updates the state management with results.
+ """
+ # Retrieve the cached image and search parameters from the state management
+ image = self.stm(self.workflow_instance_id)['image_cache']['']
+ search_path = self.stm(self.workflow_instance_id)['search_path']
+ current_patch = search_path[-1]
+ target_object_name = self.stm(self.workflow_instance_id)['target_object_name']
+ current_patch_bbox = current_patch['bbox']
+ current_patch_scale_level = current_patch['scale_level']
+ queue = self.stm(self.workflow_instance_id)['queue']
+
+ # Crop the image based on the current patch's bounding box
+ image_patch = image.crop((int(current_patch_bbox[0]), int(current_patch_bbox[1]),
+ int(current_patch_bbox[0] + current_patch_bbox[2]),
+ int(current_patch_bbox[1] + current_patch_bbox[3])))
+
+ # Call the VStar LLM to perform a visual search
+ response = self.llm.visual_search(target_object_name, copy.deepcopy(image_patch), mode='detection')
+ res = response['choices'][0]['message']['content']
+
+ # Parse the response for bounding boxes, logits, and heatmaps
+ pred_bboxes, pred_logits, target_cue_heatmap = res
+ pred_bboxes = torch.tensor(pred_bboxes, dtype=torch.float32)
+ self.callback.info(self.workflow_instance_id, progress='VstarSearch', message=f'bbox: {pred_bboxes}')
+ pred_logits = torch.tensor(pred_logits, dtype=torch.float32)
+ target_cue_heatmap = torch.tensor(target_cue_heatmap, dtype=torch.bool)
+
+ # Process the predicted logits to find the best bounding box
+ if len(pred_logits) > 0:
+ top_index = pred_logits.view(-1).argmax()
+ top_logit = pred_logits.view(-1).max()
+ final_bbox = pred_bboxes[top_index].view(4)
+ final_bbox = final_bbox * torch.Tensor([image_patch.width, image_patch.height, image_patch.width, image_patch.height])
+ final_bbox[:2] -= final_bbox[2:] / 2
+
+ if top_logit > CONFIDENCE_HIGH:
+ # Store the detection result if confidence is high
+ search_path[-1]['detection_result'] = final_bbox
+ if len(search_path) == 1:
+ all_valid_boxes = pred_bboxes[pred_logits.view(-1) > 0.5].view(-1, 4)
+ all_valid_boxes = all_valid_boxes * torch.Tensor([[image_patch.width, image_patch.height, image_patch.width, image_patch.height]])
+ all_valid_boxes[:, :2] -= all_valid_boxes[:, 2:] / 2
+ self.stm(self.workflow_instance_id)['search_result'] = (True, search_path, all_valid_boxes)
+ return
+ self.stm(self.workflow_instance_id)['search_result'] = (True, search_path, None)
+ return
+ else:
+ # Store temporary detection result if confidence is low
+ search_path[-1]['temp_detection_result'] = (top_logit, final_bbox)
+
+ # Check if the current patch size is below the minimum size
+ if min(current_patch_bbox[2], current_patch_bbox[3]) <= SMALLEST_SIZE:
+ self.stm(self.workflow_instance_id)['search_result'] = (False, search_path, None)
+ return
+
+ # Process the target cue heatmap
+ target_cue_heatmap = target_cue_heatmap.view(current_patch_bbox[3], current_patch_bbox[2], 1)
+ score_max = target_cue_heatmap.max().item()
+ threshold = max(TARGET_CUE_THRESHOLD_MINIMUM, TARGET_CUE_THRESHOLD * (TARGET_CUE_THRESHOLD_DECAY) ** (current_patch_scale_level - 1))
+
+ if score_max > threshold:
+ target_cue_heatmap = self._normalize_score(target_cue_heatmap)
+ final_heatmap = target_cue_heatmap
+ else:
+ # If the score is below the threshold, perform a VQA search
+ response = self.llm.visual_search(target_object_name, copy.deepcopy(image_patch), mode='vqa')
+ vqa_results = response['choices'][0]['message']['content']
+ self.callback.info(self.workflow_instance_id, progress='VstarSearch-vqa', message=f'response: {vqa_results}')
+ possible_location_phrase = vqa_results.split('most likely to appear')[-1].strip()
+ if possible_location_phrase.endswith('.'):
+ possible_location_phrase = possible_location_phrase[:-1]
+ possible_location_phrase = possible_location_phrase.split(target_object_name)[-1]
+ noun_chunks = extract_noun_chunks(possible_location_phrase)
+ if len(noun_chunks) == 1:
+ possible_location_phrase = noun_chunks[0]
+ else:
+ possible_location_phrase = "region {}".format(possible_location_phrase)
+
+ # Perform segmentation search based on the possible location
+ response = self.llm.visual_search(possible_location_phrase, copy.deepcopy(image_patch), mode='segmentation')
+ res = response['choices'][0]['message']['content']
+ res = torch.tensor(res, dtype=torch.int32)
+ context_cue_heatmap = res.view(current_patch_bbox[3], current_patch_bbox[2], 1)
+ context_cue_heatmap = self._normalize_score(context_cue_heatmap)
+ final_heatmap = context_cue_heatmap
+
+ current_patch_index = len(search_path) - 1
+
+ if score_max <= threshold:
+ # Store context cue if the score is below the threshold
+ search_path[current_patch_index]['context_cue'] = vqa_results + "#" + possible_location_phrase
+ search_path[current_patch_index]['final_heatmap'] = final_heatmap.cpu().numpy()
+
+ # Get sub-patches for further processing
+ basic_sub_patches, sub_patch_width, sub_patch_height = get_sub_patches(current_patch_bbox, *split_4subpatches(current_patch_bbox))
+
+ tmp_patch = current_patch
+ basic_sub_scores = [0] * len(basic_sub_patches)
+
+ # Calculate scores for each sub-patch
+ while True:
+ tmp_score_heatmap = tmp_patch['final_heatmap']
+ tmp_sub_scores = get_subpatch_scores(tmp_score_heatmap, tmp_patch['bbox'], basic_sub_patches)
+ basic_sub_scores = [basic_sub_scores[patch_i] + tmp_sub_scores[patch_i] / (4 ** tmp_patch['scale_level']) for patch_i in range(len(basic_sub_scores))]
+ if tmp_patch['parent_index'] == -1:
+ break
+ else:
+ tmp_patch = search_path[tmp_patch['parent_index']]
+
+ sub_patches = basic_sub_patches
+ sub_scores = basic_sub_scores
+
+ # Log the sub-scores for debugging
+ self.callback.info(self.workflow_instance_id, progress='VstarSearch', message=f"sub_scores: {sub_scores}")
+
+ # Push new patches into the queue for further processing
+ for sub_patch, sub_score in zip(sub_patches, sub_scores):
+ new_patch_info = dict()
+ new_patch_info['bbox'] = sub_patch
+ new_patch_info['scale_level'] = current_patch_scale_level + 1
+ new_patch_info['score'] = sub_score
+ new_patch_info['parent_index'] = current_patch_index
+ heapq.heappush(queue, Prioritize(-new_patch_info['score'], new_patch_info))
+
+ # Update the state management with the new queue and search result
+ self.stm(self.workflow_instance_id)['queue'] = queue
+ self.stm(self.workflow_instance_id)['search_result'] = False, search_path, None
+ search_step = self.stm(self.workflow_instance_id)['search_step']
+ search_step += 1
+ self.stm(self.workflow_instance_id)['search_step'] = search_step
+
+ return
+
+@registry.register_worker()
+class VstarSearchCheck(BaseWorker):
+ """
+ Worker to check the status of the VStar search process.
+ This class determines if the search has completed or if more steps are needed.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for checking the search status.
+
+ This method checks the current search step and updates the state management accordingly.
+ """
+ search_step = self.stm(self.workflow_instance_id)['search_step']
+
+ # Check if the maximum search step has been exceeded
+ if search_step > MAX_SEARCH_STEP:
+ self.stm(self.workflow_instance_id)['search_result'] = False, None, None
+ return {"finish": True}
+
+ # Retrieve the search result from the state management
+ success, search_path, all_valid_boxes = self.stm(self.workflow_instance_id)['search_result']
+ self.callback.info(self.workflow_instance_id, progress='VstarSearchCheck', message=f'\nsuccess: {success}\nall_valid_boxes:{all_valid_boxes}')
+
+ if success:
+ # If the search was successful, update the all_search_result in the state management
+ target_object_name = self.stm(self.workflow_instance_id)['target_object_name']
+ if self.stm(self.workflow_instance_id).get('all_search_result', None) is None:
+ self.stm(self.workflow_instance_id)['all_search_result'] = {}
+ result_this_step = {target_object_name: self.stm(self.workflow_instance_id)['search_result']}
+ result_origin = self.stm(self.workflow_instance_id)['all_search_result']
+ self.stm(self.workflow_instance_id)['all_search_result'] = {**result_origin, **result_this_step}
+
+ # Uncomment the following lines to save the detected crop image
+ # image = self.stm(self.workflow_instance_id)['image_cache']['']
+ # bbox = search_path[-1]['bbox']
+ # x, y, w, h = bbox
+ # os.makedirs(f"./images", exist_ok=True)
+ # image.crop([x, y, x + w, y + h]).save(f"./images/crop_{target_object_name}.jpg")
+ # self.callback.info(self.workflow_instance_id, progress='VstarSearchCheck', message=f'The detected crop image has been saved in "images/crop_{target_object_name}.jpg".')
+ return {"finish": True}
+
+ # Check if the queue is empty
+ queue = self.stm(self.workflow_instance_id)['queue']
+ if len(queue) == 0:
+ self.stm(self.workflow_instance_id)['search_result'] = False, None, None
+ return {"finish": True}
+
+ # Choose the next patch to process from the queue
+ patch_chosen = heapq.heappop(queue).item
+ self.callback.info(self.workflow_instance_id, progress='VstarSearchCheck', message=f'\npatch_chosen: {patch_chosen}')
+
+ # Update the state management with the chosen patch
+ self.stm(self.workflow_instance_id)['search_path'] = search_path + [patch_chosen]
+ self.stm(self.workflow_instance_id)['queue'] = queue
+ self.stm(self.workflow_instance_id)['current_patch'] = patch_chosen
+ current_patch_bbox = patch_chosen['bbox']
+
+ # Check if the current patch size is below the minimum size
+ if min(current_patch_bbox[2], current_patch_bbox[3]) <= SMALLEST_SIZE:
+ self.stm(self.workflow_instance_id)['search_result'] = False, search_path, None
+ return {"finish": True}
+
+ return {"finish": False}
+
+@registry.register_worker()
+class VQA_LLM_Post(BaseWorker, VStarLLM):
+ """
+ Worker for handling the post-processing of the Visual Question Answering (VQA) task.
+ This class processes the results and prepares the final output for the user.
+ """
+
+ llm: VStarLLM
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for post-processing the VQA results.
+
+ This method retrieves the results from the state management and sends the final answer to the user.
+ """
+ # Prepare the result dictionary with relevant information
+ result = {
+ "id": self.stm(self.workflow_instance_id).get('id', None),
+ "query": self.stm(self.workflow_instance_id).get('prompt', None),
+ "last_output": self.stm(self.workflow_instance_id).get('answer', None),
+ "missing_objects": self.stm(self.workflow_instance_id).get('all_missing_objects', None),
+ }
+
+ # Retrieve the cached image and prompt from the state management
+ img = self.stm(self.workflow_instance_id)['image_cache']['']
+ prompt = self.stm(self.workflow_instance_id)['prompt']
+ all_missing_objects = self.stm(self.workflow_instance_id).get('all_missing_objects', None)
+ all_search_result = self.stm(self.workflow_instance_id).get('all_search_result', None)
+
+ # Check if there are no missing objects
+ if not all_missing_objects or len(all_missing_objects) == 0:
+ self.callback.send_answer(self.workflow_instance_id, progress="VQA LLM Post", msg=self.stm(self.workflow_instance_id)['answer'])
+ return {"result": result}
+
+ # Check if there were any search results
+ if all_search_result is None:
+ self.callback.send_answer(self.workflow_instance_id, progress="VQA LLM Post", msg="Sorry, there are some mistakes in the search process.")
+ result['last_output'] = "Sorry, there are some mistakes in the search process. No search result."
+ return {"result": result}
+
+ # Process each missing object to find its search result
+ searched_infos = []
+ for object_name in all_missing_objects:
+ if object_name not in all_search_result:
+ self.callback.send_answer(self.workflow_instance_id, progress="VQA LLM Post", msg=f"Sorry, I can not find the object {object_name} in the image.")
+ continue
+
+ success, search_path, all_valid_boxes = all_search_result[object_name]
+ if success:
+ searched_infos.append({'bbox': search_path[-1]['bbox'], 'name': object_name})
+
+ # Check if no objects were found
+ if not searched_infos:
+ self.callback.send_answer(self.workflow_instance_id, progress="VQA LLM Post", msg="Sorry, I can not find the object in the image.")
+ result['last_output'] = "Sorry, I can not find all objects in the image. No success"
+ return {"result": result}
+
+ # Prepare the object names and bounding boxes for the final VQA call
+ object_names = [info['name'] for info in searched_infos]
+ bboxes = [info['bbox'] for info in searched_infos]
+
+ # Call the VStar LLM for the final VQA post-processing
+ response = self.llm.vqa_post(prompt, img, object_names=object_names, bboxes=bboxes)
+ response = response['choices'][0]['message']['content']
+
+ # Send the final answer to the user
+ self.callback.send_answer(self.workflow_instance_id, msg=response)
+ result['last_output'] = response
+ return {"result": result}
+
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/workflow.py b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/workflow.py
new file mode 100755
index 00000000..eb3d25b8
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/Vstar/workflow.py
@@ -0,0 +1,84 @@
+from omagent_core.engine.workflow.task.simple_task import simple_task
+from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
+from omagent_core.engine.workflow.task.do_while_task import DoWhileTask
+from omagent_core.engine.workflow.task.switch_task import SwitchTask
+
+
+
+class VstarWorkflow(ConductorWorkflow):
+
+ def __init__(self):
+ super().__init__(name='vstar_workflow')
+
+ def set_input(self, query: str, image_path: str, qid: str='test',):
+ self.qid = qid
+ self.query = query
+ self.image_path = image_path
+ self._configure_tasks()
+ self._configure_workflow()
+
+ def _configure_tasks(self):
+
+ self.vqa_llm_preprocess = simple_task(
+ task_def_name='VQA_LLM_Preprocess',
+ task_reference_name='vqa_llm_preprocess',
+ inputs={
+ "qid": self.qid,
+ "query": self.query,
+ "image_path": self.image_path
+ }
+ )
+
+ self.vqa_llm = simple_task(
+ task_def_name='VQA_LLM',
+ task_reference_name='vqa_llm'
+ )
+
+ vstar_search_preprocess = simple_task(
+ task_def_name='VstarSearchPreprocess',
+ task_reference_name='vstar_search_preprocess'
+ )
+
+ vstar_loop_check = simple_task(
+ task_def_name='VstarLoopCheck',
+ task_reference_name='vstar_loop_check'
+ )
+
+ vstar_search = simple_task(
+ task_def_name='VstarSearch',
+ task_reference_name='vstar_search'
+ )
+
+ vstar_search_check = simple_task(
+ task_def_name='VstarSearchCheck',
+ task_reference_name='vstar_search_check'
+ )
+
+ vstar_search_loop = DoWhileTask(
+ task_ref_name='vstar_search_loop',
+ tasks=[vstar_search, vstar_search_check],
+ termination_condition='if ($.vstar_search_check["finish"] == true){false;} else {true;}'
+ )
+
+ self.vstar_loop = DoWhileTask(
+ task_ref_name='vstar_loop',
+ tasks=[vstar_search_preprocess, vstar_search_loop, vstar_loop_check],
+ termination_condition='if ($.vstar_loop_check["finish"] == true){false;} else {true;}'
+ )
+
+
+ self.vqa_llm_post = simple_task(
+ task_def_name='VQA_LLM_Post',
+ task_reference_name='vqa_llm_post'
+ )
+
+ self.switch_task = SwitchTask(
+ task_ref_name='switch_task',
+ case_expression=self.vqa_llm.output("vqa_llm_succeed"),
+ )
+ self.switch_task.switch_case(0, [self.vstar_loop])
+ # self.switch_task.switch_case(1, self.vqa_llm_post)
+
+ def _configure_workflow(self):
+ self >> self.vqa_llm_preprocess >> self.vqa_llm >> self.switch_task >> self.vqa_llm_post
+ self.result = self.vqa_llm_post.output("result")
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/README.md b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/README.md
new file mode 100644
index 00000000..43a1519c
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/README.md
@@ -0,0 +1,123 @@
+# ZoomEye Operator
+
+ZoomEye is a workflow operator that combines visual understanding with confidence-guided search to locate and analyze visual elements in images. It builds a hierarchical image tree and uses adaptive thresholding to efficiently identify objects relevant to user queries.
+
+You can refer to the example in the `examples/ZoomEye` directory to understand how to use this operator.
+
+# Inputs, Outputs and configs
+
+## Inputs:
+The inputs that the ZoomEye operator requires are as follows:
+| Name | Type | Required | Description |
+| -------- | ----- | ----- | ---- |
+| query | str | true | The text question about the image content |
+| image_path | str | true | Path to the image file for analysis |
+| qid | str | false | Optional query identifier for tracking purposes |
+
+## Outputs:
+The outputs that the ZoomEye operator returns are as follows:
+| Name | Type | Description |
+| -------- | ----- | ---- |
+| result | dict | The result of the ZoomEye workflow. It includes the final answer, identified visual elements, prompt tokens, completion tokens, and the original query. |
+
+## Configs:
+The config of the ZoomEye operator is as follows, you can simply copy and paste the following config into your project as a zoomeye_workflow.yml file.
+```yml
+- name: VisualCueGeneration
+ llm: ${sub|gpt}
+
+- name: ZoomEyePreprocess
+
+- name: ZoomEyeSearch
+ llm: ${sub|gpt}
+
+- name: ZoomEyeSearchCheck
+
+- name: ZoomEyeLoopCheck
+
+- name: ZoomEyeOutput
+ llm: ${sub|gpt}
+
+```
+
+The ZoomEye operator settings are as follows:
+| Name | Type | Description |
+| -------- | ----- | ---- |
+| answering_confidence_threshold_upper | float | The upper confidence threshold for accepting a node during search |
+| depth_limit | int | The maximum depth of the image tree to explore |
+| threshold_decrease | list | Sequence of threshold reductions to apply during adaptive search |
+| pop_limit | int | Maximum number of nodes to process in a single search iteration |
+| answering_confidence_threshold_lower | float | The lower confidence threshold below which search stops |
+| num_interval | int | Number of additional nodes to process after each threshold adjustment |
+| smallest_size | int | The smallest size of the image to process |
+| threshold_decrease | list | Sequence of threshold reductions to apply during adaptive search |
+
+Set these parameters here`omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/utils.py`
+
+```python
+
+# Constants for question templates and thresholds
+ANSWERING_CONFIDENCE_THRESHOLD_UPPER = 0.4
+ANSWERING_CONFIDENCE_THRESHOLD_LOWER = 0
+
+# Constants for image processing
+SMALLEST_SIZE = 384
+DEPTH_LIMIT = 5
+NUM_INTERVEL = 2
+
+# Pop limit calculation
+def pop_limit_func(max_depth):
+ return max_depth * 3
+
+POP_LIMIT = pop_limit_func
+THRESHOLD_DECREASE = [0.1, 0.1, 0.2]
+```
+
+## How ZoomEye Works
+
+ZoomEye operates in multiple stages:
+
+1. **Visual Cue Generation**: Analyzes the image and query to identify key visual elements to search for
+2. **ZoomEye Preprocess**: Prepares the search environment for each visual cue and initializes the image tree
+3. **ZoomEye Search**: Performs confidence-guided traversal of the image tree to locate visual elements
+4. **ZoomEye Search Check**: Determines when the search is complete and collects found candidates
+5. **ZoomEye Loop Check**: Manages iterations across multiple visual cues
+6. **ZoomEye Output**: Synthesizes the search results into a comprehensive response
+
+The search process uses a priority-based traversal with multiple confidence metrics (existence, latent, and answering) to efficiently locate visual elements while maintaining high precision.
+
+## Example Usage
+
+Here's a simple example of how to use the ZoomEye operator in a workflow:
+
+```python
+from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
+from omagent_core.engine.workflow.task.simple_task import simple_task
+from omagent_core.advanced_components.workflow.ZoomEye.workflow import ZoomEyeWorkflow
+
+# Initialize workflow
+workflow = ConductorWorkflow(name='ZoomEyeExample')
+
+# Create input task
+input_task = simple_task(
+ task_def_name='ZoomEyeInput',
+ task_reference_name='zoomeye_input'
+)
+
+# Initialize ZoomEye workflow
+zoomeye_workflow = ZoomEyeWorkflow()
+
+# Connect input to ZoomEye workflow
+zoomeye_workflow.set_input(
+ query=input_task.output("query"),
+ image_path=input_task.output("image_path")
+)
+
+# Configure workflow execution flow
+workflow >> input_task >> zoomeye_workflow
+
+# Register workflow
+workflow.register(overwrite=True)
+```
+
+This combines the power of visual understanding with efficient search algorithms to provide detailed analysis of image content based on natural language queries.
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/visual_cue_generation/visual_cue_generation.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/visual_cue_generation/visual_cue_generation.py
new file mode 100644
index 00000000..a587a869
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/visual_cue_generation/visual_cue_generation.py
@@ -0,0 +1,155 @@
+from omagent_core.models.llms.base import BaseLLMBackend
+from omagent_core.utils.registry import registry
+from omagent_core.utils.general import read_image
+from omagent_core.models.llms.openai_gpt import OpenaiGPTLLM
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.tree import ImageTree
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.utils import *
+
+@registry.register_worker()
+class VisualCueGeneration(BaseWorker, BaseLLMBackend):
+ """
+ A worker class that generates visual cues from an image using a language model.
+ Visual cues are key objects or elements in the image that are relevant to the user's query.
+ """
+
+ llm: OpenaiGPTLLM # Language model used for generating visual cues
+
+ def _run(self, qid: str, query: str, image_path: str):
+ """
+ Main execution method for visual cue generation.
+
+ Args:
+ qid (str): Query identifier
+ query (str): User's text query about the image
+ image_path (str): Path to the image file
+
+ Returns:
+ dict: Contains the filtered visual cues identified in the image
+ """
+ # Log the start of visual cue generation process
+ self.callback.info(self.workflow_instance_id, progress='VisualCueGeneration', message=f'\nworkflow_instance_id: {self.workflow_instance_id}')
+
+ # Load the image from the provided path
+ img = read_image(input_source=image_path)
+ if img is None:
+ self.callback.error(self.workflow_instance_id, f"Failed to load image from {image_path}")
+ return None
+
+ # Store the image in the short-term memory for later use
+ self.stm(self.workflow_instance_id)['image_cache'] = {'': img}
+ self.stm(self.workflow_instance_id)['prompt'] = query
+ self.stm(self.workflow_instance_id)['id'] = qid
+ self.stm(self.workflow_instance_id)['final_all_candidates'] = []
+ self.stm(self.workflow_instance_id)['prompt_tokens'] = 0
+ self.stm(self.workflow_instance_id)['completion_tokens'] = 0
+
+ # Get the user's question from short-term memory
+ question = self.stm(self.workflow_instance_id)['prompt']
+
+ # Create a conversation prompt for visual cue generation
+ conversation = get_visual_cues_generation_conversation(question)
+
+ # Generate visual cues using the language model
+ response = self.llm.generate(conversation)
+ if not response or 'choices' not in response or not response['choices']:
+ self.callback.error(self.workflow_instance_id, "Failed to get valid response from LLM")
+ return None
+
+ try:
+ # Extract the content and token usage from the LLM response
+ res = response['choices'][0]['message']['content']
+ prompt_tokens = response['usage']['prompt_tokens']
+ completion_tokens = response['usage']['completion_tokens']
+ except KeyError as e:
+ self.callback.error(self.workflow_instance_id, f"Missing key in LLM response: {e}")
+ return None
+
+ # Track token usage for monitoring purposes
+ self.stm(self.workflow_instance_id)['prompt_tokens'] = prompt_tokens
+ self.stm(self.workflow_instance_id)['completion_tokens'] = completion_tokens
+
+ # Log the raw visual cues generated by the model
+ self.callback.info(self.workflow_instance_id, progress='VisualCueGeneration', message=f'\nVisualCuesGeneration: {res}')
+
+ # Extract target objects from the LLM response
+ targets = extract_targets(res)
+ targets = split_targets_sentence(targets)
+ if targets is None:
+ visual_cues = None
+ else:
+ # Filter out pronouns from visual cues
+ visual_cues = [t for t in targets if not include_pronouns(t)]
+
+ if visual_cues is None:
+ visual_cues = []
+
+ # Store the identified visual cues in short-term memory
+ self.stm(self.workflow_instance_id)['visual_cue'] = visual_cues
+
+ # Create an image tree for hierarchical image analysis
+ image_tree = ImageTree(
+ image_pil=img,
+ patch_size=SMALLEST_SIZE
+ )
+ self.stm(self.workflow_instance_id)['tree'] = image_tree
+
+ # Initialize variables for confidence-based filtering
+ root_node = image_tree.root
+ decomposed_question_template = DECOMPOSED_QUESTION_TEMPLATE
+ answering_confidence_threshold_upper = ANSWERING_CONFIDENCE_THRESHOLD_UPPER
+ filtered_visual_cues = []
+
+ # If multiple visual cues are detected, filter them based on confidence
+ if len(visual_cues) > 1:
+ for target in visual_cues:
+ # Special handling for 'all X' type cues
+ if target.startswith("all "):
+ filtered_visual_cues.append(target)
+ continue
+
+ # Prepare inputs for confidence evaluation
+ inputs = {
+ "node": root_node,
+ "image_pil": img,
+ "confidence_type": "answering",
+ "input_ele": decomposed_question_template.format(target)
+ }
+
+ # Generate a conversation to assess confidence in the visual cue
+ conversation = get_confidence_conversation(**inputs)
+ if not self.llm.logprobs:
+ self.llm.logprobs = True
+ self.llm.top_logprobs = 5
+
+ # Generate response with log probabilities for confidence calculation
+ llm_response = self.llm.generate(conversation)
+
+ # Update token usage statistics
+ prompt_tokens = self.stm(self.workflow_instance_id)['prompt_tokens']
+ completion_tokens = self.stm(self.workflow_instance_id)['completion_tokens']
+ self.stm(self.workflow_instance_id)['prompt_tokens'] = prompt_tokens + llm_response['usage']['prompt_tokens']
+ self.stm(self.workflow_instance_id)['completion_tokens'] = completion_tokens + llm_response['usage']['completion_tokens']
+
+ # Calculate confidence score for this visual cue
+ confidence = calculate_confidence_from_logprobs(llm_response)
+
+ # Only keep visual cues with confidence below the threshold
+ # (indicating they need further search/verification)
+ if confidence < answering_confidence_threshold_upper:
+ filtered_visual_cues.append(target)
+ else:
+ # If only one visual cue, keep it without filtering
+ filtered_visual_cues = visual_cues[:]
+
+ # Store the final filtered visual cues in short-term memory
+ if len(filtered_visual_cues) == 0:
+ self.stm(self.workflow_instance_id)['visual_cues'] = None
+ else:
+ self.stm(self.workflow_instance_id)['visual_cues'] = filtered_visual_cues
+
+ # Log the identified visual cues
+ self.callback.info(self.workflow_instance_id, progress='VisualCueGeneration', message=f'\nVisualCues: {visual_cues}')
+
+ # Return the filtered visual cues for the next stage in the workflow
+ return {"visual_cues": filtered_visual_cues}
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_loop_check/zoomeye_loop_check.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_loop_check/zoomeye_loop_check.py
new file mode 100644
index 00000000..4bcc5d0f
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_loop_check/zoomeye_loop_check.py
@@ -0,0 +1,28 @@
+from omagent_core.utils.registry import registry
+from omagent_core.engine.worker.base import BaseWorker
+
+@registry.register_worker()
+class ZoomEyeLoopCheck(BaseWorker):
+ """
+ Worker class that determines whether the ZoomEye search loop should continue or terminate.
+ This component checks if there are any remaining visual cues to process in the workflow.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method to check if the ZoomEye workflow loop should continue.
+
+ Returns:
+ dict: Contains the 'loop_check_finish' flag indicating whether to terminate the loop
+ - True: No more visual cues to process, workflow should exit the loop
+ - False: Visual cues still exist, workflow should continue processing
+ """
+ # Retrieve the current list of visual cues from short-term memory
+ visual_cues = self.stm(self.workflow_instance_id).get('visual_cues', None)
+
+ # If no visual cues exist or the list is empty, signal to terminate the loop
+ if visual_cues is None or len(visual_cues) == 0:
+ return {"loop_check_finish": True}
+ # Otherwise, continue the loop to process remaining visual cues
+ else:
+ return {"loop_check_finish": False}
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_output/zoomeye_output.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_output/zoomeye_output.py
new file mode 100644
index 00000000..0acb9634
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_output/zoomeye_output.py
@@ -0,0 +1,65 @@
+from omagent_core.models.llms.base import BaseLLMBackend
+from omagent_core.models.llms.openai_gpt import OpenaiGPTLLM
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.utils.registry import registry
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.utils import *
+
+@registry.register_worker()
+class ZoomEyeOutput(BaseWorker, BaseLLMBackend):
+ """
+ Worker class responsible for generating the final response to the user's query.
+ Combines the search results with the original query and image to create a comprehensive answer.
+ """
+
+ llm: OpenaiGPTLLM # Language model used for generating the final output
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method to generate the final response based on search results.
+
+ Args:
+ *args: Variable length argument list (not used)
+ **kwargs: Arbitrary keyword arguments (not used)
+
+ Returns:
+ dict: Contains the final result with the answer, query information, and token usage statistics
+ """
+ # Retrieve the final set of candidates (search results) from short-term memory
+ final_all_candidates = self.stm(self.workflow_instance_id)['final_all_candidates']
+
+ # Get the original image from the cache
+ image_pil = self.stm(self.workflow_instance_id)['image_cache']['']
+
+ # Get the user's original question
+ question = self.stm(self.workflow_instance_id)['prompt']
+
+ # Create a conversation prompt that includes the search results, image, and original question
+ conversation = get_output_conversation(final_all_candidates, image_pil, question)
+
+ # Generate the final answer using the language model
+ response = self.llm.generate(conversation)
+
+ # Track token usage for monitoring and billing purposes
+ prompt_tokens = self.stm(self.workflow_instance_id).get('prompt_tokens', 0)
+ completion_tokens = self.stm(self.workflow_instance_id).get('completion_tokens', 0)
+ self.stm(self.workflow_instance_id)['prompt_tokens'] = prompt_tokens + response['usage']['prompt_tokens']
+ self.stm(self.workflow_instance_id)['completion_tokens'] = completion_tokens + response['usage']['completion_tokens']
+
+ # Extract the content of the LLM's response
+ res = response['choices'][0]['message']['content']
+
+ # Log the final output for monitoring
+ self.callback.info(self.workflow_instance_id, progress='ZoomEyeOutput', message=f'\nZoomEyeOutput: {res}')
+
+ # Compile the complete result package with query information and usage statistics
+ result = {
+ "id": self.stm(self.workflow_instance_id).get('id', None), # Query identifier
+ "query": self.stm(self.workflow_instance_id).get('prompt', None), # Original user query
+ "last_output": res, # Final answer text
+ "prompt_tokens": self.stm(self.workflow_instance_id)['prompt_tokens'], # Total prompt tokens used
+ "completion_tokens": self.stm(self.workflow_instance_id)['completion_tokens'], # Total completion tokens used
+ }
+
+ # Return the final result to be presented to the user
+ return {"result": result}
+
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_preprocess/zoomeye_preprocess.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_preprocess/zoomeye_preprocess.py
new file mode 100644
index 00000000..88cb5653
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_preprocess/zoomeye_preprocess.py
@@ -0,0 +1,124 @@
+import copy
+from omagent_core.models.llms.base import BaseLLMBackend
+from omagent_core.utils.registry import registry
+from omagent_core.utils.general import read_image
+from omagent_core.models.llms.openai_gpt import OpenaiGPTLLM
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.tree import ImageTree
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.utils import *
+
+@registry.register_worker()
+class ZoomEyePreprocess(BaseWorker):
+ """
+ Worker class for preprocessing in ZoomEye workflow.
+ Prepares the search environment before starting a new visual cue search iteration.
+
+ This component:
+ 1. Extracts the next visual cue to search for in the image
+ 2. Initializes the search queue with the root node of the image tree
+ 3. Sets up threshold parameters for the confidence-based search algorithm
+ 4. Prepares tracking variables for the search process
+ """
+
+ def _run(self, *args, **kwargs) -> None:
+ """
+ Main execution method for preprocessing a visual cue search iteration.
+
+ Prepares the search environment by:
+ - Extracting the next visual cue from the queue
+ - Setting up the search parameters and thresholds
+ - Initializing the BFS search queue with the root node
+
+ Args:
+ *args: Variable length argument list (not used)
+ **kwargs: Arbitrary keyword arguments (not used)
+
+ Returns:
+ None: Updates are made directly to the shared memory
+
+ Raises:
+ ValueError: If required data is missing from the workflow state
+ """
+ # Get workflow STM for cleaner access to shared memory
+ workflow_stm = self.stm(self.workflow_instance_id)
+
+ # Validate that the image tree exists in the workflow state
+ if 'tree' not in workflow_stm:
+ raise ValueError("Required 'tree' not found in workflow STM")
+
+ # Extract the image tree and root node
+ image_tree = workflow_stm['tree']
+ root_node = image_tree.root
+
+ # Get the list of visual cues to process
+ visual_cues = workflow_stm.get('visual_cues', None)
+
+ # If there are no visual cues, set the root node as the only candidate and exit
+ if visual_cues is None:
+ workflow_stm['final_all_candidates'] = [root_node]
+ return
+
+ # Extract the next visual cue to process and remove it from the list
+ visual_cue = visual_cues.pop(0)
+
+ # Log the current visual cue being processed and remaining cues
+ self.callback.info(
+ self.workflow_instance_id,
+ progress='ZoomEyePreprocess',
+ message=f'\ncurrent search visual cues: {visual_cue}\nleft visual cues: {visual_cues}'
+ )
+
+ # Initialize the search state with parameters for the current visual cue
+ workflow_stm.update({
+ 'current_visual_cue': visual_cue, # Current object being searched for
+ 'visual_cues': visual_cues, # Remaining objects to search for
+ 'Q': [root_node], # BFS queue starting with root node
+ 'temp_threshold_descrease': copy.deepcopy(THRESHOLD_DECREASE), # Confidence threshold adjustment factor
+ 'answering_confidence_threshold_upper': ANSWERING_CONFIDENCE_THRESHOLD_UPPER, # Upper confidence threshold
+ 'candidates': [], # List to store matching candidates
+ 'num_pop': 0, # Counter for nodes processed
+ 'pop_trace': [], # History of processed nodes
+ 'max_depth': min(image_tree.max_depth, DEPTH_LIMIT), # Maximum tree depth to explore
+ 'finish': False # Flag indicating search completion
+ })
+
+ # Set initial confidence for root node
+ root_node.answering_confidence = -1
+
+ # Calculate the maximum number of nodes to process based on depth
+ pop_num_limit = POP_LIMIT(workflow_stm['max_depth']) if callable(POP_LIMIT) else POP_LIMIT
+ workflow_stm['pop_num_limit'] = pop_num_limit
+
+ # Log the initial search state for debugging and monitoring
+ self._log_state(workflow_stm)
+ return
+
+ def _log_state(self, workflow_stm: dict) -> None:
+ """
+ Helper method to log the current workflow state for monitoring and debugging.
+
+ Args:
+ workflow_stm (dict): The workflow state dictionary containing search parameters
+ and tracking variables
+
+ Returns:
+ None: Output is sent to the logging system
+ """
+ self.callback.info(
+ self.workflow_instance_id,
+ progress='ZoomEyePreprocess',
+ message=f"""\nzoomeye_items:
+ Q: {workflow_stm['Q']}
+ max_depth: {workflow_stm['max_depth']}
+ pop_num_limit: {workflow_stm['pop_num_limit']}
+ current_visual_cue: {workflow_stm['current_visual_cue']}
+ temp_threshold_descrease: {workflow_stm['temp_threshold_descrease']}
+ answering_confidence_threshold_upper: {workflow_stm['answering_confidence_threshold_upper']}
+ candidates: {workflow_stm['candidates']}
+ num_pop: {workflow_stm['num_pop']}
+ pop_trace: {workflow_stm['pop_trace']}
+ finish: {workflow_stm['finish']}
+ image_tree: {workflow_stm['tree']}
+ """
+ )
+
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_search/zoomeye_search.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_search/zoomeye_search.py
new file mode 100644
index 00000000..0af6a0f5
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_search/zoomeye_search.py
@@ -0,0 +1,274 @@
+from pathlib import Path
+from typing import List, Tuple
+from omagent_core.models.llms.base import BaseLLMBackend
+from omagent_core.utils.registry import registry
+from omagent_core.models.llms.openai_gpt import OpenaiGPTLLM
+from omagent_core.engine.worker.base import BaseWorker
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.tree import *
+from omagent_core.advanced_components.workflow.ZoomEye.schemas.utils import *
+
+@registry.register_worker()
+class ZoomEyeSearch(BaseWorker, BaseLLMBackend):
+ """
+ Worker class that implements the core search algorithm of ZoomEye.
+ Performs a confidence-guided tree traversal to locate visual elements in images,
+ using a language model to evaluate confidence scores for image regions.
+ """
+
+ llm: OpenaiGPTLLM
+
+ def get_confidence_weight(self, node: Node, max_depth: int):
+ """
+ Calculate a weighted bias based on node depth in the image tree.
+ Deeper nodes get progressively more weight in confidence calculations.
+
+ Args:
+ node (Node): The image tree node being evaluated
+ max_depth (int): Maximum depth of the image tree
+
+ Returns:
+ float: Weighted confidence value between 0.6 and 1.0
+ """
+ bias_value = 0.6 # Minimum weight value
+ coeff = (1 - bias_value) / (max_depth ** 2)
+ return coeff * (node.depth**2) + bias_value
+
+
+ def stopping_criterion(self, cur_node: Node, question: str, threshold: float) -> bool:
+ """
+ Determine if the current node satisfies the search criteria sufficiently
+ to stop the search process.
+
+ Args:
+ cur_node (Node): Current node being evaluated
+ question (str): The query that guides the search
+ threshold (float): Confidence threshold for accepting a node
+
+ Returns:
+ bool: True if node exceeds confidence threshold, False otherwise
+ """
+ try:
+ # Generate a conversation to evaluate the node's answering confidence
+ conversation = get_confidence_conversation(cur_node, cur_node.state.original_image_pil,
+ confidence_type='answering', input_ele=question)
+
+ # Enable logprobs for confidence calculation
+ if not self.llm.logprobs:
+ self.llm.logprobs = True
+ self.llm.top_logprobs = 5
+
+ # Get LLM response with confidence information
+ llm_response = self.llm.generate(conversation)
+
+ # Safely retrieve token usage data from STM
+ stm_data = self.stm(self.workflow_instance_id) or {}
+ prompt_tokens = stm_data.get('prompt_tokens', 0)
+ completion_tokens = stm_data.get('completion_tokens', 0)
+
+ # Update token usage counts
+ self.stm(self.workflow_instance_id)['prompt_tokens'] = prompt_tokens + llm_response['usage']['prompt_tokens']
+ self.stm(self.workflow_instance_id)['completion_tokens'] = completion_tokens + llm_response['usage']['completion_tokens']
+
+ # Calculate confidence score and store it in the node
+ cur_answering_confidence = calculate_confidence_from_logprobs(llm_response)
+ cur_node.answering_confidence = cur_answering_confidence
+
+ # Return whether the node exceeds the acceptance threshold
+ return cur_answering_confidence >= threshold
+ except Exception as e:
+ print(f"Error in stopping_criterion: {e}")
+ return False
+
+ def get_priority(self, node: Node, image_pil, max_depth: int, visual_cue: str) -> float:
+ """
+ Calculate a priority score for a node to determine the traversal order.
+ Uses a weighted combination of existence confidence and latent confidence.
+
+ Args:
+ node (Node): The node to calculate priority for
+ image_pil: The original PIL image
+ max_depth (int): Maximum depth of the image tree
+ visual_cue (str): The visual element being searched for
+
+ Returns:
+ float: Priority score between 0.0 and 1.0
+ """
+ try:
+ # Only calculate confidence if not already done
+ if node.fast_confidence is None:
+ if not self.llm.logprobs:
+ self.llm.logprobs = True
+ self.llm.top_logprobs = 5
+
+ # Get existence confidence (is the object visible in this region?)
+ existence_confidence = self._get_confidence(node, image_pil, 'existence', visual_cue)
+
+ # Get latent confidence (could the object be in this region?)
+ latent_confidence = self._get_confidence(node, image_pil, 'latent', visual_cue)
+
+ # Calculate weighted combination based on node depth
+ w = self.get_confidence_weight(node, max_depth)
+ node.fast_confidence = existence_confidence * w + latent_confidence * (1 - w)
+
+ # Store detailed confidence information for debugging
+ node.fast_confidence_details = {
+ 'existence': existence_confidence,
+ 'latent': latent_confidence,
+ 'weight': w
+ }
+ return node.fast_confidence
+ except Exception as e:
+ print(f"Error in get_priority: {e}")
+ return 0.0
+
+ def _get_confidence(self, node: Node, image_pil, confidence_type: str, visual_cue: str) -> float:
+ """
+ Helper method to get specific confidence scores from the language model.
+
+ Args:
+ node (Node): The node to evaluate
+ image_pil: The original PIL image
+ confidence_type (str): Type of confidence to evaluate ('existence' or 'latent')
+ visual_cue (str): The visual element being searched for
+
+ Returns:
+ float: Confidence score between 0.0 and 1.0
+ """
+ # Generate appropriate conversation for confidence evaluation
+ conversation = get_confidence_conversation(node, image_pil, confidence_type=confidence_type,
+ input_ele=visual_cue)
+ response = self.llm.generate(conversation)
+
+ # Safely update token usage counts
+ stm_data = self.stm(self.workflow_instance_id) or {}
+ prompt_tokens = stm_data.get('prompt_tokens', 0)
+ completion_tokens = stm_data.get('completion_tokens', 0)
+
+ self.stm(self.workflow_instance_id)['prompt_tokens'] = prompt_tokens + response['usage']['prompt_tokens']
+ self.stm(self.workflow_instance_id)['completion_tokens'] = completion_tokens + response['usage']['completion_tokens']
+
+ # Calculate and return confidence score
+ return calculate_confidence_from_logprobs(response)
+
+ def update_candidates(
+ self,
+ new_answering_threshold: float,
+ pop_trace: List[Tuple[int, Node]],
+ candidates: List[Node],
+ num_candidates = 1,
+ ):
+ """
+ Update the list of candidate nodes based on new threshold.
+
+ Args:
+ new_answering_threshold (float): New confidence threshold for candidates
+ pop_trace (List[Tuple[int, Node]]): History of nodes processed
+ candidates (List[Node]): Current list of candidate nodes
+ num_candidates (int): Maximum number of candidates to keep
+ """
+ # Add nodes that meet the new threshold
+ for _, node in pop_trace:
+ if node.answering_confidence >= new_answering_threshold and node not in candidates:
+ candidates.append(node)
+
+ # Keep only the top candidates if we have too many
+ if len(candidates) > num_candidates:
+ candidates.sort(key=lambda x: x.answering_confidence, reverse=True)
+ while len(candidates) > num_candidates:
+ candidates.pop()
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method for the search process.
+ Performs one step of the search algorithm, processing a single node.
+
+ Args:
+ *args: Variable length argument list (not used)
+ **kwargs: Arbitrary keyword arguments (not used)
+
+ Returns:
+ None: Updates are made directly to shared memory
+ """
+ # Retrieve search state from shared memory
+ visual_cues = self.stm(self.workflow_instance_id).get('visual_cues', None)
+ visual_cue = self.stm(self.workflow_instance_id).get('current_visual_cue', None)
+
+ # If no visual cues to search for, mark as finished
+ if visual_cues is None and visual_cue is None:
+ self.stm(self.workflow_instance_id)['finish'] = True
+ return
+
+ # Retrieve search parameters and state variables
+ candidates = self.stm(self.workflow_instance_id)['candidates']
+ num_pop = self.stm(self.workflow_instance_id)['num_pop']
+ pop_num_limit = self.stm(self.workflow_instance_id)['pop_num_limit']
+ temp_threshold_descrease = self.stm(self.workflow_instance_id)['temp_threshold_descrease']
+ answering_confidence_threshold_upper = self.stm(self.workflow_instance_id)['answering_confidence_threshold_upper']
+ pop_trace = self.stm(self.workflow_instance_id)['pop_trace']
+ max_depth = self.stm(self.workflow_instance_id)['max_depth']
+ image_pil = self.stm(self.workflow_instance_id)['image_cache']['']
+ question = self.stm(self.workflow_instance_id)['prompt']
+ Q = self.stm(self.workflow_instance_id)['Q']
+
+ # If queue is empty, mark search as finished
+ if len(Q) == 0:
+ self.stm(self.workflow_instance_id)['finish'] = True
+ return
+
+ # Get the next node to process
+ cur_node = Q.pop(0)
+ cur_node: Node # Type hint for clarity
+
+ # Update processing counters and history
+ num_pop += 1
+ pop_trace.append((num_pop, cur_node))
+ self.stm(self.workflow_instance_id)['num_pop'] = num_pop
+ self.stm(self.workflow_instance_id)['pop_trace'] = pop_trace
+
+ # Check if current node satisfies search criteria
+ if self.stopping_criterion(cur_node, question, answering_confidence_threshold_upper):
+ candidates.append(cur_node)
+ self.stm(self.workflow_instance_id)['candidates'] = candidates
+ self.stm(self.workflow_instance_id)['finish'] = True
+ return
+
+ # Check if we've reached node processing limit
+ if num_pop >= pop_num_limit:
+ # Lower confidence threshold to find more candidates
+ answering_confidence_threshold_upper -= temp_threshold_descrease[0]
+ self.stm(self.workflow_instance_id)['answering_confidence_threshold_upper'] = answering_confidence_threshold_upper
+
+ # Update threshold decrease steps if there are more
+ if len(temp_threshold_descrease) > 1:
+ _ = temp_threshold_descrease.pop(0)
+ print("temp_threshold_descrease:", temp_threshold_descrease)
+ self.stm(self.workflow_instance_id)['temp_threshold_descrease'] = temp_threshold_descrease
+
+ # Increase processing limit for next round
+ pop_num_limit += NUM_INTERVEL
+
+ # Update candidates with new threshold
+ self.update_candidates(answering_confidence_threshold_upper, pop_trace, candidates)
+
+ # If we found candidates with reduced threshold, finish search
+ if len(candidates) > 0:
+ self.stm(self.workflow_instance_id)['candidates'] = candidates
+ self.stm(self.workflow_instance_id)['finish'] = True
+ return
+
+ # If threshold is too low, stop search to avoid low quality results
+ if answering_confidence_threshold_upper < ANSWERING_CONFIDENCE_THRESHOLD_LOWER:
+ self.stm(self.workflow_instance_id)['finish'] = True
+ return
+
+ # Add child nodes to the queue
+ for child in cur_node.children:
+ Q.append(child)
+
+ # Sort queue by priority to process most promising nodes first
+ Q.sort(key=lambda x: self.get_priority(x, image_pil, max_depth, visual_cue), reverse=True)
+
+ # Update queue in shared memory
+ self.stm(self.workflow_instance_id)['Q'] = Q
+
+ return
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_search_check/zoomeye_search_check.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_search_check/zoomeye_search_check.py
new file mode 100644
index 00000000..8c48d928
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/agent/zoomeye_search_check/zoomeye_search_check.py
@@ -0,0 +1,41 @@
+from omagent_core.utils.registry import registry
+from omagent_core.engine.worker.base import BaseWorker
+
+@registry.register_worker()
+class ZoomEyeSearchCheck(BaseWorker):
+ """
+ Worker class that checks if the current search iteration has completed.
+ Collects successful search candidates and updates the final result list.
+ Acts as a checkpoint between iterations in the search process.
+ """
+
+ def _run(self, *args, **kwargs):
+ """
+ Main execution method to check search completion status and collect results.
+
+ This component:
+ 1. Checks if the current search iteration is complete
+ 2. Collects any found candidates and adds them to the final results
+ 3. Returns a flag indicating whether to continue or finish the search loop
+ """
+ # Get the search completion flag from shared memory
+ finish = self.stm(self.workflow_instance_id).get('finish', False)
+
+ # Get the list of candidate objects found in current search iteration
+ candidates = self.stm(self.workflow_instance_id).get('candidates', [])
+
+ # If current search iteration is complete, collect the results
+ if finish:
+ # Get the master list of all candidates found across iterations
+ final_all_candidates = self.stm(self.workflow_instance_id).get('final_all_candidates', [])
+
+ # If we found candidates in this iteration, add them to master list
+ if len(candidates) > 0:
+ final_all_candidates.extend(candidates)
+
+ # Update the master list in shared memory
+ self.stm(self.workflow_instance_id)['final_all_candidates'] = final_all_candidates
+
+ # Return flag indicating whether the search loop should continue or terminate
+ # When finish=True, the search loop will terminate and move to next visual cue
+ return {"search_check_finish": finish}
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/tree.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/tree.py
new file mode 100644
index 00000000..79309988
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/tree.py
@@ -0,0 +1,209 @@
+from PIL import Image
+from typing import NamedTuple, List, Optional
+import itertools
+
+
+class ZoomState(NamedTuple):
+ """
+ Contains the state information for a node in the image tree.
+
+ Attributes:
+ original_image_pil: Reference to the full original image
+ bbox: Bounding box coordinates [x, y, width, height] defining this image region
+ """
+ original_image_pil: Image.Image
+ bbox: List[int]
+
+class Node:
+ """
+ Represents a node in the hierarchical image tree.
+ Each node corresponds to a specific region in the image.
+
+ The tree structure enables multi-scale analysis and efficient region-based search.
+ """
+ id_iter = itertools.count() # Global counter for generating unique node IDs
+
+ @classmethod
+ def reset_id(cls):
+ """Reset the global ID counter to start fresh."""
+ cls.id_iter = itertools.count()
+
+ def __init__(
+ self,
+ state: Optional[ZoomState],
+ parent: "Optional[Node]" = None,
+ fast_confidence: float = None,
+ fast_confidence_details=None,
+ is_terminal: bool = False
+ ) -> None:
+ """
+ Initialize a new node in the image tree.
+
+ Args:
+ state: Contains image and bbox information for this node
+ parent: Reference to parent node (None for root node)
+ fast_confidence: Pre-computed confidence score for this region
+ fast_confidence_details: Detailed confidence information
+ is_terminal: Flag indicating if this is a leaf node that should not be subdivided
+ """
+
+ self.id = next(Node.id_iter) # Assign unique ID
+ if fast_confidence_details is None:
+ fast_confidence_details = {}
+ self.confidence_details = {} # Stores detailed confidence metrics
+ self.cum_confidences: list[float] = [] # Cumulative confidence scores
+ self.fast_confidence = self.confidence = fast_confidence # Quick access confidence score
+ self.fast_confidence_details = fast_confidence_details # Detailed breakdown of confidence
+ self.answering_confidence = 0 # Confidence that this region answers the query
+
+ self.is_terminal = is_terminal # Flag to stop further subdivision
+ self.state = state # Image region information
+ self.parent = parent # Reference to parent node
+ self.children: 'Optional[list[Node]]' = [] # Child nodes (subdivisions)
+ if parent is None:
+ self.depth = 0 # Root node has depth 0
+ else:
+ self.depth = parent.depth + 1 # Child depth is parent depth + 1
+
+ @property
+ def is_leaf(self):
+ """Check if this node has no children (is a leaf node)."""
+ return len(self.children) == 0
+
+ @property
+ def is_root(self):
+ """Check if this node is the root of the tree."""
+ return self.depth == 0
+
+ def add_child(self, child: 'Node'):
+ """Add a child node to this node's children list."""
+ self.children.append(child)
+
+ def save_crop(self, path):
+ """
+ Save the image region corresponding to this node to a file.
+ Useful for debugging and visualization.
+
+ Args:
+ path: File path where the cropped image should be saved
+ """
+ x, y, w, h = self.state.bbox
+ crop_image = self.state.original_image_pil.crop([x, y, x+w, y+h])
+ crop_image.save(path)
+
+def is_terminal(node: Node, smallest_size: int) -> bool:
+ """
+ Determine if a node should be considered terminal (not further subdivided).
+ A node is terminal if its width or height is smaller than smallest_size.
+
+ Args:
+ node: The node to check
+ smallest_size: Minimum dimension threshold
+
+ Returns:
+ bool: True if the node should not be subdivided further
+ """
+ now_w, now_h = node.state.bbox[2:]
+ return max(now_w, now_h) < smallest_size
+
+class ImageTree:
+ """
+ Hierarchical representation of an image for multi-scale analysis.
+ Divides the image into progressively smaller regions in a quad-tree like structure.
+ """
+ def __init__(self, image_pil, patch_size):
+ """
+ Initialize the image tree with the original image.
+
+ Args:
+ image_pil: The original PIL image
+ patch_size: Minimum size of image patches (termination condition)
+ """
+ self.image_pil = image_pil
+ self.patch_size = patch_size
+ # Create root node covering the entire image
+ self.root = Node(ZoomState(image_pil, [0, 0, image_pil.width, image_pil.height]))
+ self.max_depth = 0 # Track the maximum depth of the tree
+ self._build() # Build the entire tree structure
+
+
+ def _build(self):
+ """Build the entire image tree starting from the root."""
+ self._build_recursive(self.root)
+
+ def _build_recursive(self, node: Node):
+ """
+ Recursively build the image tree by subdividing nodes.
+
+ Args:
+ node: Current node to potentially subdivide
+ """
+ self.max_depth = max(self.max_depth, node.depth) # Update max depth tracker
+ if is_terminal(node, self.patch_size):
+ return # Stop subdivision if node is too small
+
+ # Get subdivision strategy and create sub-patches
+ sub_patches, _, _ = get_sub_patches(node.state.bbox, *split_4subpatches(node.state.bbox))
+ for sub_patch in sub_patches:
+ # Create a new state for each sub-patch
+ next_state = ZoomState(
+ original_image_pil=node.state.original_image_pil,
+ bbox=sub_patch,
+ )
+ # Add child node with the new state
+ node.add_child(Node(
+ state=next_state,
+ parent=node,
+ ))
+
+ # Recursively process each child node
+ for child in node.children:
+ self._build_recursive(child)
+
+
+def get_sub_patches(current_patch_bbox, num_of_width_patches, num_of_height_patches):
+ """
+ Divide a rectangular region into a grid of sub-patches.
+
+ Args:
+ current_patch_bbox: Current bounding box [x, y, width, height]
+ num_of_width_patches: Number of divisions along width
+ num_of_height_patches: Number of divisions along height
+
+ Returns:
+ tuple: (list of sub-patch bboxes, width stride, height stride)
+ """
+ width_stride = int(current_patch_bbox[2]//num_of_width_patches)
+ height_stride = int(current_patch_bbox[3]/num_of_height_patches)
+ sub_patches = []
+ for j in range(num_of_height_patches):
+ for i in range(num_of_width_patches):
+ # Handle edge case where division isn't even
+ sub_patch_width = current_patch_bbox[2] - i*width_stride if i == num_of_width_patches-1 else width_stride
+ sub_patch_height = current_patch_bbox[3] - j*height_stride if j == num_of_height_patches-1 else height_stride
+ # Calculate coordinates for this sub-patch
+ sub_patch = [current_patch_bbox[0]+i*width_stride, current_patch_bbox[1]+j*height_stride, sub_patch_width, sub_patch_height]
+ sub_patches.append(sub_patch)
+ return sub_patches, width_stride, height_stride
+
+def split_4subpatches(current_patch_bbox):
+ """
+ Determine the optimal way to split a patch based on its aspect ratio.
+ Maintains reasonable aspect ratios in the resulting sub-patches.
+
+ Args:
+ current_patch_bbox: Current bounding box [x, y, width, height]
+
+ Returns:
+ tuple: (width_divisions, height_divisions)
+ """
+ hw_ratio = current_patch_bbox[3] / current_patch_bbox[2]
+ if hw_ratio >= 2:
+ # Very tall rectangle: split into 4 vertical sections
+ return 1, 4
+ elif hw_ratio <= 0.5:
+ # Very wide rectangle: split into 4 horizontal sections
+ return 4, 1
+ else:
+ # Near-square region: split into 2×2 grid
+ return 2, 2
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/utils.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/utils.py
new file mode 100644
index 00000000..52ed278e
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/schemas/utils.py
@@ -0,0 +1,574 @@
+from typing import List
+from copy import deepcopy
+from PIL import Image, ImageDraw
+from omagent_core.models.llms.schemas import Message
+import re
+from omagent_core.utils.general import encode_image
+
+import numpy as np
+import torch
+from .tree import *
+
+import spacy
+nlp = spacy.load("en_core_web_sm") # Load spaCy for NLP processing
+
+
+# Constants for question templates and confidence thresholds
+DECOMPOSED_QUESTION_TEMPLATE = "What is the appearance of the {}?" # Template for generating object-specific questions
+ANSWERING_CONFIDENCE_THRESHOLD_UPPER = 0.4 # Upper confidence threshold for accepting answers
+ANSWERING_CONFIDENCE_THRESHOLD_LOWER = 0 # Lower bound for confidence before terminating search
+
+# Constants for image processing parameters
+SMALLEST_SIZE = 384 # Minimum size for image patches (px)
+DEPTH_LIMIT = 5 # Maximum depth of image tree
+NUM_INTERVEL = 2 # Number of nodes to add after each threshold adjustment
+
+# Calculate node processing limit based on tree depth
+def pop_limit_func(max_depth):
+ """
+ Determine the maximum number of nodes to process based on tree depth.
+ Scales linearly with depth to balance thoroughness and efficiency.
+
+ Args:
+ max_depth: Maximum depth of the image tree
+
+ Returns:
+ int: Number of nodes to process before adjusting thresholds
+ """
+ return max_depth * 3
+
+POP_LIMIT = pop_limit_func # Function to calculate processing limit
+THRESHOLD_DECREASE = [0.1, 0.1, 0.2] # Sequence of threshold reductions during search
+
+
+def get_visual_cues_generation_conversation(question: str):
+ """
+ Generate a conversation prompt for identifying visual cues in the image.
+ Uses few-shot examples to guide the model in identifying relevant objects.
+
+ Args:
+ question: User's question about the image
+
+ Returns:
+ list: Conversation messages for the LLM
+ """
+ conversation = []
+ # Prepare few-shot examples with question template and example pairs
+ prompt = {
+ "question_template": "Question: {}\nIf you want to answer the question, which objects' information do you need?",
+ "question_list": [
+ "What is the color of the boy's bag?",
+ "Is the yellow car on the left or right side of the white car?",
+ "Tell me the number on the black board above the dog.",
+ "Is the girl with pink hair on the left or right side of the man with backpack?",
+ "What kind of animal is on the red sign?",
+ "How many cars in the image?"
+ ],
+ "response_list": [
+ "To answer the question, I need know the location of the boy with a bag so that I can determine the color of the bag. So I need the information about the following objects: boy with a bag.",
+ "To answer the question, I need know the location of the yellow car and the white car so that I can determine the positional relationship between the two of them. So I need the information about the following objects: white car and yellow car.",
+ "To answer the question, I need know the location of the black board above the dog so that I can determine the number on it. So I need the information about the following objects: black board above the dog.",
+ "To answer the question, I need know the location of the girl with pink hair and the man with backpack so that I can determine the positional relationship between the two of them. So I need the information about the following objects: girl with pink hair and man with backpack.",
+ "To answer the question, I need know the location of the red sign so that I can determine the kind of animal on it. So I need the information about the following objects: red sign.",
+ "To answer the question, I need know the location of all cars so that I can determine the number of cars. So I need the information about the following objects: all cars."
+ ]
+ }
+ # Add example QA pairs to conversation
+ for q, a in zip(prompt["question_list"], prompt["response_list"]):
+ conversation.extend([
+ Message.user(prompt["question_template"].format(q)),
+ Message.assistant(a)
+ ])
+
+ # Add user question
+ conversation.append(Message.user(prompt["question_template"].format(question)))
+
+ return conversation
+
+def get_prompt_tag(image_list):
+ """
+ Determine the prompt type based on the number of images.
+
+ Args:
+ image_list: List of images in the conversation
+
+ Returns:
+ str: 'global' for single image, 'zoom' for image with zoomed region
+
+ Raises:
+ ValueError: If unsupported number of images
+ """
+ if len(image_list) == 1:
+ prompt_tag = "global" # Single image - global view
+ elif len(image_list) == 2:
+ prompt_tag = "zoom" # Two images - main image and zoomed region
+ else:
+ raise ValueError
+ return prompt_tag
+
+
+def get_confidence_conversation(node: Node, image_pil: Image.Image, confidence_type: str, input_ele, root_anyres=True):
+ """
+ Generate a conversation for evaluating confidence of a node.
+
+ Args:
+ node: The image tree node to evaluate
+ image_pil: Original image
+ confidence_type: Type of confidence to evaluate ('existence', 'latent', or 'answering')
+ input_ele: Visual element or question to evaluate
+ root_anyres: Whether to use any resolution for root node
+
+ Returns:
+ list: Conversation messages for the LLM
+
+ Raises:
+ AssertionError: If invalid confidence type
+ """
+ assert confidence_type in ['existence', 'latent', 'answering']
+ zoom_eye_method = ZoomEyeMethod()
+ # Process node into appropriate image format (original or with highlighted region)
+ image_list = zoom_eye_method.process_nodes_to_image_list([node], image_pil, root_anyres=root_anyres)
+ prompt_tag = get_prompt_tag(image_list)
+
+ # Define prompts for different confidence types and image formats
+ prompts = {
+ "global":{
+ "pre_informations": [""],
+ "latent_prompt": "According to your common sense knowledge and the content of image, is it possible to find a {} by further zooming in the image? Answer Yes or No and tell the reason.",
+ "existence_prompt": "Is there a {} in the image? Answer Yes or No.",
+ "answering_prompt": "Question: {}\nCould you answer the question based on the the available visual information? Answer Yes or No.",
+ },
+ "zoom":{
+ "pre_informations": ["This is the main image, and the section enclosed by the {BOX_COLOR} rectangle is the focus region.\n", "This is the zoomed-in view of the focus region.\n"],
+ "latent_prompt": "According to your common sense knowledge and the content of the zoomed-in view, along with its location in the image, is it possible to find a {} by further zooming in the current view? Answer Yes or No and tell the reason.",
+ "existence_prompt": "Is there a {} in the zoomed-in view? Answer Yes or No.",
+ "answering_prompt": "Question: {}\nCould you answer the question based on the the available visual information? Answer Yes or No.",
+ },
+ }
+ pre_informations = prompts[prompt_tag]["pre_informations"]
+ instruction = prompts[prompt_tag][f"{confidence_type}_prompt"].format(input_ele)
+
+ # Build conversation with images and instructions
+ conversation = []
+ input_content = []
+ for image, pre_information in zip(image_list, pre_informations):
+ if pre_information != "":
+ input_content.append(pre_information)
+ input_content.append(image)
+ input_content.append(instruction)
+ conversation.append(Message.user(input_content))
+
+ return conversation
+
+def calculate_confidence_from_logprobs(response, yes_tokens=["yes", "Yes", "YES"], no_tokens=["no", "No", "NO"]):
+ """
+ Calculate confidence score from model logprobs for Yes/No questions.
+
+ Args:
+ response: LLM response containing logprobs
+ yes_tokens: List of tokens representing "yes"
+ no_tokens: List of tokens representing "no"
+
+ Returns:
+ float: Confidence value in range [-1, 1], positive for "yes", negative for "no"
+ """
+ first_token_logprobs = response['choices'][0]['logprobs']['content'][0]['top_logprobs']
+
+ yes_logprob = float('-inf')
+ no_logprob = float('-inf')
+
+ # Find the highest logprob for yes and no tokens
+ for item in first_token_logprobs:
+ if item['token'] in yes_tokens:
+ yes_logprob = max(yes_logprob, item['logprob'])
+ elif item['token'] in no_tokens:
+ no_logprob = max(no_logprob, item['logprob'])
+
+ # If neither yes nor no tokens found
+ if yes_logprob == float('-inf') and no_logprob == float('-inf'):
+ return 0.0
+
+ # Convert logprobs to normalized confidence score
+ logprob_yesno = [yes_logprob, no_logprob]
+ yes_prob = torch.softmax(torch.tensor(logprob_yesno), dim=-1)[0]
+ confidence = 2 * (yes_prob.item() - 0.5) # Map from [0,1] to [-1,1]
+
+ return confidence
+
+def get_output_conversation(candidates: List[Node], image_pil: Image.Image, question: str):
+ """
+ Generate a conversation for producing the final answer.
+
+ Args:
+ candidates: List of nodes identified as relevant to the question
+ image_pil: Original image
+ question: User's question
+
+ Returns:
+ list: Conversation messages for the LLM
+ """
+ zoom_eye_method = ZoomEyeMethod()
+ # Process candidate nodes into appropriate image format
+ image_list = zoom_eye_method.process_nodes_to_image_list(candidates, image_pil)
+ conversation = []
+ input_content = []
+ for image in image_list:
+ input_content.append(image)
+ input_content.append(question)
+ conversation.append(Message.user(input_content))
+ return conversation
+
+def extract_targets(sentence: str, pattern = r"So I need the information about the following objects: (.+)"):
+ """
+ Extract target objects from the model's response.
+
+ Args:
+ sentence: Model response text
+ pattern: Regex pattern to extract object list
+
+ Returns:
+ str: Extracted target objects as text, or None if not found
+ """
+ match = re.search(pattern, sentence)
+ if match:
+ return match.group(1)
+ return None
+
+def split_targets_sentence(targets_sentence:str, split_tag = r' and |, '):
+ """
+ Split a comma or 'and' separated list of target objects.
+
+ Args:
+ targets_sentence: String containing multiple targets
+ split_tag: Regex pattern for splitting
+
+ Returns:
+ list: Individual target objects, or None if input is None
+ """
+ if targets_sentence is None:
+ return None
+ if targets_sentence.endswith('.'):
+ targets_sentence = targets_sentence[:-1] # Remove trailing period
+ targets = re.split(split_tag, targets_sentence)
+ return targets
+
+def include_pronouns(text):
+ """
+ Check if text includes pronouns (which should be filtered out).
+
+ Args:
+ text: Text to check for pronouns
+
+ Returns:
+ bool: True if pronouns are present, False otherwise
+ """
+ doc = nlp(text)
+ for token in doc:
+ if token.pos_ == 'PRON':
+ return True
+ return False
+
+def visualize_bbox_and_arrow(image: Image.Image, bbox, color="red", thickness=2, xyxy=False):
+ """
+ Draw a bounding box on an image.
+
+ Args:
+ image: PIL Image to draw on
+ bbox: Bounding box coordinates
+ color: Color of the bounding box
+ thickness: Line thickness
+ xyxy: Whether bbox format is [x1,y1,x2,y2] instead of [x,y,w,h]
+
+ Returns:
+ list: Adjusted bounding box coordinates
+ """
+ if not xyxy:
+ x1, y1, w, h = bbox
+ x2 = x1 + w
+ y2 = y1 + h
+ else:
+ x1, y1, x2, y2 = bbox
+ # Ensure bbox is within image bounds
+ x1 = max(0, x1 - thickness)
+ y1 = max(0, y1 - thickness)
+ x2 = min(image.width, x2 + thickness)
+ y2 = min(image.height, y2 + thickness)
+ draw = ImageDraw.Draw(image)
+ new_bbox = [x1, y1, x2, y2]
+ draw.rectangle((x1, y1, x2, y2), outline=color, width=thickness)
+ min_distance = thickness * 6
+ center_x = image.width//2
+ center_y = image.height//2
+ center_x_bbox = (x1+x2)//2
+ center_y_bbox = (y1+y2)//2
+ return new_bbox
+
+def expand2square(pil_img, background_color):
+ """
+ Expand image to square with padding if needed.
+
+ Args:
+ pil_img: Original PIL image
+ background_color: Color for padding
+
+ Returns:
+ tuple: (Square image, x_offset, y_offset)
+ """
+ width, height = pil_img.size
+ if width == height:
+ return deepcopy(pil_img), 0, 0
+ elif width > height:
+ result = Image.new(pil_img.mode, (width, width), background_color)
+ result.paste(pil_img, (0, (width - height) // 2))
+ return result, 0, (width - height) // 2
+ else:
+ result = Image.new(pil_img.mode, (height, height), background_color)
+ result.paste(pil_img, ((height - width) // 2, 0))
+ return result, (height - width) // 2, 0
+
+def bbox_area(bbox):
+ """
+ Calculate the area of a bounding box.
+
+ Args:
+ bbox: Bounding box in [x1,y1,x2,y2] format
+
+ Returns:
+ int: Area of the bounding box
+ """
+ x_min, y_min, x_max, y_max = bbox
+ return (x_max - x_min) * (y_max - y_min)
+
+def intersect_bbox(bboxA, bboxB, distance_buffer=50):
+ """
+ Find the intersection between two bounding boxes with buffer.
+
+ Args:
+ bboxA: First bounding box
+ bboxB: Second bounding box
+ distance_buffer: Buffer distance to expand boxes
+
+ Returns:
+ list: Intersection bounding box, or None if no intersection
+ """
+ bbox1 = [v-distance_buffer if i<2 else v+distance_buffer for i, v in enumerate(bboxA)]
+ bbox2 = [v-distance_buffer if i<2 else v+distance_buffer for i, v in enumerate(bboxB)]
+
+ x_min = max(bbox1[0], bbox2[0])
+ y_min = max(bbox1[1], bbox2[1])
+ x_max = min(bbox1[2], bbox2[2])
+ y_max = min(bbox1[3], bbox2[3])
+
+ if x_max > x_min and y_max > y_min:
+ return (x_min, y_min, x_max, y_max)
+
+ return None
+
+
+def merge_bboxes(bbox1, bbox2):
+ """
+ Merge two bounding boxes into one containing both.
+
+ Args:
+ bbox1: First bounding box
+ bbox2: Second bounding box
+
+ Returns:
+ tuple: Merged bounding box
+ """
+ return (
+ min(bbox1[0], bbox2[0]),
+ min(bbox1[1], bbox2[1]),
+ max(bbox1[2], bbox2[2]),
+ max(bbox1[3], bbox2[3])
+ )
+
+def union_all_bboxes(bboxes):
+ """
+ Find the union of all bounding boxes in a list.
+
+ Args:
+ bboxes: List of bounding boxes
+
+ Returns:
+ tuple: Union bounding box, or None if list is empty
+ """
+ if len(bboxes) == 0:
+ return None
+ ret = bboxes[0]
+ for bbox in bboxes[1:]:
+ ret = merge_bboxes(ret, bbox)
+ return ret
+
+def merge_bbox_list(bboxes, threshold=0):
+ """
+ Merge all intersecting bounding boxes in a list.
+
+ Args:
+ bboxes: List of bounding boxes
+ threshold: Overlap threshold for merging (0-1)
+
+ Returns:
+ list: Merged bounding boxes
+ """
+ changed = True
+ while changed:
+ changed = False
+ new_bboxes = []
+ used = set()
+
+ for i in range(len(bboxes)):
+ if i in used:
+ continue
+ merged = False
+
+ for j in range(len(bboxes)):
+ if j in used or i == j:
+ continue
+ intersection = intersect_bbox(bboxes[i], bboxes[j])
+ if intersection:
+ if threshold == 0 or (threshold > 0 and (bbox_area(intersection) >= threshold * bbox_area(bboxes[i]) or bbox_area(intersection) >= threshold * bbox_area(bboxes[j]))):
+ new_bbox = merge_bboxes(bboxes[i], bboxes[j])
+ new_bboxes.append(new_bbox)
+ used.update([i, j])
+ changed = True
+ merged = True
+ break
+ if not merged and i not in used:
+ new_bboxes.append(bboxes[i])
+
+ bboxes = new_bboxes
+
+ return bboxes
+
+BOX_COLOR = 'red' # Color for highlighting bounding boxes
+
+class ZoomEyeMethod:
+ """
+ Class for processing image regions and creating visual representations.
+ Handles the visual aspects of the ZoomEye system including image cropping,
+ bounding box visualization, and image formatting.
+ """
+ def __init__(self, background_color=(127, 127, 127), patch_scale=1.2, bias_value=0.6):
+ """
+ Initialize the ZoomEye image processing method.
+
+ Args:
+ background_color: Color for padding and background
+ patch_scale: Scale factor for image patches
+ bias_value: Bias value for confidence weighting
+ """
+ self.background_color = background_color
+ self.patch_scale = patch_scale
+ self.input_size = (SMALLEST_SIZE, SMALLEST_SIZE)
+ self.bias_value = bias_value
+
+ def is_root_only(self, nodes: List[Node]):
+ """Check if the list contains only the root node."""
+ return (len(nodes)==1 and nodes[0].is_root)
+
+ def include_root(self, nodes: List[Node]):
+ """Check if the list includes the root node."""
+ return any(node.is_root for node in nodes)
+
+ def get_bbox_in_square_image(self, bbox, left, top):
+ """Adjust bbox coordinates for a square padded image."""
+ x1, y1, x2, y2 = bbox
+ return [x1+left, y1+top, x2+left, y2+top]
+
+ def get_patch(self, bbox, image_width, image_height, patch_size, patch_scale=None):
+ """
+ Calculate patch coordinates based on object bbox.
+ Centers the patch on the object and ensures minimum size.
+
+ Args:
+ bbox: Object bounding box [x,y,w,h]
+ image_width: Width of the original image
+ image_height: Height of the original image
+ patch_size: Minimum patch size
+ patch_scale: Optional scaling factor
+
+ Returns:
+ list: Patch coordinates [left,top,right,bottom]
+ """
+ object_width = int(np.ceil(bbox[2]))
+ object_height = int(np.ceil(bbox[3]))
+
+ object_center_x = int(bbox[0] + bbox[2]/2)
+ object_center_y = int(bbox[1] + bbox[3]/2)
+
+ patch_width = max(object_width, patch_size)
+ patch_height = max(object_height, patch_size)
+ if patch_scale is not None:
+ patch_width = int(patch_width*patch_scale)
+ patch_height = int(patch_height*patch_scale)
+
+ left = max(0, object_center_x-patch_width//2)
+ right = min(left+patch_width, image_width)
+
+ top = max(0, object_center_y-patch_height//2)
+ bottom = min(top+patch_height, image_height)
+
+ return [left, top, right, bottom]
+
+ def draw_bbox_arrow_in_square_image(self, square_image, resized_bbox, color):
+ """Draw bounding box on an image with appropriate thickness."""
+ thickness = square_image.width//120
+ new_bbox = visualize_bbox_and_arrow(square_image, resized_bbox, color, thickness, xyxy=True)
+ return new_bbox
+
+ def process_nodes_to_image_list(self, nodes: List[Node], image_pil, root_anyres=True):
+ """
+ Process nodes into a list of images for visualization.
+
+ For root node: returns just the original image
+ For other nodes: returns original image with highlighted region + zoomed view
+
+ Args:
+ nodes: List of nodes to visualize
+ image_pil: Original image
+ root_anyres: Whether to use original resolution for root node
+
+ Returns:
+ list: List of images for the conversation
+ """
+ square_image, left, top = expand2square(image_pil, self.background_color)
+ if self.is_root_only(nodes):
+ return [deepcopy(image_pil)] if root_anyres else [square_image.resize(self.input_size)]
+ if len(nodes) == 0 or self.include_root(nodes):
+ return [deepcopy(image_pil)]
+
+ # Get and merge bounding boxes for all nodes
+ resized_bboxes = [self.get_patch(node.state.bbox, image_pil.width, image_pil.height, patch_size=self.input_size[0], patch_scale=self.patch_scale) for node in nodes]
+ resized_bboxes = merge_bbox_list(resized_bboxes, threshold=0)
+
+ # Draw bounding boxes on the square image
+ full_color_bboxes = []
+ for i in range(len(resized_bboxes)):
+ resized_bbox = self.get_bbox_in_square_image(resized_bboxes[i], left, top)
+ color_bbox = self.draw_bbox_arrow_in_square_image(square_image, resized_bbox, BOX_COLOR)
+ full_color_bboxes.append(color_bbox)
+
+ # Create zoomed view of the union of all bounding boxes
+ union_color_bboxes = union_all_bboxes(full_color_bboxes)
+ if union_color_bboxes is None:
+ return [square_image]
+ zoomed_view = square_image.crop(union_color_bboxes)
+ print(union_color_bboxes)
+
+ return [square_image.resize(self.input_size), zoomed_view]
+
+ def get_confidence_weight(self, node: Node, max_depth: int):
+ """
+ Calculate confidence weight based on node depth.
+ Deeper nodes get higher weight to prioritize detailed regions.
+
+ Args:
+ node: Node to calculate weight for
+ max_depth: Maximum depth of the tree
+
+ Returns:
+ float: Confidence weight between bias_value and 1.0
+ """
+ coeff = (1 - self.bias_value) / (max_depth ** 2)
+ return coeff * (node.depth**2) + self.bias_value
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/workflow.py b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/workflow.py
new file mode 100644
index 00000000..3d448bf1
--- /dev/null
+++ b/omagent-core/src/omagent_core/advanced_components/workflow/ZoomEye/workflow.py
@@ -0,0 +1,105 @@
+from omagent_core.engine.workflow.task.simple_task import simple_task
+from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
+from omagent_core.engine.workflow.task.do_while_task import DoWhileTask
+from omagent_core.engine.workflow.task.switch_task import SwitchTask
+
+
+class ZoomEyeWorkflow(ConductorWorkflow):
+ """
+ ZoomEye workflow implementation for processing visual queries.
+ Extends the ConductorWorkflow to create a specialized workflow
+ for handling image-based search and analysis.
+ """
+
+ def __init__(self):
+ """
+ Initialize the ZoomEye workflow with a lightweight configuration.
+ """
+ super().__init__(name='zoomeye_workflow', lite_version=True)
+
+
+ def set_input(self, query: str, image_path: str, qid: str="test"):
+ """
+ Set input parameters for the workflow and configure tasks.
+
+ Args:
+ query (str): The text query from the user
+ image_path (str): Path to the image to be analyzed
+ qid (str, optional): Query identifier. Defaults to "test".
+ """
+ self.qid = qid
+ self.query = query
+ self.image_path = image_path
+ self._configure_tasks()
+ self._configure_workflow()
+
+ def _configure_tasks(self):
+ """
+ Configure all tasks in the ZoomEye workflow pipeline.
+ Creates and connects various processing stages for the image analysis workflow.
+ """
+
+ # Task for generating visual cues from the input image
+ self.visual_cue_generation = simple_task(
+ task_def_name='VisualCueGeneration',
+ task_reference_name='visual_cue_generation',
+ inputs={
+ "qid": self.qid,
+ "query": self.query,
+ "image_path": self.image_path
+ }
+ )
+
+ # Task for preprocessing data before search operations
+ zoomeye_preprocess = simple_task(
+ task_def_name='ZoomEyePreprocess',
+ task_reference_name='zoomeye_preprocess',
+ )
+
+ # Task for executing the ZoomEye search with current parameters
+ zoomeye_search = simple_task(
+ task_def_name='ZoomEyeSearch',
+ task_reference_name='zoomeye_search'
+ )
+
+ # Task for checking search results and determining if more searches are needed
+ zoomeye_search_check = simple_task(
+ task_def_name='ZoomEyeSearchCheck',
+ task_reference_name='zoomeye_search_check'
+ )
+
+ # Loop task for repeated search operations until adequate results are found
+ zoomeye_search_loop = DoWhileTask(
+ task_ref_name='zoomeye_search_loop',
+ tasks=[zoomeye_search, zoomeye_search_check],
+ termination_condition='if ($.zoomeye_search_check["search_check_finish"] == true){false;} else {true;}'
+ )
+
+ # Task for checking if the entire search process should continue with new parameters
+ zoomeye_loop_check = simple_task(
+ task_def_name='ZoomEyeLoopCheck',
+ task_reference_name='zoomeye_loop_check'
+ )
+
+ # Main loop task that includes preprocessing, searching, and checking
+ # Continues until satisfactory results are found or timeout conditions met
+ self.zoomeye_loop = DoWhileTask(
+ task_ref_name='zoomeye_loop',
+ tasks=[zoomeye_preprocess, zoomeye_search_loop, zoomeye_loop_check],
+ termination_condition='if ($.zoomeye_loop_check["loop_check_finish"] == true){false;} else {true;}'
+ )
+
+ # Final task to format and return the search results
+ self.zoomeye_output = simple_task(
+ task_def_name='ZoomEyeOutput',
+ task_reference_name='zoomeye_output'
+ )
+
+
+ def _configure_workflow(self):
+ """
+ Configure the workflow execution sequence.
+ Establishes the flow between different tasks in the pipeline.
+ """
+ # Connect tasks in sequence: visual cue generation -> search loop -> output formatting
+ self >> self.visual_cue_generation >> self.zoomeye_loop >> self.zoomeye_output
\ No newline at end of file
diff --git a/omagent-core/src/omagent_core/clients/devices/programmatic/lite_client.py b/omagent-core/src/omagent_core/clients/devices/programmatic/lite_client.py
index 59222478..31df410b 100644
--- a/omagent-core/src/omagent_core/clients/devices/programmatic/lite_client.py
+++ b/omagent-core/src/omagent_core/clients/devices/programmatic/lite_client.py
@@ -1,4 +1,5 @@
-
+from pathlib import Path
+import uuid
from omagent_core.utils.container import container
from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
from omagent_core.utils.build import build_from_file
@@ -28,16 +29,21 @@ def __init__(
self._workers = workers
self._input_prompt = input_prompt
worker_config = build_from_file(self._config_path)
+ self.workflow_instance_id = str(uuid.uuid4())
self.initialization(workers, worker_config)
def initialization(self, workers, worker_config):
self.workers = {}
for worker in workers:
+ worker.workflow_instance_id = self.workflow_instance_id
self.workers[type(worker).__name__] = worker
for config in worker_config:
- worker_cls = registry.get_worker(config['name'])
- self.workers[config['name']] = worker_cls(**config)
+ worker_cls = registry.get_worker(config['name'])
+ worker = worker_cls(**config)
+ worker.workflow_instance_id = self.workflow_instance_id
+ self.workers[config['name']] = worker
+ # self.workers[config['name']] = worker_cls(**config)
def start_processor_with_input(self, workflow_input: dict):
self._interactor.start_workflow_with_input(workflow_input=workflow_input, workers=self.workers)
diff --git a/omagent-core/src/omagent_core/models/llms/vstar.py b/omagent-core/src/omagent_core/models/llms/vstar.py
new file mode 100644
index 00000000..225d5869
--- /dev/null
+++ b/omagent-core/src/omagent_core/models/llms/vstar.py
@@ -0,0 +1,180 @@
+import os
+import requests
+import io
+import base64
+from typing import Any, Dict, List, Union, Optional
+from PIL import Image
+
+from pydantic import Field
+
+from omagent_core.utils.registry import registry
+from omagent_core.models.llms.base import BaseLLM
+from omagent_core.models.llms.schemas import Content, Message
+
+
+@registry.register_llm()
+class VStarLLM(BaseLLM):
+ """VStar LLM backend for visual understanding tasks."""
+
+ endpoint: str = Field(
+ default=os.getenv("ENDPOINT", "https://vstar.om-ai.com"),
+ description="The endpoint of VStar service"
+ )
+ # temperature: float = Field(default=1.0, description="The temperature of LLM")
+ # max_tokens: int = Field(default=2048, description="The max tokens of LLM")
+
+ class Config:
+ """Configuration for this Pydantic object."""
+ protected_namespaces = ()
+ extra = "allow"
+
+ def model_post_init(self, __context: Any) -> None:
+ """Initialize the VStar client."""
+ # No specific client initialization needed for REST API calls
+ pass
+
+ def _image_to_base64(self, img: Image.Image) -> str:
+ """Convert a PIL.Image object to base64 encoding.
+
+ Args:
+ img (Image.Image): The image to convert.
+
+ Returns:
+ str: Base64 encoded string of the image.
+ """
+ buffered = io.BytesIO()
+ img.save(buffered, format="JPEG")
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+ def _prepare_vstar_request(self, prompt: str, image: Image.Image, **kwargs) -> Dict:
+ """Prepare VStar API request data from the prompt and image.
+
+ Args:
+ prompt (str): The text prompt for the VStar API.
+ image (Image.Image): The image to be processed.
+
+ Returns:
+ Dict: A dictionary containing the prepared request data.
+
+ Raises:
+ ValueError: If the image is None.
+ """
+ # Ensure prompt is not None
+ if prompt is None:
+ prompt = ""
+
+ # Ensure image is provided
+ if image is None:
+ raise ValueError("Image is required for VStar API")
+
+ # Prepare request data
+ encoded_image = self._image_to_base64(image)
+ data = {
+ "prompt": prompt,
+ "image_base64": encoded_image
+ }
+
+ # Update with any additional keyword arguments
+ data.update(kwargs)
+ return data
+
+ def _call(self, prompt: str, image: Image.Image, model: str = "vqa_llm", mode: str = None, **kwargs) -> Dict:
+ """Call the VStar API and return the response.
+
+ Args:
+ prompt (str): The text prompt for the API.
+ image (Image.Image): The image to be processed.
+ model (str): The model type to use (default is "vqa_llm").
+ mode (str): The mode of operation (optional).
+
+ Returns:
+ Dict: The response from the VStar API formatted to match LLM output structure.
+
+ Raises:
+ RuntimeError: If the API response status code is not 200.
+ """
+ # Validate model and mode
+ assert model in ["vqa_llm", "visual_search_model", "vqa_llm_post"], f"Invalid model: {model}"
+
+ if model == "visual_search_model":
+ assert mode in ["vqa", "detection", "segmentation"], f"Invalid mode: {mode}"
+ data = self._prepare_vstar_request(prompt, image, mode=mode, **kwargs)
+ else:
+ assert mode is None, f"Mode should not be provided for model {model}"
+ data = self._prepare_vstar_request(prompt, image, **kwargs)
+
+ # Make API request
+ response = requests.post(f"{self.endpoint}/{model}", json=data)
+
+ # Check for successful response
+ if response.status_code != 200:
+ raise RuntimeError(f"Error: Received status code {response.status_code}")
+
+ response_data = response.json()
+
+ # Format response to match LLM output structure
+ result = {
+ "id": f"vstar-{model}-{mode}",
+ "object": "chat.completion",
+ "created": int(response.elapsed.total_seconds()),
+ "model": "vstar",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": response_data.get("generated_text", response_data.get("response", ""))
+ },
+ "finish_reason": "stop"
+ }
+ ]
+ }
+
+ return result
+
+ def vqa(self, prompt: str, image: Image.Image, **kwargs) -> Dict:
+ """Perform Visual Question Answering (VQA) using the VStar LLM.
+
+ Args:
+ prompt (str): The question to be answered.
+ image (Image.Image): The image to analyze.
+
+ Returns:
+ Dict: The response from the VStar API for the VQA task.
+ """
+ return self._call(prompt, image, model="vqa_llm", **kwargs)
+
+ def visual_search(self, target_object_name: str, image: Image.Image, mode: str, **kwargs) -> Dict:
+ """Perform a visual search for a target object using the VStar LLM.
+
+ Args:
+ target_object_name (str): The name of the object to locate.
+ image (Image.Image): The image to search within.
+ mode (str): The mode of the visual search (e.g., "vqa", "detection", "segmentation").
+
+ Returns:
+ Dict: The response from the VStar API for the visual search task.
+
+ Raises:
+ AssertionError: If the mode is invalid.
+ """
+ assert mode in ["vqa", "detection", "segmentation"], f"Invalid mode: {mode}"
+ if mode == "vqa":
+ prompt = f"According to the common sense knowledge and possible visual cues, what is the most likely location of the {target_object_name} in the image?"
+ else:
+ prompt = f"Please locate the {target_object_name} in this image."
+
+ return self._call(prompt, image, model="visual_search_model", mode=mode, **kwargs)
+
+ def vqa_post(self, prompt: str, image: Image.Image, **kwargs) -> Dict:
+ """Perform post-processing for Visual Question Answering (VQA) using the VStar LLM.
+
+ Args:
+ prompt (str): The question to be answered.
+ image (Image.Image): The image to analyze.
+
+ Returns:
+ Dict: The response from the VStar API for the VQA post-processing task.
+ """
+ return self._call(prompt, image, model="vqa_llm_post", **kwargs)
+