From 5cc0289bdd35c1a349cef5240a3c804241bade3f Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Wed, 1 Oct 2025 19:08:35 +0800
Subject: [PATCH 1/8] add vlrmbench dataset

---
 vlmeval/dataset/__init__.py  |  24 +-
 vlmeval/dataset/vlrmbench.py | 473 +++++++++++++++++++++++++++++++++++
 2 files changed, 496 insertions(+), 1 deletion(-)
 create mode 100644 vlmeval/dataset/vlrmbench.py

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 855049d4a..7db71e8d4 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -79,6 +79,23 @@
 from .chartmimic import ChartMimic
 from .m4bench import M4Bench
 
+# VLRMBench imports
+from .vlrmbench import (
+    VLRMBenchBase,
+    VLRMBenchAttributeHallucination,
+    VLRMBenchDetailError,
+    VLRMBenchStepCorrectness,
+    VLRMBenchForesight,
+    VLRMBenchErrorCorrection,
+    VLRMBenchErrorReasonAnalysis,
+    VLRMBenchExistenceHallucination,
+    VLRMBenchImageRefError,
+    VLRMBenchLocationError,
+    VLRMBenchMostConfidence,
+    VLRMBenchMultiSolution,
+    VLRMBenchRedundantDet,
+)
+
 
 class ConcatDataset(ImageBaseDataset):
     # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
@@ -205,7 +222,12 @@ def evaluate(self, eval_file, **judge_kwargs):
     ZEROBench, SCAM, Omni3DBench, TallyQA, _3DSRBench, BMMR, AffordanceDataset,
     MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench,
     OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
-    AyaVisionBench, TopViewRS, VLMBias
+    AyaVisionBench, TopViewRS, VLMBias,
+    # VLRMBench datasets
+    VLRMBenchAttributeHallucination, VLRMBenchDetailError, VLRMBenchStepCorrectness,
+    VLRMBenchForesight, VLRMBenchErrorCorrection, VLRMBenchErrorReasonAnalysis,
+    VLRMBenchExistenceHallucination, VLRMBenchImageRefError, VLRMBenchLocationError,
+    VLRMBenchMostConfidence, VLRMBenchMultiSolution, VLRMBenchRedundantDet
 ]
 
 VIDEO_DATASET = [
diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
new file mode 100644
index 000000000..395e83103
--- /dev/null
+++ b/vlmeval/dataset/vlrmbench.py
@@ -0,0 +1,473 @@
+import os
+import json
+import zipfile
+import re
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Any, Union, Optional
+import pandas as pd
+import numpy as np
+from huggingface_hub import snapshot_download
+from sklearn.metrics import f1_score
+from ..smp import *
+from .image_base import ImageBaseDataset
+
+
+class VLRMBenchBase(ImageBaseDataset):
+    """
+    VLRMBench数据集基础类
+    支持从HuggingFace下载和解压数据，处理JSONL格式的推理错误检测数据
+    """
+
+    MODALITY = "IMAGE"
+    TYPE = "VQA"  # 设置为VQA类型以支持开放式问答
+
+    # 支持的子集列表
+    SUPPORTED_SUBSETS = [
+        "attribute_hallucination",
+        "detail_error",
+        "error_correction",
+        "error_reason_analysis",
+        "existence_hallucination",
+        "foresight",
+        "image_ref_error",
+        "location_error",
+        "most_confidence",
+        "multi_solution",
+        "redundant_det",
+        "step_correctness",
+    ]
+
+    # HuggingFace仓库信息
+    HF_REPO = "Winston-Yuan/VLRMBench"
+
+    @classmethod
+    def supported_datasets(cls):
+        """返回支持的数据集名称列表"""
+        return [f"VLRMBench_{subset}" for subset in cls.SUPPORTED_SUBSETS]
+
+    def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
+        """
+        初始化VLRMBench数据集
+
+        Args:
+            dataset: 数据集名称，格式为VLRMBench_{subset}
+            **kwargs: 其他参数
+        """
+        # 从数据集名称中提取子集名称
+        if dataset.startswith("VLRMBench_"):
+            subset = dataset[len("VLRMBench_") :]
+        else:
+            subset = "attribute_hallucination"  # 默认子集
+
+        if subset not in self.SUPPORTED_SUBSETS:
+            raise ValueError(f"Unsupported subset: {subset}. Supported subsets: {self.SUPPORTED_SUBSETS}")
+
+        self.subset = subset
+        self.dataset_name = dataset
+
+        # 设置数据根目录
+        ROOT = LMUDataRoot()
+        self.data_root = osp.join(ROOT, "datasets", "VLRMBench")
+        os.makedirs(self.data_root, exist_ok=True)
+
+        # 下载和解压数据
+        self.data_dir = self._download_and_extract()
+
+        # 加载数据
+        data = self._load_jsonl_data()
+
+        # 确保数据有index字段
+        for i, item in enumerate(data):
+            if 'index' not in item:
+                item['index'] = i
+        
+        # 转换为DataFrame格式
+        self.data = pd.DataFrame(data)
+
+        # 设置图片根目录
+        self.img_root = osp.join(self.data_dir, "images")
+
+        # 设置评估模式
+        self.evaluation_mode = self._get_evaluation_mode()
+
+        # 后处理
+        self.post_build(dataset)
+
+    def _download_and_extract(self) -> str:
+        """
+        从HuggingFace下载数据并解压
+
+        Returns:
+            str: 解压后的数据目录路径
+        """
+        local_dir = osp.join(self.data_root, "VLRMBench-HF")
+
+        # 检查是否已经下载
+        if osp.exists(local_dir) and osp.exists(osp.join(local_dir, "benchmark_data")):
+            print(f"VLRMBench data already exists at {local_dir}")
+            return local_dir
+
+        print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}")
+
+
+        # 下载数据
+        snapshot_download(
+            repo_id=self.HF_REPO,
+            repo_type="dataset",
+            local_dir=local_dir,
+            local_dir_use_symlinks=False,
+            tqdm_class=None,  # 使用默认的tqdm进度条
+        )
+
+        # 解压图片文件
+        self._extract_images(local_dir)
+
+        print(f"VLRMBench data downloaded and extracted to {local_dir}")
+        return local_dir
+
+    def _extract_images(self, data_dir: str):
+        """
+        解压图片文件
+
+        Args:
+            data_dir: 数据目录路径
+        """
+        zip_file = osp.join(data_dir, "Image.zip")
+        images_dir = osp.join(data_dir, "images")
+
+        if osp.exists(images_dir):
+            print("Images already extracted")
+            return
+
+        if not osp.exists(zip_file):
+            raise FileNotFoundError(f"Image.zip not found at {zip_file}")
+
+        print(f"Extracting images from {zip_file}")
+        os.makedirs(images_dir, exist_ok=True)
+
+        with zipfile.ZipFile(zip_file, "r") as zip_ref:
+            zip_ref.extractall(images_dir)
+
+        print(f"Images extracted to {images_dir}")
+
+    def _load_jsonl_data(self) -> List[Dict]:
+        """
+        加载JSONL数据文件
+
+        Returns:
+            List[Dict]: 加载的数据列表
+        """
+        jsonl_file = osp.join(self.data_dir, "benchmark_data", f"{self.subset}.jsonl")
+
+        if not osp.exists(jsonl_file):
+            raise FileNotFoundError(f"JSONL file not found: {jsonl_file}")
+
+        data = []
+        with open(jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.strip():
+                    data.append(json.loads(line))
+
+        print(f"Loaded {len(data)} samples from {self.subset}")
+        return data
+
+    def _get_evaluation_mode(self) -> str:
+        """
+        根据子集确定评估模式
+
+        Returns:
+            str: 评估模式
+        """
+        # 前瞻性推理使用任务级评估
+        if self.subset == "foresight":
+            return "foresight"
+
+        # 多解任务使用特殊评估
+        if self.subset == "multi_solution":
+            return "multi_solution"
+
+        # 生成任务使用judge评估 (暂时跳过)
+        if self.subset in ["error_correction", "error_reason_analysis"]:
+            return "generation"
+
+        # 其他子集使用二进制分类评估
+        return "binary_classification"
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        item = dict(self.data.iloc[idx])
+
+        # 处理图片路径
+        if "image" in item and isinstance(item["image"], list):
+            # 将相对路径转换为绝对路径
+            image_paths = []
+            for img_path in item["image"]:
+                full_path = osp.join(self.img_root, img_path)
+                if osp.exists(full_path):
+                    image_paths.append(full_path)
+                else:
+                    print(f"Warning: Image not found: {full_path}")
+
+            item["image"] = image_paths[0] if len(image_paths) == 1 else image_paths
+
+        return item
+
+    def post_build(self, dataset):
+        """后处理，设置数据集特定属性"""
+        # 设置评估指标
+        if self.evaluation_mode == "binary_classification":
+            self.metrics = ["f1_positive", "f1_negative", "f1_weighted", "step_accuracy"]
+        elif self.evaluation_mode == "foresight":
+            self.metrics = ["task_accuracy"]
+        elif self.evaluation_mode == "multi_solution":
+            self.metrics = ["multi_solution_accuracy"]
+        elif self.evaluation_mode == "generation":
+            self.metrics = ["win_rate", "judge_score"]
+        else:
+            self.metrics = ["overall_accuracy"]
+
+    def build_prompt(self, line):
+        """
+        构建提示词 - 根据评估模式构建不同的提示词
+
+        Args:
+            line: 数据行
+
+        Returns:
+            str: 构建的提示词
+        """
+        question = line["question"]
+
+        if self.evaluation_mode == "binary_classification":
+            step_list = line.get("step_list", [])
+            prompt = f"Question: {question}\n\nReasoning Steps:\n"
+            for i, step in enumerate(step_list, 1):
+                prompt += f"Step {i}: {step}\n"
+            prompt += "\nPlease identify which steps contain errors. Output format: [0,1,0,1,...]"
+
+        elif self.evaluation_mode == "foresight":
+            step_list = line.get("step_list", [])
+            prompt = f"Question: {question}\n\nReasoning Steps:\n"
+            for i, step in enumerate(step_list, 1):
+                prompt += f"Step {i}: {step}\n"
+            prompt += "\nDoes this reasoning show good foresight? Answer: yes/no"
+
+        elif self.evaluation_mode == "multi_solution":
+            prompt = f"Question: {question}\n\nPlease provide two different solution approaches."
+
+        elif self.evaluation_mode == "generation":
+            reasoning_error = line.get("reasoning_error", [])
+            prompt = f"Question: {question}\n\nReasoning with errors:\n"
+            for i, step in enumerate(reasoning_error, 1):
+                prompt += f"Step {i}: {step}\n"
+            prompt += "\nPlease analyze and correct the errors in this reasoning."
+
+        return prompt
+
+    def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]:
+        """
+        格式化模型答案 - 基于原始评测脚本的逻辑
+
+        Args:
+            model_answer: 模型原始输出
+            task_gt: 任务真实标签
+
+        Returns:
+            List[int]: 格式化后的预测结果
+        """
+        if self.evaluation_mode == "binary_classification":
+            return self._format_binary_classification_answer(model_answer, task_gt)
+        elif self.evaluation_mode == "multi_solution":
+            return self._format_multi_solution_answer(model_answer, task_gt)
+        else:
+            return []
+
+    def _format_binary_classification_answer(self, model_answer: str, task_gt: List) -> List[int]:
+        """
+        格式化二进制分类任务的模型答案
+        基于 get_sc_mc_rd_eval_res.py 中的 format_model_answer_tolist 函数
+        """
+        # 提取数字
+        numbers = re.findall(r"\d+", model_answer)
+        result = [int(num) for num in numbers]
+
+        # 将非0/1的数字转换为1
+        result = [num if num == 0 or num == 1 else 1 for num in result]
+
+        # 调整长度以匹配真实标签
+        if len(result) >= len(task_gt):
+            return result[: len(task_gt)]
+        else:
+            return result + [0] * (len(task_gt) - len(result))
+
+    def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> List[int]:
+        """
+        格式化多解任务的模型答案
+        基于 get_ms_eval_res.py 中的 format_ms_ec_era_model_answer_tolist 函数
+        """
+        numbers = re.findall(r"\d+", model_answer)
+        result = [int(num) for num in numbers]
+
+        if len(result) >= len(task_gt):
+            return result[-len(task_gt) :]
+        else:
+            return result + [0] * (len(task_gt) - len(result))
+
+    def evaluate_binary_classification(
+        self, predictions: List[List[int]], ground_truths: List[List[int]]
+    ) -> Dict[str, float]:
+        """
+        评估二进制分类任务
+        基于 get_sc_mc_rd_eval_res.py 中的评估逻辑
+        """
+        # 展平所有预测和真实标签
+        flat_predictions = []
+        flat_ground_truths = []
+
+        for pred, gt in zip(predictions, ground_truths):
+            flat_predictions.extend(pred)
+            flat_ground_truths.extend(gt)
+
+        # 转换为numpy数组
+        pred_array = np.array(flat_predictions)
+        gt_array = np.array(flat_ground_truths)
+
+        # 计算F1分数
+        f1_pos = f1_score(gt_array, pred_array, pos_label=1)
+        f1_neg = f1_score(gt_array, pred_array, pos_label=0)
+
+        # 计算加权F1
+        pos_count = np.sum(gt_array == 1)
+        neg_count = np.sum(gt_array == 0)
+        total_count = pos_count + neg_count
+
+        if total_count > 0:
+            f1_weighted = (f1_pos * pos_count + f1_neg * neg_count) / total_count
+        else:
+            f1_weighted = 0.0
+
+        # 计算步骤准确率
+        step_accuracy = np.mean(pred_array == gt_array)
+
+        return {
+            "f1_positive": f1_pos,
+            "f1_negative": f1_neg,
+            "f1_weighted": f1_weighted,
+            "step_accuracy": step_accuracy,
+        }
+
+    def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) -> Dict[str, float]:
+        """
+        评估前瞻性推理任务
+        基于 get_fores_eval_res.py 中的评估逻辑
+        """
+        correct = 0
+        total = len(predictions)
+
+        for pred, gt in zip(predictions, ground_truths):
+            if gt == True:
+                if re.search(r"\b(yes|true)\b", pred, re.IGNORECASE):
+                    correct += 1
+            elif gt == False:
+                if re.search(r"\b(no|false)\b", pred, re.IGNORECASE):
+                    correct += 1
+
+        accuracy = correct / total if total > 0 else 0.0
+
+        return {"task_accuracy": accuracy}
+
+    def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[List[int]]) -> Dict[str, float]:
+        """
+        评估多解任务
+        基于 get_ms_eval_res.py 中的评估逻辑
+        """
+        correct = 0
+        total = len(predictions)
+
+        for pred, gt in zip(predictions, ground_truths):
+            # 假设predictions包含front和back两个答案
+            front_answer = pred.get("front", [0, 0])
+            back_answer = pred.get("back", [0, 0])
+
+            # 计算得分
+            score1 = front_answer[0] + back_answer[1]
+            score2 = front_answer[1] + back_answer[0]
+
+            if score1 > score2:
+                correct += 1
+
+        accuracy = correct / total if total > 0 else 0.0
+
+        return {"multi_solution_accuracy": accuracy}
+
+
+# 为每个子集创建具体的类
+class VLRMBenchAttributeHallucination(VLRMBenchBase):
+    """属性幻觉检测子集"""
+
+    def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchDetailError(VLRMBenchBase):
+    """细节错误检测子集"""
+
+    def __init__(self, dataset="VLRMBench_detail_error", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchStepCorrectness(VLRMBenchBase):
+    """步骤正确性评估子集"""
+
+    def __init__(self, dataset="VLRMBench_step_correctness", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchForesight(VLRMBenchBase):
+    """前瞻性推理评估子集"""
+
+    def __init__(self, dataset="VLRMBench_foresight", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchErrorCorrection(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_error_correction", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchErrorReasonAnalysis(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_error_reason_analysis", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchExistenceHallucination(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_existence_hallucination", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchImageRefError(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_image_ref_error", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchLocationError(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_location_error", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchMostConfidence(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_most_confidence", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchMultiSolution(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_multi_solution", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+
+
+class VLRMBenchRedundantDet(VLRMBenchBase):
+    def __init__(self, dataset="VLRMBench_redundant_det", **kwargs):
+        super().__init__(dataset=dataset, **kwargs)

From 7ca05b6a9021b6039be88c68e8d3588822e5de9c Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Wed, 1 Oct 2025 19:57:18 +0800
Subject: [PATCH 2/8] Translate comments to English

---
 vlmeval/dataset/vlrmbench.py | 159 +++++++++++++++++++----------------
 1 file changed, 88 insertions(+), 71 deletions(-)

diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
index 395e83103..47b96cb62 100644
--- a/vlmeval/dataset/vlrmbench.py
+++ b/vlmeval/dataset/vlrmbench.py
@@ -15,14 +15,15 @@
 
 class VLRMBenchBase(ImageBaseDataset):
     """
-    VLRMBench数据集基础类
-    支持从HuggingFace下载和解压数据，处理JSONL格式的推理错误检测数据
+    Base class for VLRMBench dataset.
+    Supports downloading and extracting data from HuggingFace,
+    and processing JSONL-formatted reasoning error detection data.
     """
 
     MODALITY = "IMAGE"
-    TYPE = "VQA"  # 设置为VQA类型以支持开放式问答
+    TYPE = "VQA"  # Set as VQA type to support open-ended QA
 
-    # 支持的子集列表
+    # List of supported subsets
     SUPPORTED_SUBSETS = [
         "attribute_hallucination",
         "detail_error",
@@ -38,27 +39,27 @@ class VLRMBenchBase(ImageBaseDataset):
         "step_correctness",
     ]
 
-    # HuggingFace仓库信息
+    # HuggingFace repository information
     HF_REPO = "Winston-Yuan/VLRMBench"
 
     @classmethod
     def supported_datasets(cls):
-        """返回支持的数据集名称列表"""
+        """Return list of supported dataset names."""
         return [f"VLRMBench_{subset}" for subset in cls.SUPPORTED_SUBSETS]
 
     def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
         """
-        初始化VLRMBench数据集
+        Initialize VLRMBench dataset.
 
         Args:
-            dataset: 数据集名称，格式为VLRMBench_{subset}
-            **kwargs: 其他参数
+            dataset: Dataset name in format VLRMBench_{subset}
+            **kwargs: Additional arguments
         """
-        # 从数据集名称中提取子集名称
+        # Extract subset name from dataset name
         if dataset.startswith("VLRMBench_"):
             subset = dataset[len("VLRMBench_") :]
         else:
-            subset = "attribute_hallucination"  # 默认子集
+            subset = "attribute_hallucination"  # Default subset
 
         if subset not in self.SUPPORTED_SUBSETS:
             raise ValueError(f"Unsupported subset: {subset}. Supported subsets: {self.SUPPORTED_SUBSETS}")
@@ -66,44 +67,44 @@ def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
         self.subset = subset
         self.dataset_name = dataset
 
-        # 设置数据根目录
+        # Set data root directory
         ROOT = LMUDataRoot()
         self.data_root = osp.join(ROOT, "datasets", "VLRMBench")
         os.makedirs(self.data_root, exist_ok=True)
 
-        # 下载和解压数据
+        # Download and extract data
         self.data_dir = self._download_and_extract()
 
-        # 加载数据
+        # Load data
         data = self._load_jsonl_data()
 
-        # 确保数据有index字段
+        # Ensure data has index field
         for i, item in enumerate(data):
             if 'index' not in item:
                 item['index'] = i
         
-        # 转换为DataFrame格式
+        # Convert to DataFrame format
         self.data = pd.DataFrame(data)
 
-        # 设置图片根目录
+        # Set image root directory
         self.img_root = osp.join(self.data_dir, "images")
 
-        # 设置评估模式
+        # Set evaluation mode
         self.evaluation_mode = self._get_evaluation_mode()
 
-        # 后处理
+        # Post-processing
         self.post_build(dataset)
 
     def _download_and_extract(self) -> str:
         """
-        从HuggingFace下载数据并解压
+        Download data from HuggingFace and extract images.
 
         Returns:
-            str: 解压后的数据目录路径
+            str: Path to the extracted data directory
         """
         local_dir = osp.join(self.data_root, "VLRMBench-HF")
 
-        # 检查是否已经下载
+        # Check if already downloaded
         if osp.exists(local_dir) and osp.exists(osp.join(local_dir, "benchmark_data")):
             print(f"VLRMBench data already exists at {local_dir}")
             return local_dir
@@ -111,16 +112,16 @@ def _download_and_extract(self) -> str:
         print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}")
 
 
-        # 下载数据
+        # Download data
         snapshot_download(
             repo_id=self.HF_REPO,
             repo_type="dataset",
             local_dir=local_dir,
             local_dir_use_symlinks=False,
-            tqdm_class=None,  # 使用默认的tqdm进度条
+            tqdm_class=None,  # Use default tqdm progress bar
         )
 
-        # 解压图片文件
+        # Extract image files
         self._extract_images(local_dir)
 
         print(f"VLRMBench data downloaded and extracted to {local_dir}")
@@ -128,10 +129,10 @@ def _download_and_extract(self) -> str:
 
     def _extract_images(self, data_dir: str):
         """
-        解压图片文件
+        Extract image files from zip archive.
 
         Args:
-            data_dir: 数据目录路径
+            data_dir: Path to the data directory
         """
         zip_file = osp.join(data_dir, "Image.zip")
         images_dir = osp.join(data_dir, "images")
@@ -153,10 +154,10 @@ def _extract_images(self, data_dir: str):
 
     def _load_jsonl_data(self) -> List[Dict]:
         """
-        加载JSONL数据文件
+        Load JSONL data file.
 
         Returns:
-            List[Dict]: 加载的数据列表
+            List[Dict]: List of loaded data items
         """
         jsonl_file = osp.join(self.data_dir, "benchmark_data", f"{self.subset}.jsonl")
 
@@ -174,24 +175,24 @@ def _load_jsonl_data(self) -> List[Dict]:
 
     def _get_evaluation_mode(self) -> str:
         """
-        根据子集确定评估模式
+        Determine evaluation mode based on subset.
 
         Returns:
-            str: 评估模式
+            str: Evaluation mode
         """
-        # 前瞻性推理使用任务级评估
+        # Foresight reasoning uses task-level evaluation
         if self.subset == "foresight":
             return "foresight"
 
-        # 多解任务使用特殊评估
+        # Multi-solution task uses special evaluation
         if self.subset == "multi_solution":
             return "multi_solution"
 
-        # 生成任务使用judge评估 (暂时跳过)
+        # Generation tasks use judge evaluation (skipped for now)
         if self.subset in ["error_correction", "error_reason_analysis"]:
             return "generation"
 
-        # 其他子集使用二进制分类评估
+        # Other subsets use binary classification evaluation
         return "binary_classification"
 
     def __len__(self):
@@ -200,9 +201,9 @@ def __len__(self):
     def __getitem__(self, idx):
         item = dict(self.data.iloc[idx])
 
-        # 处理图片路径
+        # Process image paths
         if "image" in item and isinstance(item["image"], list):
-            # 将相对路径转换为绝对路径
+            # Convert relative paths to absolute paths
             image_paths = []
             for img_path in item["image"]:
                 full_path = osp.join(self.img_root, img_path)
@@ -216,8 +217,8 @@ def __getitem__(self, idx):
         return item
 
     def post_build(self, dataset):
-        """后处理，设置数据集特定属性"""
-        # 设置评估指标
+        """Post-processing to set dataset-specific attributes."""
+        # Set evaluation metrics
         if self.evaluation_mode == "binary_classification":
             self.metrics = ["f1_positive", "f1_negative", "f1_weighted", "step_accuracy"]
         elif self.evaluation_mode == "foresight":
@@ -231,13 +232,13 @@ def post_build(self, dataset):
 
     def build_prompt(self, line):
         """
-        构建提示词 - 根据评估模式构建不同的提示词
+        Build prompt based on evaluation mode.
 
         Args:
-            line: 数据行
+            line: Data row
 
         Returns:
-            str: 构建的提示词
+            str: Constructed prompt
         """
         question = line["question"]
 
@@ -269,14 +270,14 @@ def build_prompt(self, line):
 
     def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]:
         """
-        格式化模型答案 - 基于原始评测脚本的逻辑
+        Format model answer based on original evaluation script logic.
 
         Args:
-            model_answer: 模型原始输出
-            task_gt: 任务真实标签
+            model_answer: Raw model output
+            task_gt: Ground truth labels
 
         Returns:
-            List[int]: 格式化后的预测结果
+            List[int]: Formatted prediction results
         """
         if self.evaluation_mode == "binary_classification":
             return self._format_binary_classification_answer(model_answer, task_gt)
@@ -287,17 +288,17 @@ def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]:
 
     def _format_binary_classification_answer(self, model_answer: str, task_gt: List) -> List[int]:
         """
-        格式化二进制分类任务的模型答案
-        基于 get_sc_mc_rd_eval_res.py 中的 format_model_answer_tolist 函数
+        Format binary classification task model answer.
+        Based on format_model_answer_tolist function in get_sc_mc_rd_eval_res.py
         """
-        # 提取数字
+        # Extract numbers
         numbers = re.findall(r"\d+", model_answer)
         result = [int(num) for num in numbers]
 
-        # 将非0/1的数字转换为1
+        # Convert non-0/1 numbers to 1
         result = [num if num == 0 or num == 1 else 1 for num in result]
 
-        # 调整长度以匹配真实标签
+        # Adjust length to match ground truth
         if len(result) >= len(task_gt):
             return result[: len(task_gt)]
         else:
@@ -305,8 +306,8 @@ def _format_binary_classification_answer(self, model_answer: str, task_gt: List)
 
     def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> List[int]:
         """
-        格式化多解任务的模型答案
-        基于 get_ms_eval_res.py 中的 format_ms_ec_era_model_answer_tolist 函数
+        Format multi-solution task model answer.
+        Based on format_ms_ec_era_model_answer_tolist function in get_ms_eval_res.py
         """
         numbers = re.findall(r"\d+", model_answer)
         result = [int(num) for num in numbers]
@@ -320,10 +321,10 @@ def evaluate_binary_classification(
         self, predictions: List[List[int]], ground_truths: List[List[int]]
     ) -> Dict[str, float]:
         """
-        评估二进制分类任务
-        基于 get_sc_mc_rd_eval_res.py 中的评估逻辑
+        Evaluate binary classification tasks.
+        Based on evaluation logic in get_sc_mc_rd_eval_res.py
         """
-        # 展平所有预测和真实标签
+        # Flatten all predictions and ground truths
         flat_predictions = []
         flat_ground_truths = []
 
@@ -331,15 +332,15 @@ def evaluate_binary_classification(
             flat_predictions.extend(pred)
             flat_ground_truths.extend(gt)
 
-        # 转换为numpy数组
+        # Convert to numpy arrays
         pred_array = np.array(flat_predictions)
         gt_array = np.array(flat_ground_truths)
 
-        # 计算F1分数
+        # Calculate F1 scores
         f1_pos = f1_score(gt_array, pred_array, pos_label=1)
         f1_neg = f1_score(gt_array, pred_array, pos_label=0)
 
-        # 计算加权F1
+        # Calculate weighted F1
         pos_count = np.sum(gt_array == 1)
         neg_count = np.sum(gt_array == 0)
         total_count = pos_count + neg_count
@@ -349,7 +350,7 @@ def evaluate_binary_classification(
         else:
             f1_weighted = 0.0
 
-        # 计算步骤准确率
+        # Calculate step accuracy
         step_accuracy = np.mean(pred_array == gt_array)
 
         return {
@@ -361,8 +362,8 @@ def evaluate_binary_classification(
 
     def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) -> Dict[str, float]:
         """
-        评估前瞻性推理任务
-        基于 get_fores_eval_res.py 中的评估逻辑
+        Evaluate foresight reasoning tasks.
+        Based on evaluation logic in get_fores_eval_res.py
         """
         correct = 0
         total = len(predictions)
@@ -381,18 +382,18 @@ def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool])
 
     def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[List[int]]) -> Dict[str, float]:
         """
-        评估多解任务
-        基于 get_ms_eval_res.py 中的评估逻辑
+        Evaluate multi-solution tasks.
+        Based on evaluation logic in get_ms_eval_res.py
         """
         correct = 0
         total = len(predictions)
 
         for pred, gt in zip(predictions, ground_truths):
-            # 假设predictions包含front和back两个答案
+            # Assume predictions contain front and back answers
             front_answer = pred.get("front", [0, 0])
             back_answer = pred.get("back", [0, 0])
 
-            # 计算得分
+            # Calculate scores
             score1 = front_answer[0] + back_answer[1]
             score2 = front_answer[1] + back_answer[0]
 
@@ -404,70 +405,86 @@ def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[L
         return {"multi_solution_accuracy": accuracy}
 
 
-# 为每个子集创建具体的类
+# Create specific classes for each subset
 class VLRMBenchAttributeHallucination(VLRMBenchBase):
-    """属性幻觉检测子集"""
+    """Attribute hallucination detection subset."""
 
     def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchDetailError(VLRMBenchBase):
-    """细节错误检测子集"""
+    """Detail error detection subset."""
 
     def __init__(self, dataset="VLRMBench_detail_error", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchStepCorrectness(VLRMBenchBase):
-    """步骤正确性评估子集"""
+    """Step correctness evaluation subset."""
 
     def __init__(self, dataset="VLRMBench_step_correctness", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchForesight(VLRMBenchBase):
-    """前瞻性推理评估子集"""
+    """Foresight reasoning evaluation subset."""
 
     def __init__(self, dataset="VLRMBench_foresight", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchErrorCorrection(VLRMBenchBase):
+    """Error correction subset."""
+
     def __init__(self, dataset="VLRMBench_error_correction", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchErrorReasonAnalysis(VLRMBenchBase):
+    """Error reason analysis subset."""
+
     def __init__(self, dataset="VLRMBench_error_reason_analysis", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchExistenceHallucination(VLRMBenchBase):
+    """Existence hallucination detection subset."""
+
     def __init__(self, dataset="VLRMBench_existence_hallucination", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchImageRefError(VLRMBenchBase):
+    """Image reference error detection subset."""
+
     def __init__(self, dataset="VLRMBench_image_ref_error", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchLocationError(VLRMBenchBase):
+    """Location error detection subset."""
+
     def __init__(self, dataset="VLRMBench_location_error", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchMostConfidence(VLRMBenchBase):
+    """Most confidence evaluation subset."""
+
     def __init__(self, dataset="VLRMBench_most_confidence", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchMultiSolution(VLRMBenchBase):
+    """Multi-solution evaluation subset."""
+
     def __init__(self, dataset="VLRMBench_multi_solution", **kwargs):
         super().__init__(dataset=dataset, **kwargs)
 
 
 class VLRMBenchRedundantDet(VLRMBenchBase):
+    """Redundant detection subset."""
+
     def __init__(self, dataset="VLRMBench_redundant_det", **kwargs):
         super().__init__(dataset=dataset, **kwargs)

From 86fbf9b002e064978fd77190671762dabe6e13ac Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Thu, 2 Oct 2025 15:37:09 +0800
Subject: [PATCH 3/8] Fix code formatting issues

---
 vlmeval/dataset/vlrmbench.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
index 47b96cb62..4a6fad77e 100644
--- a/vlmeval/dataset/vlrmbench.py
+++ b/vlmeval/dataset/vlrmbench.py
@@ -57,7 +57,7 @@ def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
         """
         # Extract subset name from dataset name
         if dataset.startswith("VLRMBench_"):
-            subset = dataset[len("VLRMBench_") :]
+            subset = dataset[len("VLRMBench_"):]
         else:
             subset = "attribute_hallucination"  # Default subset
 
@@ -82,7 +82,7 @@ def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
         for i, item in enumerate(data):
             if 'index' not in item:
                 item['index'] = i
-        
+
         # Convert to DataFrame format
         self.data = pd.DataFrame(data)
 
@@ -111,7 +111,6 @@ def _download_and_extract(self) -> str:
 
         print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}")
 
-
         # Download data
         snapshot_download(
             repo_id=self.HF_REPO,
@@ -313,7 +312,7 @@ def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> Lis
         result = [int(num) for num in numbers]
 
         if len(result) >= len(task_gt):
-            return result[-len(task_gt) :]
+            return result[-len(task_gt):]
         else:
             return result + [0] * (len(task_gt) - len(result))
 
@@ -369,10 +368,10 @@ def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool])
         total = len(predictions)
 
         for pred, gt in zip(predictions, ground_truths):
-            if gt == True:
+            if gt is True:
                 if re.search(r"\b(yes|true)\b", pred, re.IGNORECASE):
                     correct += 1
-            elif gt == False:
+            elif gt is False:
                 if re.search(r"\b(no|false)\b", pred, re.IGNORECASE):
                     correct += 1
 

From dbea3ff0f65fa68fb7e6379f991926edb7b4889b Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Tue, 14 Oct 2025 22:20:09 +0800
Subject: [PATCH 4/8] Reconstructed the dataset and the evaluation code of
 vlrmbench

---
 vlmeval/dataset/__init__.py  |  23 +-
 vlmeval/dataset/vlrmbench.py | 623 ++++++++++-------------------------
 2 files changed, 178 insertions(+), 468 deletions(-)

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 7db71e8d4..068b4fc27 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -80,21 +80,7 @@
 from .m4bench import M4Bench
 
 # VLRMBench imports
-from .vlrmbench import (
-    VLRMBenchBase,
-    VLRMBenchAttributeHallucination,
-    VLRMBenchDetailError,
-    VLRMBenchStepCorrectness,
-    VLRMBenchForesight,
-    VLRMBenchErrorCorrection,
-    VLRMBenchErrorReasonAnalysis,
-    VLRMBenchExistenceHallucination,
-    VLRMBenchImageRefError,
-    VLRMBenchLocationError,
-    VLRMBenchMostConfidence,
-    VLRMBenchMultiSolution,
-    VLRMBenchRedundantDet,
-)
+from .vlrmbench import VLRMBench
 
 
 class ConcatDataset(ImageBaseDataset):
@@ -223,11 +209,8 @@ def evaluate(self, eval_file, **judge_kwargs):
     MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench,
     OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
     AyaVisionBench, TopViewRS, VLMBias,
-    # VLRMBench datasets
-    VLRMBenchAttributeHallucination, VLRMBenchDetailError, VLRMBenchStepCorrectness,
-    VLRMBenchForesight, VLRMBenchErrorCorrection, VLRMBenchErrorReasonAnalysis,
-    VLRMBenchExistenceHallucination, VLRMBenchImageRefError, VLRMBenchLocationError,
-    VLRMBenchMostConfidence, VLRMBenchMultiSolution, VLRMBenchRedundantDet
+    # VLRMBench dataset
+    VLRMBench
 ]
 
 VIDEO_DATASET = [
diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
index 4a6fad77e..390986fb2 100644
--- a/vlmeval/dataset/vlrmbench.py
+++ b/vlmeval/dataset/vlrmbench.py
@@ -1,489 +1,216 @@
-import os
-import json
-import zipfile
+from ast import literal_eval
+from collections import defaultdict
 import re
-import tempfile
-from pathlib import Path
-from typing import Dict, List, Any, Union, Optional
-import pandas as pd
 import numpy as np
-from huggingface_hub import snapshot_download
 from sklearn.metrics import f1_score
-from ..smp import *
+
 from .image_base import ImageBaseDataset
+from ..smp import *
 
 
-class VLRMBenchBase(ImageBaseDataset):
-    """
-    Base class for VLRMBench dataset.
-    Supports downloading and extracting data from HuggingFace,
-    and processing JSONL-formatted reasoning error detection data.
+def format_model_answer_tolist(model_answer, task_gt):
     """
+    从模型答案中提取0/1列表
 
-    MODALITY = "IMAGE"
-    TYPE = "VQA"  # Set as VQA type to support open-ended QA
-
-    # List of supported subsets
-    SUPPORTED_SUBSETS = [
-        "attribute_hallucination",
-        "detail_error",
-        "error_correction",
-        "error_reason_analysis",
-        "existence_hallucination",
-        "foresight",
-        "image_ref_error",
-        "location_error",
-        "most_confidence",
-        "multi_solution",
-        "redundant_det",
-        "step_correctness",
-    ]
-
-    # HuggingFace repository information
-    HF_REPO = "Winston-Yuan/VLRMBench"
-
-    @classmethod
-    def supported_datasets(cls):
-        """Return list of supported dataset names."""
-        return [f"VLRMBench_{subset}" for subset in cls.SUPPORTED_SUBSETS]
-
-    def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
-        """
-        Initialize VLRMBench dataset.
-
-        Args:
-            dataset: Dataset name in format VLRMBench_{subset}
-            **kwargs: Additional arguments
-        """
-        # Extract subset name from dataset name
-        if dataset.startswith("VLRMBench_"):
-            subset = dataset[len("VLRMBench_"):]
-        else:
-            subset = "attribute_hallucination"  # Default subset
-
-        if subset not in self.SUPPORTED_SUBSETS:
-            raise ValueError(f"Unsupported subset: {subset}. Supported subsets: {self.SUPPORTED_SUBSETS}")
-
-        self.subset = subset
-        self.dataset_name = dataset
-
-        # Set data root directory
-        ROOT = LMUDataRoot()
-        self.data_root = osp.join(ROOT, "datasets", "VLRMBench")
-        os.makedirs(self.data_root, exist_ok=True)
-
-        # Download and extract data
-        self.data_dir = self._download_and_extract()
-
-        # Load data
-        data = self._load_jsonl_data()
-
-        # Ensure data has index field
-        for i, item in enumerate(data):
-            if 'index' not in item:
-                item['index'] = i
-
-        # Convert to DataFrame format
-        self.data = pd.DataFrame(data)
-
-        # Set image root directory
-        self.img_root = osp.join(self.data_dir, "images")
-
-        # Set evaluation mode
-        self.evaluation_mode = self._get_evaluation_mode()
-
-        # Post-processing
-        self.post_build(dataset)
-
-    def _download_and_extract(self) -> str:
-        """
-        Download data from HuggingFace and extract images.
-
-        Returns:
-            str: Path to the extracted data directory
-        """
-        local_dir = osp.join(self.data_root, "VLRMBench-HF")
-
-        # Check if already downloaded
-        if osp.exists(local_dir) and osp.exists(osp.join(local_dir, "benchmark_data")):
-            print(f"VLRMBench data already exists at {local_dir}")
-            return local_dir
-
-        print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}")
+    参数:
+        model_answer: 模型的预测答案（字符串）
+        task_gt: ground truth列表，用于确定期望的长度
 
-        # Download data
-        snapshot_download(
-            repo_id=self.HF_REPO,
-            repo_type="dataset",
-            local_dir=local_dir,
-            local_dir_use_symlinks=False,
-            tqdm_class=None,  # Use default tqdm progress bar
-        )
-
-        # Extract image files
-        self._extract_images(local_dir)
+    返回:
+        list: 0/1列表
+    """
+    numbers = re.findall(r'\d+', str(model_answer))
 
-        print(f"VLRMBench data downloaded and extracted to {local_dir}")
-        return local_dir
+    result = [int(num) for num in numbers]
 
-    def _extract_images(self, data_dir: str):
-        """
-        Extract image files from zip archive.
+    # 将非0/1的数字转换为1
+    result = [num if num == 0 or num == 1 else 1 for num in result]
 
-        Args:
-            data_dir: Path to the data directory
-        """
-        zip_file = osp.join(data_dir, "Image.zip")
-        images_dir = osp.join(data_dir, "images")
+    # 调整长度以匹配task_gt
+    if len(result) >= len(task_gt):
+        return result[:len(task_gt)]
+    else:
+        return result + [0] * (len(task_gt) - len(result))
 
-        if osp.exists(images_dir):
-            print("Images already extracted")
-            return
 
-        if not osp.exists(zip_file):
-            raise FileNotFoundError(f"Image.zip not found at {zip_file}")
+def get_F1Score(gathered_model_answer, gathered_task_gt):
+    """
+    计算F1分数
 
-        print(f"Extracting images from {zip_file}")
-        os.makedirs(images_dir, exist_ok=True)
+    参数:
+        gathered_model_answer: 所有模型答案的列表
+        gathered_task_gt: 所有ground truth的列表
 
-        with zipfile.ZipFile(zip_file, "r") as zip_ref:
-            zip_ref.extractall(images_dir)
+    返回:
+        tuple: (F1_pos, F1_neg, F1_w) - 正类F1、负类F1、加权F1
+    """
+    model_answer = np.array(gathered_model_answer)
+    task_gt = np.array(gathered_task_gt)
 
-        print(f"Images extracted to {images_dir}")
+    pos_count = np.sum(task_gt == 1)
+    neg_count = np.sum(task_gt == 0)
 
-    def _load_jsonl_data(self) -> List[Dict]:
-        """
-        Load JSONL data file.
+    F1_pos = f1_score(task_gt, model_answer, pos_label=1, zero_division=0)
+    F1_neg = f1_score(task_gt, model_answer, pos_label=0, zero_division=0)
 
-        Returns:
-            List[Dict]: List of loaded data items
-        """
-        jsonl_file = osp.join(self.data_dir, "benchmark_data", f"{self.subset}.jsonl")
+    w_pos = neg_count / (pos_count + neg_count) if (pos_count + neg_count) > 0 else 0
+    w_neg = pos_count / (pos_count + neg_count) if (pos_count + neg_count) > 0 else 0
 
-        if not osp.exists(jsonl_file):
-            raise FileNotFoundError(f"JSONL file not found: {jsonl_file}")
+    F1_w = w_neg * F1_neg + w_pos * F1_pos
 
-        data = []
-        with open(jsonl_file, "r", encoding="utf-8") as f:
-            for line in f:
-                if line.strip():
-                    data.append(json.loads(line))
+    return F1_pos, F1_neg, F1_w
 
-        print(f"Loaded {len(data)} samples from {self.subset}")
-        return data
 
-    def _get_evaluation_mode(self) -> str:
-        """
-        Determine evaluation mode based on subset.
+class VLRMBench(ImageBaseDataset):
+    """
+    VLRMBench Dataset - Visual Language Reasoning Model Benchmark
+
+    A comprehensive benchmark for evaluating visual reasoning capabilities including:
+    - step_correctness: 步骤正确性检测
+    - redundant_det: 冗余检测
+    - most_confidence: 最高置信度判断
+    - attribute_hallucination: 属性幻觉检测
+    - existence_hallucination: 存在性幻觉检测
+    - detail_error: 细节错误检测
+    - image_ref_error: 图像引用错误检测
+    - location_error: 位置错误检测
+    """
 
-        Returns:
-            str: Evaluation mode
-        """
-        # Foresight reasoning uses task-level evaluation
-        if self.subset == "foresight":
-            return "foresight"
-
-        # Multi-solution task uses special evaluation
-        if self.subset == "multi_solution":
-            return "multi_solution"
-
-        # Generation tasks use judge evaluation (skipped for now)
-        if self.subset in ["error_correction", "error_reason_analysis"]:
-            return "generation"
-
-        # Other subsets use binary classification evaluation
-        return "binary_classification"
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        item = dict(self.data.iloc[idx])
-
-        # Process image paths
-        if "image" in item and isinstance(item["image"], list):
-            # Convert relative paths to absolute paths
-            image_paths = []
-            for img_path in item["image"]:
-                full_path = osp.join(self.img_root, img_path)
-                if osp.exists(full_path):
-                    image_paths.append(full_path)
-                else:
-                    print(f"Warning: Image not found: {full_path}")
-
-            item["image"] = image_paths[0] if len(image_paths) == 1 else image_paths
-
-        return item
-
-    def post_build(self, dataset):
-        """Post-processing to set dataset-specific attributes."""
-        # Set evaluation metrics
-        if self.evaluation_mode == "binary_classification":
-            self.metrics = ["f1_positive", "f1_negative", "f1_weighted", "step_accuracy"]
-        elif self.evaluation_mode == "foresight":
-            self.metrics = ["task_accuracy"]
-        elif self.evaluation_mode == "multi_solution":
-            self.metrics = ["multi_solution_accuracy"]
-        elif self.evaluation_mode == "generation":
-            self.metrics = ["win_rate", "judge_score"]
-        else:
-            self.metrics = ["overall_accuracy"]
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv?download=true'
+    }
+    DATASET_MD5 = {
+        'VLRMBench': None  # 可以后续添加MD5校验
+    }
 
     def build_prompt(self, line):
         """
-        Build prompt based on evaluation mode.
+        构建提示信息
 
-        Args:
-            line: Data row
+        参数:
+            line: 数据行，可以是int索引或pd.Series
 
-        Returns:
-            str: Constructed prompt
-        """
-        question = line["question"]
-
-        if self.evaluation_mode == "binary_classification":
-            step_list = line.get("step_list", [])
-            prompt = f"Question: {question}\n\nReasoning Steps:\n"
-            for i, step in enumerate(step_list, 1):
-                prompt += f"Step {i}: {step}\n"
-            prompt += "\nPlease identify which steps contain errors. Output format: [0,1,0,1,...]"
-
-        elif self.evaluation_mode == "foresight":
-            step_list = line.get("step_list", [])
-            prompt = f"Question: {question}\n\nReasoning Steps:\n"
-            for i, step in enumerate(step_list, 1):
-                prompt += f"Step {i}: {step}\n"
-            prompt += "\nDoes this reasoning show good foresight? Answer: yes/no"
-
-        elif self.evaluation_mode == "multi_solution":
-            prompt = f"Question: {question}\n\nPlease provide two different solution approaches."
-
-        elif self.evaluation_mode == "generation":
-            reasoning_error = line.get("reasoning_error", [])
-            prompt = f"Question: {question}\n\nReasoning with errors:\n"
-            for i, step in enumerate(reasoning_error, 1):
-                prompt += f"Step {i}: {step}\n"
-            prompt += "\nPlease analyze and correct the errors in this reasoning."
-
-        return prompt
-
-    def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]:
+        返回:
+            list: 多模态消息列表，格式为 [dict(type='image', value=path), dict(type='text', value=text), ...]
         """
-        Format model answer based on original evaluation script logic.
+        if isinstance(line, int):
+            line = self.data.iloc[line]
 
-        Args:
-            model_answer: Raw model output
-            task_gt: Ground truth labels
+        # 创建line的副本避免SettingWithCopyWarning
+        line = line.copy()
 
-        Returns:
-            List[int]: Formatted prediction results
-        """
-        if self.evaluation_mode == "binary_classification":
-            return self._format_binary_classification_answer(model_answer, task_gt)
-        elif self.evaluation_mode == "multi_solution":
-            return self._format_multi_solution_answer(model_answer, task_gt)
-        else:
-            return []
+        # 使用父类方法保存图片（从base64解码并保存到本地）
+        tgt_path = self.dump_image(line)
 
-    def _format_binary_classification_answer(self, model_answer: str, task_gt: List) -> List[int]:
-        """
-        Format binary classification task model answer.
-        Based on format_model_answer_tolist function in get_sc_mc_rd_eval_res.py
-        """
-        # Extract numbers
-        numbers = re.findall(r"\d+", model_answer)
-        result = [int(num) for num in numbers]
-
-        # Convert non-0/1 numbers to 1
-        result = [num if num == 0 or num == 1 else 1 for num in result]
-
-        # Adjust length to match ground truth
-        if len(result) >= len(task_gt):
-            return result[: len(task_gt)]
+        # 构建消息
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
         else:
-            return result + [0] * (len(task_gt) - len(result))
+            msgs = [dict(type='image', value=tgt_path)]
 
-    def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> List[int]:
-        """
-        Format multi-solution task model answer.
-        Based on format_ms_ec_era_model_answer_tolist function in get_ms_eval_res.py
-        """
-        numbers = re.findall(r"\d+", model_answer)
-        result = [int(num) for num in numbers]
-
-        if len(result) >= len(task_gt):
-            return result[-len(task_gt):]
-        else:
-            return result + [0] * (len(task_gt) - len(result))
-
-    def evaluate_binary_classification(
-        self, predictions: List[List[int]], ground_truths: List[List[int]]
-    ) -> Dict[str, float]:
-        """
-        Evaluate binary classification tasks.
-        Based on evaluation logic in get_sc_mc_rd_eval_res.py
-        """
-        # Flatten all predictions and ground truths
-        flat_predictions = []
-        flat_ground_truths = []
+        # 添加问题文本
+        question = line.get('question', '')
+        if question:
+            msgs.append(dict(type='text', value=question))
 
-        for pred, gt in zip(predictions, ground_truths):
-            flat_predictions.extend(pred)
-            flat_ground_truths.extend(gt)
+        return msgs
 
-        # Convert to numpy arrays
-        pred_array = np.array(flat_predictions)
-        gt_array = np.array(flat_ground_truths)
-
-        # Calculate F1 scores
-        f1_pos = f1_score(gt_array, pred_array, pos_label=1)
-        f1_neg = f1_score(gt_array, pred_array, pos_label=0)
-
-        # Calculate weighted F1
-        pos_count = np.sum(gt_array == 1)
-        neg_count = np.sum(gt_array == 0)
-        total_count = pos_count + neg_count
-
-        if total_count > 0:
-            f1_weighted = (f1_pos * pos_count + f1_neg * neg_count) / total_count
+    @classmethod
+    def evaluate(cls, eval_file, **judge_kwargs):
+        """
+        评估模型预测结果
+
+        参数:
+            eval_file: 模型预测结果文件路径
+            **judge_kwargs: 其他评估参数
+
+        返回:
+            pd.DataFrame: 评估结果，包含各类别的F1分数
+        """
+        # 加载预测数据
+        data = load(eval_file)
+
+        # 确保必要的字段存在
+        assert 'answer' in data.columns, "评估文件缺少 'answer' 字段"
+        assert 'prediction' in data.columns, "评估文件缺少 'prediction' 字段"
+        assert 'category' in data.columns, "评估文件缺少 'category' 字段"
+
+        # 按类别收集模型答案和ground truth
+        category_model_answers = defaultdict(list)
+        category_task_gts = defaultdict(list)
+        category_total = defaultdict(int)
+
+        for idx in range(len(data)):
+            item = data.iloc[idx]
+            category = item['category']
+
+            try:
+                # 解析task_gt（answer字段）
+                task_gt = item['answer']
+                if isinstance(task_gt, str):
+                    # 尝试将字符串解析为列表
+                    task_gt = literal_eval(task_gt)
+
+                # 获取模型答案（prediction字段）
+                model_answer = item.get('prediction', '')
+
+                # 使用format_model_answer_tolist格式化模型答案
+                formatted_model_answer = format_model_answer_tolist(model_answer, task_gt)
+
+                # 收集每个类别的答案
+                category_task_gts[category].extend(task_gt)
+                category_model_answers[category].extend(formatted_model_answer)
+                category_total[category] += 1
+            except Exception as e:
+                # 如果解析失败，记录并跳过该样本
+                print(f"处理样本失败 (idx={idx}, category={category}): {e}")
+                continue
+
+        # 计算各类别的F1分数
+        results = {}
+        for category in category_task_gts:
+            gathered_task_gt = category_task_gts[category]
+            gathered_model_answer = category_model_answers[category]
+
+            if len(gathered_task_gt) > 0:
+                F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt)
+
+                results[f'{category}_F1_pos'] = F1_pos
+                results[f'{category}_F1_neg'] = F1_neg
+                results[f'{category}_F1_weighted'] = F1_w
+                results[f'{category}_count'] = category_total[category]
+            else:
+                results[f'{category}_F1_pos'] = 0.0
+                results[f'{category}_F1_neg'] = 0.0
+                results[f'{category}_F1_weighted'] = 0.0
+                results[f'{category}_count'] = 0
+
+        # 计算总体F1分数（所有类别合并）
+        all_task_gts = []
+        all_model_answers = []
+        for category in category_task_gts:
+            all_task_gts.extend(category_task_gts[category])
+            all_model_answers.extend(category_model_answers[category])
+
+        if len(all_task_gts) > 0:
+            F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts)
+            results['Overall_F1_pos'] = F1_pos_overall
+            results['Overall_F1_neg'] = F1_neg_overall
+            results['Overall_F1_weighted'] = F1_w_overall
+            results['Overall_count'] = sum(category_total.values())
         else:
-            f1_weighted = 0.0
-
-        # Calculate step accuracy
-        step_accuracy = np.mean(pred_array == gt_array)
-
-        return {
-            "f1_positive": f1_pos,
-            "f1_negative": f1_neg,
-            "f1_weighted": f1_weighted,
-            "step_accuracy": step_accuracy,
-        }
-
-    def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) -> Dict[str, float]:
-        """
-        Evaluate foresight reasoning tasks.
-        Based on evaluation logic in get_fores_eval_res.py
-        """
-        correct = 0
-        total = len(predictions)
-
-        for pred, gt in zip(predictions, ground_truths):
-            if gt is True:
-                if re.search(r"\b(yes|true)\b", pred, re.IGNORECASE):
-                    correct += 1
-            elif gt is False:
-                if re.search(r"\b(no|false)\b", pred, re.IGNORECASE):
-                    correct += 1
-
-        accuracy = correct / total if total > 0 else 0.0
-
-        return {"task_accuracy": accuracy}
-
-    def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[List[int]]) -> Dict[str, float]:
-        """
-        Evaluate multi-solution tasks.
-        Based on evaluation logic in get_ms_eval_res.py
-        """
-        correct = 0
-        total = len(predictions)
-
-        for pred, gt in zip(predictions, ground_truths):
-            # Assume predictions contain front and back answers
-            front_answer = pred.get("front", [0, 0])
-            back_answer = pred.get("back", [0, 0])
-
-            # Calculate scores
-            score1 = front_answer[0] + back_answer[1]
-            score2 = front_answer[1] + back_answer[0]
-
-            if score1 > score2:
-                correct += 1
-
-        accuracy = correct / total if total > 0 else 0.0
-
-        return {"multi_solution_accuracy": accuracy}
-
-
-# Create specific classes for each subset
-class VLRMBenchAttributeHallucination(VLRMBenchBase):
-    """Attribute hallucination detection subset."""
-
-    def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchDetailError(VLRMBenchBase):
-    """Detail error detection subset."""
-
-    def __init__(self, dataset="VLRMBench_detail_error", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchStepCorrectness(VLRMBenchBase):
-    """Step correctness evaluation subset."""
-
-    def __init__(self, dataset="VLRMBench_step_correctness", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchForesight(VLRMBenchBase):
-    """Foresight reasoning evaluation subset."""
-
-    def __init__(self, dataset="VLRMBench_foresight", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchErrorCorrection(VLRMBenchBase):
-    """Error correction subset."""
-
-    def __init__(self, dataset="VLRMBench_error_correction", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchErrorReasonAnalysis(VLRMBenchBase):
-    """Error reason analysis subset."""
-
-    def __init__(self, dataset="VLRMBench_error_reason_analysis", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchExistenceHallucination(VLRMBenchBase):
-    """Existence hallucination detection subset."""
-
-    def __init__(self, dataset="VLRMBench_existence_hallucination", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchImageRefError(VLRMBenchBase):
-    """Image reference error detection subset."""
-
-    def __init__(self, dataset="VLRMBench_image_ref_error", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchLocationError(VLRMBenchBase):
-    """Location error detection subset."""
-
-    def __init__(self, dataset="VLRMBench_location_error", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchMostConfidence(VLRMBenchBase):
-    """Most confidence evaluation subset."""
-
-    def __init__(self, dataset="VLRMBench_most_confidence", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
-
-
-class VLRMBenchMultiSolution(VLRMBenchBase):
-    """Multi-solution evaluation subset."""
-
-    def __init__(self, dataset="VLRMBench_multi_solution", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
+            results['Overall_F1_pos'] = 0.0
+            results['Overall_F1_neg'] = 0.0
+            results['Overall_F1_weighted'] = 0.0
+            results['Overall_count'] = 0
 
+        # 转换为DataFrame格式
+        results_df = pd.DataFrame([results])
 
-class VLRMBenchRedundantDet(VLRMBenchBase):
-    """Redundant detection subset."""
+        # 保存结果
+        score_file = eval_file.replace('.xlsx', '_scores.csv')
+        dump(results_df, score_file)
 
-    def __init__(self, dataset="VLRMBench_redundant_det", **kwargs):
-        super().__init__(dataset=dataset, **kwargs)
+        return results_df

From ee9f8cce77f04fee932cfc9a46f88016a2c89958 Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Wed, 15 Oct 2025 18:05:48 +0800
Subject: [PATCH 5/8] Refactor and translate comments in VLRMBench dataset code
 to English

---
 vlmeval/dataset/__init__.py  |   2 -
 vlmeval/dataset/vlrmbench.py | 113 ++++++++++++++++++-----------------
 2 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 068b4fc27..9b706e907 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -78,8 +78,6 @@
 from .mmifeval import MMIFEval
 from .chartmimic import ChartMimic
 from .m4bench import M4Bench
-
-# VLRMBench imports
 from .vlrmbench import VLRMBench
 
 
diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
index 390986fb2..91f7c150b 100644
--- a/vlmeval/dataset/vlrmbench.py
+++ b/vlmeval/dataset/vlrmbench.py
@@ -6,27 +6,28 @@
 
 from .image_base import ImageBaseDataset
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 
 def format_model_answer_tolist(model_answer, task_gt):
     """
-    从模型答案中提取0/1列表
+    Extract 0/1 list from model answer
 
-    参数:
-        model_answer: 模型的预测答案（字符串）
-        task_gt: ground truth列表，用于确定期望的长度
+    Args:
+        model_answer: Model's prediction answer (string)
+        task_gt: Ground truth list, used to determine expected length
 
-    返回:
-        list: 0/1列表
+    Returns:
+        list: 0/1 list
     """
     numbers = re.findall(r'\d+', str(model_answer))
 
     result = [int(num) for num in numbers]
 
-    # 将非0/1的数字转换为1
+    # Convert non-0/1 numbers to 1
     result = [num if num == 0 or num == 1 else 1 for num in result]
 
-    # 调整长度以匹配task_gt
+    # Adjust length to match task_gt
     if len(result) >= len(task_gt):
         return result[:len(task_gt)]
     else:
@@ -35,14 +36,14 @@ def format_model_answer_tolist(model_answer, task_gt):
 
 def get_F1Score(gathered_model_answer, gathered_task_gt):
     """
-    计算F1分数
+    Calculate F1 score
 
-    参数:
-        gathered_model_answer: 所有模型答案的列表
-        gathered_task_gt: 所有ground truth的列表
+    Args:
+        gathered_model_answer: List of all model answers
+        gathered_task_gt: List of all ground truth
 
-    返回:
-        tuple: (F1_pos, F1_neg, F1_w) - 正类F1、负类F1、加权F1
+    Returns:
+        tuple: (F1_pos, F1_neg, F1_w) - positive class F1, negative class F1, weighted F1
     """
     model_answer = np.array(gathered_model_answer)
     task_gt = np.array(gathered_task_gt)
@@ -66,14 +67,14 @@ class VLRMBench(ImageBaseDataset):
     VLRMBench Dataset - Visual Language Reasoning Model Benchmark
 
     A comprehensive benchmark for evaluating visual reasoning capabilities including:
-    - step_correctness: 步骤正确性检测
-    - redundant_det: 冗余检测
-    - most_confidence: 最高置信度判断
-    - attribute_hallucination: 属性幻觉检测
-    - existence_hallucination: 存在性幻觉检测
-    - detail_error: 细节错误检测
-    - image_ref_error: 图像引用错误检测
-    - location_error: 位置错误检测
+    - step_correctness: Step correctness detection
+    - redundant_det: Redundancy detection
+    - most_confidence: Highest confidence judgment
+    - attribute_hallucination: Attribute hallucination detection
+    - existence_hallucination: Existence hallucination detection
+    - detail_error: Detail error detection
+    - image_ref_error: Image reference error detection
+    - location_error: Location error detection
     """
 
     TYPE = 'VQA'
@@ -81,36 +82,36 @@ class VLRMBench(ImageBaseDataset):
         'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv?download=true'
     }
     DATASET_MD5 = {
-        'VLRMBench': None  # 可以后续添加MD5校验
+        'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2'
     }
 
     def build_prompt(self, line):
         """
-        构建提示信息
+        Build prompt information
 
-        参数:
-            line: 数据行，可以是int索引或pd.Series
+        Args:
+            line: Data row, can be int index or pd.Series
 
-        返回:
-            list: 多模态消息列表，格式为 [dict(type='image', value=path), dict(type='text', value=text), ...]
+        Returns:
+            list: Multimodal message list, format is [dict(type='image', value=path), dict(type='text', value=text),]
         """
         if isinstance(line, int):
             line = self.data.iloc[line]
 
-        # 创建line的副本避免SettingWithCopyWarning
+        # Create a copy of line to avoid SettingWithCopyWarning
         line = line.copy()
 
-        # 使用父类方法保存图片（从base64解码并保存到本地）
+        # Use parent class method to save image (decode from base64 and save locally)
         tgt_path = self.dump_image(line)
 
-        # 构建消息
+        # Build messages
         msgs = []
         if isinstance(tgt_path, list):
             msgs.extend([dict(type='image', value=p) for p in tgt_path])
         else:
             msgs = [dict(type='image', value=tgt_path)]
 
-        # 添加问题文本
+        # Add question text
         question = line.get('question', '')
         if question:
             msgs.append(dict(type='text', value=question))
@@ -120,24 +121,24 @@ def build_prompt(self, line):
     @classmethod
     def evaluate(cls, eval_file, **judge_kwargs):
         """
-        评估模型预测结果
+        Evaluate model prediction results
 
-        参数:
-            eval_file: 模型预测结果文件路径
-            **judge_kwargs: 其他评估参数
+        Args:
+            eval_file: Path to model prediction results file
+            **judge_kwargs: Other evaluation parameters
 
-        返回:
-            pd.DataFrame: 评估结果，包含各类别的F1分数
+        Returns:
+            pd.DataFrame: Evaluation results, including F1 scores for each category
         """
-        # 加载预测数据
+        # Load prediction data
         data = load(eval_file)
 
-        # 确保必要的字段存在
-        assert 'answer' in data.columns, "评估文件缺少 'answer' 字段"
-        assert 'prediction' in data.columns, "评估文件缺少 'prediction' 字段"
-        assert 'category' in data.columns, "评估文件缺少 'category' 字段"
+        # Ensure necessary fields exist
+        assert 'answer' in data.columns, "Evaluation file missing 'answer' field"
+        assert 'prediction' in data.columns, "Evaluation file missing 'prediction' field"
+        assert 'category' in data.columns, "Evaluation file missing 'category' field"
 
-        # 按类别收集模型答案和ground truth
+        # Collect model answers and ground truth by category
         category_model_answers = defaultdict(list)
         category_task_gts = defaultdict(list)
         category_total = defaultdict(int)
@@ -147,28 +148,28 @@ def evaluate(cls, eval_file, **judge_kwargs):
             category = item['category']
 
             try:
-                # 解析task_gt（answer字段）
+                # Parse task_gt (answer field)
                 task_gt = item['answer']
                 if isinstance(task_gt, str):
-                    # 尝试将字符串解析为列表
+                    # Try to parse string as list
                     task_gt = literal_eval(task_gt)
 
-                # 获取模型答案（prediction字段）
+                # Get model answer (prediction field)
                 model_answer = item.get('prediction', '')
 
-                # 使用format_model_answer_tolist格式化模型答案
+                # Format model answer using format_model_answer_tolist
                 formatted_model_answer = format_model_answer_tolist(model_answer, task_gt)
 
-                # 收集每个类别的答案
+                # Collect answers for each category
                 category_task_gts[category].extend(task_gt)
                 category_model_answers[category].extend(formatted_model_answer)
                 category_total[category] += 1
             except Exception as e:
-                # 如果解析失败，记录并跳过该样本
-                print(f"处理样本失败 (idx={idx}, category={category}): {e}")
+                # If parsing fails, log and skip the sample
+                print(f"Failed to process sample (idx={idx}, category={category}): {e}")
                 continue
 
-        # 计算各类别的F1分数
+        # Calculate F1 scores for each category
         results = {}
         for category in category_task_gts:
             gathered_task_gt = category_task_gts[category]
@@ -187,7 +188,7 @@ def evaluate(cls, eval_file, **judge_kwargs):
                 results[f'{category}_F1_weighted'] = 0.0
                 results[f'{category}_count'] = 0
 
-        # 计算总体F1分数（所有类别合并）
+        # Calculate overall F1 score (all categories combined)
         all_task_gts = []
         all_model_answers = []
         for category in category_task_gts:
@@ -206,11 +207,11 @@ def evaluate(cls, eval_file, **judge_kwargs):
             results['Overall_F1_weighted'] = 0.0
             results['Overall_count'] = 0
 
-        # 转换为DataFrame格式
+        # Convert to DataFrame format
         results_df = pd.DataFrame([results])
 
-        # 保存结果
-        score_file = eval_file.replace('.xlsx', '_scores.csv')
+        # Save results
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
         dump(results_df, score_file)
 
         return results_df

From 52b88d33f15bfd5524cdfdf0761422f3ee78ebee Mon Sep 17 00:00:00 2001
From: miaomiao <147327746+Winston-Yuan@users.noreply.github.com>
Date: Wed, 15 Oct 2025 18:14:12 +0800
Subject: [PATCH 6/8] Consolidate dataset imports in __init__.py

---
 vlmeval/dataset/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 9b706e907..c6dc8a1ad 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -206,9 +206,7 @@ def evaluate(self, eval_file, **judge_kwargs):
     ZEROBench, SCAM, Omni3DBench, TallyQA, _3DSRBench, BMMR, AffordanceDataset,
     MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench,
     OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
-    AyaVisionBench, TopViewRS, VLMBias,
-    # VLRMBench dataset
-    VLRMBench
+    AyaVisionBench, TopViewRS, VLMBias, VLRMBench
 ]
 
 VIDEO_DATASET = [

From 9cc6032adfda2704b413dface5d057aa8f6f1d77 Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Thu, 16 Oct 2025 17:57:40 +0800
Subject: [PATCH 7/8] Add multi_solution evaluation support in VLRMBench
 dataset

---
 vlmeval/dataset/vlrmbench.py | 249 +++++++++++++++++++++++++----------
 1 file changed, 182 insertions(+), 67 deletions(-)

diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
index 91f7c150b..dbff3d225 100644
--- a/vlmeval/dataset/vlrmbench.py
+++ b/vlmeval/dataset/vlrmbench.py
@@ -34,6 +34,28 @@ def format_model_answer_tolist(model_answer, task_gt):
         return result + [0] * (len(task_gt) - len(result))
 
 
+def format_ms_model_answer(model_answer):
+    """
+    Extract two scores from multi_solution model answer
+
+    Args:
+        model_answer: Model's prediction answer (string)
+        Expected format: "[7, 8]" or "7, 8" or "Score 1: 7, Score 2: 8"
+
+    Returns:
+        list: Two scores [score1, score2]
+    """
+    numbers = re.findall(r'\d+', str(model_answer))
+    result = [int(num) for num in numbers]
+
+    # Return last two numbers (most likely the actual scores)
+    if len(result) >= 2:
+        return result[-2:]
+    else:
+        # If less than 2 numbers found, pad with 0
+        return result + [0] * (2 - len(result))
+
+
 def get_F1Score(gathered_model_answer, gathered_task_gt):
     """
     Calculate F1 score
@@ -75,14 +97,19 @@ class VLRMBench(ImageBaseDataset):
     - detail_error: Detail error detection
     - image_ref_error: Image reference error detection
     - location_error: Location error detection
+    - multi_solution: Position bias resistance evaluation
     """
 
     TYPE = 'VQA'
     DATASET_URL = {
-        'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv?download=true'
+        'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv',
+        'VLRMBench_MultiSolution': (
+            'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_MultiSolution.tsv'
+        )
     }
     DATASET_MD5 = {
-        'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2'
+        'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2',
+        'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387'
     }
 
     def build_prompt(self, line):
@@ -118,94 +145,182 @@ def build_prompt(self, line):
 
         return msgs
 
+    @classmethod
+    def evaluate_multi_solution(cls, data):
+        """
+        Evaluate multi_solution type data (position bias resistance)
+
+        Args:
+            data: DataFrame containing multi_solution predictions
+
+        Returns:
+            dict: Evaluation results with accuracy metric
+        """
+        acc_sample = 0
+        overall_sample = 0
+        skipped = 0
+
+        # Group by index pairs (front: even, back: odd)
+        indices = sorted(data['index'].unique())
+
+        for i in range(0, len(indices), 2):
+            if i + 1 >= len(indices):
+                skipped += 1
+                continue
+
+            front_idx = indices[i]
+            back_idx = indices[i + 1]
+
+            # Get front and back rows
+            front_rows = data[data['index'] == front_idx]
+            back_rows = data[data['index'] == back_idx]
+
+            if len(front_rows) == 0 or len(back_rows) == 0:
+                skipped += 1
+                continue
+
+            front_row = front_rows.iloc[0]
+            back_row = back_rows.iloc[0]
+
+            # Verify order field if exists
+            if 'order' in data.columns:
+                if front_row.get('order') != 'front' or back_row.get('order') != 'back':
+                    print(f"Warning: Order mismatch at index {front_idx}, {back_idx}")
+                    skipped += 1
+                    continue
+
+            try:
+                # Parse model predictions
+                front_scores = format_ms_model_answer(front_row.get('prediction', ''))
+                back_scores = format_ms_model_answer(back_row.get('prediction', ''))
+
+                # Apply evaluation formula: front[0] + back[1] vs front[1] + back[0]
+                # This checks if model consistently prefers the better response regardless of position
+                if front_scores[0] + back_scores[1] > front_scores[1] + back_scores[0]:
+                    acc_sample += 1
+
+                overall_sample += 1
+            except Exception as e:
+                print(f"Failed to process multi_solution pair ({front_idx}, {back_idx}): {e}")
+                skipped += 1
+
+        results = {
+            'multi_solution_accuracy': acc_sample / overall_sample if overall_sample > 0 else 0.0,
+            'multi_solution_count': overall_sample,
+            'multi_solution_skipped': skipped
+        }
+
+        return results
+
     @classmethod
     def evaluate(cls, eval_file, **judge_kwargs):
         """
         Evaluate model prediction results
+        Automatically detects and handles both step-based and multi_solution data
 
         Args:
             eval_file: Path to model prediction results file
             **judge_kwargs: Other evaluation parameters
 
         Returns:
-            pd.DataFrame: Evaluation results, including F1 scores for each category
+            pd.DataFrame: Evaluation results, including F1 scores and/or accuracy
         """
         # Load prediction data
         data = load(eval_file)
 
         # Ensure necessary fields exist
-        assert 'answer' in data.columns, "Evaluation file missing 'answer' field"
         assert 'prediction' in data.columns, "Evaluation file missing 'prediction' field"
         assert 'category' in data.columns, "Evaluation file missing 'category' field"
 
-        # Collect model answers and ground truth by category
-        category_model_answers = defaultdict(list)
-        category_task_gts = defaultdict(list)
-        category_total = defaultdict(int)
-
-        for idx in range(len(data)):
-            item = data.iloc[idx]
-            category = item['category']
+        # Detect data types
+        categories = data['category'].unique()
+        has_multi_solution = 'multi_solution' in categories
+        has_step_based = any(c != 'multi_solution' for c in categories)
 
-            try:
-                # Parse task_gt (answer field)
-                task_gt = item['answer']
-                if isinstance(task_gt, str):
-                    # Try to parse string as list
-                    task_gt = literal_eval(task_gt)
-
-                # Get model answer (prediction field)
-                model_answer = item.get('prediction', '')
-
-                # Format model answer using format_model_answer_tolist
-                formatted_model_answer = format_model_answer_tolist(model_answer, task_gt)
-
-                # Collect answers for each category
-                category_task_gts[category].extend(task_gt)
-                category_model_answers[category].extend(formatted_model_answer)
-                category_total[category] += 1
-            except Exception as e:
-                # If parsing fails, log and skip the sample
-                print(f"Failed to process sample (idx={idx}, category={category}): {e}")
-                continue
-
-        # Calculate F1 scores for each category
         results = {}
-        for category in category_task_gts:
-            gathered_task_gt = category_task_gts[category]
-            gathered_model_answer = category_model_answers[category]
 
-            if len(gathered_task_gt) > 0:
-                F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt)
+        # Process step-based categories
+        if has_step_based:
+            # Filter step-based data
+            step_data = data[data['category'] != 'multi_solution']
 
-                results[f'{category}_F1_pos'] = F1_pos
-                results[f'{category}_F1_neg'] = F1_neg
-                results[f'{category}_F1_weighted'] = F1_w
-                results[f'{category}_count'] = category_total[category]
+            # Ensure answer field exists for step-based data
+            if 'answer' not in step_data.columns:
+                print("Warning: Step-based data missing 'answer' field, skipping step-based evaluation")
             else:
-                results[f'{category}_F1_pos'] = 0.0
-                results[f'{category}_F1_neg'] = 0.0
-                results[f'{category}_F1_weighted'] = 0.0
-                results[f'{category}_count'] = 0
-
-        # Calculate overall F1 score (all categories combined)
-        all_task_gts = []
-        all_model_answers = []
-        for category in category_task_gts:
-            all_task_gts.extend(category_task_gts[category])
-            all_model_answers.extend(category_model_answers[category])
-
-        if len(all_task_gts) > 0:
-            F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts)
-            results['Overall_F1_pos'] = F1_pos_overall
-            results['Overall_F1_neg'] = F1_neg_overall
-            results['Overall_F1_weighted'] = F1_w_overall
-            results['Overall_count'] = sum(category_total.values())
-        else:
-            results['Overall_F1_pos'] = 0.0
-            results['Overall_F1_neg'] = 0.0
-            results['Overall_F1_weighted'] = 0.0
-            results['Overall_count'] = 0
+                # Collect model answers and ground truth by category
+                category_model_answers = defaultdict(list)
+                category_task_gts = defaultdict(list)
+                category_total = defaultdict(int)
+
+                for idx in range(len(step_data)):
+                    item = step_data.iloc[idx]
+                    category = item['category']
+
+                    try:
+                        # Parse task_gt (answer field)
+                        task_gt = item['answer']
+                        if isinstance(task_gt, str):
+                            # Try to parse string as list
+                            task_gt = literal_eval(task_gt)
+
+                        # Get model answer (prediction field)
+                        model_answer = item.get('prediction', '')
+
+                        # Format model answer using format_model_answer_tolist
+                        formatted_model_answer = format_model_answer_tolist(model_answer, task_gt)
+
+                        # Collect answers for each category
+                        category_task_gts[category].extend(task_gt)
+                        category_model_answers[category].extend(formatted_model_answer)
+                        category_total[category] += 1
+                    except Exception as e:
+                        # If parsing fails, log and skip the sample
+                        print(f"Failed to process sample (idx={idx}, category={category}): {e}")
+                        continue
+
+                # Calculate F1 scores for each category
+                for category in category_task_gts:
+                    gathered_task_gt = category_task_gts[category]
+                    gathered_model_answer = category_model_answers[category]
+
+                    if len(gathered_task_gt) > 0:
+                        F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt)
+
+                        results[f'{category}_F1_pos'] = F1_pos
+                        results[f'{category}_F1_neg'] = F1_neg
+                        results[f'{category}_F1_weighted'] = F1_w
+                        results[f'{category}_count'] = category_total[category]
+                    else:
+                        results[f'{category}_F1_pos'] = 0.0
+                        results[f'{category}_F1_neg'] = 0.0
+                        results[f'{category}_F1_weighted'] = 0.0
+                        results[f'{category}_count'] = 0
+
+                # Calculate overall F1 score (all step-based categories combined)
+                all_task_gts = []
+                all_model_answers = []
+                for category in category_task_gts:
+                    all_task_gts.extend(category_task_gts[category])
+                    all_model_answers.extend(category_model_answers[category])
+
+                if len(all_task_gts) > 0:
+                    F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts)
+                    results['Overall_F1_pos'] = F1_pos_overall
+                    results['Overall_F1_neg'] = F1_neg_overall
+                    results['Overall_F1_weighted'] = F1_w_overall
+                    results['Overall_count'] = sum(category_total.values())
+                else:
+                    results['Overall_F1_pos'] = 0.0
+                    results['Overall_F1_neg'] = 0.0
+                    results['Overall_F1_weighted'] = 0.0
+                    results['Overall_count'] = 0
+
+        # Process multi_solution category
+        if has_multi_solution:
+            ms_data = data[data['category'] == 'multi_solution']
+            ms_results = cls.evaluate_multi_solution(ms_data)
+            results.update(ms_results)
 
         # Convert to DataFrame format
         results_df = pd.DataFrame([results])

From 29ba4bb598387788582c080070a71dfb7ba9aa1e Mon Sep 17 00:00:00 2001
From: Yuan Wenzhen <colorful-black@outlook.com>
Date: Thu, 23 Oct 2025 21:30:03 +0800
Subject: [PATCH 8/8] Add foresight evaluation support in VLRMBench dataset
 Updated initialization to warn about supported task types. Fix and update the
 md5

---
 vlmeval/dataset/vlrmbench.py | 79 ++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 4 deletions(-)

diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
index dbff3d225..14f3961a4 100644
--- a/vlmeval/dataset/vlrmbench.py
+++ b/vlmeval/dataset/vlrmbench.py
@@ -98,6 +98,10 @@ class VLRMBench(ImageBaseDataset):
     - image_ref_error: Image reference error detection
     - location_error: Location error detection
     - multi_solution: Position bias resistance evaluation
+    - foresight: Reasoning foresight capability evaluation
+
+    Note: Currently only supports Outcome-based tasks and Step-based tasks.
+    Criticism-based tasks are not supported in this implementation.
     """
 
     TYPE = 'VQA'
@@ -105,13 +109,30 @@ class VLRMBench(ImageBaseDataset):
         'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv',
         'VLRMBench_MultiSolution': (
             'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_MultiSolution.tsv'
+        ),
+        'VLRMBench_Foresight': (
+            'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_Foresight.tsv'
         )
     }
     DATASET_MD5 = {
         'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2',
-        'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387'
+        'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387',
+        'VLRMBench_Foresight': '1e22f1b94afbd6f4f3a4028c91749311'
     }
 
+    def __init__(self, **kwargs):
+        """
+        Initialize VLRMBench dataset with warning about supported task types.
+        """
+        import warnings
+        warnings.warn(
+            "VLRMBench currently only supports Outcome-based tasks and Step-based tasks. "
+            "Criticism-based tasks are not supported in this implementation.",
+            UserWarning,
+            stacklevel=2
+        )
+        super().__init__(**kwargs)
+
     def build_prompt(self, line):
         """
         Build prompt information
@@ -212,11 +233,54 @@ def evaluate_multi_solution(cls, data):
 
         return results
 
+    @classmethod
+    def evaluate_foresight(cls, data):
+        """
+        Evaluate foresight type data (reasoning foresight capability)
+
+        Args:
+            data: DataFrame containing foresight predictions
+
+        Returns:
+            dict: Evaluation results with accuracy metric
+        """
+        acc_sample = 0
+        overall_sample = 0
+        skipped = 0
+
+        for idx in range(len(data)):
+            item = data.iloc[idx]
+
+            try:
+                task_gt = item['task_gt']  # True/False
+                model_answer = item.get('prediction', '')
+
+                # 关键词匹配逻辑（与get_fores_eval_res.py一致）
+                if task_gt is True:
+                    if re.search(r'\b(yes|true)\b', model_answer, re.IGNORECASE):
+                        acc_sample += 1
+                elif task_gt is False:
+                    if re.search(r'\b(no|false)\b', model_answer, re.IGNORECASE):
+                        acc_sample += 1
+
+                overall_sample += 1
+            except Exception as e:
+                print(f"Failed to process foresight sample (idx={idx}): {e}")
+                skipped += 1
+
+        results = {
+            'foresight_accuracy': acc_sample / overall_sample if overall_sample > 0 else 0.0,
+            'foresight_count': overall_sample,
+            'foresight_skipped': skipped
+        }
+
+        return results
+
     @classmethod
     def evaluate(cls, eval_file, **judge_kwargs):
         """
         Evaluate model prediction results
-        Automatically detects and handles both step-based and multi_solution data
+        Automatically detects and handles step-based, multi_solution, and foresight data
 
         Args:
             eval_file: Path to model prediction results file
@@ -235,14 +299,15 @@ def evaluate(cls, eval_file, **judge_kwargs):
         # Detect data types
         categories = data['category'].unique()
         has_multi_solution = 'multi_solution' in categories
-        has_step_based = any(c != 'multi_solution' for c in categories)
+        has_foresight = 'foresight' in categories
+        has_step_based = any(c not in ['multi_solution', 'foresight'] for c in categories)
 
         results = {}
 
         # Process step-based categories
         if has_step_based:
             # Filter step-based data
-            step_data = data[data['category'] != 'multi_solution']
+            step_data = data[~data['category'].isin(['multi_solution', 'foresight'])]
 
             # Ensure answer field exists for step-based data
             if 'answer' not in step_data.columns:
@@ -322,6 +387,12 @@ def evaluate(cls, eval_file, **judge_kwargs):
             ms_results = cls.evaluate_multi_solution(ms_data)
             results.update(ms_results)
 
+        # Process foresight category
+        if has_foresight:
+            foresight_data = data[data['category'] == 'foresight']
+            foresight_results = cls.evaluate_foresight(foresight_data)
+            results.update(foresight_results)
+
         # Convert to DataFrame format
         results_df = pd.DataFrame([results])