From 5cc0289bdd35c1a349cef5240a3c804241bade3f Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Wed, 1 Oct 2025 19:08:35 +0800 Subject: [PATCH 1/8] add vlrmbench dataset --- vlmeval/dataset/__init__.py | 24 +- vlmeval/dataset/vlrmbench.py | 473 +++++++++++++++++++++++++++++++++++ 2 files changed, 496 insertions(+), 1 deletion(-) create mode 100644 vlmeval/dataset/vlrmbench.py diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 855049d4a..7db71e8d4 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -79,6 +79,23 @@ from .chartmimic import ChartMimic from .m4bench import M4Bench +# VLRMBench imports +from .vlrmbench import ( + VLRMBenchBase, + VLRMBenchAttributeHallucination, + VLRMBenchDetailError, + VLRMBenchStepCorrectness, + VLRMBenchForesight, + VLRMBenchErrorCorrection, + VLRMBenchErrorReasonAnalysis, + VLRMBenchExistenceHallucination, + VLRMBenchImageRefError, + VLRMBenchLocationError, + VLRMBenchMostConfidence, + VLRMBenchMultiSolution, + VLRMBenchRedundantDet, +) + class ConcatDataset(ImageBaseDataset): # This dataset takes multiple dataset names as input and aggregate them into a single dataset. @@ -205,7 +222,12 @@ def evaluate(self, eval_file, **judge_kwargs): ZEROBench, SCAM, Omni3DBench, TallyQA, _3DSRBench, BMMR, AffordanceDataset, MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench, OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench, - AyaVisionBench, TopViewRS, VLMBias + AyaVisionBench, TopViewRS, VLMBias, + # VLRMBench datasets + VLRMBenchAttributeHallucination, VLRMBenchDetailError, VLRMBenchStepCorrectness, + VLRMBenchForesight, VLRMBenchErrorCorrection, VLRMBenchErrorReasonAnalysis, + VLRMBenchExistenceHallucination, VLRMBenchImageRefError, VLRMBenchLocationError, + VLRMBenchMostConfidence, VLRMBenchMultiSolution, VLRMBenchRedundantDet ] VIDEO_DATASET = [ diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py new file mode 100644 index 000000000..395e83103 --- /dev/null +++ b/vlmeval/dataset/vlrmbench.py @@ -0,0 +1,473 @@ +import os +import json +import zipfile +import re +import tempfile +from pathlib import Path +from typing import Dict, List, Any, Union, Optional +import pandas as pd +import numpy as np +from huggingface_hub import snapshot_download +from sklearn.metrics import f1_score +from ..smp import * +from .image_base import ImageBaseDataset + + +class VLRMBenchBase(ImageBaseDataset): + """ + VLRMBench数据集基础类 + 支持从HuggingFace下载和解压数据,处理JSONL格式的推理错误检测数据 + """ + + MODALITY = "IMAGE" + TYPE = "VQA" # 设置为VQA类型以支持开放式问答 + + # 支持的子集列表 + SUPPORTED_SUBSETS = [ + "attribute_hallucination", + "detail_error", + "error_correction", + "error_reason_analysis", + "existence_hallucination", + "foresight", + "image_ref_error", + "location_error", + "most_confidence", + "multi_solution", + "redundant_det", + "step_correctness", + ] + + # HuggingFace仓库信息 + HF_REPO = "Winston-Yuan/VLRMBench" + + @classmethod + def supported_datasets(cls): + """返回支持的数据集名称列表""" + return [f"VLRMBench_{subset}" for subset in cls.SUPPORTED_SUBSETS] + + def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): + """ + 初始化VLRMBench数据集 + + Args: + dataset: 数据集名称,格式为VLRMBench_{subset} + **kwargs: 其他参数 + """ + # 从数据集名称中提取子集名称 + if dataset.startswith("VLRMBench_"): + subset = dataset[len("VLRMBench_") :] + else: + subset = "attribute_hallucination" # 默认子集 + + if subset not in self.SUPPORTED_SUBSETS: + raise ValueError(f"Unsupported subset: {subset}. Supported subsets: {self.SUPPORTED_SUBSETS}") + + self.subset = subset + self.dataset_name = dataset + + # 设置数据根目录 + ROOT = LMUDataRoot() + self.data_root = osp.join(ROOT, "datasets", "VLRMBench") + os.makedirs(self.data_root, exist_ok=True) + + # 下载和解压数据 + self.data_dir = self._download_and_extract() + + # 加载数据 + data = self._load_jsonl_data() + + # 确保数据有index字段 + for i, item in enumerate(data): + if 'index' not in item: + item['index'] = i + + # 转换为DataFrame格式 + self.data = pd.DataFrame(data) + + # 设置图片根目录 + self.img_root = osp.join(self.data_dir, "images") + + # 设置评估模式 + self.evaluation_mode = self._get_evaluation_mode() + + # 后处理 + self.post_build(dataset) + + def _download_and_extract(self) -> str: + """ + 从HuggingFace下载数据并解压 + + Returns: + str: 解压后的数据目录路径 + """ + local_dir = osp.join(self.data_root, "VLRMBench-HF") + + # 检查是否已经下载 + if osp.exists(local_dir) and osp.exists(osp.join(local_dir, "benchmark_data")): + print(f"VLRMBench data already exists at {local_dir}") + return local_dir + + print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}") + + + # 下载数据 + snapshot_download( + repo_id=self.HF_REPO, + repo_type="dataset", + local_dir=local_dir, + local_dir_use_symlinks=False, + tqdm_class=None, # 使用默认的tqdm进度条 + ) + + # 解压图片文件 + self._extract_images(local_dir) + + print(f"VLRMBench data downloaded and extracted to {local_dir}") + return local_dir + + def _extract_images(self, data_dir: str): + """ + 解压图片文件 + + Args: + data_dir: 数据目录路径 + """ + zip_file = osp.join(data_dir, "Image.zip") + images_dir = osp.join(data_dir, "images") + + if osp.exists(images_dir): + print("Images already extracted") + return + + if not osp.exists(zip_file): + raise FileNotFoundError(f"Image.zip not found at {zip_file}") + + print(f"Extracting images from {zip_file}") + os.makedirs(images_dir, exist_ok=True) + + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(images_dir) + + print(f"Images extracted to {images_dir}") + + def _load_jsonl_data(self) -> List[Dict]: + """ + 加载JSONL数据文件 + + Returns: + List[Dict]: 加载的数据列表 + """ + jsonl_file = osp.join(self.data_dir, "benchmark_data", f"{self.subset}.jsonl") + + if not osp.exists(jsonl_file): + raise FileNotFoundError(f"JSONL file not found: {jsonl_file}") + + data = [] + with open(jsonl_file, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + data.append(json.loads(line)) + + print(f"Loaded {len(data)} samples from {self.subset}") + return data + + def _get_evaluation_mode(self) -> str: + """ + 根据子集确定评估模式 + + Returns: + str: 评估模式 + """ + # 前瞻性推理使用任务级评估 + if self.subset == "foresight": + return "foresight" + + # 多解任务使用特殊评估 + if self.subset == "multi_solution": + return "multi_solution" + + # 生成任务使用judge评估 (暂时跳过) + if self.subset in ["error_correction", "error_reason_analysis"]: + return "generation" + + # 其他子集使用二进制分类评估 + return "binary_classification" + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + item = dict(self.data.iloc[idx]) + + # 处理图片路径 + if "image" in item and isinstance(item["image"], list): + # 将相对路径转换为绝对路径 + image_paths = [] + for img_path in item["image"]: + full_path = osp.join(self.img_root, img_path) + if osp.exists(full_path): + image_paths.append(full_path) + else: + print(f"Warning: Image not found: {full_path}") + + item["image"] = image_paths[0] if len(image_paths) == 1 else image_paths + + return item + + def post_build(self, dataset): + """后处理,设置数据集特定属性""" + # 设置评估指标 + if self.evaluation_mode == "binary_classification": + self.metrics = ["f1_positive", "f1_negative", "f1_weighted", "step_accuracy"] + elif self.evaluation_mode == "foresight": + self.metrics = ["task_accuracy"] + elif self.evaluation_mode == "multi_solution": + self.metrics = ["multi_solution_accuracy"] + elif self.evaluation_mode == "generation": + self.metrics = ["win_rate", "judge_score"] + else: + self.metrics = ["overall_accuracy"] + + def build_prompt(self, line): + """ + 构建提示词 - 根据评估模式构建不同的提示词 + + Args: + line: 数据行 + + Returns: + str: 构建的提示词 + """ + question = line["question"] + + if self.evaluation_mode == "binary_classification": + step_list = line.get("step_list", []) + prompt = f"Question: {question}\n\nReasoning Steps:\n" + for i, step in enumerate(step_list, 1): + prompt += f"Step {i}: {step}\n" + prompt += "\nPlease identify which steps contain errors. Output format: [0,1,0,1,...]" + + elif self.evaluation_mode == "foresight": + step_list = line.get("step_list", []) + prompt = f"Question: {question}\n\nReasoning Steps:\n" + for i, step in enumerate(step_list, 1): + prompt += f"Step {i}: {step}\n" + prompt += "\nDoes this reasoning show good foresight? Answer: yes/no" + + elif self.evaluation_mode == "multi_solution": + prompt = f"Question: {question}\n\nPlease provide two different solution approaches." + + elif self.evaluation_mode == "generation": + reasoning_error = line.get("reasoning_error", []) + prompt = f"Question: {question}\n\nReasoning with errors:\n" + for i, step in enumerate(reasoning_error, 1): + prompt += f"Step {i}: {step}\n" + prompt += "\nPlease analyze and correct the errors in this reasoning." + + return prompt + + def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]: + """ + 格式化模型答案 - 基于原始评测脚本的逻辑 + + Args: + model_answer: 模型原始输出 + task_gt: 任务真实标签 + + Returns: + List[int]: 格式化后的预测结果 + """ + if self.evaluation_mode == "binary_classification": + return self._format_binary_classification_answer(model_answer, task_gt) + elif self.evaluation_mode == "multi_solution": + return self._format_multi_solution_answer(model_answer, task_gt) + else: + return [] + + def _format_binary_classification_answer(self, model_answer: str, task_gt: List) -> List[int]: + """ + 格式化二进制分类任务的模型答案 + 基于 get_sc_mc_rd_eval_res.py 中的 format_model_answer_tolist 函数 + """ + # 提取数字 + numbers = re.findall(r"\d+", model_answer) + result = [int(num) for num in numbers] + + # 将非0/1的数字转换为1 + result = [num if num == 0 or num == 1 else 1 for num in result] + + # 调整长度以匹配真实标签 + if len(result) >= len(task_gt): + return result[: len(task_gt)] + else: + return result + [0] * (len(task_gt) - len(result)) + + def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> List[int]: + """ + 格式化多解任务的模型答案 + 基于 get_ms_eval_res.py 中的 format_ms_ec_era_model_answer_tolist 函数 + """ + numbers = re.findall(r"\d+", model_answer) + result = [int(num) for num in numbers] + + if len(result) >= len(task_gt): + return result[-len(task_gt) :] + else: + return result + [0] * (len(task_gt) - len(result)) + + def evaluate_binary_classification( + self, predictions: List[List[int]], ground_truths: List[List[int]] + ) -> Dict[str, float]: + """ + 评估二进制分类任务 + 基于 get_sc_mc_rd_eval_res.py 中的评估逻辑 + """ + # 展平所有预测和真实标签 + flat_predictions = [] + flat_ground_truths = [] + + for pred, gt in zip(predictions, ground_truths): + flat_predictions.extend(pred) + flat_ground_truths.extend(gt) + + # 转换为numpy数组 + pred_array = np.array(flat_predictions) + gt_array = np.array(flat_ground_truths) + + # 计算F1分数 + f1_pos = f1_score(gt_array, pred_array, pos_label=1) + f1_neg = f1_score(gt_array, pred_array, pos_label=0) + + # 计算加权F1 + pos_count = np.sum(gt_array == 1) + neg_count = np.sum(gt_array == 0) + total_count = pos_count + neg_count + + if total_count > 0: + f1_weighted = (f1_pos * pos_count + f1_neg * neg_count) / total_count + else: + f1_weighted = 0.0 + + # 计算步骤准确率 + step_accuracy = np.mean(pred_array == gt_array) + + return { + "f1_positive": f1_pos, + "f1_negative": f1_neg, + "f1_weighted": f1_weighted, + "step_accuracy": step_accuracy, + } + + def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) -> Dict[str, float]: + """ + 评估前瞻性推理任务 + 基于 get_fores_eval_res.py 中的评估逻辑 + """ + correct = 0 + total = len(predictions) + + for pred, gt in zip(predictions, ground_truths): + if gt == True: + if re.search(r"\b(yes|true)\b", pred, re.IGNORECASE): + correct += 1 + elif gt == False: + if re.search(r"\b(no|false)\b", pred, re.IGNORECASE): + correct += 1 + + accuracy = correct / total if total > 0 else 0.0 + + return {"task_accuracy": accuracy} + + def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[List[int]]) -> Dict[str, float]: + """ + 评估多解任务 + 基于 get_ms_eval_res.py 中的评估逻辑 + """ + correct = 0 + total = len(predictions) + + for pred, gt in zip(predictions, ground_truths): + # 假设predictions包含front和back两个答案 + front_answer = pred.get("front", [0, 0]) + back_answer = pred.get("back", [0, 0]) + + # 计算得分 + score1 = front_answer[0] + back_answer[1] + score2 = front_answer[1] + back_answer[0] + + if score1 > score2: + correct += 1 + + accuracy = correct / total if total > 0 else 0.0 + + return {"multi_solution_accuracy": accuracy} + + +# 为每个子集创建具体的类 +class VLRMBenchAttributeHallucination(VLRMBenchBase): + """属性幻觉检测子集""" + + def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchDetailError(VLRMBenchBase): + """细节错误检测子集""" + + def __init__(self, dataset="VLRMBench_detail_error", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchStepCorrectness(VLRMBenchBase): + """步骤正确性评估子集""" + + def __init__(self, dataset="VLRMBench_step_correctness", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchForesight(VLRMBenchBase): + """前瞻性推理评估子集""" + + def __init__(self, dataset="VLRMBench_foresight", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchErrorCorrection(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_error_correction", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchErrorReasonAnalysis(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_error_reason_analysis", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchExistenceHallucination(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_existence_hallucination", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchImageRefError(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_image_ref_error", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchLocationError(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_location_error", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchMostConfidence(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_most_confidence", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchMultiSolution(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_multi_solution", **kwargs): + super().__init__(dataset=dataset, **kwargs) + + +class VLRMBenchRedundantDet(VLRMBenchBase): + def __init__(self, dataset="VLRMBench_redundant_det", **kwargs): + super().__init__(dataset=dataset, **kwargs) From 7ca05b6a9021b6039be88c68e8d3588822e5de9c Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Wed, 1 Oct 2025 19:57:18 +0800 Subject: [PATCH 2/8] Translate comments to English --- vlmeval/dataset/vlrmbench.py | 159 +++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 71 deletions(-) diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py index 395e83103..47b96cb62 100644 --- a/vlmeval/dataset/vlrmbench.py +++ b/vlmeval/dataset/vlrmbench.py @@ -15,14 +15,15 @@ class VLRMBenchBase(ImageBaseDataset): """ - VLRMBench数据集基础类 - 支持从HuggingFace下载和解压数据,处理JSONL格式的推理错误检测数据 + Base class for VLRMBench dataset. + Supports downloading and extracting data from HuggingFace, + and processing JSONL-formatted reasoning error detection data. """ MODALITY = "IMAGE" - TYPE = "VQA" # 设置为VQA类型以支持开放式问答 + TYPE = "VQA" # Set as VQA type to support open-ended QA - # 支持的子集列表 + # List of supported subsets SUPPORTED_SUBSETS = [ "attribute_hallucination", "detail_error", @@ -38,27 +39,27 @@ class VLRMBenchBase(ImageBaseDataset): "step_correctness", ] - # HuggingFace仓库信息 + # HuggingFace repository information HF_REPO = "Winston-Yuan/VLRMBench" @classmethod def supported_datasets(cls): - """返回支持的数据集名称列表""" + """Return list of supported dataset names.""" return [f"VLRMBench_{subset}" for subset in cls.SUPPORTED_SUBSETS] def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): """ - 初始化VLRMBench数据集 + Initialize VLRMBench dataset. Args: - dataset: 数据集名称,格式为VLRMBench_{subset} - **kwargs: 其他参数 + dataset: Dataset name in format VLRMBench_{subset} + **kwargs: Additional arguments """ - # 从数据集名称中提取子集名称 + # Extract subset name from dataset name if dataset.startswith("VLRMBench_"): subset = dataset[len("VLRMBench_") :] else: - subset = "attribute_hallucination" # 默认子集 + subset = "attribute_hallucination" # Default subset if subset not in self.SUPPORTED_SUBSETS: raise ValueError(f"Unsupported subset: {subset}. Supported subsets: {self.SUPPORTED_SUBSETS}") @@ -66,44 +67,44 @@ def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): self.subset = subset self.dataset_name = dataset - # 设置数据根目录 + # Set data root directory ROOT = LMUDataRoot() self.data_root = osp.join(ROOT, "datasets", "VLRMBench") os.makedirs(self.data_root, exist_ok=True) - # 下载和解压数据 + # Download and extract data self.data_dir = self._download_and_extract() - # 加载数据 + # Load data data = self._load_jsonl_data() - # 确保数据有index字段 + # Ensure data has index field for i, item in enumerate(data): if 'index' not in item: item['index'] = i - # 转换为DataFrame格式 + # Convert to DataFrame format self.data = pd.DataFrame(data) - # 设置图片根目录 + # Set image root directory self.img_root = osp.join(self.data_dir, "images") - # 设置评估模式 + # Set evaluation mode self.evaluation_mode = self._get_evaluation_mode() - # 后处理 + # Post-processing self.post_build(dataset) def _download_and_extract(self) -> str: """ - 从HuggingFace下载数据并解压 + Download data from HuggingFace and extract images. Returns: - str: 解压后的数据目录路径 + str: Path to the extracted data directory """ local_dir = osp.join(self.data_root, "VLRMBench-HF") - # 检查是否已经下载 + # Check if already downloaded if osp.exists(local_dir) and osp.exists(osp.join(local_dir, "benchmark_data")): print(f"VLRMBench data already exists at {local_dir}") return local_dir @@ -111,16 +112,16 @@ def _download_and_extract(self) -> str: print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}") - # 下载数据 + # Download data snapshot_download( repo_id=self.HF_REPO, repo_type="dataset", local_dir=local_dir, local_dir_use_symlinks=False, - tqdm_class=None, # 使用默认的tqdm进度条 + tqdm_class=None, # Use default tqdm progress bar ) - # 解压图片文件 + # Extract image files self._extract_images(local_dir) print(f"VLRMBench data downloaded and extracted to {local_dir}") @@ -128,10 +129,10 @@ def _download_and_extract(self) -> str: def _extract_images(self, data_dir: str): """ - 解压图片文件 + Extract image files from zip archive. Args: - data_dir: 数据目录路径 + data_dir: Path to the data directory """ zip_file = osp.join(data_dir, "Image.zip") images_dir = osp.join(data_dir, "images") @@ -153,10 +154,10 @@ def _extract_images(self, data_dir: str): def _load_jsonl_data(self) -> List[Dict]: """ - 加载JSONL数据文件 + Load JSONL data file. Returns: - List[Dict]: 加载的数据列表 + List[Dict]: List of loaded data items """ jsonl_file = osp.join(self.data_dir, "benchmark_data", f"{self.subset}.jsonl") @@ -174,24 +175,24 @@ def _load_jsonl_data(self) -> List[Dict]: def _get_evaluation_mode(self) -> str: """ - 根据子集确定评估模式 + Determine evaluation mode based on subset. Returns: - str: 评估模式 + str: Evaluation mode """ - # 前瞻性推理使用任务级评估 + # Foresight reasoning uses task-level evaluation if self.subset == "foresight": return "foresight" - # 多解任务使用特殊评估 + # Multi-solution task uses special evaluation if self.subset == "multi_solution": return "multi_solution" - # 生成任务使用judge评估 (暂时跳过) + # Generation tasks use judge evaluation (skipped for now) if self.subset in ["error_correction", "error_reason_analysis"]: return "generation" - # 其他子集使用二进制分类评估 + # Other subsets use binary classification evaluation return "binary_classification" def __len__(self): @@ -200,9 +201,9 @@ def __len__(self): def __getitem__(self, idx): item = dict(self.data.iloc[idx]) - # 处理图片路径 + # Process image paths if "image" in item and isinstance(item["image"], list): - # 将相对路径转换为绝对路径 + # Convert relative paths to absolute paths image_paths = [] for img_path in item["image"]: full_path = osp.join(self.img_root, img_path) @@ -216,8 +217,8 @@ def __getitem__(self, idx): return item def post_build(self, dataset): - """后处理,设置数据集特定属性""" - # 设置评估指标 + """Post-processing to set dataset-specific attributes.""" + # Set evaluation metrics if self.evaluation_mode == "binary_classification": self.metrics = ["f1_positive", "f1_negative", "f1_weighted", "step_accuracy"] elif self.evaluation_mode == "foresight": @@ -231,13 +232,13 @@ def post_build(self, dataset): def build_prompt(self, line): """ - 构建提示词 - 根据评估模式构建不同的提示词 + Build prompt based on evaluation mode. Args: - line: 数据行 + line: Data row Returns: - str: 构建的提示词 + str: Constructed prompt """ question = line["question"] @@ -269,14 +270,14 @@ def build_prompt(self, line): def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]: """ - 格式化模型答案 - 基于原始评测脚本的逻辑 + Format model answer based on original evaluation script logic. Args: - model_answer: 模型原始输出 - task_gt: 任务真实标签 + model_answer: Raw model output + task_gt: Ground truth labels Returns: - List[int]: 格式化后的预测结果 + List[int]: Formatted prediction results """ if self.evaluation_mode == "binary_classification": return self._format_binary_classification_answer(model_answer, task_gt) @@ -287,17 +288,17 @@ def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]: def _format_binary_classification_answer(self, model_answer: str, task_gt: List) -> List[int]: """ - 格式化二进制分类任务的模型答案 - 基于 get_sc_mc_rd_eval_res.py 中的 format_model_answer_tolist 函数 + Format binary classification task model answer. + Based on format_model_answer_tolist function in get_sc_mc_rd_eval_res.py """ - # 提取数字 + # Extract numbers numbers = re.findall(r"\d+", model_answer) result = [int(num) for num in numbers] - # 将非0/1的数字转换为1 + # Convert non-0/1 numbers to 1 result = [num if num == 0 or num == 1 else 1 for num in result] - # 调整长度以匹配真实标签 + # Adjust length to match ground truth if len(result) >= len(task_gt): return result[: len(task_gt)] else: @@ -305,8 +306,8 @@ def _format_binary_classification_answer(self, model_answer: str, task_gt: List) def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> List[int]: """ - 格式化多解任务的模型答案 - 基于 get_ms_eval_res.py 中的 format_ms_ec_era_model_answer_tolist 函数 + Format multi-solution task model answer. + Based on format_ms_ec_era_model_answer_tolist function in get_ms_eval_res.py """ numbers = re.findall(r"\d+", model_answer) result = [int(num) for num in numbers] @@ -320,10 +321,10 @@ def evaluate_binary_classification( self, predictions: List[List[int]], ground_truths: List[List[int]] ) -> Dict[str, float]: """ - 评估二进制分类任务 - 基于 get_sc_mc_rd_eval_res.py 中的评估逻辑 + Evaluate binary classification tasks. + Based on evaluation logic in get_sc_mc_rd_eval_res.py """ - # 展平所有预测和真实标签 + # Flatten all predictions and ground truths flat_predictions = [] flat_ground_truths = [] @@ -331,15 +332,15 @@ def evaluate_binary_classification( flat_predictions.extend(pred) flat_ground_truths.extend(gt) - # 转换为numpy数组 + # Convert to numpy arrays pred_array = np.array(flat_predictions) gt_array = np.array(flat_ground_truths) - # 计算F1分数 + # Calculate F1 scores f1_pos = f1_score(gt_array, pred_array, pos_label=1) f1_neg = f1_score(gt_array, pred_array, pos_label=0) - # 计算加权F1 + # Calculate weighted F1 pos_count = np.sum(gt_array == 1) neg_count = np.sum(gt_array == 0) total_count = pos_count + neg_count @@ -349,7 +350,7 @@ def evaluate_binary_classification( else: f1_weighted = 0.0 - # 计算步骤准确率 + # Calculate step accuracy step_accuracy = np.mean(pred_array == gt_array) return { @@ -361,8 +362,8 @@ def evaluate_binary_classification( def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) -> Dict[str, float]: """ - 评估前瞻性推理任务 - 基于 get_fores_eval_res.py 中的评估逻辑 + Evaluate foresight reasoning tasks. + Based on evaluation logic in get_fores_eval_res.py """ correct = 0 total = len(predictions) @@ -381,18 +382,18 @@ def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[List[int]]) -> Dict[str, float]: """ - 评估多解任务 - 基于 get_ms_eval_res.py 中的评估逻辑 + Evaluate multi-solution tasks. + Based on evaluation logic in get_ms_eval_res.py """ correct = 0 total = len(predictions) for pred, gt in zip(predictions, ground_truths): - # 假设predictions包含front和back两个答案 + # Assume predictions contain front and back answers front_answer = pred.get("front", [0, 0]) back_answer = pred.get("back", [0, 0]) - # 计算得分 + # Calculate scores score1 = front_answer[0] + back_answer[1] score2 = front_answer[1] + back_answer[0] @@ -404,70 +405,86 @@ def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[L return {"multi_solution_accuracy": accuracy} -# 为每个子集创建具体的类 +# Create specific classes for each subset class VLRMBenchAttributeHallucination(VLRMBenchBase): - """属性幻觉检测子集""" + """Attribute hallucination detection subset.""" def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchDetailError(VLRMBenchBase): - """细节错误检测子集""" + """Detail error detection subset.""" def __init__(self, dataset="VLRMBench_detail_error", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchStepCorrectness(VLRMBenchBase): - """步骤正确性评估子集""" + """Step correctness evaluation subset.""" def __init__(self, dataset="VLRMBench_step_correctness", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchForesight(VLRMBenchBase): - """前瞻性推理评估子集""" + """Foresight reasoning evaluation subset.""" def __init__(self, dataset="VLRMBench_foresight", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchErrorCorrection(VLRMBenchBase): + """Error correction subset.""" + def __init__(self, dataset="VLRMBench_error_correction", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchErrorReasonAnalysis(VLRMBenchBase): + """Error reason analysis subset.""" + def __init__(self, dataset="VLRMBench_error_reason_analysis", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchExistenceHallucination(VLRMBenchBase): + """Existence hallucination detection subset.""" + def __init__(self, dataset="VLRMBench_existence_hallucination", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchImageRefError(VLRMBenchBase): + """Image reference error detection subset.""" + def __init__(self, dataset="VLRMBench_image_ref_error", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchLocationError(VLRMBenchBase): + """Location error detection subset.""" + def __init__(self, dataset="VLRMBench_location_error", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchMostConfidence(VLRMBenchBase): + """Most confidence evaluation subset.""" + def __init__(self, dataset="VLRMBench_most_confidence", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchMultiSolution(VLRMBenchBase): + """Multi-solution evaluation subset.""" + def __init__(self, dataset="VLRMBench_multi_solution", **kwargs): super().__init__(dataset=dataset, **kwargs) class VLRMBenchRedundantDet(VLRMBenchBase): + """Redundant detection subset.""" + def __init__(self, dataset="VLRMBench_redundant_det", **kwargs): super().__init__(dataset=dataset, **kwargs) From 86fbf9b002e064978fd77190671762dabe6e13ac Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Thu, 2 Oct 2025 15:37:09 +0800 Subject: [PATCH 3/8] Fix code formatting issues --- vlmeval/dataset/vlrmbench.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py index 47b96cb62..4a6fad77e 100644 --- a/vlmeval/dataset/vlrmbench.py +++ b/vlmeval/dataset/vlrmbench.py @@ -57,7 +57,7 @@ def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): """ # Extract subset name from dataset name if dataset.startswith("VLRMBench_"): - subset = dataset[len("VLRMBench_") :] + subset = dataset[len("VLRMBench_"):] else: subset = "attribute_hallucination" # Default subset @@ -82,7 +82,7 @@ def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): for i, item in enumerate(data): if 'index' not in item: item['index'] = i - + # Convert to DataFrame format self.data = pd.DataFrame(data) @@ -111,7 +111,6 @@ def _download_and_extract(self) -> str: print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}") - # Download data snapshot_download( repo_id=self.HF_REPO, @@ -313,7 +312,7 @@ def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> Lis result = [int(num) for num in numbers] if len(result) >= len(task_gt): - return result[-len(task_gt) :] + return result[-len(task_gt):] else: return result + [0] * (len(task_gt) - len(result)) @@ -369,10 +368,10 @@ def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) total = len(predictions) for pred, gt in zip(predictions, ground_truths): - if gt == True: + if gt is True: if re.search(r"\b(yes|true)\b", pred, re.IGNORECASE): correct += 1 - elif gt == False: + elif gt is False: if re.search(r"\b(no|false)\b", pred, re.IGNORECASE): correct += 1 From dbea3ff0f65fa68fb7e6379f991926edb7b4889b Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Tue, 14 Oct 2025 22:20:09 +0800 Subject: [PATCH 4/8] Reconstructed the dataset and the evaluation code of vlrmbench --- vlmeval/dataset/__init__.py | 23 +- vlmeval/dataset/vlrmbench.py | 623 ++++++++++------------------------- 2 files changed, 178 insertions(+), 468 deletions(-) diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 7db71e8d4..068b4fc27 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -80,21 +80,7 @@ from .m4bench import M4Bench # VLRMBench imports -from .vlrmbench import ( - VLRMBenchBase, - VLRMBenchAttributeHallucination, - VLRMBenchDetailError, - VLRMBenchStepCorrectness, - VLRMBenchForesight, - VLRMBenchErrorCorrection, - VLRMBenchErrorReasonAnalysis, - VLRMBenchExistenceHallucination, - VLRMBenchImageRefError, - VLRMBenchLocationError, - VLRMBenchMostConfidence, - VLRMBenchMultiSolution, - VLRMBenchRedundantDet, -) +from .vlrmbench import VLRMBench class ConcatDataset(ImageBaseDataset): @@ -223,11 +209,8 @@ def evaluate(self, eval_file, **judge_kwargs): MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench, OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench, AyaVisionBench, TopViewRS, VLMBias, - # VLRMBench datasets - VLRMBenchAttributeHallucination, VLRMBenchDetailError, VLRMBenchStepCorrectness, - VLRMBenchForesight, VLRMBenchErrorCorrection, VLRMBenchErrorReasonAnalysis, - VLRMBenchExistenceHallucination, VLRMBenchImageRefError, VLRMBenchLocationError, - VLRMBenchMostConfidence, VLRMBenchMultiSolution, VLRMBenchRedundantDet + # VLRMBench dataset + VLRMBench ] VIDEO_DATASET = [ diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py index 4a6fad77e..390986fb2 100644 --- a/vlmeval/dataset/vlrmbench.py +++ b/vlmeval/dataset/vlrmbench.py @@ -1,489 +1,216 @@ -import os -import json -import zipfile +from ast import literal_eval +from collections import defaultdict import re -import tempfile -from pathlib import Path -from typing import Dict, List, Any, Union, Optional -import pandas as pd import numpy as np -from huggingface_hub import snapshot_download from sklearn.metrics import f1_score -from ..smp import * + from .image_base import ImageBaseDataset +from ..smp import * -class VLRMBenchBase(ImageBaseDataset): - """ - Base class for VLRMBench dataset. - Supports downloading and extracting data from HuggingFace, - and processing JSONL-formatted reasoning error detection data. +def format_model_answer_tolist(model_answer, task_gt): """ + 从模型答案中提取0/1列表 - MODALITY = "IMAGE" - TYPE = "VQA" # Set as VQA type to support open-ended QA - - # List of supported subsets - SUPPORTED_SUBSETS = [ - "attribute_hallucination", - "detail_error", - "error_correction", - "error_reason_analysis", - "existence_hallucination", - "foresight", - "image_ref_error", - "location_error", - "most_confidence", - "multi_solution", - "redundant_det", - "step_correctness", - ] - - # HuggingFace repository information - HF_REPO = "Winston-Yuan/VLRMBench" - - @classmethod - def supported_datasets(cls): - """Return list of supported dataset names.""" - return [f"VLRMBench_{subset}" for subset in cls.SUPPORTED_SUBSETS] - - def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): - """ - Initialize VLRMBench dataset. - - Args: - dataset: Dataset name in format VLRMBench_{subset} - **kwargs: Additional arguments - """ - # Extract subset name from dataset name - if dataset.startswith("VLRMBench_"): - subset = dataset[len("VLRMBench_"):] - else: - subset = "attribute_hallucination" # Default subset - - if subset not in self.SUPPORTED_SUBSETS: - raise ValueError(f"Unsupported subset: {subset}. Supported subsets: {self.SUPPORTED_SUBSETS}") - - self.subset = subset - self.dataset_name = dataset - - # Set data root directory - ROOT = LMUDataRoot() - self.data_root = osp.join(ROOT, "datasets", "VLRMBench") - os.makedirs(self.data_root, exist_ok=True) - - # Download and extract data - self.data_dir = self._download_and_extract() - - # Load data - data = self._load_jsonl_data() - - # Ensure data has index field - for i, item in enumerate(data): - if 'index' not in item: - item['index'] = i - - # Convert to DataFrame format - self.data = pd.DataFrame(data) - - # Set image root directory - self.img_root = osp.join(self.data_dir, "images") - - # Set evaluation mode - self.evaluation_mode = self._get_evaluation_mode() - - # Post-processing - self.post_build(dataset) - - def _download_and_extract(self) -> str: - """ - Download data from HuggingFace and extract images. - - Returns: - str: Path to the extracted data directory - """ - local_dir = osp.join(self.data_root, "VLRMBench-HF") - - # Check if already downloaded - if osp.exists(local_dir) and osp.exists(osp.join(local_dir, "benchmark_data")): - print(f"VLRMBench data already exists at {local_dir}") - return local_dir - - print(f"Downloading VLRMBench from HuggingFace: {self.HF_REPO}") + 参数: + model_answer: 模型的预测答案(字符串) + task_gt: ground truth列表,用于确定期望的长度 - # Download data - snapshot_download( - repo_id=self.HF_REPO, - repo_type="dataset", - local_dir=local_dir, - local_dir_use_symlinks=False, - tqdm_class=None, # Use default tqdm progress bar - ) - - # Extract image files - self._extract_images(local_dir) + 返回: + list: 0/1列表 + """ + numbers = re.findall(r'\d+', str(model_answer)) - print(f"VLRMBench data downloaded and extracted to {local_dir}") - return local_dir + result = [int(num) for num in numbers] - def _extract_images(self, data_dir: str): - """ - Extract image files from zip archive. + # 将非0/1的数字转换为1 + result = [num if num == 0 or num == 1 else 1 for num in result] - Args: - data_dir: Path to the data directory - """ - zip_file = osp.join(data_dir, "Image.zip") - images_dir = osp.join(data_dir, "images") + # 调整长度以匹配task_gt + if len(result) >= len(task_gt): + return result[:len(task_gt)] + else: + return result + [0] * (len(task_gt) - len(result)) - if osp.exists(images_dir): - print("Images already extracted") - return - if not osp.exists(zip_file): - raise FileNotFoundError(f"Image.zip not found at {zip_file}") +def get_F1Score(gathered_model_answer, gathered_task_gt): + """ + 计算F1分数 - print(f"Extracting images from {zip_file}") - os.makedirs(images_dir, exist_ok=True) + 参数: + gathered_model_answer: 所有模型答案的列表 + gathered_task_gt: 所有ground truth的列表 - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(images_dir) + 返回: + tuple: (F1_pos, F1_neg, F1_w) - 正类F1、负类F1、加权F1 + """ + model_answer = np.array(gathered_model_answer) + task_gt = np.array(gathered_task_gt) - print(f"Images extracted to {images_dir}") + pos_count = np.sum(task_gt == 1) + neg_count = np.sum(task_gt == 0) - def _load_jsonl_data(self) -> List[Dict]: - """ - Load JSONL data file. + F1_pos = f1_score(task_gt, model_answer, pos_label=1, zero_division=0) + F1_neg = f1_score(task_gt, model_answer, pos_label=0, zero_division=0) - Returns: - List[Dict]: List of loaded data items - """ - jsonl_file = osp.join(self.data_dir, "benchmark_data", f"{self.subset}.jsonl") + w_pos = neg_count / (pos_count + neg_count) if (pos_count + neg_count) > 0 else 0 + w_neg = pos_count / (pos_count + neg_count) if (pos_count + neg_count) > 0 else 0 - if not osp.exists(jsonl_file): - raise FileNotFoundError(f"JSONL file not found: {jsonl_file}") + F1_w = w_neg * F1_neg + w_pos * F1_pos - data = [] - with open(jsonl_file, "r", encoding="utf-8") as f: - for line in f: - if line.strip(): - data.append(json.loads(line)) + return F1_pos, F1_neg, F1_w - print(f"Loaded {len(data)} samples from {self.subset}") - return data - def _get_evaluation_mode(self) -> str: - """ - Determine evaluation mode based on subset. +class VLRMBench(ImageBaseDataset): + """ + VLRMBench Dataset - Visual Language Reasoning Model Benchmark + + A comprehensive benchmark for evaluating visual reasoning capabilities including: + - step_correctness: 步骤正确性检测 + - redundant_det: 冗余检测 + - most_confidence: 最高置信度判断 + - attribute_hallucination: 属性幻觉检测 + - existence_hallucination: 存在性幻觉检测 + - detail_error: 细节错误检测 + - image_ref_error: 图像引用错误检测 + - location_error: 位置错误检测 + """ - Returns: - str: Evaluation mode - """ - # Foresight reasoning uses task-level evaluation - if self.subset == "foresight": - return "foresight" - - # Multi-solution task uses special evaluation - if self.subset == "multi_solution": - return "multi_solution" - - # Generation tasks use judge evaluation (skipped for now) - if self.subset in ["error_correction", "error_reason_analysis"]: - return "generation" - - # Other subsets use binary classification evaluation - return "binary_classification" - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - item = dict(self.data.iloc[idx]) - - # Process image paths - if "image" in item and isinstance(item["image"], list): - # Convert relative paths to absolute paths - image_paths = [] - for img_path in item["image"]: - full_path = osp.join(self.img_root, img_path) - if osp.exists(full_path): - image_paths.append(full_path) - else: - print(f"Warning: Image not found: {full_path}") - - item["image"] = image_paths[0] if len(image_paths) == 1 else image_paths - - return item - - def post_build(self, dataset): - """Post-processing to set dataset-specific attributes.""" - # Set evaluation metrics - if self.evaluation_mode == "binary_classification": - self.metrics = ["f1_positive", "f1_negative", "f1_weighted", "step_accuracy"] - elif self.evaluation_mode == "foresight": - self.metrics = ["task_accuracy"] - elif self.evaluation_mode == "multi_solution": - self.metrics = ["multi_solution_accuracy"] - elif self.evaluation_mode == "generation": - self.metrics = ["win_rate", "judge_score"] - else: - self.metrics = ["overall_accuracy"] + TYPE = 'VQA' + DATASET_URL = { + 'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv?download=true' + } + DATASET_MD5 = { + 'VLRMBench': None # 可以后续添加MD5校验 + } def build_prompt(self, line): """ - Build prompt based on evaluation mode. + 构建提示信息 - Args: - line: Data row + 参数: + line: 数据行,可以是int索引或pd.Series - Returns: - str: Constructed prompt - """ - question = line["question"] - - if self.evaluation_mode == "binary_classification": - step_list = line.get("step_list", []) - prompt = f"Question: {question}\n\nReasoning Steps:\n" - for i, step in enumerate(step_list, 1): - prompt += f"Step {i}: {step}\n" - prompt += "\nPlease identify which steps contain errors. Output format: [0,1,0,1,...]" - - elif self.evaluation_mode == "foresight": - step_list = line.get("step_list", []) - prompt = f"Question: {question}\n\nReasoning Steps:\n" - for i, step in enumerate(step_list, 1): - prompt += f"Step {i}: {step}\n" - prompt += "\nDoes this reasoning show good foresight? Answer: yes/no" - - elif self.evaluation_mode == "multi_solution": - prompt = f"Question: {question}\n\nPlease provide two different solution approaches." - - elif self.evaluation_mode == "generation": - reasoning_error = line.get("reasoning_error", []) - prompt = f"Question: {question}\n\nReasoning with errors:\n" - for i, step in enumerate(reasoning_error, 1): - prompt += f"Step {i}: {step}\n" - prompt += "\nPlease analyze and correct the errors in this reasoning." - - return prompt - - def format_model_answer(self, model_answer: str, task_gt: List) -> List[int]: + 返回: + list: 多模态消息列表,格式为 [dict(type='image', value=path), dict(type='text', value=text), ...] """ - Format model answer based on original evaluation script logic. + if isinstance(line, int): + line = self.data.iloc[line] - Args: - model_answer: Raw model output - task_gt: Ground truth labels + # 创建line的副本避免SettingWithCopyWarning + line = line.copy() - Returns: - List[int]: Formatted prediction results - """ - if self.evaluation_mode == "binary_classification": - return self._format_binary_classification_answer(model_answer, task_gt) - elif self.evaluation_mode == "multi_solution": - return self._format_multi_solution_answer(model_answer, task_gt) - else: - return [] + # 使用父类方法保存图片(从base64解码并保存到本地) + tgt_path = self.dump_image(line) - def _format_binary_classification_answer(self, model_answer: str, task_gt: List) -> List[int]: - """ - Format binary classification task model answer. - Based on format_model_answer_tolist function in get_sc_mc_rd_eval_res.py - """ - # Extract numbers - numbers = re.findall(r"\d+", model_answer) - result = [int(num) for num in numbers] - - # Convert non-0/1 numbers to 1 - result = [num if num == 0 or num == 1 else 1 for num in result] - - # Adjust length to match ground truth - if len(result) >= len(task_gt): - return result[: len(task_gt)] + # 构建消息 + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) else: - return result + [0] * (len(task_gt) - len(result)) + msgs = [dict(type='image', value=tgt_path)] - def _format_multi_solution_answer(self, model_answer: str, task_gt: List) -> List[int]: - """ - Format multi-solution task model answer. - Based on format_ms_ec_era_model_answer_tolist function in get_ms_eval_res.py - """ - numbers = re.findall(r"\d+", model_answer) - result = [int(num) for num in numbers] - - if len(result) >= len(task_gt): - return result[-len(task_gt):] - else: - return result + [0] * (len(task_gt) - len(result)) - - def evaluate_binary_classification( - self, predictions: List[List[int]], ground_truths: List[List[int]] - ) -> Dict[str, float]: - """ - Evaluate binary classification tasks. - Based on evaluation logic in get_sc_mc_rd_eval_res.py - """ - # Flatten all predictions and ground truths - flat_predictions = [] - flat_ground_truths = [] + # 添加问题文本 + question = line.get('question', '') + if question: + msgs.append(dict(type='text', value=question)) - for pred, gt in zip(predictions, ground_truths): - flat_predictions.extend(pred) - flat_ground_truths.extend(gt) + return msgs - # Convert to numpy arrays - pred_array = np.array(flat_predictions) - gt_array = np.array(flat_ground_truths) - - # Calculate F1 scores - f1_pos = f1_score(gt_array, pred_array, pos_label=1) - f1_neg = f1_score(gt_array, pred_array, pos_label=0) - - # Calculate weighted F1 - pos_count = np.sum(gt_array == 1) - neg_count = np.sum(gt_array == 0) - total_count = pos_count + neg_count - - if total_count > 0: - f1_weighted = (f1_pos * pos_count + f1_neg * neg_count) / total_count + @classmethod + def evaluate(cls, eval_file, **judge_kwargs): + """ + 评估模型预测结果 + + 参数: + eval_file: 模型预测结果文件路径 + **judge_kwargs: 其他评估参数 + + 返回: + pd.DataFrame: 评估结果,包含各类别的F1分数 + """ + # 加载预测数据 + data = load(eval_file) + + # 确保必要的字段存在 + assert 'answer' in data.columns, "评估文件缺少 'answer' 字段" + assert 'prediction' in data.columns, "评估文件缺少 'prediction' 字段" + assert 'category' in data.columns, "评估文件缺少 'category' 字段" + + # 按类别收集模型答案和ground truth + category_model_answers = defaultdict(list) + category_task_gts = defaultdict(list) + category_total = defaultdict(int) + + for idx in range(len(data)): + item = data.iloc[idx] + category = item['category'] + + try: + # 解析task_gt(answer字段) + task_gt = item['answer'] + if isinstance(task_gt, str): + # 尝试将字符串解析为列表 + task_gt = literal_eval(task_gt) + + # 获取模型答案(prediction字段) + model_answer = item.get('prediction', '') + + # 使用format_model_answer_tolist格式化模型答案 + formatted_model_answer = format_model_answer_tolist(model_answer, task_gt) + + # 收集每个类别的答案 + category_task_gts[category].extend(task_gt) + category_model_answers[category].extend(formatted_model_answer) + category_total[category] += 1 + except Exception as e: + # 如果解析失败,记录并跳过该样本 + print(f"处理样本失败 (idx={idx}, category={category}): {e}") + continue + + # 计算各类别的F1分数 + results = {} + for category in category_task_gts: + gathered_task_gt = category_task_gts[category] + gathered_model_answer = category_model_answers[category] + + if len(gathered_task_gt) > 0: + F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt) + + results[f'{category}_F1_pos'] = F1_pos + results[f'{category}_F1_neg'] = F1_neg + results[f'{category}_F1_weighted'] = F1_w + results[f'{category}_count'] = category_total[category] + else: + results[f'{category}_F1_pos'] = 0.0 + results[f'{category}_F1_neg'] = 0.0 + results[f'{category}_F1_weighted'] = 0.0 + results[f'{category}_count'] = 0 + + # 计算总体F1分数(所有类别合并) + all_task_gts = [] + all_model_answers = [] + for category in category_task_gts: + all_task_gts.extend(category_task_gts[category]) + all_model_answers.extend(category_model_answers[category]) + + if len(all_task_gts) > 0: + F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts) + results['Overall_F1_pos'] = F1_pos_overall + results['Overall_F1_neg'] = F1_neg_overall + results['Overall_F1_weighted'] = F1_w_overall + results['Overall_count'] = sum(category_total.values()) else: - f1_weighted = 0.0 - - # Calculate step accuracy - step_accuracy = np.mean(pred_array == gt_array) - - return { - "f1_positive": f1_pos, - "f1_negative": f1_neg, - "f1_weighted": f1_weighted, - "step_accuracy": step_accuracy, - } - - def evaluate_foresight(self, predictions: List[str], ground_truths: List[bool]) -> Dict[str, float]: - """ - Evaluate foresight reasoning tasks. - Based on evaluation logic in get_fores_eval_res.py - """ - correct = 0 - total = len(predictions) - - for pred, gt in zip(predictions, ground_truths): - if gt is True: - if re.search(r"\b(yes|true)\b", pred, re.IGNORECASE): - correct += 1 - elif gt is False: - if re.search(r"\b(no|false)\b", pred, re.IGNORECASE): - correct += 1 - - accuracy = correct / total if total > 0 else 0.0 - - return {"task_accuracy": accuracy} - - def evaluate_multi_solution(self, predictions: List[Dict], ground_truths: List[List[int]]) -> Dict[str, float]: - """ - Evaluate multi-solution tasks. - Based on evaluation logic in get_ms_eval_res.py - """ - correct = 0 - total = len(predictions) - - for pred, gt in zip(predictions, ground_truths): - # Assume predictions contain front and back answers - front_answer = pred.get("front", [0, 0]) - back_answer = pred.get("back", [0, 0]) - - # Calculate scores - score1 = front_answer[0] + back_answer[1] - score2 = front_answer[1] + back_answer[0] - - if score1 > score2: - correct += 1 - - accuracy = correct / total if total > 0 else 0.0 - - return {"multi_solution_accuracy": accuracy} - - -# Create specific classes for each subset -class VLRMBenchAttributeHallucination(VLRMBenchBase): - """Attribute hallucination detection subset.""" - - def __init__(self, dataset="VLRMBench_attribute_hallucination", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchDetailError(VLRMBenchBase): - """Detail error detection subset.""" - - def __init__(self, dataset="VLRMBench_detail_error", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchStepCorrectness(VLRMBenchBase): - """Step correctness evaluation subset.""" - - def __init__(self, dataset="VLRMBench_step_correctness", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchForesight(VLRMBenchBase): - """Foresight reasoning evaluation subset.""" - - def __init__(self, dataset="VLRMBench_foresight", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchErrorCorrection(VLRMBenchBase): - """Error correction subset.""" - - def __init__(self, dataset="VLRMBench_error_correction", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchErrorReasonAnalysis(VLRMBenchBase): - """Error reason analysis subset.""" - - def __init__(self, dataset="VLRMBench_error_reason_analysis", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchExistenceHallucination(VLRMBenchBase): - """Existence hallucination detection subset.""" - - def __init__(self, dataset="VLRMBench_existence_hallucination", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchImageRefError(VLRMBenchBase): - """Image reference error detection subset.""" - - def __init__(self, dataset="VLRMBench_image_ref_error", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchLocationError(VLRMBenchBase): - """Location error detection subset.""" - - def __init__(self, dataset="VLRMBench_location_error", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchMostConfidence(VLRMBenchBase): - """Most confidence evaluation subset.""" - - def __init__(self, dataset="VLRMBench_most_confidence", **kwargs): - super().__init__(dataset=dataset, **kwargs) - - -class VLRMBenchMultiSolution(VLRMBenchBase): - """Multi-solution evaluation subset.""" - - def __init__(self, dataset="VLRMBench_multi_solution", **kwargs): - super().__init__(dataset=dataset, **kwargs) + results['Overall_F1_pos'] = 0.0 + results['Overall_F1_neg'] = 0.0 + results['Overall_F1_weighted'] = 0.0 + results['Overall_count'] = 0 + # 转换为DataFrame格式 + results_df = pd.DataFrame([results]) -class VLRMBenchRedundantDet(VLRMBenchBase): - """Redundant detection subset.""" + # 保存结果 + score_file = eval_file.replace('.xlsx', '_scores.csv') + dump(results_df, score_file) - def __init__(self, dataset="VLRMBench_redundant_det", **kwargs): - super().__init__(dataset=dataset, **kwargs) + return results_df From ee9f8cce77f04fee932cfc9a46f88016a2c89958 Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Wed, 15 Oct 2025 18:05:48 +0800 Subject: [PATCH 5/8] Refactor and translate comments in VLRMBench dataset code to English --- vlmeval/dataset/__init__.py | 2 - vlmeval/dataset/vlrmbench.py | 113 ++++++++++++++++++----------------- 2 files changed, 57 insertions(+), 58 deletions(-) diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 068b4fc27..9b706e907 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -78,8 +78,6 @@ from .mmifeval import MMIFEval from .chartmimic import ChartMimic from .m4bench import M4Bench - -# VLRMBench imports from .vlrmbench import VLRMBench diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py index 390986fb2..91f7c150b 100644 --- a/vlmeval/dataset/vlrmbench.py +++ b/vlmeval/dataset/vlrmbench.py @@ -6,27 +6,28 @@ from .image_base import ImageBaseDataset from ..smp import * +from ..smp.file import get_intermediate_file_path def format_model_answer_tolist(model_answer, task_gt): """ - 从模型答案中提取0/1列表 + Extract 0/1 list from model answer - 参数: - model_answer: 模型的预测答案(字符串) - task_gt: ground truth列表,用于确定期望的长度 + Args: + model_answer: Model's prediction answer (string) + task_gt: Ground truth list, used to determine expected length - 返回: - list: 0/1列表 + Returns: + list: 0/1 list """ numbers = re.findall(r'\d+', str(model_answer)) result = [int(num) for num in numbers] - # 将非0/1的数字转换为1 + # Convert non-0/1 numbers to 1 result = [num if num == 0 or num == 1 else 1 for num in result] - # 调整长度以匹配task_gt + # Adjust length to match task_gt if len(result) >= len(task_gt): return result[:len(task_gt)] else: @@ -35,14 +36,14 @@ def format_model_answer_tolist(model_answer, task_gt): def get_F1Score(gathered_model_answer, gathered_task_gt): """ - 计算F1分数 + Calculate F1 score - 参数: - gathered_model_answer: 所有模型答案的列表 - gathered_task_gt: 所有ground truth的列表 + Args: + gathered_model_answer: List of all model answers + gathered_task_gt: List of all ground truth - 返回: - tuple: (F1_pos, F1_neg, F1_w) - 正类F1、负类F1、加权F1 + Returns: + tuple: (F1_pos, F1_neg, F1_w) - positive class F1, negative class F1, weighted F1 """ model_answer = np.array(gathered_model_answer) task_gt = np.array(gathered_task_gt) @@ -66,14 +67,14 @@ class VLRMBench(ImageBaseDataset): VLRMBench Dataset - Visual Language Reasoning Model Benchmark A comprehensive benchmark for evaluating visual reasoning capabilities including: - - step_correctness: 步骤正确性检测 - - redundant_det: 冗余检测 - - most_confidence: 最高置信度判断 - - attribute_hallucination: 属性幻觉检测 - - existence_hallucination: 存在性幻觉检测 - - detail_error: 细节错误检测 - - image_ref_error: 图像引用错误检测 - - location_error: 位置错误检测 + - step_correctness: Step correctness detection + - redundant_det: Redundancy detection + - most_confidence: Highest confidence judgment + - attribute_hallucination: Attribute hallucination detection + - existence_hallucination: Existence hallucination detection + - detail_error: Detail error detection + - image_ref_error: Image reference error detection + - location_error: Location error detection """ TYPE = 'VQA' @@ -81,36 +82,36 @@ class VLRMBench(ImageBaseDataset): 'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv?download=true' } DATASET_MD5 = { - 'VLRMBench': None # 可以后续添加MD5校验 + 'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2' } def build_prompt(self, line): """ - 构建提示信息 + Build prompt information - 参数: - line: 数据行,可以是int索引或pd.Series + Args: + line: Data row, can be int index or pd.Series - 返回: - list: 多模态消息列表,格式为 [dict(type='image', value=path), dict(type='text', value=text), ...] + Returns: + list: Multimodal message list, format is [dict(type='image', value=path), dict(type='text', value=text),] """ if isinstance(line, int): line = self.data.iloc[line] - # 创建line的副本避免SettingWithCopyWarning + # Create a copy of line to avoid SettingWithCopyWarning line = line.copy() - # 使用父类方法保存图片(从base64解码并保存到本地) + # Use parent class method to save image (decode from base64 and save locally) tgt_path = self.dump_image(line) - # 构建消息 + # Build messages msgs = [] if isinstance(tgt_path, list): msgs.extend([dict(type='image', value=p) for p in tgt_path]) else: msgs = [dict(type='image', value=tgt_path)] - # 添加问题文本 + # Add question text question = line.get('question', '') if question: msgs.append(dict(type='text', value=question)) @@ -120,24 +121,24 @@ def build_prompt(self, line): @classmethod def evaluate(cls, eval_file, **judge_kwargs): """ - 评估模型预测结果 + Evaluate model prediction results - 参数: - eval_file: 模型预测结果文件路径 - **judge_kwargs: 其他评估参数 + Args: + eval_file: Path to model prediction results file + **judge_kwargs: Other evaluation parameters - 返回: - pd.DataFrame: 评估结果,包含各类别的F1分数 + Returns: + pd.DataFrame: Evaluation results, including F1 scores for each category """ - # 加载预测数据 + # Load prediction data data = load(eval_file) - # 确保必要的字段存在 - assert 'answer' in data.columns, "评估文件缺少 'answer' 字段" - assert 'prediction' in data.columns, "评估文件缺少 'prediction' 字段" - assert 'category' in data.columns, "评估文件缺少 'category' 字段" + # Ensure necessary fields exist + assert 'answer' in data.columns, "Evaluation file missing 'answer' field" + assert 'prediction' in data.columns, "Evaluation file missing 'prediction' field" + assert 'category' in data.columns, "Evaluation file missing 'category' field" - # 按类别收集模型答案和ground truth + # Collect model answers and ground truth by category category_model_answers = defaultdict(list) category_task_gts = defaultdict(list) category_total = defaultdict(int) @@ -147,28 +148,28 @@ def evaluate(cls, eval_file, **judge_kwargs): category = item['category'] try: - # 解析task_gt(answer字段) + # Parse task_gt (answer field) task_gt = item['answer'] if isinstance(task_gt, str): - # 尝试将字符串解析为列表 + # Try to parse string as list task_gt = literal_eval(task_gt) - # 获取模型答案(prediction字段) + # Get model answer (prediction field) model_answer = item.get('prediction', '') - # 使用format_model_answer_tolist格式化模型答案 + # Format model answer using format_model_answer_tolist formatted_model_answer = format_model_answer_tolist(model_answer, task_gt) - # 收集每个类别的答案 + # Collect answers for each category category_task_gts[category].extend(task_gt) category_model_answers[category].extend(formatted_model_answer) category_total[category] += 1 except Exception as e: - # 如果解析失败,记录并跳过该样本 - print(f"处理样本失败 (idx={idx}, category={category}): {e}") + # If parsing fails, log and skip the sample + print(f"Failed to process sample (idx={idx}, category={category}): {e}") continue - # 计算各类别的F1分数 + # Calculate F1 scores for each category results = {} for category in category_task_gts: gathered_task_gt = category_task_gts[category] @@ -187,7 +188,7 @@ def evaluate(cls, eval_file, **judge_kwargs): results[f'{category}_F1_weighted'] = 0.0 results[f'{category}_count'] = 0 - # 计算总体F1分数(所有类别合并) + # Calculate overall F1 score (all categories combined) all_task_gts = [] all_model_answers = [] for category in category_task_gts: @@ -206,11 +207,11 @@ def evaluate(cls, eval_file, **judge_kwargs): results['Overall_F1_weighted'] = 0.0 results['Overall_count'] = 0 - # 转换为DataFrame格式 + # Convert to DataFrame format results_df = pd.DataFrame([results]) - # 保存结果 - score_file = eval_file.replace('.xlsx', '_scores.csv') + # Save results + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') dump(results_df, score_file) return results_df From 52b88d33f15bfd5524cdfdf0761422f3ee78ebee Mon Sep 17 00:00:00 2001 From: miaomiao <147327746+Winston-Yuan@users.noreply.github.com> Date: Wed, 15 Oct 2025 18:14:12 +0800 Subject: [PATCH 6/8] Consolidate dataset imports in __init__.py --- vlmeval/dataset/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 9b706e907..c6dc8a1ad 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -206,9 +206,7 @@ def evaluate(self, eval_file, **judge_kwargs): ZEROBench, SCAM, Omni3DBench, TallyQA, _3DSRBench, BMMR, AffordanceDataset, MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench, OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench, - AyaVisionBench, TopViewRS, VLMBias, - # VLRMBench dataset - VLRMBench + AyaVisionBench, TopViewRS, VLMBias, VLRMBench ] VIDEO_DATASET = [ From 9cc6032adfda2704b413dface5d057aa8f6f1d77 Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Thu, 16 Oct 2025 17:57:40 +0800 Subject: [PATCH 7/8] Add multi_solution evaluation support in VLRMBench dataset --- vlmeval/dataset/vlrmbench.py | 249 +++++++++++++++++++++++++---------- 1 file changed, 182 insertions(+), 67 deletions(-) diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py index 91f7c150b..dbff3d225 100644 --- a/vlmeval/dataset/vlrmbench.py +++ b/vlmeval/dataset/vlrmbench.py @@ -34,6 +34,28 @@ def format_model_answer_tolist(model_answer, task_gt): return result + [0] * (len(task_gt) - len(result)) +def format_ms_model_answer(model_answer): + """ + Extract two scores from multi_solution model answer + + Args: + model_answer: Model's prediction answer (string) + Expected format: "[7, 8]" or "7, 8" or "Score 1: 7, Score 2: 8" + + Returns: + list: Two scores [score1, score2] + """ + numbers = re.findall(r'\d+', str(model_answer)) + result = [int(num) for num in numbers] + + # Return last two numbers (most likely the actual scores) + if len(result) >= 2: + return result[-2:] + else: + # If less than 2 numbers found, pad with 0 + return result + [0] * (2 - len(result)) + + def get_F1Score(gathered_model_answer, gathered_task_gt): """ Calculate F1 score @@ -75,14 +97,19 @@ class VLRMBench(ImageBaseDataset): - detail_error: Detail error detection - image_ref_error: Image reference error detection - location_error: Location error detection + - multi_solution: Position bias resistance evaluation """ TYPE = 'VQA' DATASET_URL = { - 'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv?download=true' + 'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv', + 'VLRMBench_MultiSolution': ( + 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_MultiSolution.tsv' + ) } DATASET_MD5 = { - 'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2' + 'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2', + 'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387' } def build_prompt(self, line): @@ -118,94 +145,182 @@ def build_prompt(self, line): return msgs + @classmethod + def evaluate_multi_solution(cls, data): + """ + Evaluate multi_solution type data (position bias resistance) + + Args: + data: DataFrame containing multi_solution predictions + + Returns: + dict: Evaluation results with accuracy metric + """ + acc_sample = 0 + overall_sample = 0 + skipped = 0 + + # Group by index pairs (front: even, back: odd) + indices = sorted(data['index'].unique()) + + for i in range(0, len(indices), 2): + if i + 1 >= len(indices): + skipped += 1 + continue + + front_idx = indices[i] + back_idx = indices[i + 1] + + # Get front and back rows + front_rows = data[data['index'] == front_idx] + back_rows = data[data['index'] == back_idx] + + if len(front_rows) == 0 or len(back_rows) == 0: + skipped += 1 + continue + + front_row = front_rows.iloc[0] + back_row = back_rows.iloc[0] + + # Verify order field if exists + if 'order' in data.columns: + if front_row.get('order') != 'front' or back_row.get('order') != 'back': + print(f"Warning: Order mismatch at index {front_idx}, {back_idx}") + skipped += 1 + continue + + try: + # Parse model predictions + front_scores = format_ms_model_answer(front_row.get('prediction', '')) + back_scores = format_ms_model_answer(back_row.get('prediction', '')) + + # Apply evaluation formula: front[0] + back[1] vs front[1] + back[0] + # This checks if model consistently prefers the better response regardless of position + if front_scores[0] + back_scores[1] > front_scores[1] + back_scores[0]: + acc_sample += 1 + + overall_sample += 1 + except Exception as e: + print(f"Failed to process multi_solution pair ({front_idx}, {back_idx}): {e}") + skipped += 1 + + results = { + 'multi_solution_accuracy': acc_sample / overall_sample if overall_sample > 0 else 0.0, + 'multi_solution_count': overall_sample, + 'multi_solution_skipped': skipped + } + + return results + @classmethod def evaluate(cls, eval_file, **judge_kwargs): """ Evaluate model prediction results + Automatically detects and handles both step-based and multi_solution data Args: eval_file: Path to model prediction results file **judge_kwargs: Other evaluation parameters Returns: - pd.DataFrame: Evaluation results, including F1 scores for each category + pd.DataFrame: Evaluation results, including F1 scores and/or accuracy """ # Load prediction data data = load(eval_file) # Ensure necessary fields exist - assert 'answer' in data.columns, "Evaluation file missing 'answer' field" assert 'prediction' in data.columns, "Evaluation file missing 'prediction' field" assert 'category' in data.columns, "Evaluation file missing 'category' field" - # Collect model answers and ground truth by category - category_model_answers = defaultdict(list) - category_task_gts = defaultdict(list) - category_total = defaultdict(int) - - for idx in range(len(data)): - item = data.iloc[idx] - category = item['category'] + # Detect data types + categories = data['category'].unique() + has_multi_solution = 'multi_solution' in categories + has_step_based = any(c != 'multi_solution' for c in categories) - try: - # Parse task_gt (answer field) - task_gt = item['answer'] - if isinstance(task_gt, str): - # Try to parse string as list - task_gt = literal_eval(task_gt) - - # Get model answer (prediction field) - model_answer = item.get('prediction', '') - - # Format model answer using format_model_answer_tolist - formatted_model_answer = format_model_answer_tolist(model_answer, task_gt) - - # Collect answers for each category - category_task_gts[category].extend(task_gt) - category_model_answers[category].extend(formatted_model_answer) - category_total[category] += 1 - except Exception as e: - # If parsing fails, log and skip the sample - print(f"Failed to process sample (idx={idx}, category={category}): {e}") - continue - - # Calculate F1 scores for each category results = {} - for category in category_task_gts: - gathered_task_gt = category_task_gts[category] - gathered_model_answer = category_model_answers[category] - if len(gathered_task_gt) > 0: - F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt) + # Process step-based categories + if has_step_based: + # Filter step-based data + step_data = data[data['category'] != 'multi_solution'] - results[f'{category}_F1_pos'] = F1_pos - results[f'{category}_F1_neg'] = F1_neg - results[f'{category}_F1_weighted'] = F1_w - results[f'{category}_count'] = category_total[category] + # Ensure answer field exists for step-based data + if 'answer' not in step_data.columns: + print("Warning: Step-based data missing 'answer' field, skipping step-based evaluation") else: - results[f'{category}_F1_pos'] = 0.0 - results[f'{category}_F1_neg'] = 0.0 - results[f'{category}_F1_weighted'] = 0.0 - results[f'{category}_count'] = 0 - - # Calculate overall F1 score (all categories combined) - all_task_gts = [] - all_model_answers = [] - for category in category_task_gts: - all_task_gts.extend(category_task_gts[category]) - all_model_answers.extend(category_model_answers[category]) - - if len(all_task_gts) > 0: - F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts) - results['Overall_F1_pos'] = F1_pos_overall - results['Overall_F1_neg'] = F1_neg_overall - results['Overall_F1_weighted'] = F1_w_overall - results['Overall_count'] = sum(category_total.values()) - else: - results['Overall_F1_pos'] = 0.0 - results['Overall_F1_neg'] = 0.0 - results['Overall_F1_weighted'] = 0.0 - results['Overall_count'] = 0 + # Collect model answers and ground truth by category + category_model_answers = defaultdict(list) + category_task_gts = defaultdict(list) + category_total = defaultdict(int) + + for idx in range(len(step_data)): + item = step_data.iloc[idx] + category = item['category'] + + try: + # Parse task_gt (answer field) + task_gt = item['answer'] + if isinstance(task_gt, str): + # Try to parse string as list + task_gt = literal_eval(task_gt) + + # Get model answer (prediction field) + model_answer = item.get('prediction', '') + + # Format model answer using format_model_answer_tolist + formatted_model_answer = format_model_answer_tolist(model_answer, task_gt) + + # Collect answers for each category + category_task_gts[category].extend(task_gt) + category_model_answers[category].extend(formatted_model_answer) + category_total[category] += 1 + except Exception as e: + # If parsing fails, log and skip the sample + print(f"Failed to process sample (idx={idx}, category={category}): {e}") + continue + + # Calculate F1 scores for each category + for category in category_task_gts: + gathered_task_gt = category_task_gts[category] + gathered_model_answer = category_model_answers[category] + + if len(gathered_task_gt) > 0: + F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt) + + results[f'{category}_F1_pos'] = F1_pos + results[f'{category}_F1_neg'] = F1_neg + results[f'{category}_F1_weighted'] = F1_w + results[f'{category}_count'] = category_total[category] + else: + results[f'{category}_F1_pos'] = 0.0 + results[f'{category}_F1_neg'] = 0.0 + results[f'{category}_F1_weighted'] = 0.0 + results[f'{category}_count'] = 0 + + # Calculate overall F1 score (all step-based categories combined) + all_task_gts = [] + all_model_answers = [] + for category in category_task_gts: + all_task_gts.extend(category_task_gts[category]) + all_model_answers.extend(category_model_answers[category]) + + if len(all_task_gts) > 0: + F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts) + results['Overall_F1_pos'] = F1_pos_overall + results['Overall_F1_neg'] = F1_neg_overall + results['Overall_F1_weighted'] = F1_w_overall + results['Overall_count'] = sum(category_total.values()) + else: + results['Overall_F1_pos'] = 0.0 + results['Overall_F1_neg'] = 0.0 + results['Overall_F1_weighted'] = 0.0 + results['Overall_count'] = 0 + + # Process multi_solution category + if has_multi_solution: + ms_data = data[data['category'] == 'multi_solution'] + ms_results = cls.evaluate_multi_solution(ms_data) + results.update(ms_results) # Convert to DataFrame format results_df = pd.DataFrame([results]) From 29ba4bb598387788582c080070a71dfb7ba9aa1e Mon Sep 17 00:00:00 2001 From: Yuan Wenzhen Date: Thu, 23 Oct 2025 21:30:03 +0800 Subject: [PATCH 8/8] Add foresight evaluation support in VLRMBench dataset Updated initialization to warn about supported task types. Fix and update the md5 --- vlmeval/dataset/vlrmbench.py | 79 ++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py index dbff3d225..14f3961a4 100644 --- a/vlmeval/dataset/vlrmbench.py +++ b/vlmeval/dataset/vlrmbench.py @@ -98,6 +98,10 @@ class VLRMBench(ImageBaseDataset): - image_ref_error: Image reference error detection - location_error: Location error detection - multi_solution: Position bias resistance evaluation + - foresight: Reasoning foresight capability evaluation + + Note: Currently only supports Outcome-based tasks and Step-based tasks. + Criticism-based tasks are not supported in this implementation. """ TYPE = 'VQA' @@ -105,13 +109,30 @@ class VLRMBench(ImageBaseDataset): 'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv', 'VLRMBench_MultiSolution': ( 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_MultiSolution.tsv' + ), + 'VLRMBench_Foresight': ( + 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_Foresight.tsv' ) } DATASET_MD5 = { 'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2', - 'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387' + 'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387', + 'VLRMBench_Foresight': '1e22f1b94afbd6f4f3a4028c91749311' } + def __init__(self, **kwargs): + """ + Initialize VLRMBench dataset with warning about supported task types. + """ + import warnings + warnings.warn( + "VLRMBench currently only supports Outcome-based tasks and Step-based tasks. " + "Criticism-based tasks are not supported in this implementation.", + UserWarning, + stacklevel=2 + ) + super().__init__(**kwargs) + def build_prompt(self, line): """ Build prompt information @@ -212,11 +233,54 @@ def evaluate_multi_solution(cls, data): return results + @classmethod + def evaluate_foresight(cls, data): + """ + Evaluate foresight type data (reasoning foresight capability) + + Args: + data: DataFrame containing foresight predictions + + Returns: + dict: Evaluation results with accuracy metric + """ + acc_sample = 0 + overall_sample = 0 + skipped = 0 + + for idx in range(len(data)): + item = data.iloc[idx] + + try: + task_gt = item['task_gt'] # True/False + model_answer = item.get('prediction', '') + + # 关键词匹配逻辑(与get_fores_eval_res.py一致) + if task_gt is True: + if re.search(r'\b(yes|true)\b', model_answer, re.IGNORECASE): + acc_sample += 1 + elif task_gt is False: + if re.search(r'\b(no|false)\b', model_answer, re.IGNORECASE): + acc_sample += 1 + + overall_sample += 1 + except Exception as e: + print(f"Failed to process foresight sample (idx={idx}): {e}") + skipped += 1 + + results = { + 'foresight_accuracy': acc_sample / overall_sample if overall_sample > 0 else 0.0, + 'foresight_count': overall_sample, + 'foresight_skipped': skipped + } + + return results + @classmethod def evaluate(cls, eval_file, **judge_kwargs): """ Evaluate model prediction results - Automatically detects and handles both step-based and multi_solution data + Automatically detects and handles step-based, multi_solution, and foresight data Args: eval_file: Path to model prediction results file @@ -235,14 +299,15 @@ def evaluate(cls, eval_file, **judge_kwargs): # Detect data types categories = data['category'].unique() has_multi_solution = 'multi_solution' in categories - has_step_based = any(c != 'multi_solution' for c in categories) + has_foresight = 'foresight' in categories + has_step_based = any(c not in ['multi_solution', 'foresight'] for c in categories) results = {} # Process step-based categories if has_step_based: # Filter step-based data - step_data = data[data['category'] != 'multi_solution'] + step_data = data[~data['category'].isin(['multi_solution', 'foresight'])] # Ensure answer field exists for step-based data if 'answer' not in step_data.columns: @@ -322,6 +387,12 @@ def evaluate(cls, eval_file, **judge_kwargs): ms_results = cls.evaluate_multi_solution(ms_data) results.update(ms_results) + # Process foresight category + if has_foresight: + foresight_data = data[data['category'] == 'foresight'] + foresight_results = cls.evaluate_foresight(foresight_data) + results.update(foresight_results) + # Convert to DataFrame format results_df = pd.DataFrame([results])