diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 0007b2561..353f81a95 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -80,6 +80,7 @@
 from .mmifeval import MMIFEval
 from .chartmimic import ChartMimic
 from .m4bench import M4Bench
+from .vlrmbench import VLRMBench
 from .mmhelix import MMHELIX
 from .medqbench_mcq import MedqbenchMCQDataset
 from .medqbench_caption import MedqbenchCaptionDataset
@@ -218,7 +219,7 @@ def evaluate(self, eval_file, **judge_kwargs):
     OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
     AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset,
     MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus,
-    olmOCRBench, OceanOCRBench, MATBench
+    olmOCRBench, OceanOCRBench, MATBench, VLRMBench
 ]
 
 VIDEO_DATASET = [
diff --git a/vlmeval/dataset/vlrmbench.py b/vlmeval/dataset/vlrmbench.py
new file mode 100644
index 000000000..14f3961a4
--- /dev/null
+++ b/vlmeval/dataset/vlrmbench.py
@@ -0,0 +1,403 @@
+from ast import literal_eval
+from collections import defaultdict
+import re
+import numpy as np
+from sklearn.metrics import f1_score
+
+from .image_base import ImageBaseDataset
+from ..smp import *
+from ..smp.file import get_intermediate_file_path
+
+
+def format_model_answer_tolist(model_answer, task_gt):
+    """
+    Extract 0/1 list from model answer
+
+    Args:
+        model_answer: Model's prediction answer (string)
+        task_gt: Ground truth list, used to determine expected length
+
+    Returns:
+        list: 0/1 list
+    """
+    numbers = re.findall(r'\d+', str(model_answer))
+
+    result = [int(num) for num in numbers]
+
+    # Convert non-0/1 numbers to 1
+    result = [num if num == 0 or num == 1 else 1 for num in result]
+
+    # Adjust length to match task_gt
+    if len(result) >= len(task_gt):
+        return result[:len(task_gt)]
+    else:
+        return result + [0] * (len(task_gt) - len(result))
+
+
+def format_ms_model_answer(model_answer):
+    """
+    Extract two scores from multi_solution model answer
+
+    Args:
+        model_answer: Model's prediction answer (string)
+        Expected format: "[7, 8]" or "7, 8" or "Score 1: 7, Score 2: 8"
+
+    Returns:
+        list: Two scores [score1, score2]
+    """
+    numbers = re.findall(r'\d+', str(model_answer))
+    result = [int(num) for num in numbers]
+
+    # Return last two numbers (most likely the actual scores)
+    if len(result) >= 2:
+        return result[-2:]
+    else:
+        # If less than 2 numbers found, pad with 0
+        return result + [0] * (2 - len(result))
+
+
+def get_F1Score(gathered_model_answer, gathered_task_gt):
+    """
+    Calculate F1 score
+
+    Args:
+        gathered_model_answer: List of all model answers
+        gathered_task_gt: List of all ground truth
+
+    Returns:
+        tuple: (F1_pos, F1_neg, F1_w) - positive class F1, negative class F1, weighted F1
+    """
+    model_answer = np.array(gathered_model_answer)
+    task_gt = np.array(gathered_task_gt)
+
+    pos_count = np.sum(task_gt == 1)
+    neg_count = np.sum(task_gt == 0)
+
+    F1_pos = f1_score(task_gt, model_answer, pos_label=1, zero_division=0)
+    F1_neg = f1_score(task_gt, model_answer, pos_label=0, zero_division=0)
+
+    w_pos = neg_count / (pos_count + neg_count) if (pos_count + neg_count) > 0 else 0
+    w_neg = pos_count / (pos_count + neg_count) if (pos_count + neg_count) > 0 else 0
+
+    F1_w = w_neg * F1_neg + w_pos * F1_pos
+
+    return F1_pos, F1_neg, F1_w
+
+
+class VLRMBench(ImageBaseDataset):
+    """
+    VLRMBench Dataset - Visual Language Reasoning Model Benchmark
+
+    A comprehensive benchmark for evaluating visual reasoning capabilities including:
+    - step_correctness: Step correctness detection
+    - redundant_det: Redundancy detection
+    - most_confidence: Highest confidence judgment
+    - attribute_hallucination: Attribute hallucination detection
+    - existence_hallucination: Existence hallucination detection
+    - detail_error: Detail error detection
+    - image_ref_error: Image reference error detection
+    - location_error: Location error detection
+    - multi_solution: Position bias resistance evaluation
+    - foresight: Reasoning foresight capability evaluation
+
+    Note: Currently only supports Outcome-based tasks and Step-based tasks.
+    Criticism-based tasks are not supported in this implementation.
+    """
+
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'VLRMBench': 'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench.tsv',
+        'VLRMBench_MultiSolution': (
+            'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_MultiSolution.tsv'
+        ),
+        'VLRMBench_Foresight': (
+            'https://huggingface.co/datasets/Winston-Yuan/VLRMBench/resolve/main/VLRMBench_Foresight.tsv'
+        )
+    }
+    DATASET_MD5 = {
+        'VLRMBench': 'f1dedeac74fc1112545390d6e2ecf4a2',
+        'VLRMBench_MultiSolution': 'e8c15ab7c24568ba4d72375530389387',
+        'VLRMBench_Foresight': '1e22f1b94afbd6f4f3a4028c91749311'
+    }
+
+    def __init__(self, **kwargs):
+        """
+        Initialize VLRMBench dataset with warning about supported task types.
+        """
+        import warnings
+        warnings.warn(
+            "VLRMBench currently only supports Outcome-based tasks and Step-based tasks. "
+            "Criticism-based tasks are not supported in this implementation.",
+            UserWarning,
+            stacklevel=2
+        )
+        super().__init__(**kwargs)
+
+    def build_prompt(self, line):
+        """
+        Build prompt information
+
+        Args:
+            line: Data row, can be int index or pd.Series
+
+        Returns:
+            list: Multimodal message list, format is [dict(type='image', value=path), dict(type='text', value=text),]
+        """
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        # Create a copy of line to avoid SettingWithCopyWarning
+        line = line.copy()
+
+        # Use parent class method to save image (decode from base64 and save locally)
+        tgt_path = self.dump_image(line)
+
+        # Build messages
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+
+        # Add question text
+        question = line.get('question', '')
+        if question:
+            msgs.append(dict(type='text', value=question))
+
+        return msgs
+
+    @classmethod
+    def evaluate_multi_solution(cls, data):
+        """
+        Evaluate multi_solution type data (position bias resistance)
+
+        Args:
+            data: DataFrame containing multi_solution predictions
+
+        Returns:
+            dict: Evaluation results with accuracy metric
+        """
+        acc_sample = 0
+        overall_sample = 0
+        skipped = 0
+
+        # Group by index pairs (front: even, back: odd)
+        indices = sorted(data['index'].unique())
+
+        for i in range(0, len(indices), 2):
+            if i + 1 >= len(indices):
+                skipped += 1
+                continue
+
+            front_idx = indices[i]
+            back_idx = indices[i + 1]
+
+            # Get front and back rows
+            front_rows = data[data['index'] == front_idx]
+            back_rows = data[data['index'] == back_idx]
+
+            if len(front_rows) == 0 or len(back_rows) == 0:
+                skipped += 1
+                continue
+
+            front_row = front_rows.iloc[0]
+            back_row = back_rows.iloc[0]
+
+            # Verify order field if exists
+            if 'order' in data.columns:
+                if front_row.get('order') != 'front' or back_row.get('order') != 'back':
+                    print(f"Warning: Order mismatch at index {front_idx}, {back_idx}")
+                    skipped += 1
+                    continue
+
+            try:
+                # Parse model predictions
+                front_scores = format_ms_model_answer(front_row.get('prediction', ''))
+                back_scores = format_ms_model_answer(back_row.get('prediction', ''))
+
+                # Apply evaluation formula: front[0] + back[1] vs front[1] + back[0]
+                # This checks if model consistently prefers the better response regardless of position
+                if front_scores[0] + back_scores[1] > front_scores[1] + back_scores[0]:
+                    acc_sample += 1
+
+                overall_sample += 1
+            except Exception as e:
+                print(f"Failed to process multi_solution pair ({front_idx}, {back_idx}): {e}")
+                skipped += 1
+
+        results = {
+            'multi_solution_accuracy': acc_sample / overall_sample if overall_sample > 0 else 0.0,
+            'multi_solution_count': overall_sample,
+            'multi_solution_skipped': skipped
+        }
+
+        return results
+
+    @classmethod
+    def evaluate_foresight(cls, data):
+        """
+        Evaluate foresight type data (reasoning foresight capability)
+
+        Args:
+            data: DataFrame containing foresight predictions
+
+        Returns:
+            dict: Evaluation results with accuracy metric
+        """
+        acc_sample = 0
+        overall_sample = 0
+        skipped = 0
+
+        for idx in range(len(data)):
+            item = data.iloc[idx]
+
+            try:
+                task_gt = item['task_gt']  # True/False
+                model_answer = item.get('prediction', '')
+
+                # 关键词匹配逻辑（与get_fores_eval_res.py一致）
+                if task_gt is True:
+                    if re.search(r'\b(yes|true)\b', model_answer, re.IGNORECASE):
+                        acc_sample += 1
+                elif task_gt is False:
+                    if re.search(r'\b(no|false)\b', model_answer, re.IGNORECASE):
+                        acc_sample += 1
+
+                overall_sample += 1
+            except Exception as e:
+                print(f"Failed to process foresight sample (idx={idx}): {e}")
+                skipped += 1
+
+        results = {
+            'foresight_accuracy': acc_sample / overall_sample if overall_sample > 0 else 0.0,
+            'foresight_count': overall_sample,
+            'foresight_skipped': skipped
+        }
+
+        return results
+
+    @classmethod
+    def evaluate(cls, eval_file, **judge_kwargs):
+        """
+        Evaluate model prediction results
+        Automatically detects and handles step-based, multi_solution, and foresight data
+
+        Args:
+            eval_file: Path to model prediction results file
+            **judge_kwargs: Other evaluation parameters
+
+        Returns:
+            pd.DataFrame: Evaluation results, including F1 scores and/or accuracy
+        """
+        # Load prediction data
+        data = load(eval_file)
+
+        # Ensure necessary fields exist
+        assert 'prediction' in data.columns, "Evaluation file missing 'prediction' field"
+        assert 'category' in data.columns, "Evaluation file missing 'category' field"
+
+        # Detect data types
+        categories = data['category'].unique()
+        has_multi_solution = 'multi_solution' in categories
+        has_foresight = 'foresight' in categories
+        has_step_based = any(c not in ['multi_solution', 'foresight'] for c in categories)
+
+        results = {}
+
+        # Process step-based categories
+        if has_step_based:
+            # Filter step-based data
+            step_data = data[~data['category'].isin(['multi_solution', 'foresight'])]
+
+            # Ensure answer field exists for step-based data
+            if 'answer' not in step_data.columns:
+                print("Warning: Step-based data missing 'answer' field, skipping step-based evaluation")
+            else:
+                # Collect model answers and ground truth by category
+                category_model_answers = defaultdict(list)
+                category_task_gts = defaultdict(list)
+                category_total = defaultdict(int)
+
+                for idx in range(len(step_data)):
+                    item = step_data.iloc[idx]
+                    category = item['category']
+
+                    try:
+                        # Parse task_gt (answer field)
+                        task_gt = item['answer']
+                        if isinstance(task_gt, str):
+                            # Try to parse string as list
+                            task_gt = literal_eval(task_gt)
+
+                        # Get model answer (prediction field)
+                        model_answer = item.get('prediction', '')
+
+                        # Format model answer using format_model_answer_tolist
+                        formatted_model_answer = format_model_answer_tolist(model_answer, task_gt)
+
+                        # Collect answers for each category
+                        category_task_gts[category].extend(task_gt)
+                        category_model_answers[category].extend(formatted_model_answer)
+                        category_total[category] += 1
+                    except Exception as e:
+                        # If parsing fails, log and skip the sample
+                        print(f"Failed to process sample (idx={idx}, category={category}): {e}")
+                        continue
+
+                # Calculate F1 scores for each category
+                for category in category_task_gts:
+                    gathered_task_gt = category_task_gts[category]
+                    gathered_model_answer = category_model_answers[category]
+
+                    if len(gathered_task_gt) > 0:
+                        F1_pos, F1_neg, F1_w = get_F1Score(gathered_model_answer, gathered_task_gt)
+
+                        results[f'{category}_F1_pos'] = F1_pos
+                        results[f'{category}_F1_neg'] = F1_neg
+                        results[f'{category}_F1_weighted'] = F1_w
+                        results[f'{category}_count'] = category_total[category]
+                    else:
+                        results[f'{category}_F1_pos'] = 0.0
+                        results[f'{category}_F1_neg'] = 0.0
+                        results[f'{category}_F1_weighted'] = 0.0
+                        results[f'{category}_count'] = 0
+
+                # Calculate overall F1 score (all step-based categories combined)
+                all_task_gts = []
+                all_model_answers = []
+                for category in category_task_gts:
+                    all_task_gts.extend(category_task_gts[category])
+                    all_model_answers.extend(category_model_answers[category])
+
+                if len(all_task_gts) > 0:
+                    F1_pos_overall, F1_neg_overall, F1_w_overall = get_F1Score(all_model_answers, all_task_gts)
+                    results['Overall_F1_pos'] = F1_pos_overall
+                    results['Overall_F1_neg'] = F1_neg_overall
+                    results['Overall_F1_weighted'] = F1_w_overall
+                    results['Overall_count'] = sum(category_total.values())
+                else:
+                    results['Overall_F1_pos'] = 0.0
+                    results['Overall_F1_neg'] = 0.0
+                    results['Overall_F1_weighted'] = 0.0
+                    results['Overall_count'] = 0
+
+        # Process multi_solution category
+        if has_multi_solution:
+            ms_data = data[data['category'] == 'multi_solution']
+            ms_results = cls.evaluate_multi_solution(ms_data)
+            results.update(ms_results)
+
+        # Process foresight category
+        if has_foresight:
+            foresight_data = data[data['category'] == 'foresight']
+            foresight_results = cls.evaluate_foresight(foresight_data)
+            results.update(foresight_results)
+
+        # Convert to DataFrame format
+        results_df = pd.DataFrame([results])
+
+        # Save results
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
+        dump(results_df, score_file)
+
+        return results_df