diff --git a/run.py b/run.py
index ae6c4c4f0..627f96d8c 100644
--- a/run.py
+++ b/run.py
@@ -271,7 +271,8 @@ def main():
                 dist.barrier()
 
             try:
-                result_file_base = f'{model_name}_{dataset_name}.xlsx'
+                pred_format = get_pred_file_format()
+                result_file_base = f'{model_name}_{dataset_name}.{pred_format}'
 
                 if use_config:
                     if WORLD_SIZE > 1:
@@ -299,9 +300,6 @@ def main():
                         continue
 
                 # Handling Multi-Turn Dataset
-                if dataset.TYPE == 'MT':
-                    result_file_base = result_file_base.replace('.xlsx', '.tsv')
-
                 result_file = osp.join(pred_root, result_file_base)
                 # Reuse the previous prediction file if exists
                 if RANK == 0 and len(prev_pred_roots):
diff --git a/scripts/apires_scan.py b/scripts/apires_scan.py
index c6036625f..890aea3da 100644
--- a/scripts/apires_scan.py
+++ b/scripts/apires_scan.py
@@ -10,7 +10,9 @@
 model_name = root.split('/')[-1]
 
 for d in SUPPORTED_DATASETS:
-    fname = f'{model_name}_{d}.xlsx'
+    from vlmeval.smp import get_pred_file_format
+    pred_format = get_pred_file_format()
+    fname = f'{model_name}_{d}.{pred_format}'
     pth = osp.join(root, fname)
     if osp.exists(pth):
         data = load(pth)
diff --git a/scripts/auto_run.py b/scripts/auto_run.py
index f3cd1bbf3..381c3432f 100644
--- a/scripts/auto_run.py
+++ b/scripts/auto_run.py
@@ -26,7 +26,9 @@ def is_large(x):
 models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
 
 for m in models:
-    unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
+    from vlmeval.smp import get_pred_file_format
+    pred_format = get_pred_file_format()
+    unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')]
     if len(unknown_datasets) == 0:
         continue
     dataset_str = ' '.join(unknown_datasets)
diff --git a/vlmeval/dataset/CGAVCounting/cg_av_counting.py b/vlmeval/dataset/CGAVCounting/cg_av_counting.py
index 59445bb8c..8626c7fd4 100644
--- a/vlmeval/dataset/CGAVCounting/cg_av_counting.py
+++ b/vlmeval/dataset/CGAVCounting/cg_av_counting.py
@@ -359,10 +359,11 @@ def save_video_frames(self, video, uid, num_frames=8, fps=-1):
 
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
+            'data file should be an supported format (xlsx/json/tsv) file'
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
 
         data = load(eval_file)
 
diff --git a/vlmeval/dataset/EgoExoBench/egoexobench.py b/vlmeval/dataset/EgoExoBench/egoexobench.py
index a49c20f02..9400966c0 100644
--- a/vlmeval/dataset/EgoExoBench/egoexobench.py
+++ b/vlmeval/dataset/EgoExoBench/egoexobench.py
@@ -244,11 +244,12 @@ def build_prompt(self, line, video_llm):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils import get_dimension_rating, extract_characters_regex, extract_option
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
+            'data file should be an supported format (xlsx/json/tsv) file'
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
 
         if not osp.exists(score_file):
             model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/GUI/screenspot.py b/vlmeval/dataset/GUI/screenspot.py
index ac2cbe3e2..842d61ad0 100644
--- a/vlmeval/dataset/GUI/screenspot.py
+++ b/vlmeval/dataset/GUI/screenspot.py
@@ -324,7 +324,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
                 results_dict[key] = str(0)
             else:
                 results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(results_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -437,7 +437,7 @@ def make_safe(value):
                 sub_stats = itertools.chain(*sub_stats)
                 final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
 
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
diff --git a/vlmeval/dataset/GUI/screenspot_pro.py b/vlmeval/dataset/GUI/screenspot_pro.py
index 26fde4114..c926a29c2 100644
--- a/vlmeval/dataset/GUI/screenspot_pro.py
+++ b/vlmeval/dataset/GUI/screenspot_pro.py
@@ -312,7 +312,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
                 results_dict[key] = str(0)
             else:
                 results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(results_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -422,7 +422,7 @@ def make_safe(value):
             sub_stats = itertools.chain(*sub_stats)
             final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
 
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
diff --git a/vlmeval/dataset/OmniDocBench/omnidocbench.py b/vlmeval/dataset/OmniDocBench/omnidocbench.py
index 1d688404d..28aca5a09 100644
--- a/vlmeval/dataset/OmniDocBench/omnidocbench.py
+++ b/vlmeval/dataset/OmniDocBench/omnidocbench.py
@@ -4,10 +4,12 @@
 import pandas as pd
 import tempfile
 import base64
+import numpy as np
 from tqdm import tqdm
 import torch.distributed as dist
 from ..image_base import ImageBaseDataset
 from ...smp import *
+from .utils import get_intermediate_file_path, load, dump
 
 
 class OmniDocBench(ImageBaseDataset):
@@ -75,9 +77,6 @@ def __init__(self,
                  tsv_path,
                  match_method:str='quick_match',
                  filter_types:dict=None):
-        self.result_foler='../../../outputs/OmniDocBench'
-        if not os.path.exists(self.result_foler):
-            os.makedirs(self.result_foler)
         self.eval_file=eval_file
         self.match_method=match_method
         self.references=[]
@@ -374,17 +373,18 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
                 'group':group_result,
                 'page':page_result
             }
-            if not os.path.exists('./output/OmniDocBench'):
-                os.makedirs('./output/OmniDocBench')
             if isinstance(cur_samples,list):
                 saved_samples=cur_samples
             else:
                 saved_samples=cur_samples.samples
-            with open(os.path.join(self.result_foler,f'{save_name}_result.josn'),'w',encoding='utf-8') as f:
-                json.dump(saved_samples,f,indent=4,ensure_ascii=False)
+            # NOTE: The original code has a bug here, it will overwrite the result file in each iteration.
+            # I will fix it by adding element to the filename.
+            # NOTE: Fixed typo .josn -> .json
+            result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_{element}_result', 'json')
+            dump(saved_samples, result_file)
 
-        with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
-            json.dump(result_all,f,indent=4,ensure_ascii=False)
+        metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
+        dump(result_all, metric_result_file)
 
         dict_list = []
         save_dict={}
@@ -409,20 +409,20 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
         dict_list.append(save_dict)
         df = pd.DataFrame(dict_list,index=['end2end',]).round(3)
 
-        with open(os.path.join(self.result_foler,'End2End_Evaluation.json'),'w',encoding='utf-8') as f:
-            json.dump(result_all,f,indent=4,ensure_ascii=False)
-        df.to_csv(os.path.join(self.result_foler,'overall.csv'))
-        over_all_path=os.path.join(self.result_foler,'End2End_Evaluation.json')
-        print(f"The save path of overall.csv is :{over_all_path}")
+        e2e_eval_file = get_intermediate_file_path(self.eval_file, '_End2End_Evaluation', 'json')
+        dump(result_all, e2e_eval_file)
+        
+        overall_file = get_intermediate_file_path(self.eval_file, '_overall')
+        dump(df, overall_file)
+
+        print(f"The save path of End2End_Evaluation is: {e2e_eval_file}")
+        print(f"The save path of overall metrics is: {overall_file}")
         return df
 
 
 class table_evalutor():
     def __init__(self,eval_file,tsv_path):
-
-        self.result_foler='../../../outputs/OmniDocBench'
-        if not os.path.exists(self.result_foler):
-            os.makedirs(self.result_foler)
+        self.eval_file = eval_file
         gt_key='html'
         pred_key='pred'
         self.category_filter='table'
@@ -434,8 +434,8 @@ def load_data(self,eval_file,gt_file,pred_key,gt_key):
         from .data_preprocess import clean_string, normalized_formula, textblock2unicode, normalized_table
         samples=[]
         preds=[]
-        predictions=pd.read_excel(eval_file)['prediction'].tolist()
-        gt_samples=pd.read_csv(gt_file,sep='\t')['answer'].tolist()
+        predictions=load(eval_file)['prediction'].tolist()
+        gt_samples=load(gt_file)['answer'].tolist()
         load_success,load_fail=0,0
         for i,gt_sample in tqdm(enumerate(gt_samples),desc='Loading data'):
             try:
@@ -533,8 +533,8 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
             'page':page_result
         }
 
-        with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
-            json.dump(result_all,f,indent=4,ensure_ascii=False)
+        metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
+        dump(result_all, metric_result_file)
 
         dict_list=[]
         dict_list.append(result_all["group"]["TEDS"])
@@ -545,10 +545,7 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
         selected_columns = df4[["language: table_en", "language: table_simplified_chinese", "language: table_en_ch_mixed", "line: full_line", "line: less_line", "line: fewer_line", "line: wireless_line",
                         "with_span: True", "with_span: False", "include_equation: True", "include_equation: False", "include_background: True", "include_background: False", "table_layout: vertical", "table_layout: horizontal"]]
 
-        selected_columns.to_csv(os.path.join(self.result_foler,'table_attribute.csv'))
-        table_attribute_path=os.path.join(self.result_foler,'table_attribute.csv')
-        print(f'The save path of table_attribute.csv is :{table_attribute_path}')
-        selected_columns
-
-
+        table_attr_file = get_intermediate_file_path(self.eval_file, '_table_attribute')
+        dump(selected_columns, table_attr_file)
+        print(f'The save path of table_attribute is :{table_attr_file}')
         return selected_columns
diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 579444829..855049d4a 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -151,7 +151,6 @@ def supported_datasets(cls):
         return list(cls.DATASET_SETS)
 
     def evaluate(self, eval_file, **judge_kwargs):
-        suffix = eval_file.split('.')[-1]
         # First, split the eval_file by dataset
         data_all = load(eval_file)
         for dname in self.datasets:
@@ -179,11 +178,11 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         if len(df_all):
             result = pd.concat(df_all)
-            score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+            score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
             dump(result, score_file)
             return result
         else:
-            score_file = eval_file.replace(f'.{suffix}', '_score.json')
+            score_file = get_intermediate_file_path(eval_file, '_score', 'json')
             dump(dict_all, score_file)
             return dict_all
 
diff --git a/vlmeval/dataset/cgbench.py b/vlmeval/dataset/cgbench.py
index 8ca5b5f12..aada9da6a 100644
--- a/vlmeval/dataset/cgbench.py
+++ b/vlmeval/dataset/cgbench.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from .utils.cgbench import *
@@ -432,10 +433,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
 
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         data = load(eval_file)
 
@@ -760,12 +761,12 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
-        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
-        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
+        step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
+        step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
 
         data = load(eval_file)
 
@@ -784,13 +785,13 @@ def evaluate(self, eval_file, **judge_kwargs):
             axis=1,
         )
 
-        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
-        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
-
         if judge_kwargs.get("model", None) != "gpt-4o-0806":
             judge_kwargs["model"] = "gpt-4o-0806"
             print("The judge model in cg-bench is gpt-4o-0806!")
 
+        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+
         model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
         nproc = judge_kwargs.pop("nproc", 32)
 
@@ -1314,10 +1315,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
 
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         data = load(eval_file)
 
@@ -1641,12 +1642,12 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
-        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
-        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
+        step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
+        step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
 
         data = load(eval_file)
 
diff --git a/vlmeval/dataset/chartmimic.py b/vlmeval/dataset/chartmimic.py
index 8151f0af1..f970f17a5 100644
--- a/vlmeval/dataset/chartmimic.py
+++ b/vlmeval/dataset/chartmimic.py
@@ -570,19 +570,12 @@ def judge_one_item_success(item):
 
         infer_data_all = load(eval_file).to_dict(orient="records")
 
-        suffix = eval_file.split(".")[-1]
         print(f"judge_kwargs: {judge_kwargs}")
         infer_model = judge_kwargs["model"]
-        storage = os.path.abspath(
-            eval_file.replace(f".{suffix}", f"_{infer_model}.jsonl")
-        )
-        score_file = os.path.abspath(
-            eval_file.replace(f".{suffix}", f"_{infer_model}_score.csv")
-        )
+        storage = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}', 'jsonl'))
+        score_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_score', 'csv'))
         # use abs path because of using os.chdir()
-        tmp_file = os.path.abspath(
-            eval_file.replace(f".{suffix}", f"_{infer_model}_tmp.pkl")
-        )
+        tmp_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_tmp', 'pkl'))
         # actually the --api-nproc
         nproc = judge_kwargs.pop("nproc", 8)
         logger.info(f"nproc: {nproc}")
diff --git a/vlmeval/dataset/charxiv.py b/vlmeval/dataset/charxiv.py
index 0427632ba..3a3c01e13 100644
--- a/vlmeval/dataset/charxiv.py
+++ b/vlmeval/dataset/charxiv.py
@@ -6,6 +6,7 @@
 
 from vlmeval.dataset.image_base import ImageBaseDataset
 from vlmeval.smp import misc, file
+from vlmeval.smp.file import get_intermediate_file_path
 from vlmeval import utils
 from vlmeval.dataset.utils import build_judge
 
@@ -203,10 +204,9 @@ def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame:
         judge_model_name = judge_model.model
 
         # Define file paths
-        suffix = eval_file.split(".")[-1]
-        result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.xlsx")
-        temp_result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.pkl")
-        score_file = result_file.replace(".xlsx", "_acc.csv")
+        result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}")
+        temp_result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}", "pkl")
+        score_file = get_intermediate_file_path(result_file, "_acc", "csv")
 
         # Return existing results if available
         if os.path.exists(result_file):
diff --git a/vlmeval/dataset/cmmmu.py b/vlmeval/dataset/cmmmu.py
index 12c583f29..d96a241e6 100644
--- a/vlmeval/dataset/cmmmu.py
+++ b/vlmeval/dataset/cmmmu.py
@@ -5,6 +5,7 @@
 import re
 import tempfile
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 
 def get_multi_choice_prediction(response, all_choices, index2ans):
@@ -223,8 +224,7 @@ def dump_image(self, line):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
 
         if not osp.exists(result_file):
             data = load(eval_file)
diff --git a/vlmeval/dataset/creation.py b/vlmeval/dataset/creation.py
index 4e37102fe..38a5d3d51 100644
--- a/vlmeval/dataset/creation.py
+++ b/vlmeval/dataset/creation.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 from .utils import build_judge, DEBUG_MESSAGE
 from ..utils import track_progress_rich
 import re
@@ -662,20 +663,18 @@ def evaluate(self, eval_file, **judge_kwargs):
             tgt = load(eval_file)
             tgt['reference_answer_by_gpt4o'] = src['prediction']
             tgt['prediction'] = src['reference_answer_by_gpt4o']
-            tgt_file_name = eval_file.replace('.xlsx', '_rev.xlsx')
+            tgt_file_name = get_intermediate_file_path(eval_file, '_rev')
             dump(tgt, tgt_file_name)
             judge_kwargs['dual_eval'] = False
             rating_rev = self.evaluate(tgt_file_name, **judge_kwargs)
         judge_kwargs.pop('dual_eval', None)
 
-        suffix = '.' + eval_file.split('.')[-1]
-
-        score_file = eval_file.replace(suffix, '_score.csv')
-        tgt_file = eval_file.replace(suffix, '_rating.json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
 
         model = judge_kwargs.pop('model', 'gpt-4o-0806')
         model_name = model.split('/')[-1] if '/' in model else model
-        tmp_file = eval_file.replace(suffix, f'_{model_name}.pkl')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}', 'pkl')
 
         nproc = judge_kwargs.pop('nproc', 4)
 
diff --git a/vlmeval/dataset/dude.py b/vlmeval/dataset/dude.py
index c520c7d28..e024d9821 100644
--- a/vlmeval/dataset/dude.py
+++ b/vlmeval/dataset/dude.py
@@ -5,6 +5,7 @@
 from .image_base import ImageBaseDataset
 from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 
 FAIL_MSG = 'Failed to obtain answer via API.'
@@ -165,9 +166,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         logger = get_logger('Evaluation')
         model = judge_kwargs['model']
 
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
 
         if osp.exists(storage):
             logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
@@ -203,7 +203,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = DUDE_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
 
         dump(score, score_pth)
         logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
diff --git a/vlmeval/dataset/dynamath.py b/vlmeval/dataset/dynamath.py
index a463276d7..e66797ac1 100644
--- a/vlmeval/dataset/dynamath.py
+++ b/vlmeval/dataset/dynamath.py
@@ -12,6 +12,7 @@
 from .utils import build_judge
 from ..utils import track_progress_rich
 from ..smp import load, dump, d2df, toliststr
+from ..smp.file import get_intermediate_file_path
 
 
 def preprocess(str1):
@@ -170,11 +171,10 @@ def evaluate(self, eval_file, **judge_kwargs):
         judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
 
         model = build_judge(model=judge_name, **judge_kwargs)
-        suffix = eval_file.split('.')[-1]
 
-        storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx')  # noqa: F841
-        score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv')  # noqa: F841
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl')  # noqa: F841
+        storage = get_intermediate_file_path(eval_file, f'_{judge_name}')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge_name}_score', 'csv')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 6)  # noqa: F841
 
         res = load(tmp_file) if os.path.exists(tmp_file) else {}
diff --git a/vlmeval/dataset/gobench.py b/vlmeval/dataset/gobench.py
index 3e9990c13..7667b934a 100644
--- a/vlmeval/dataset/gobench.py
+++ b/vlmeval/dataset/gobench.py
@@ -152,7 +152,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             'Instruction_Consistency_Score': [avg_scores.get('consistency', 0) * 100]
         })
 
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        score_file = get_intermediate_file_path(eval_file, '_score')
         dump(final_df, score_file)
         print(f"Detailed scores including failed attempts saved to {score_file}")
 
diff --git a/vlmeval/dataset/image_caption.py b/vlmeval/dataset/image_caption.py
index 23282805c..6a9d806f5 100644
--- a/vlmeval/dataset/image_caption.py
+++ b/vlmeval/dataset/image_caption.py
@@ -70,6 +70,6 @@ def evaluate(self, eval_file, **kwargs):
 
         scorer = COCO_Caption_Scorer(ref, gt)
         coco_caption_score_dict = scorer.compute_scores()
-        score_pth = eval_file.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(coco_caption_score_dict, score_pth)
         return coco_caption_score_dict
diff --git a/vlmeval/dataset/image_ccocr.py b/vlmeval/dataset/image_ccocr.py
index b1286daba..e70403d64 100644
--- a/vlmeval/dataset/image_ccocr.py
+++ b/vlmeval/dataset/image_ccocr.py
@@ -9,6 +9,7 @@
 
 from .image_base import ImageBaseDataset
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 # should be the same as  FAIL_MSG definded in vlmeval/inference.py
 FAIL_MSG = 'Failed to obtain answer via API.'
@@ -230,13 +231,12 @@ def evaluate(self, eval_file, **judge_kwargs):
                 print(f"Failed to evaluate {sub_dataset_id}")
 
         # Save comprehensive results
-        base_name = os.path.splitext(os.path.abspath(eval_file))[0]
+        result_file = get_intermediate_file_path(eval_file, '_comprehensive_eval', 'json')
         comprehensive_result = {
             "meta": {"total_datasets": len(all_results), "datasets": list(all_results.keys())},
             "results": all_results,
             "summaries": all_summaries
         }
-        result_file = base_name + "_comprehensive_eval.json"
         dump(comprehensive_result, result_file)
         print(f"Comprehensive results saved to: {result_file}")
 
@@ -298,5 +298,6 @@ def evaluate(self, eval_file, **judge_kwargs):
             print(f"  {k.upper():<20}: {v:.4f}")
         print("="*80)
         df = d2df(res)
-        dump(df, base_name + '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
+        dump(df, score_file)
         return res
diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py
index b0892ad90..44ef1e9c3 100644
--- a/vlmeval/dataset/image_mcq.py
+++ b/vlmeval/dataset/image_mcq.py
@@ -258,7 +258,6 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             dump(data, eval_file)
             circular = True
 
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs.get('model', 'exact_matching')
         assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
         name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
@@ -276,7 +275,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
-        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
 
         data = load(eval_file)
         data = data.sort_values(by='index')
@@ -299,7 +298,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
 
         # load split
-        eval_record = eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')
+        eval_record = get_intermediate_file_path(eval_file, f'_{name_str}_result')
         dump(data, eval_record)
         data = load(eval_record)
 
@@ -311,7 +310,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
         else:
             acc = report_acc(data)
 
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(acc, score_file)
 
         # The piece of code is for internal use, to check vanilla acc (circ0 & all) for circular datasets
@@ -327,16 +326,16 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             else:
                 offset = 1e6
                 circ0 = data[data['index'] <= offset]
-            result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_result.pkl')
+            result_file = get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_result', 'pkl')
             data0 = mcq_vanilla_eval(model, circ0, meta, nproc, result_file, self.dataset_name)
-            dump(data0, eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_circ0_result.{suffix}'))
-            data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_circ0_result.{suffix}'))
+            dump(data0, get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_circ0_result'))
+            data = load(get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_circ0_result'))
             acc_map['vanilla_0'] = report_acc(data)
             # Vanilla ALL Acc
             data = load(eval_file)
             dataall = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
-            dump(dataall, eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_all_result.{suffix}'))
-            data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_all_result.{suffix}'))
+            dump(dataall, get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_all_result'))
+            data = load(get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_all_result'))
             acc_map['vanilla_all'] = report_acc(data)
             # Merge & Print the Evaluation Results
             for k, v in acc_map.items():
@@ -350,7 +349,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             score_all = [acc_map['vanilla_0'], acc_map['vanilla_all'], acc_map['circular']]
             score_all = pd.concat(score_all)
             print(score_all)
-            score_file = eval_file.replace(f'.{suffix}', '_acc_all.csv')
+            score_file = get_intermediate_file_path(eval_file, '_acc_all', 'csv')
             dump(score_all, score_file)
 
         if dataset == 'AesBench_VAL':
@@ -382,7 +381,6 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
         if circular:
             raise ValueError("circular is not supported for verifier evaluation")
 
-        suffix = eval_file.split('.')[-1]
         data = load(eval_file)
         data = data.sort_values(by='index')
         data['prediction'] = [str(x) for x in data['prediction']]
@@ -418,7 +416,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
         data['verifier_score'] = verifier_scores
         data['verifier_match'] = verifier_matches
 
-        detailed_result_file = eval_file.replace(f'.{suffix}', '_detailed_results.xlsx')
+        detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
         dump(data, detailed_result_file)
 
         def report_acc_verifier(result_file):
@@ -462,7 +460,7 @@ def report_acc_verifier(result_file):
             res_df = pd.DataFrame(res)
             return res_df
         acc = report_acc_verifier(detailed_result_file)
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(acc, score_file)
         return acc
 
@@ -615,11 +613,11 @@ def evaluate(self, eval_file, **judge_kwargs):
         if 'COT' in self.dataset_name:
             data = load(eval_file)
             data['prediction'] = [self.cot_postproc(x) for x in data['prediction']]
-            tgt = eval_file.replace('.xlsx', '_cotpost.xlsx')
+            tgt = get_intermediate_file_path(eval_file, '_cotpost')
             dump(data, tgt)
             res = super().evaluate(tgt, **judge_kwargs)
-            acc_org = eval_file.replace('.xlsx', '_acc.csv')
-            acc_now = eval_file.replace('.xlsx', '_cotpost_acc.csv')
+            acc_org = get_intermediate_file_path(eval_file, '_acc', 'csv')
+            acc_now = get_intermediate_file_path(eval_file, '_cotpost_acc', 'csv')
             shutil.copy(acc_now, acc_org)
             return res
         else:
@@ -1017,11 +1015,11 @@ def build_prompt(self, line):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
         FAIL_MSG = 'Failed to obtain answer via API.'
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
 
@@ -1036,6 +1034,17 @@ def evaluate(self, eval_file, **judge_kwargs):
                 ans = data.loc[data['index'] == idx, 'answer'].values[0]
                 pred = data.loc[data['index'] == idx, 'prediction'].values[0]
 
+                match_cot = re.search(r"<think>(.*?)</think>", pred, re.DOTALL)
+                cot = match_cot.group(1).strip() if match_cot else pred
+
+                target_instances = ast.literal_eval(data.loc[data['index'] == idx, 'target_instances'].values[0])
+                iou = self.evaluate_box_iou(cot, target_instances)
+
+                data.loc[data['index'] == idx, 'iou'] = iou
+
+                match_pred = re.search(r"<answer>(.*?)</answer>", pred, re.DOTALL)
+                pred = match_pred.group(1).strip().upper() if match_pred else pred
+
                 extract_pred = extract_characters_regex(pred)
                 if extract_pred == '':
                     cnt_rejected += 1
@@ -1055,6 +1064,86 @@ def evaluate(self, eval_file, **judge_kwargs):
         dump(rating, tgt_file)
         return rating
 
+    def evaluate_box_iou(predict_str: str, target_instances: list) -> float:
+        pattern = r"<box>(.*?)</box>"
+        matches = re.findall(pattern, predict_str, re.DOTALL)
+
+        all_boxes = []
+
+        for match in matches:
+            box = match.strip()
+
+            coord_pattern = r'\[(\d+),(\d+),(\d+),(\d+)\]'
+            coord_match = re.match(coord_pattern, box)
+
+            if coord_match:
+                x1, y1, x2, y2 = map(int, coord_match.groups())
+
+                if x1 < x2 and y1 < y2:
+                    # all_boxes.append([(x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1])
+                    all_boxes.append([x1, y1, x2, y2])
+
+        if len(all_boxes) == 0:
+            return 0
+
+        target_boxes = target_instances
+        if len(target_boxes) == 0:
+            return len(all_boxes) > 0
+
+        def calculate_average_iou(pred_boxes, target_boxes):
+            """
+            计算每个目标框与预测框中 IoU 最大的预测框之间的平均 IoU。
+
+            参数:
+                pred_boxes (List[List[float]]): 预测框列表，每个框为 [cx, cy, w, h]
+                target_boxes (List[List[float]]): 目标框列表，每个框为 [cx, cy, w, h]
+
+            返回:
+                float: 匹配上的平均 IoU
+            """
+            def compute_iou(box1, box2):
+                """计算两个框之间的 IoU"""
+                x1_min, y1_min, x1_max, y1_max = box1
+                x2_min, y2_min, x2_max, y2_max = box2
+
+                inter_x_min = max(x1_min, x2_min)
+                inter_y_min = max(y1_min, y2_min)
+                inter_x_max = min(x1_max, x2_max)
+                inter_y_max = min(y1_max, y2_max)
+
+                inter_width = max(0, inter_x_max - inter_x_min)
+                inter_height = max(0, inter_y_max - inter_y_min)
+                inter_area = inter_width * inter_height
+
+                area1 = (x1_max - x1_min) * (y1_max - y1_min)
+                area2 = (x2_max - x2_min) * (y2_max - y2_min)
+
+                union_area = area1 + area2 - inter_area
+
+                return inter_area / union_area if union_area > 0 else 0.0
+
+            pred_coords = pred_boxes
+            target_coords = target_boxes
+
+            total_iou = 0.0
+            num_targets = len(target_boxes)
+
+            if num_targets == 0:
+                return 0.0
+
+            # 为每个目标框找到最大 IoU 的预测框
+            for t_coord in target_coords:
+                best_iou = 0.0
+                for p_coord in pred_coords:
+                    iou = compute_iou(t_coord, p_coord)
+                    if iou > best_iou:
+                        best_iou = iou
+                total_iou += best_iou
+
+            return total_iou / num_targets
+
+        return calculate_average_iou(all_boxes, target_boxes)
+
 
 class CVBench(ImageMCQDataset):
     """CV-Bench, composed of two sub datasets:
@@ -1101,7 +1190,6 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         nproc = judge_kwargs.pop("nproc", 4)
 
-        suffix = eval_file.split(".")[-1]
         model_name = judge_kwargs.get("model", "extract_matching")
 
         if model_name == "exact_matching":
@@ -1117,7 +1205,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             )
             model = None
 
-        result_file = eval_file.replace(f".{suffix}", f"_{model_name}_result.pkl")
+        result_file = get_intermediate_file_path(eval_file, f"_{model_name}_result", "pkl")
 
         data = load(eval_file)
         data = data.sort_values(by="index")
@@ -1136,7 +1224,7 @@ def evaluate(self, eval_file, **judge_kwargs):
                 k in meta_q_map
             ), f"eval_file should be the same as or a subset of dataset {self.dataset_name}"
 
-        score_file = eval_file.replace(f".{suffix}", "_acc.csv")
+        score_file = get_intermediate_file_path(eval_file, "_acc", "csv")
 
         if osp.exists(score_file):
             acc = load(score_file)
@@ -1144,15 +1232,14 @@ def evaluate(self, eval_file, **judge_kwargs):
         data = mcq_vanilla_eval(
             model, data, meta, nproc, result_file, self.dataset_name
         )
-        dump(data, eval_file.replace(f".{suffix}", f"_{model}_result.{suffix}"))
-        data = load(eval_file.replace(f".{suffix}", f"_{model}_result.{suffix}"))
+        dump(data, get_intermediate_file_path(eval_file, f"_{model_name}_result"))
+        data = load(get_intermediate_file_path(eval_file, f"_{model_name}_result"))
 
         if all(data["split"] == "2D"):  # 2D
             acc = self.report_accuracy(data)
         else:  # 3D, use default evaluation strategy
             acc = report_acc(data)
 
-        score_file = eval_file.replace(f".{suffix}", "_acc.csv")
         dump(acc, score_file)
 
         return acc
@@ -1198,7 +1285,6 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.hrbench import report_acc_hrbench
         nproc = judge_kwargs.pop('nproc', 4)
 
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs.get('model', 'extract_matching')
         assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
         name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
@@ -1216,7 +1302,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
-        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
 
         data = load(eval_file)
         data = data.sort_values(by='index')
@@ -1233,18 +1319,17 @@ def evaluate(self, eval_file, **judge_kwargs):
                 f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
             )
 
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
 
         if osp.exists(score_file):
             acc = load(score_file)
             return acc
         data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
-        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
-        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        dump(data, get_intermediate_file_path(eval_file, f'_{name_str}_result'))
+        data = load(get_intermediate_file_path(eval_file, f'_{name_str}_result'))
 
         acc = report_acc_hrbench(data)
 
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
         dump(acc, score_file)
 
         return acc
@@ -1323,7 +1408,7 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         scores = get_scores(results)
         print(scores)
-        score_file = 'NaturalBench_acc.csv'
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
         dump(df, score_file)
 
@@ -1401,13 +1486,12 @@ def evaluate(self, eval_file, **judge_kwargs):
             warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+        storage = get_intermediate_file_path(eval_file, f'_{name_str}')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage) and model is not None:
             data = load(eval_file)
-            result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+            result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
 
             data = load(eval_file)
             data = data.sort_values(by='index')
@@ -1437,7 +1521,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             four_dim_scores = wemath_accuracy(eval_file)
         combine_score = {**accuracy_scores, **four_dim_scores}
         combine_score = pd.DataFrame(combine_score)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
         dump(combine_score, score_pth)
         return combine_score
 
@@ -1488,15 +1572,14 @@ def build_prompt(self, line):
 
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.vmcbench import get_mc_score, report_vmc_acc
-        suffix = eval_file.split('.')[-1]
         data = load(eval_file)
         data = data.sort_values(by='index')
         data['prediction'] = [str(x) for x in data['prediction']]
         data['hit'] = data.apply(get_mc_score, axis=1)
-        result_file = eval_file.replace(f'.{suffix}', f'_result.{suffix}')
+        result_file = get_intermediate_file_path(eval_file, '_result')
         dump(data, result_file)
         acc = report_vmc_acc(data)
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(acc, score_file)
 
         return acc
@@ -1638,8 +1721,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+        storage = get_intermediate_file_path(eval_file, f'_{name_str}')
 
         if osp.exists(storage):
             accuracy_scores = VisuLogic_acc(storage)
@@ -1647,7 +1729,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             accuracy_scores = VisuLogic_acc(eval_file)
         combine_score = {**accuracy_scores,}
         combine_score = pd.DataFrame(combine_score)
-        score_pth = storage.replace('.xlsx', '_acc.csv')
+        score_pth = get_intermediate_file_path(storage, '_acc', 'csv')
         dump(combine_score, score_pth)
         return combine_score
 
@@ -1698,7 +1780,6 @@ def do_evaluate(self, eval_file, **judge_kwargs):
         from .utils.multiple_choice import report_acc, mcq_vanilla_eval
         nproc = judge_kwargs.pop('nproc', 4)
 
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs.get('model', 'exact_matching')
         assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125', 'gpt-4o-mini']
         name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4', 'gpt-4o-mini': 'gpt4omini'}
@@ -1716,7 +1797,7 @@ def do_evaluate(self, eval_file, **judge_kwargs):
             warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
-        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
 
         data = load(eval_file)
         data = data.sort_values(by='index')
@@ -1736,12 +1817,12 @@ def do_evaluate(self, eval_file, **judge_kwargs):
         data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
 
         # Save evaluation results
-        judged_result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')
+        judged_result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result')
         dump(data, judged_result_file)
 
         acc = report_acc(data)
 
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(acc, score_file)
 
         return acc, judged_result_file
@@ -1920,8 +2001,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         result_df = pd.DataFrame(accuracy_dict)
         result_df['Overall macro'] = result_df.mean(axis=1)
         result_df['Overall micro'] = micro_metric['correct'] / micro_metric['total']
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(result_df, score_file)
         return result_df
 
@@ -2062,8 +2142,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             result_df[f"Sphere macro: {sphere}"] = sum(accs) / len(accs)
         result_df["Overall macro"] = result_df.mean(axis=1)
         result_df["Overall micro"] = micro_metric["correct"] / micro_metric["total"]
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(result_df, score_file)
         return result_df
 
@@ -2254,31 +2333,15 @@ def evaluate(self, eval_file, **judge_kwargs):
                 ans = self.extract_content_in_braces(data_item["prediction"])
                 if ans == data_item["answers"]:
                     task_stats[task]['correct'] += 1
-            elif data_item["task"] == "Touching Circles":
-                if str.lower(data_item["answers"]) in str.lower(data_item["prediction"]):
-                    task_stats[task]['correct'] += 1
-            elif data_item["task"] == "Counting Grid - Word Grids":
+            elif data_item["task"] == "Touchdown Reading":
                 if self.compare_string_with_values(data_item["prediction"], data_item["answers"]):
                     task_stats[task]['correct'] += 1
-            elif data_item["task"] == "Counting Grid - Blank Grids":
-                if self.compare_string_with_values(data_item["prediction"], data_item["answers"]):
-                    task_stats[task]['correct'] += 1
-            elif data_item["task"] == "Olympic Counting - Pentagons":
-                if data_item["answers"] in data_item["prediction"]:
-                    task_stats[task]['correct'] += 1
-            elif data_item["task"] == "Olympic Counting - Circles":
-                if data_item["answers"] in data_item["prediction"]:
-                    task_stats[task]['correct'] += 1
-            elif data_item["task"] == "Circled Letter":
-                ans = self.extract_content_in_braces(data_item["prediction"])
-                if ans == data_item["answers"]:
-                    task_stats[task]['correct'] += 1
-
-        accuracy_dict = {task: [stats['correct'] / stats['total']] for task, stats in task_stats.items()}
-        result_df = pd.DataFrame(accuracy_dict)
-        result_df['overall'] = result_df.mean(axis=1)
 
-        return result_df
+        accuracy_dict = {task: [stats['correct'] / stats['total']] for task, stats in sorted(task_stats.items())}
+        accuracy_df = pd.DataFrame(accuracy_dict)
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
+        dump(accuracy_df, score_file)
+        return accuracy_df
 
 
 class SCAM(ImageMCQDataset):
@@ -2330,54 +2393,23 @@ class _3DSRBench(ImageMCQDataset):
     DATASET_MD5 = {'3DSRBench': '610516a0b4710595545b7613c60524e8'}
 
     def evaluate(self, eval_file, **judge_kwargs):
-        super().evaluate(eval_file, **judge_kwargs)
         from .utils.multiple_choice import report_acc
-        dname = osp.dirname(eval_file)
-        base = osp.basename(eval_file).split('.')[:-1]
-        base = '.'.join(base)
-        result_file = ls(dname, match=[base + '_', 'result.xlsx'])
-        assert len(result_file) == 1, result_file
-        result_file = result_file[0]
-        data = load(result_file)
-
-        acc_map = {}
-        acc_map['vanilla'] = report_acc(data)
-        # Flip Acc
-        qid2key = {x: x.replace('-flip', '') for x in data['qid']}
-        key_set = set(list(qid2key.values()))
-        main = cp.deepcopy(data[data['qid'].isin(key_set)])
-        hit_map = {x: y for x, y in zip(main['qid'], main['hit'])}
-        for x, y in zip(data['qid'], data['hit']):
-            hit_map[qid2key[x]] *= y
-        main['hit'] = [hit_map[x] for x in main['qid']]
-        acc_map['flip_eval'] = report_acc(main)
-        # Circ Acc
-        qid2key = {x: x[:8] if '-flip' not in x else x[:13] for x in data['qid']}
-        key_set = set(list(qid2key.values()))
-        main = cp.deepcopy(data[data['qid'].isin(key_set)])
-        hit_map = {x: y for x, y in zip(main['qid'], main['hit'])}
-        for x, y in zip(data['qid'], data['hit']):
-            hit_map[qid2key[x]] *= y
-        main['hit'] = [hit_map[x] for x in main['qid']]
-        acc_map['circ_eval'] = report_acc(main)
-        # Flip Circ Acc
-        qid2key = {x: x[:8] for x in data['qid']}
-        key_set = set(list(qid2key.values()))
-        main = cp.deepcopy(data[data['qid'].isin(key_set)])
-        hit_map = {x: y for x, y in zip(main['qid'], main['hit'])}
-        for x, y in zip(data['qid'], data['hit']):
-            hit_map[qid2key[x]] *= y
-        main['hit'] = [hit_map[x] for x in main['qid']]
-        acc_map['flip_circ_eval'] = report_acc(main)
-
-        metrics = []
-        for k in acc_map:
-            acc_map[k].pop('split')
-            acc_map[k]['setting'] = [k] * len(acc_map[k])
-            metrics.append(acc_map[k])
-        res_all = pd.concat(metrics)
-        dump(res_all, eval_file.replace('.xlsx', '_acc_all.csv'))
-        return res_all
+        from .utils.sr3d import parse_3dsr_prediction, eval_3dsr
+        from ..smp import dump, load
+        from ..utils.dataset_util import TDBench_grounding_eval
+        from ..dataset import parse_img_path_list
+        from ..config import VLM_EVAL_WITH_SUBSET
+        data = load(eval_file)
+        # parse the model predictions
+        data = parse_img_path_list(data)
+        data = parse_3dsr_prediction(data)
+        # rotate the image and boxes
+        data['hit'] = eval_3dsr(data)
+        result_file = get_intermediate_file_path(eval_file, '_acc')
+        if VLM_EVAL_WITH_SUBSET:
+            data['subset'] = [x.split('|')[0] for x in data['index']]
+        dump(data, result_file)
+        return report_acc(data)
 
 
 class AffordanceDataset(ImageMCQDataset):
@@ -2556,57 +2588,14 @@ def build_prompt(self, line):
     # It returns a dictionary
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
-        import ast
-        from .utils.multiple_choice import extract_characters_regex
-        from .utils.treebench import get_dimension_rating
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
-        FAIL_MSG = 'Failed to obtain answer via API.'
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
-
-        if not osp.exists(score_file):
-
-            res = {} if not osp.exists(tmp_file) else load(tmp_file)
-            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
-
-            data = load(eval_file)
-            cnt_rejected = 0
-            data_un = data[~pd.isna(data['prediction'])]
-
-            for idx in data['index']:
-                ans = data.loc[data['index'] == idx, 'answer'].values[0]
-                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
-
-                match_cot = re.search(r"<think>(.*?)</think>", pred, re.DOTALL)
-                cot = match_cot.group(1).strip() if match_cot else pred
-
-                target_instances = ast.literal_eval(data.loc[data['index'] == idx, 'target_instances'].values[0])
-                iou = self.evaluate_box_iou(cot, target_instances)
-
-                data.loc[data['index'] == idx, 'iou'] = iou
-
-                match_pred = re.search(r"<answer>(.*?)</answer>", pred, re.DOTALL)
-                pred = match_pred.group(1).strip().upper() if match_pred else pred
-
-                extract_pred = extract_characters_regex(pred)
-                if extract_pred == '':
-                    cnt_rejected += 1
-                    data.loc[data['index'] == idx, 'score'] = 0
-                else:
-                    data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
-
-            print(
-                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
-                f'failed to obtain the score for another {cnt_rejected} questions. '
-                f'Those questions will be counted as 0 score in ALL rating.'
-            )
-
-            dump(data, score_file)
-
-        rating = get_dimension_rating(score_file)
-        dump(rating, tgt_file)
-        return rating
+        from .utils.treebench import get_acc
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
+        try:
+            res = get_acc(eval_file)
+            dump(res, score_file)
+            return res
+        except:
+            return 0
 
     def evaluate_box_iou(predict_str: str, target_instances: list) -> float:
         pattern = r"<box>(.*?)</box>"
diff --git a/vlmeval/dataset/image_mt.py b/vlmeval/dataset/image_mt.py
index 07658948a..3cd72d726 100644
--- a/vlmeval/dataset/image_mt.py
+++ b/vlmeval/dataset/image_mt.py
@@ -1,6 +1,7 @@
 from .image_base import ImageBaseDataset
 from .utils.judge_util import build_judge
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 from ..utils import track_progress_rich
 
 
@@ -86,11 +87,10 @@ def calculat_metric(self, ans):
         return pd.DataFrame([sp1, sp2])
 
     def evaluate(self, eval_file, **judge_kwargs):
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs['model']
 
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
         nproc = judge_kwargs.pop('nproc', 4)
 
         data = load(eval_file)
diff --git a/vlmeval/dataset/image_shortqa.py b/vlmeval/dataset/image_shortqa.py
index 3650730cb..0d60ded33 100644
--- a/vlmeval/dataset/image_shortqa.py
+++ b/vlmeval/dataset/image_shortqa.py
@@ -4,6 +4,7 @@
 from .utils.multiple_choice import report_acc, eval_vanilla, eval_circular_group
 from .utils.shortqa import ShortQA_prompt
 from ..utils import track_progress_rich
+from ..smp.file import get_intermediate_file_path
 
 
 def ShortQA_auxeval(model, line):
@@ -89,8 +90,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         data['prediction'] = [str(x) for x in data['prediction']]
         data['answer'] = [str(x) for x in data['answer']]
 
-        storage = eval_file.replace('.xlsx', '_judge.xlsx')
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        storage = get_intermediate_file_path(eval_file, '_judge')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
@@ -137,7 +138,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         data = load(storage)
         acc = report_acc(data)
 
-        score_file = eval_file.replace('.xlsx', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(acc, score_file)
         return acc
 
diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
index ad202d9e8..27800547c 100644
--- a/vlmeval/dataset/image_vqa.py
+++ b/vlmeval/dataset/image_vqa.py
@@ -9,6 +9,7 @@
 from .image_base import ImageBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from ..utils import track_progress_rich
 
 
@@ -89,8 +90,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
         data['eval_match'] = [r['match'] for r in res]
         data['eval_score'] = [np.mean(r['match']) for r in res]
 
-        suffix = eval_file.split('.')[-1]
-        detailed_result_file = eval_file.replace(f'.{suffix}', '_results.xlsx')
+        detailed_result_file = get_intermediate_file_path(eval_file, '_results')
         dump(data, detailed_result_file)
 
         hit = hit_calculate(res, dataset)
@@ -118,8 +118,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
         ret = d2df(ret)
         ret.round(2)
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc')
         dump(ret, result_file)
         return ret
 
@@ -146,8 +145,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
         data['verifier_score'] = scores
         data['verifier_match'] = [1.0 if score else 0.0 for score in scores]
 
-        suffix = eval_file.split('.')[-1]
-        detailed_result_file = eval_file.replace(f'.{suffix}', '_detailed_results.xlsx')
+        detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
         dump(data, detailed_result_file)
 
         def hit_calculate(result):
@@ -177,8 +175,7 @@ def hit_calculate(result):
         ret = d2df(ret)
         ret.round(2)
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc')
         dump(ret, result_file)
         return ret
 
@@ -194,8 +191,7 @@ class VizWiz(ImageBaseDataset):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.vqa_eval import hit_calculate, process_line
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc')
 
         if not osp.exists(result_file):
             data = load(eval_file)
@@ -217,7 +213,7 @@ def evaluate(self, eval_file, **judge_kwargs):
 
             dump(ret, result_file)
 
-        retz = pd.read_csv(result_file)
+        retz = load(result_file)
         return retz
 
 
@@ -292,7 +288,7 @@ def evaluate(self, eval_file, **judge_kwargs):
              + final_score_dict['Handwritten Mathematical Expression Recognition'])
         final_score_dict['Final Score Norm'] = (
             float(final_score_dict['Final Score']) / 10)
-        score_pth = eval_file.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
         return final_score_dict
 
@@ -317,9 +313,8 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
         from .utils.mathvista import MathVista_auxeval, MathVista_acc
 
         model = judge_kwargs['model']
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
@@ -357,7 +352,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = MathVista_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
         dump(score, score_pth)
         return score
 
@@ -383,7 +378,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
             data['verifier_score'] = verifier_scores
             data['verifier_match'] = verifier_matches
 
-            detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx')
+            detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
             dump(data, detailed_result_file)
 
         def MathVista_acc_verifier(result_file):
@@ -422,7 +417,7 @@ def MathVista_acc_verifier(result_file):
             return res
 
         score = MathVista_acc_verifier(detailed_result_file)
-        score_pth = eval_file.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'csv')
         dump(score, score_pth)
         return score
 
@@ -483,11 +478,10 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
 
         model = judge_kwargs['model']
-        suffix = eval_file.split('.')[-1]
-        storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
-        tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
-        storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
+        storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract')
+        tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract', 'pkl')
+        storage_score = get_intermediate_file_path(eval_file, f'_{model}_score')
+        tmp_file_score = get_intermediate_file_path(eval_file, f'_{model}_score', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
         # stage1: extract the answer
         if not osp.exists(storage_extract):
@@ -517,8 +511,8 @@ def evaluate(self, eval_file, **judge_kwargs):
                 ans = load(tmp_file_extract)
                 for k, v in zip(indices, new_results):
                     assert k in ans
-                    assert ans[k]['log_extract'] == v['log_extract'] and ans[
-                        k]['extract'] == v['extract']
+                    assert ans[k]['log_extract'] == v['log_extract'] and ans[k][
+                        'extract'] == v['extract']
 
             data['extract'] = [ans[idx]['extract'] for idx in data['index']]
             data['log_extract'] = [
@@ -564,7 +558,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage_score)
 
         score = MathVerse_acc(storage_score)
-        score_pth = storage_score.replace('.xlsx', '.csv')
+        score_pth = get_intermediate_file_path(storage_score, '', 'csv')
         dump(score, score_pth)
         return score
 
@@ -595,9 +589,8 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             model = judge_kwargs['model']
         else:
             model = os.path.basename(os.environ.get('LOCAL_LLM'))
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
@@ -635,7 +628,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = MATH_V_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
         dump(score, score_pth)
         return score
 
@@ -662,11 +655,11 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
             data['verifier_score'] = verifier_scores
             data['verifier_match'] = verifier_matches
 
-            detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx')
+            detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
             dump(data, detailed_result_file)
 
         else:
-            detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx')
+            detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
             if not osp.exists(detailed_result_file):
                 dump(data, detailed_result_file)
 
@@ -697,7 +690,7 @@ def MathVision_acc_verifier(result_file):
             return res
 
         score = MathVision_acc_verifier(detailed_result_file)
-        score_pth = eval_file.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'csv')
         dump(score, score_pth)
         return score
 
@@ -797,9 +790,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             print(f'Using local model as judge model for PHYSICS: {model}')
         else:
             model = judge_kwargs.setdefault('model', 'gpt-4o-mini')
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
@@ -839,7 +831,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = PHYSIC_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
         dump(score, score_pth)
         return score
 
@@ -962,12 +954,11 @@ def evaluate(self, eval_file, **judge_kwargs):
         if use_api_judger:
             from .utils.olympiadbench import Olympiad_auxeval_extract, Olympiad_auxeval_score
             model = judge_kwargs['model']
-            suffix = eval_file.split('.')[-1]
-            storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
-            tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
-            result_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-            tmp_result_file = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
-            score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+            storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract')
+            tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract_tmp')
+            result_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+            tmp_result_file = get_intermediate_file_path(eval_file, f'_{model}_score_tmp')
+            score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
             nproc = judge_kwargs.pop('nproc', 4)
             # stage1: extract the answer
             if not osp.exists(storage_extract):
@@ -1046,46 +1037,47 @@ def evaluate(self, eval_file, **judge_kwargs):
             from .utils.olympiadbench import MathJudger, extract_answer
             judger = MathJudger()
 
-            suffix = eval_file.split('.')[-1]
-            name_str1 = 'judge'
-            name_str2 = 'score'
-            result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
-            score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
+        name_str1 = 'judge'
+        name_str2 = 'score'
+        result_file = get_intermediate_file_path(eval_file, f'_{name_str1}_result')
+        score_file = get_intermediate_file_path(eval_file, f'_{name_str2}_result', 'csv')
 
-            if not osp.exists(result_file):
-                data = load(eval_file)
-                scorez = []
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            scorez = []
 
-                for i in tqdm(data.iterrows()):
-                    line = i[1]
-                    model_answer = line['prediction']
-                    is_chinese = 'zh' in line['source']
-                    model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
-                    answer_type = line['answer_type']
+            for i in tqdm(data.iterrows()):
+                line = i[1]
+                model_answer = line['prediction']
+                is_chinese = 'zh' in line['source']
+                model_answer = extract_answer(is_chinese,
+                                              model_answer,
+                                              is_deepseek=False)
+                answer_type = line['answer_type']
 
-                    final_answer = line['final_answer'][2:-2]
+                final_answer = line['final_answer'][2:-2]
 
-                    if str(answer_type) != 'nan' and 'Tuple' in answer_type:
-                        judge_result = judger.judge(model_answer, final_answer)
-                    else:
-                        if str(line['error']) != 'nan':
-                            if ',' in line['error']:
-                                precisions = line['error'].split(',')
-                                precisions = [
-                                    float(p) if p else 1e-8 for p in precisions
-                                ]
-                                judge_result = judger.judge(
-                                    model_answer, final_answer, precisions)
-                            else:
-                                precision = float(line['error'])
-                                judge_result = judger.judge(
-                                    model_answer, final_answer, precision)
+                if str(answer_type) != 'nan' and 'Tuple' in answer_type:
+                    judge_result = judger.judge(model_answer, final_answer)
+                else:
+                    if str(line['error']) != 'nan':
+                        if ',' in line['error']:
+                            precisions = line['error'].split(',')
+                            precisions = [
+                                float(p) if p else 1e-8 for p in precisions
+                            ]
+                            judge_result = judger.judge(
+                                model_answer, final_answer, precisions)
                         else:
-                            judge_result = judger.judge(model_answer, final_answer)
-                    scorez.append(judge_result)
+                            precision = float(line['error'])
+                            judge_result = judger.judge(
+                                model_answer, final_answer, precision)
+                    else:
+                        judge_result = judger.judge(model_answer, final_answer)
+                scorez.append(judge_result)
 
-                data['score'] = scorez
-                dump(data, result_file)
+            data['score'] = scorez
+            dump(data, result_file)
 
         judge_file = load(result_file)
 
@@ -1153,9 +1145,9 @@ def evaluate(self, eval_file, **judge_kwargs):
             acc_dict['AVG'] = [acc]
 
             acc_pd = pd.DataFrame(acc_dict)
-            acc_pd.to_csv(score_file, index=False, encoding='gbk')
+            dump(acc_pd, score_file)
 
-        accdz = pd.read_csv(score_file)
+        accdz = load(score_file)
         return accdz
 
 
@@ -1228,9 +1220,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.seephys import extract, eval_acc
 
         model = judge_kwargs.pop('model', 'deepseek')
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
         if not osp.exists(storage):
             data = load(eval_file)
@@ -1268,7 +1259,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = eval_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(storage, '_score', 'json')
         dump(score, score_pth)
         return score
 
@@ -1312,9 +1303,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             )
             model = None
 
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{name_str}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{name_str}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage) and model is not None:
@@ -1354,7 +1344,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
         if osp.exists(storage):
             accuracy_scores = evaluate_logicvista(storage)
-            score_pth = storage.replace('.xlsx', '_score.csv')
+            score_pth = get_intermediate_file_path(storage, '_score', 'csv')
             dump(accuracy_scores, score_pth)
 
             return accuracy_scores
@@ -1478,7 +1468,6 @@ class LLaVABench(ImageBaseDataset):
     }
     DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
 
-    # It returns a DataFrame
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.llavabench import (
@@ -1487,9 +1476,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             LLaVABench_score,
         )
 
-        suffix = '.' + eval_file.split('.')[-1]
-        record_file = eval_file.replace(suffix, '_openai_result' + suffix)
-        score_file = eval_file.replace(suffix, '_score.csv')
+        record_file = get_intermediate_file_path(eval_file, '_openai_result')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
         nproc = judge_kwargs.pop('nproc', 4)
         system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
 
@@ -1534,9 +1522,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             LLaVABench_score,
         )
 
-        suffix = '.' + eval_file.split('.')[-1]
-        record_file = eval_file.replace(suffix, '_openai_result' + suffix)
-        score_file = eval_file.replace(suffix, '_score.csv')
+        record_file = get_intermediate_file_path(eval_file, '_openai_result')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
         nproc = judge_kwargs.pop('nproc', 4)
         system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
 
@@ -1583,9 +1570,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             VGRPBench_get_system_prompt,
         )
 
-        suffix = '.' + eval_file.split('.')[-1]
-        record_file = eval_file.replace(suffix, '_openai_result' + suffix)
-        score_file = eval_file.replace(suffix, '_score.csv')
+        record_file = get_intermediate_file_path(eval_file, '_openai_result')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
 
         nproc = judge_kwargs.pop('nproc', 4)
 
@@ -1649,10 +1635,9 @@ class MMVet(ImageBaseDataset):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.mmvet import MMVet_auxeval, MMVet_acc
 
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs['model']
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
         if not osp.exists(storage):
             data = load(eval_file)
@@ -1687,8 +1672,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score, score_fine = MMVet_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
-        score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
+        score_fine_pth = get_intermediate_file_path(storage, '_score_fine', 'csv')
         dump(score, score_pth)
         dump(score_fine, score_fine_pth)
         return score
@@ -1727,8 +1712,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             for category, scores in category_scores.items()
         }
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.json')
+        result_file = get_intermediate_file_path(eval_file, '_acc', 'json')
         dump(category_averages, result_file)
 
         return category_averages
@@ -1908,6 +1892,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             for task, metrics in eval_results.items()
             for metric, score in metrics.items()
         ])
+        result_file = get_intermediate_file_path(eval_file, '_acc')
+        dump(ret_df, result_file)
         return ret_df
 
     # WildDoc adopts a custom prompt for each subset
@@ -1979,8 +1965,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             eval_result['average_scores'].append(
                 split_eval_meta['average_scores'])
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         eval_result = pd.DataFrame(eval_result)
         dump(eval_result, result_file)
 
@@ -2089,7 +2074,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             else:
                 final_score_dict[category] = None
 
-        score_pth = eval_file.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
         return final_score_dict
 
@@ -2255,9 +2240,8 @@ def evaluate(self, eval_file, **judge_kwargs):
 
             # extract using model
             model = judge_kwargs['model']
-            suffix = eval_file.split('.')[-1]
-            storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-            tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+            storage = get_intermediate_file_path(eval_file, f'_{model}')
+            tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
             nproc = judge_kwargs.pop('nproc', 4)
 
             if not osp.exists(storage):
@@ -2353,7 +2337,7 @@ def evaluate(self, eval_file, **judge_kwargs):
                 delta_1_point_5_per_question_type
             })
 
-        score_pth = eval_file.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
         return final_score_dict
 
@@ -2477,7 +2461,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             else:
                 final_score_dict[category] = None
 
-        score_pth = eval_file.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
         return final_score_dict
 
@@ -2562,12 +2546,9 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.mmsci import (get_all_metrics_for_g_eval_score,
                                   get_all_metrics_for_reference_based_metrics,
                                   merge_rating, fact_score_generate)
-        refer_based_metrics_output_file = eval_file.replace(
-            '.xlsx', '_reference_based_metrics.xlsx')
-        g_eval_metrics_output_file = eval_file.replace('.xlsx',
-                                                       '_g_eval_metrics.xlsx')
-        fact_score_metrics_output_file = eval_file.replace(
-            '.xlsx', '_fact_score.xlsx')
+        refer_based_metrics_output_file = get_intermediate_file_path(eval_file, '_reference_based_metrics')
+        g_eval_metrics_output_file = get_intermediate_file_path(eval_file, '_g_eval_metrics')
+        fact_score_metrics_output_file = get_intermediate_file_path(eval_file, '_fact_score')
 
         # calculate reference-based metrics
         if not osp.exists(refer_based_metrics_output_file):
@@ -2592,8 +2573,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             if isinstance(references[0], str):
                 references = [[r] for r in references]
 
-            reference_based_metrics_file = eval_file.replace(
-                '.xlsx', '_reference_based_metrics.pkl')
+            reference_based_metrics_file = get_intermediate_file_path(eval_file, '_reference_based_metrics', 'pkl')
             existing_data = get_all_metrics_for_reference_based_metrics(
                 references, candidates, image_id_list,
                 reference_based_metrics_file)
@@ -2643,8 +2623,7 @@ def evaluate(self, eval_file, **judge_kwargs):
 
             assert judge_model.working(), (
                 'Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
-            suffix = '.' + eval_file.split('.')[-1]
-            tmp_file = eval_file.replace(suffix, f'_{model}_G_eval.pkl')
+            tmp_file = get_intermediate_file_path(eval_file, f'_{model}_G_eval', 'pkl')
 
             tmp_result = get_all_metrics_for_g_eval_score(
                 references,
@@ -2666,7 +2645,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         rating = merge_rating(refer_based_metrics_output_file,
                               g_eval_metrics_output_file,
                               fact_score_metrics_output_file)
-        dump(rating, eval_file.replace('.xlsx', '_final_rating.xlsx'))
+        dump(rating, get_intermediate_file_path(eval_file, '_final_rating'))
         return rating
 
 
@@ -2681,7 +2660,7 @@ class BMMR(ImageBaseDataset):
 
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.bmmr import get_acc_for_reference_based_metrics, merge_rating
-        refer_based_metrics_output_file = eval_file.replace('.xlsx', '_reference_based_metrics.xlsx')
+        refer_based_metrics_output_file = get_intermediate_file_path(eval_file, '_reference_based_metrics')
         if not osp.exists(refer_based_metrics_output_file):
             data = load(eval_file)
             old_candidates = {}
@@ -2707,7 +2686,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             if isinstance(references[0], str):
                 references = [[r] for r in references]
 
-            reference_based_metrics_file = eval_file.replace('.xlsx', '_reference_based_metrics.pkl')
+            reference_based_metrics_file = get_intermediate_file_path(eval_file, '_reference_based_metrics', 'pkl')
             assert len(references) == len(candidates) == len(image_id_list) == len(task_type_list)
             existing_data = get_acc_for_reference_based_metrics(
                 references, candidates, image_id_list, task_type_list, reference_based_metrics_file
@@ -2720,7 +2699,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         rating = merge_rating(
             refer_based_metrics_output_file,
         )
-        dump(rating, eval_file.replace('.xlsx', '_final_rating.xlsx'))
+        dump(rating, get_intermediate_file_path(eval_file, '_final_rating'))
         return rating
 
     def build_prompt(self, line):
@@ -2756,7 +2735,6 @@ class TDBenchGrounding(ImageVQADataset):
 
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.tdbench import evaluate_bbox, extract_bbox_from_string, rotational_eval
-        suffix = eval_file.split('.')[-1]
         method = judge_kwargs.get('model', 'centroid')
         assert method in ['centroid',
                           'iou'], '--judge should be either centroid or iou'
@@ -2786,16 +2764,16 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         data['hit'] = scores
         data['category'] = 'visual_grounding'
-        result_file = eval_file.replace(f'.{suffix}', f'_{method}_result.xlsx')
-        data.to_excel(result_file, index=False)
+        result_file = get_intermediate_file_path(eval_file, f'_{method}_result')
+        dump(data, result_file)
 
         metric_name = 'Average Centroid Containment' if method == 'centroid' else 'Average IoU'
         summary_scores = {metric_name: avg_score, 'Total Samples': len(scores)}
 
         score_df = pd.DataFrame(list(summary_scores.items()),
                                 columns=['Metric', 'Score'])
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
-        score_df.to_csv(score_file, index=False)
+        score_file = get_intermediate_file_path(eval_file, '_acc')
+        dump(score_df, score_file)
         re_result = rotational_eval(result_file)
         if method == 'centroid' and re_result is not None and re_result is not False:
             file_addr = osp.abspath(
@@ -2902,7 +2880,11 @@ def evaluate(self, eval_file, **judge_kwargs):
             if ans in pred:
                 correct_count += 1
         accuracy = correct_count / total_count if total_count > 0 else 0
-        return {'accuracy': accuracy}
+
+        result = {'accuracy': accuracy * 100}
+        result_file = get_intermediate_file_path(eval_file, '_acc')
+        dump(d2df(result), result_file)
+        return result
 
 
 class OCR_Reasoning(ImageBaseDataset):
@@ -2919,9 +2901,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.ocr_reasoning import OcrR_auxeval, OcrR_acc
 
         model = judge_kwargs['model']
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
         nproc = 1
         if not osp.exists(storage):
@@ -2932,7 +2913,6 @@ def evaluate(self, eval_file, **judge_kwargs):
             lines = [data.iloc[i] for i in range(lt)]
             tups = [(model, line) for line in lines]
             indices = [line['index'] for line in lines]
-
             ans = {}
             if osp.exists(tmp_file):
                 ans = load(tmp_file)
@@ -2961,7 +2941,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             ]
             dump(data, storage)
         score = OcrR_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
         dump(score, score_pth)
         return score
 
@@ -3058,8 +3038,7 @@ def evaluate(self, eval_file, **judge_kwargs):
                 # Open ended mode
                 res = pool.map(partial(PhyX_process_line), lines)
 
-            suffix = eval_file.split('.')[-1]
-            result_file = eval_file.replace(f'.{suffix}', '_predict.xlsx')
+            result_file = get_intermediate_file_path(eval_file, '_predict')
             df = pd.DataFrame(res)
             df.to_excel(result_file, index=False)
 
@@ -3077,8 +3056,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             ret = d2df(ret)
             ret.round(2)
 
-            suffix = eval_file.split('.')[-1]
-            result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+            result_file = get_intermediate_file_path(eval_file, '_acc')
             dump(ret, result_file)
             return ret
 
@@ -3086,9 +3064,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             from .utils.phyx import PhyX_auxeval, PhyX_acc, PhyX_auxeval_MC
 
             model = judge_kwargs['model']
-            suffix = eval_file.split('.')[-1]
-            storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-            tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+            storage = get_intermediate_file_path(eval_file, f'_{model}')
+            tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
             nproc = judge_kwargs.pop('nproc', 4)
 
             if not osp.exists(storage):
@@ -3141,7 +3118,7 @@ def evaluate(self, eval_file, **judge_kwargs):
                 dump(data, storage)
 
             score = PhyX_acc(storage)
-            score_pth = storage.replace('.xlsx', '_score.csv')
+            score_pth = get_intermediate_file_path(storage, '_score', 'csv')
             dump(score, score_pth)
             return score
 
@@ -3232,9 +3209,9 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.mme_reasoning import MMEReasoning_extract, MMEReasoning_openeval, MMEReasoning_acc, FAIL_MSG, mme_reasoning_eval_functions  # noqa
 
         model = judge_kwargs.get('model', 'gpt-4o-mini')
-        suffix = eval_file.split('.')[-1]
-        storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
-        tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
+        storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract')
+        tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract_tmp')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
         nproc = judge_kwargs.pop('nproc', 4)
 
         # stage 1: extract answers using LLM
@@ -3282,11 +3259,9 @@ def evaluate(self, eval_file, **judge_kwargs):
             data['log'] = log_list
             dump(data, storage_extract)
 
-        storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
-
+        tmp_file_score = get_intermediate_file_path(eval_file, f'_{model}_score_tmp')
         # stage 2: evaluate score
-        if not osp.exists(storage_score):
+        if not osp.exists(score_file):
             data = load(storage_extract)
             data = data.replace({float('nan'): None})
             model = build_judge(max_tokens=1024, **judge_kwargs)
@@ -3390,10 +3365,10 @@ def evaluate(self, eval_file, **judge_kwargs):
 
             data['score'] = [ans[idx]['score'] for idx in data['index']]
             data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
-            dump(data, storage_score)
+            dump(data, score_file)
 
-        score = MMEReasoning_acc(storage_score)
-        score_pth = storage_score.replace('.xlsx', '.csv')
+        score = MMEReasoning_acc(score_file)
+        score_pth = get_intermediate_file_path(score_file, '', 'csv')
         dump(score, score_pth)
         return score
 
@@ -3454,14 +3429,12 @@ def report_acc_mmatch(scores, match_types_int):
 
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
         judge = judge_kwargs['model']
         nproc = judge_kwargs.pop('nproc', 4)
-
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
-        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
-        acc_file = eval_file.replace('.xlsx', f'_{judge}_acc.xlsx')
-
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
+        acc_file = get_intermediate_file_path(eval_file, f'_{judge}_acc')
         judge_kwargs['temperature'] = 0.0
         model = build_judge(**judge_kwargs)
 
@@ -3571,7 +3544,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         final_score_dict = {**en_scores, **cn_scores}
         final_score_dict["English Overall Score"] = score_en_overall
         final_score_dict["Chinese Overall Score"] = score_cn_overall
-        score_pth = eval_file.replace('.xlsx', '_score.json')
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
         return final_score_dict
 
@@ -3605,10 +3578,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         model = build_judge(**judge_kwargs)
         if not model.working():
             raise RuntimeError("OPENAI API is not working properly. Please check your API key and configuration.")
-
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model_name}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model_name}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model_name}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}_tmp')
         nproc = judge_kwargs.pop('nproc', 4)
 
         data = load(eval_file)
@@ -3652,6 +3623,6 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         ret.round(2)
 
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc')
         dump(ret, result_file)
         return ret
diff --git a/vlmeval/dataset/image_yorn.py b/vlmeval/dataset/image_yorn.py
index 63ccd2b24..844ed0227 100644
--- a/vlmeval/dataset/image_yorn.py
+++ b/vlmeval/dataset/image_yorn.py
@@ -42,8 +42,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         dataset = self.dataset_name
         data = load(eval_file)
         data['prediction'] = [str(x) for x in data['prediction']]
-        storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        storage = get_intermediate_file_path(eval_file, '_auxmatch')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
@@ -104,6 +104,6 @@ def evaluate(self, eval_file, **judge_kwargs):
         else:
             score = default_rating(storage)
 
-        score_tgt = eval_file.replace('.xlsx', '_score.csv')
+        score_tgt = get_intermediate_file_path(eval_file, '_score', 'csv')
         dump(score, score_tgt)
         return score
diff --git a/vlmeval/dataset/longvideobench.py b/vlmeval/dataset/longvideobench.py
index f4e6470d5..ea2ce0de2 100644
--- a/vlmeval/dataset/longvideobench.py
+++ b/vlmeval/dataset/longvideobench.py
@@ -278,11 +278,11 @@ def build_prompt(self, line, video_llm):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/m4bench.py b/vlmeval/dataset/m4bench.py
index 32ded4d44..2695c043c 100644
--- a/vlmeval/dataset/m4bench.py
+++ b/vlmeval/dataset/m4bench.py
@@ -6,7 +6,7 @@
 from os import path as osp
 from .image_base import ImageBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
-from ..smp import decode_base64_to_image_file, load, dump
+from ..smp import decode_base64_to_image_file, load, dump, get_intermediate_file_path
 FAIL_MSG = 'Failed to obtain answer via API.'
 
 
@@ -183,8 +183,7 @@ def extract_options(q):
         df['score'] = (df['parsed_pred'] == df['response'])
 
         # Save detailed results
-        base_name = os.path.splitext(os.path.abspath(eval_file))[0]
-        details_file = base_name + '_details.xlsx'
+        details_file = get_intermediate_file_path(eval_file, '_details')
         dump(df, details_file)
 
         # Calculate and return accuracy
diff --git a/vlmeval/dataset/megabench.py b/vlmeval/dataset/megabench.py
index cc1cb85c7..7be235cc8 100644
--- a/vlmeval/dataset/megabench.py
+++ b/vlmeval/dataset/megabench.py
@@ -395,7 +395,7 @@ def process_text_and_media(text, media_list, is_demo=False):
         return message
 
     def evaluate(self, eval_file, **judge_kwargs):
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
         data = load(eval_file)
         result = []
 
@@ -424,7 +424,7 @@ def process_media_path(media_str):
         # save the result to json
         output_path = os.path.join(os.path.dirname(eval_file), f'megabench_result_{self.subset_name}.json')
         result_path = os.path.join(os.path.dirname(eval_file), f'megabench_score_{self.subset_name}.json')
-        score_path = eval_file.replace('.xlsx','_acc_{self.subset_name}.json')
+        score_path = get_intermediate_file_path(eval_file, '_acc_{self.subset_name}', 'json')
         if not os.path.exists(output_path) or not os.path.exists(result_path):
             for task_name, group in data.groupby('task_name'):
                 task_dict = {
diff --git a/vlmeval/dataset/miabench.py b/vlmeval/dataset/miabench.py
index 2e99d39ec..c33f3510b 100644
--- a/vlmeval/dataset/miabench.py
+++ b/vlmeval/dataset/miabench.py
@@ -114,10 +114,9 @@ def evaluate(self, eval_file, **judge_kwargs):
         judge_name = judge_kwargs.pop('model', 'gpt-4o')
 
         model = build_judge(model=judge_name, **judge_kwargs)
-        suffix = eval_file.split('.')[-1]
 
-        storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx')  # noqa: F841
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl')  # noqa: F841
+        storage = get_intermediate_file_path(eval_file, f'_{judge_name}')  # noqa: F841
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl')  # noqa: F841
         nproc = judge_kwargs.pop('nproc', 4)  # noqa: F841
 
         if not osp.exists(storage):
@@ -160,7 +159,7 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         goresult = load(storage)
         results = get_score_dict(goresult, goresult['score_raw'])
-        result_pth = storage.replace('.xlsx', '_score.csv')
+        result_pth = get_intermediate_file_path(storage, '_score', 'csv')
         results_pd = pd.DataFrame.from_dict(list(results.items()))
         dump(results_pd, result_pth)
 
diff --git a/vlmeval/dataset/mlvu.py b/vlmeval/dataset/mlvu.py
index 6244502d2..bcad3e961 100644
--- a/vlmeval/dataset/mlvu.py
+++ b/vlmeval/dataset/mlvu.py
@@ -1,6 +1,7 @@
 import huggingface_hub
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 from .video_concat_dataset import ConcatVideoDataset
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
@@ -34,8 +35,7 @@ def supported_datasets(cls):
 
     def evaluate(self, eval_file, **judge_kwargs):
         result = super().evaluate(eval_file=eval_file, **judge_kwargs)
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc')
         for key in self.type_data_dict:
             result.loc[key] = 0.0
             for name, item in result.iterrows():
@@ -211,10 +211,10 @@ def build_prompt(self, line, video_llm):
 
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.setdefault('model', 'chatgpt-0125')
@@ -423,9 +423,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
             judge_kwargs['model'] = 'gpt-4-0125'
 
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(score_file):
diff --git a/vlmeval/dataset/mmalignbench.py b/vlmeval/dataset/mmalignbench.py
index 6d8c6bb0f..fd77deccd 100644
--- a/vlmeval/dataset/mmalignbench.py
+++ b/vlmeval/dataset/mmalignbench.py
@@ -171,11 +171,10 @@ def gen_eval_base(self, eval_file, b64_map):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
         # We adopt pairwise evaluation (twice for a pair) for this dataset
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs['model']
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
diff --git a/vlmeval/dataset/mmbench_video.py b/vlmeval/dataset/mmbench_video.py
index 816ec1db6..f2ada230c 100644
--- a/vlmeval/dataset/mmbench_video.py
+++ b/vlmeval/dataset/mmbench_video.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..utils import track_progress_rich
@@ -208,13 +209,13 @@ def load_pack_answers(self, data_raw):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
         judge = judge_kwargs['model']
         nproc = judge_kwargs.pop('nproc', 4)
 
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
-        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
 
         model = build_judge(system_prompt=system_prompt, **judge_kwargs)
         assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
diff --git a/vlmeval/dataset/mmifeval.py b/vlmeval/dataset/mmifeval.py
index 7e68b6b37..6dcfd1f38 100644
--- a/vlmeval/dataset/mmifeval.py
+++ b/vlmeval/dataset/mmifeval.py
@@ -4,6 +4,7 @@
 from .image_base import ImageBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 from ..utils import track_progress_rich
 from ..dataset.utils.mmif.function_and_compare import *
 
@@ -370,11 +371,10 @@ def build_prompt(self, line):
     def evaluate(self, eval_file, **judge_kwargs):
         raw_bench_data = MMIFEval("MM-IFEval").data
         global aux_data_dict
-        suffix = eval_file.split(".")[-1]
         model = judge_kwargs["model"]
-        storage = eval_file.replace(f".{suffix}", f"_{model}.jsonl")
-        score_file = eval_file.replace(f".{suffix}", f"_{model}_score.csv")
-        tmp_file = eval_file.replace(f".{suffix}", f"_{model}_tmp.pkl")
+        storage = get_intermediate_file_path(eval_file, f"_{model}", "jsonl")
+        score_file = get_intermediate_file_path(eval_file, f"_{model}_score", "csv")
+        tmp_file = get_intermediate_file_path(eval_file, f"_{model}_tmp", "pkl")
         nproc = judge_kwargs.pop("nproc", 4)
 
         data_all = load(eval_file).to_dict(orient="records")
diff --git a/vlmeval/dataset/mmlongbench.py b/vlmeval/dataset/mmlongbench.py
index 2b5dd3619..3379d6af6 100644
--- a/vlmeval/dataset/mmlongbench.py
+++ b/vlmeval/dataset/mmlongbench.py
@@ -7,6 +7,7 @@
 from vlmeval.dataset.utils import build_judge, levenshtein_distance
 from vlmeval.smp import *
 from .image_base import ImageBaseDataset
+from ..smp.file import get_intermediate_file_path
 
 FAIL_MSG = 'Failed to obtain answer via API.'
 
@@ -538,9 +539,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         logger = get_logger('Evaluation')
         model = judge_kwargs['model']
 
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
 
         if osp.exists(storage):
             logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
@@ -576,7 +576,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = MMLongBench_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
 
         dump(score, score_pth)
         logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
diff --git a/vlmeval/dataset/mmmath.py b/vlmeval/dataset/mmmath.py
index e70f592be..d71bb1263 100644
--- a/vlmeval/dataset/mmmath.py
+++ b/vlmeval/dataset/mmmath.py
@@ -11,7 +11,7 @@
 
 from .image_base import ImageBaseDataset
 from ..utils import track_progress_rich
-from ..smp import load, dump
+from ..smp import load, dump, get_intermediate_file_path
 
 try:
     import sympy as sp
@@ -432,7 +432,7 @@ def evaluate(self, eval_file, **kwargs):
         data['hit'] = res
         dump(data, eval_file)
 
-        score_file = eval_file.replace('.xlsx', '_score.json')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'json')
         score = {}
         score['overall'] = np.mean(data['hit'])
         # Results by Difficulty
diff --git a/vlmeval/dataset/moat.py b/vlmeval/dataset/moat.py
index 928fc587f..123825799 100644
--- a/vlmeval/dataset/moat.py
+++ b/vlmeval/dataset/moat.py
@@ -4,6 +4,7 @@
 from ..utils import track_progress_rich
 from ..smp import load, dump, decode_base64_to_image
 from .utils import DEBUG_MESSAGE
+from ..smp.file import get_intermediate_file_path
 
 import zipfile
 from random import shuffle, seed
@@ -99,8 +100,7 @@ def build_prompt(self, line):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
         model = judge_kwargs['model']
-        suffix = eval_file.split('.')[-1]
-        result_path = eval_file.replace(f'.{suffix}', f"_{model}.xlsx")
+        result_path = get_intermediate_file_path(eval_file, f"_{model}")
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(result_path):
@@ -164,7 +164,7 @@ def verdict_one(model, line):
             'result_path': result_path,
             'capability_acc': capability_score_map,
         }
-        score_pth = eval_file.replace(f'.{suffix}', "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, "_score", "json")
         dump(metrics, score_pth)
 
         return metrics
diff --git a/vlmeval/dataset/moviechat1k.py b/vlmeval/dataset/moviechat1k.py
index 84dba33d6..fed877536 100644
--- a/vlmeval/dataset/moviechat1k.py
+++ b/vlmeval/dataset/moviechat1k.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..utils import track_progress_rich
@@ -215,16 +216,16 @@ def load_pack_answers(self, data_raw):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.moviechat1k import get_dimension_rating, prepare_score_prompt
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
         judge = judge_kwargs.setdefault('model', 'chatgpt-0125')
         assert judge in ['chatgpt-0125'], f'Invalid judge model for MovieChat1k: {judge}'
         nproc = judge_kwargs.pop('nproc', 4)
         _ = judge_kwargs.pop('verbose', None)
         _ = judge_kwargs.pop('retry', None)
 
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
-        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
 
         model = build_judge(**judge_kwargs)
 
diff --git a/vlmeval/dataset/mvbench.py b/vlmeval/dataset/mvbench.py
index 4f0aa7f03..69a49c0af 100644
--- a/vlmeval/dataset/mvbench.py
+++ b/vlmeval/dataset/mvbench.py
@@ -362,11 +362,11 @@ def build_prompt(self, line, video_llm):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.setdefault('model', 'chatgpt-0125')
@@ -609,11 +609,11 @@ def build_prompt(self, line, video_llm):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.setdefault('model', 'chatgpt-0125')
diff --git a/vlmeval/dataset/qbench_video.py b/vlmeval/dataset/qbench_video.py
index a208ebaf2..317fa019c 100644
--- a/vlmeval/dataset/qbench_video.py
+++ b/vlmeval/dataset/qbench_video.py
@@ -2,6 +2,7 @@
 import huggingface_hub
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_concat_dataset import ConcatVideoDataset
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
@@ -31,8 +32,7 @@ def supported_datasets(cls):
 
     def evaluate(self, eval_file, **judge_kwargs):
         result = super().evaluate(eval_file=eval_file, **judge_kwargs)
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc')
         result.at['open_ended', 'acc'] /= 2
         dump(result, score_file)
         return result
@@ -159,10 +159,10 @@ def build_prompt(self, line, video_llm):
 
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.setdefault('model', 'exact_matching')
@@ -318,9 +318,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         model = judge_kwargs.setdefault('model', 'gpt-4o-0806')
         assert model in ['gpt-4o-0806', 'gpt-4o']
 
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(score_file):
diff --git a/vlmeval/dataset/sfebench.py b/vlmeval/dataset/sfebench.py
index 7d672bb8b..b2aa24bc2 100644
--- a/vlmeval/dataset/sfebench.py
+++ b/vlmeval/dataset/sfebench.py
@@ -1,5 +1,7 @@
 import string
 from vlmeval import *
+from .smp import *
+from .smp.file import get_intermediate_file_path
 from .image_vqa import ImageVQADataset
 from .utils.judge_util import build_judge
 from ..utils import track_progress_rich
@@ -172,8 +174,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         assert 'answer' in data and 'prediction' in data
         data['prediction'] = [str(x) for x in data['prediction']]
         data['answer'] = [str(x) for x in data['answer']]
-        storage = eval_file.replace('.xlsx', '_judge.xlsx')
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        storage = get_intermediate_file_path(eval_file, '_judge')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
         if not osp.exists(storage):
             ans_map = {} if not osp.exists(tmp_file) else load(tmp_file)
@@ -216,6 +218,6 @@ def evaluate(self, eval_file, **judge_kwargs):
         data = load(storage)
         score = report_score(data)
 
-        score_file = eval_file.replace('.xlsx', '_score.csv')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
         dump(score, score_file)
         return score
diff --git a/vlmeval/dataset/slidevqa.py b/vlmeval/dataset/slidevqa.py
index ae7104d43..c6aa68575 100644
--- a/vlmeval/dataset/slidevqa.py
+++ b/vlmeval/dataset/slidevqa.py
@@ -6,6 +6,7 @@
 from vlmeval.smp import *
 from .image_base import ImageBaseDataset
 from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
+from ..smp.file import get_intermediate_file_path
 
 
 FAIL_MSG = 'Failed to obtain answer via API.'
@@ -143,9 +144,8 @@ def evaluate(self, eval_file, **judge_kwargs):
         logger = get_logger('Evaluation')
         model = judge_kwargs['model']
 
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
 
         if osp.exists(storage):
             logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
@@ -181,7 +181,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             dump(data, storage)
 
         score = SlideVQA_acc(storage)
-        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_pth = get_intermediate_file_path(storage, '_score', 'csv')
 
         dump(score, score_pth)
         logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
diff --git a/vlmeval/dataset/spatial457.py b/vlmeval/dataset/spatial457.py
index 15475f1c3..4026e4c1c 100644
--- a/vlmeval/dataset/spatial457.py
+++ b/vlmeval/dataset/spatial457.py
@@ -133,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs):
                 all_results[f"{level}_correct"] / all_results[level] if all_results[level] > 0 else 0
             )
 
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, "_score", "json")
 
         dump(all_results, score_pth)
         return all_results
diff --git a/vlmeval/dataset/tamperbench.py b/vlmeval/dataset/tamperbench.py
index 9c90e5e3d..7aebb4813 100644
--- a/vlmeval/dataset/tamperbench.py
+++ b/vlmeval/dataset/tamperbench.py
@@ -1,6 +1,7 @@
 import huggingface_hub
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 import torchvision.transforms as T
@@ -11,6 +12,7 @@
 import os
 import glob
 from .utils.tamperbench import *
+import warnings
 
 # constants
 FAIL_MSG = 'Failed to obtain answer via API.'
@@ -25,8 +27,6 @@ class MVTamperBench(VideoBaseDataset):
         'MVTamperBenchEnd': 'aa2c19dd02e1b006ee2d4be9f6f2b62b',
     }
     SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
-the detail and movement of objects, and the action and pose of persons. \
-Based on your observations, select the best option that accurately addresses the question.
 """
 
     TYPE = 'Video-MCQ'
@@ -87,14 +87,14 @@ def prepare_dataset(self, dataset_name='MVTamperBench', repo_id=None):
 
         def check_integrity(pth):
             """
-    Verifies the completeness and consistency of the dataset located at the specified path.
+            Verifies the completeness and consistency of the dataset located at the specified path.
 
-    Args:
-        path_to_dataset (str): The directory path where the dataset is stored.
+            Args:
+                path_to_dataset (str): The directory path where the dataset is stored.
 
-    Returns:
-        bool: True if the dataset is intact, False otherwise.
-    """
+            Returns:
+                bool: True if the dataset is intact, False otherwise.
+            """
             # Construct the full path to the data file
             data_file = osp.join(pth, f'{dataset_name}.tsv')
 
@@ -436,14 +436,14 @@ def evaluate(self, eval_file, **judge_kwargs):
         Evaluates the given evaluation file and generates ratings based on different dimensions.
 
         Args:
-            eval_file (str): Path to the evaluation file. The file should be in .xlsx format.
+            eval_file (str): Path to the evaluation file. The file should be in a supported format (xlsx/json/tsv).
             **judge_kwargs: Additional keyword arguments for the judge model.
 
         Returns:
             dict: A dictionary containing ratings for task type, tamper type, and task-tamper type.
 
         Raises:
-            AssertionError: If the eval_file does not end with '.xlsx'.
+            AssertionError: If the eval_file is not a supported format.
             Warning: If the OPENAI API is not working properly or the API key is not set,
                      exact matching will be used for evaluation.
 
@@ -454,15 +454,15 @@ def evaluate(self, eval_file, **judge_kwargs):
             - Ratings are generated for different dimensions and saved to respective files.
         """
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_task_type_file = eval_file.replace('.xlsx', '_task_type_rating.json')
-        tgt_tamper_type_file = eval_file.replace('.xlsx', '_tamper_type_rating.json')
-        tgt_task_tamper_type_file = eval_file.replace('.xlsx', '_task_tamper_type_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
-        score_metrics_file = eval_file.replace('.xlsx', '_score_f1.xlsx')
-        action_metrics_file = eval_file.replace('.xlsx', '_action_f1.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_task_type_file = get_intermediate_file_path(eval_file, '_task_type_rating', 'json')
+        tgt_tamper_type_file = get_intermediate_file_path(eval_file, '_tamper_type_rating', 'json')
+        tgt_task_tamper_type_file = get_intermediate_file_path(eval_file, '_task_tamper_type_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
+        score_metrics_file = get_intermediate_file_path(eval_file, '_score_f1')
+        action_metrics_file = get_intermediate_file_path(eval_file, '_action_f1')
 
         if not osp.exists(score_file):
             model = judge_kwargs.setdefault('model', 'chatgpt-0125')
diff --git a/vlmeval/dataset/tempcompass.py b/vlmeval/dataset/tempcompass.py
index 2cc10429c..6c409334e 100644
--- a/vlmeval/dataset/tempcompass.py
+++ b/vlmeval/dataset/tempcompass.py
@@ -25,9 +25,8 @@ def supported_datasets(cls):
 
     def evaluate(self, eval_file, **judge_kwargs):
         result = super().evaluate(eval_file=eval_file, **judge_kwargs)
-        suffix = eval_file.split('.')[-1]
         result = result.reset_index().rename(columns={'index': 'dim.task_type'})
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         avg_dict = {}
         for idx, item in result.iterrows():
             dim, task_type = item['dim.task_type'].split('. ')
@@ -214,9 +213,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             "presence_penalty": 1,
         })
 
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(score_file):
@@ -412,9 +410,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             "presence_penalty": 1,
         })
 
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(score_file):
@@ -609,9 +606,8 @@ def evaluate(self, eval_file, **judge_kwargs):
             "presence_penalty": 1,
         })
 
-        suffix = eval_file.split('.')[-1]
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(score_file):
diff --git a/vlmeval/dataset/text_mcq.py b/vlmeval/dataset/text_mcq.py
index 9db53893d..2879551a2 100644
--- a/vlmeval/dataset/text_mcq.py
+++ b/vlmeval/dataset/text_mcq.py
@@ -1,6 +1,7 @@
 from .text_base import TextBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 
 class TextMCQDataset(TextBaseDataset):
@@ -52,8 +53,6 @@ def evaluate(self, eval_file, **judge_kwargs):
         nproc = judge_kwargs.pop('nproc', 4)
 
         circular = False
-
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs.get('model', 'exact_matching')
         assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
         name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
@@ -71,7 +70,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
-        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
 
         data = load(eval_file)
         data = data.sort_values(by='index')
@@ -94,8 +93,9 @@ def evaluate(self, eval_file, **judge_kwargs):
             data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
 
         # load split
-        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
-        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        eval_name_result = get_intermediate_file_path(eval_file, f'_{name_str}_result')
+        dump(data, eval_name_result)
+        data = load(eval_name_result)
 
         # May have different report acc functions for different datasets
         if 'MMT' in dataset:
@@ -103,7 +103,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         else:
             acc = report_acc(data)
 
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(acc, score_file)
 
         return acc
diff --git a/vlmeval/dataset/utils/multiple_choice.py b/vlmeval/dataset/utils/multiple_choice.py
index d36c62341..e965808aa 100644
--- a/vlmeval/dataset/utils/multiple_choice.py
+++ b/vlmeval/dataset/utils/multiple_choice.py
@@ -562,7 +562,8 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
                 if k not in result:
                     result[k] = v
 
-    tmp_pth = f'/tmp/{timestr()}.xlsx'
+    tmp_ext = get_pred_file_format()
+    tmp_pth = f'/tmp/{timestr()}.{tmp_ext}'
     dump(data_main, tmp_pth)
     data_main = load(tmp_pth)
     indices = data_main['index']
diff --git a/vlmeval/dataset/utils/ocrbench.py b/vlmeval/dataset/utils/ocrbench.py
index f88bb246c..dedee4bfc 100644
--- a/vlmeval/dataset/utils/ocrbench.py
+++ b/vlmeval/dataset/utils/ocrbench.py
@@ -57,7 +57,7 @@ def OCRBench_eval(eval_file):
         + final_score_dict['Handwritten Mathematical Expression Recognition']
     )
     final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
-    score_pth = eval_file.replace('.xlsx', '_score.json')
+    score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
     dump(final_score_dict, score_pth)
     logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
     logger.info('Score: ')
diff --git a/vlmeval/dataset/vcr.py b/vlmeval/dataset/vcr.py
index c659c60f4..e63fab6fc 100644
--- a/vlmeval/dataset/vcr.py
+++ b/vlmeval/dataset/vcr.py
@@ -2,6 +2,8 @@
 from functools import partial
 from .image_base import ImageBaseDataset
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
+
 
 rouge = None
 nlp_en = None
@@ -323,9 +325,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             'Jaccard': vcr_score['Jaccard'],
             'Predictions': results_out,
         }
-        score_pth = eval_file.replace(
-            '.xlsx', f'{self.language}_{self.difficulty}_score.json'
-        )
+        score_pth = get_intermediate_file_path(eval_file, f'_{self.language}_{self.difficulty}_score', 'json')
         dump(results_with_metrics, score_pth)
         logger.info(
             f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
diff --git a/vlmeval/dataset/vcrbench.py b/vlmeval/dataset/vcrbench.py
index 7b35f708a..13efa628a 100644
--- a/vlmeval/dataset/vcrbench.py
+++ b/vlmeval/dataset/vcrbench.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..utils import track_progress_rich
@@ -141,14 +142,14 @@ def evaluate(self, eval_file, **judge_kwargs):
         from .utils.vcrbench.eval import precision, recall
         from .utils.vcrbench.cau_total import calu_pre_recall
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
         judge = judge_kwargs.pop('model','gpt-4o-0806')
         nproc = judge_kwargs.pop('nproc', 4)
 
         # step1: extract answer
         print("running step 1: extracting answer")
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_extracted_answer_tmp.pkl')
-        extracted_answer_file = eval_file.replace('.xlsx', f'_{judge}_extracted_answer.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_extracted_answer_tmp', 'pkl')
+        extracted_answer_file = get_intermediate_file_path(eval_file, f'_{judge}_extracted_answer')
         model = build_judge(system_prompt=Answer_Extraction_Prompt_part1, model=judge, **judge_kwargs)
 
         if not osp.exists(extracted_answer_file):
@@ -179,8 +180,8 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         # step2: scoring
         print("running step 2: acc scoring")
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_answer_score_tmp.pkl')
-        answer_score_file = eval_file.replace('.xlsx', f'_{judge}_answer_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score_tmp', 'pkl')
+        answer_score_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score')
         model = build_judge(system_prompt=Answer_Scoring_Prompt_part1, model=judge, **judge_kwargs)
 
         if not osp.exists(answer_score_file):
@@ -206,15 +207,15 @@ def evaluate(self, eval_file, **judge_kwargs):
             data['answer_scoring'] = [answer_score_map[idx] if idx in answer_score_map else -1 for idx in data['index']]
             dump(data, answer_score_file)
 
-        txt_file = eval_file.replace('.xlsx', f'_{judge}_answer_score.txt')
-        answer_score_json = eval_file.replace('.xlsx', f'_{judge}_answer_score.json')
+        txt_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score', 'txt')
+        answer_score_json = get_intermediate_file_path(eval_file, f'_{judge}_answer_score', 'json')
         xlsx2json(answer_score_file, answer_score_json)
         calu_acc_main(answer_score_json, txt_file)
 
         # step3: calulate precision_score
         print("running step 3: calulate precision_score")
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_pre_score_tmp.pkl')
-        pre_score_file = eval_file.replace('.xlsx', f'_{judge}_pre_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_pre_score_tmp', 'pkl')
+        pre_score_file = get_intermediate_file_path(eval_file, f'_{judge}_pre_score')
 
         model = build_judge(system_prompt=Precision_Evaluation_Prompt, model=judge, **judge_kwargs)
 
@@ -253,13 +254,13 @@ def evaluate(self, eval_file, **judge_kwargs):
             data = data.loc[valid_indices]
             dump(data, pre_score_file)
 
-        pre_score_json = eval_file.replace('.xlsx', f'_{judge}_pre_score.json')
+        pre_score_json = get_intermediate_file_path(eval_file, f'_{judge}_pre_score', 'json')
         xlsx2json(pre_score_file, pre_score_json)
 
         # step4: calulate recall_score
         print("running step 4: calulate recall_score")
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_recall_score_tmp.pkl')
-        recall_score_file = eval_file.replace('.xlsx', f'_{judge}_recall_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_recall_score_tmp', 'pkl')
+        recall_score_file = get_intermediate_file_path(eval_file, f'_{judge}_recall_score')
 
         model = build_judge(system_prompt=Recall_Evaluation_Prompt, model=judge, **judge_kwargs)
 
@@ -295,7 +296,7 @@ def evaluate(self, eval_file, **judge_kwargs):
             data = data.loc[valid_indices]
             dump(data, recall_score_file)
 
-        txt_file = eval_file.replace('.xlsx', f'_{judge}_precision_recall_score.txt')
-        recall_score_json = eval_file.replace('.xlsx', f'_{judge}_recall_score.json')
+        txt_file = get_intermediate_file_path(eval_file, f'_{judge}_precision_recall_score', 'txt')
+        recall_score_json = get_intermediate_file_path(eval_file, f'_{judge}_recall_score', 'json')
         xlsx2json(recall_score_file, recall_score_json)
         calu_pre_recall(pre_score_json, recall_score_json, txt_file)
diff --git a/vlmeval/dataset/vdc.py b/vlmeval/dataset/vdc.py
index dce63cb42..75e1051bc 100644
--- a/vlmeval/dataset/vdc.py
+++ b/vlmeval/dataset/vdc.py
@@ -1,6 +1,7 @@
 # flake8: noqa
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..utils import track_progress_rich
@@ -346,16 +347,16 @@ def load_pack_answers(self, data_raw):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.vdc import get_dimension_rating, prepare_response_prompt, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT, SYSTEM_GENER_PRED_PROMPT
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
         judge = judge_kwargs['model']
         nproc = judge_kwargs.pop('nproc', 4)
         _ = judge_kwargs.pop('verbose', None)
         _ = judge_kwargs.pop('retry', None)
 
-        response_file = eval_file.replace('.xlsx', f'_{judge}_response.pkl')
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
-        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+        response_file = get_intermediate_file_path(eval_file, f'_{judge}_response', 'pkl')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
 
         model = build_judge(**judge_kwargs)
 
diff --git a/vlmeval/dataset/video_concat_dataset.py b/vlmeval/dataset/video_concat_dataset.py
index dab1ae1d7..fcf3e8227 100644
--- a/vlmeval/dataset/video_concat_dataset.py
+++ b/vlmeval/dataset/video_concat_dataset.py
@@ -1,4 +1,5 @@
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 from .video_base import VideoBaseDataset
 
 
@@ -59,7 +60,6 @@ def supported_datasets(cls):
         return []  # list(cls.DATASET_SETS)
 
     def evaluate(self, eval_file, **judge_kwargs):
-        suffix = eval_file.split('.')[-1]
         # First, split the eval_file by dataset
         data_all = load(eval_file)
         for dname in self.datasets:
@@ -80,6 +80,6 @@ def evaluate(self, eval_file, **judge_kwargs):
         result = result.T
         for idx, item in result.iterrows():
             result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1)
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
         dump(result, score_file)
         return result
diff --git a/vlmeval/dataset/video_holmes.py b/vlmeval/dataset/video_holmes.py
index c267755e3..3d6ff37b2 100644
--- a/vlmeval/dataset/video_holmes.py
+++ b/vlmeval/dataset/video_holmes.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 
@@ -204,11 +205,11 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         from .utils.videoholmes import get_dimension_rating, extract_option
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/video_mmlu.py b/vlmeval/dataset/video_mmlu.py
index a229a95bc..977cbff9e 100644
--- a/vlmeval/dataset/video_mmlu.py
+++ b/vlmeval/dataset/video_mmlu.py
@@ -1,6 +1,7 @@
 # flake8: noqa
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..utils import track_progress_rich
@@ -276,16 +277,16 @@ def load_pack_answers(self, data_raw):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.video_mmlu import get_dimension_rating, prepare_response_prompt, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT_CAP, SYSTEM_GENER_PRED_PROMPT
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
         judge = judge_kwargs['model']
         nproc = judge_kwargs.pop('nproc', 4)
         _ = judge_kwargs.pop('verbose', None)
         _ = judge_kwargs.pop('retry', None)
 
-        response_file = eval_file.replace('.xlsx', f'_{judge}_response.pkl')
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
-        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+        response_file = get_intermediate_file_path(eval_file, f'_{judge}_response', 'pkl')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
 
         judge_kwargs['temperature'] = 0.0
         model = build_judge(**judge_kwargs)
@@ -564,15 +565,15 @@ def load_pack_answers(self, data_raw):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.video_mmlu import get_dimension_rating, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT_QA
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
         judge = judge_kwargs['model']
         nproc = judge_kwargs.pop('nproc', 4)
         _ = judge_kwargs.pop('verbose', None)
         _ = judge_kwargs.pop('retry', None)
 
-        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
-        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
 
         judge_kwargs['temperature'] = 0.0
         model = build_judge(**judge_kwargs)
diff --git a/vlmeval/dataset/videomme.py b/vlmeval/dataset/videomme.py
index c084ad796..84a20eeb4 100644
--- a/vlmeval/dataset/videomme.py
+++ b/vlmeval/dataset/videomme.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 
@@ -231,11 +232,11 @@ def build_prompt(self, line, video_llm):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.videomme import get_dimension_rating, extract_characters_regex, extract_option
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/visfactor.py b/vlmeval/dataset/visfactor.py
index c3a0f8126..6b8313fbd 100644
--- a/vlmeval/dataset/visfactor.py
+++ b/vlmeval/dataset/visfactor.py
@@ -1,6 +1,7 @@
 import re
 from vlmeval import *
 from .image_base import ImageBaseDataset
+from ..smp.file import get_intermediate_file_path
 
 
 class VisFactor(ImageBaseDataset):
@@ -141,9 +142,11 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         accuracy['ALL'] = sum([accuracy[s] for s in accuracy]) / len([accuracy[s] for s in accuracy])
 
-        data.to_csv(eval_file.replace('.xlsx', '.csv'), index=False)
-        with open(eval_file.replace('.xlsx', '_acc.csv'), 'w') as f:
-            for key in accuracy:
-                f.write(f'{key},{accuracy[key]}\n')
+        verbose_file = get_intermediate_file_path(eval_file, '_verbose')
+        dump(data, verbose_file)
+
+        score_df = d2df(accuracy)
+        score_file = get_intermediate_file_path(eval_file, '_acc')
+        dump(score_df, score_file)
 
         return accuracy
diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py
index d8dad7383..ce8b397a8 100644
--- a/vlmeval/dataset/vl_rewardbench.py
+++ b/vlmeval/dataset/vl_rewardbench.py
@@ -102,11 +102,10 @@ def build_prompt(self, line):
     # It returns a DataFrame
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs['model']
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
diff --git a/vlmeval/dataset/vlm2bench.py b/vlmeval/dataset/vlm2bench.py
index 5cd04e283..bbe86d554 100644
--- a/vlmeval/dataset/vlm2bench.py
+++ b/vlmeval/dataset/vlm2bench.py
@@ -8,6 +8,8 @@
     cnt_aggregate_metric,
     grp_aggregate_accuracy,
 )
+from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 
 class VLM2Bench(ImageBaseDataset):
@@ -69,25 +71,15 @@ def evaluate(cls, eval_file, **judge_kwargs):
         """
         model = judge_kwargs.get("model")
         if model:
-            suffix = eval_file.split('.')[-1]
-            storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-            score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
-            tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+            storage = get_intermediate_file_path(eval_file, f'_{model}')
+            score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+            tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
             if os.path.exists(storage):
-                if storage.lower().endswith(".xlsx"):
-                    data = pd.read_excel(storage)
-                else:
-                    data = pd.read_csv(storage, sep="\t", encoding="latin1", engine="python")
+                data = load(storage)
             else:
-                if eval_file.lower().endswith(".xlsx"):
-                    data = pd.read_excel(eval_file)
-                else:
-                    data = pd.read_csv(eval_file, sep="\t", encoding="latin1", engine="python")
+                data = load(eval_file)
         else:
-            if eval_file.lower().endswith(".xlsx"):
-                data = pd.read_excel(eval_file)
-            else:
-                data = pd.read_csv(eval_file, sep="\t", encoding="latin1", engine="python")
+            data = load(eval_file)
 
         results = data.to_dict(orient="records")
         processed = common_process_results(results)
@@ -117,7 +109,6 @@ def evaluate(cls, eval_file, **judge_kwargs):
         if model:
             final_score_file = score_file
         else:
-            suffix = os.path.splitext(eval_file)[1]
-            final_score_file = eval_file.replace(suffix, "_score.csv")
-        score_df.to_csv(final_score_file, index=False)
+            final_score_file = get_intermediate_file_path(eval_file, "_score", "csv")
+        dump(score_df, final_score_file)
         return score_df
diff --git a/vlmeval/dataset/vlmbias.py b/vlmeval/dataset/vlmbias.py
index b3b42e582..45e0ebb9b 100644
--- a/vlmeval/dataset/vlmbias.py
+++ b/vlmeval/dataset/vlmbias.py
@@ -16,9 +16,8 @@ class VLMBias(ImageVQADataset):
 
     def evaluate(self, eval_file, **judge_kwargs):
         model = judge_kwargs.pop('model', 'gpt-4o')
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}_tmp')
         nproc = judge_kwargs.pop('nproc', 16)
 
         if not osp.exists(storage):
@@ -51,6 +50,6 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         data = load(storage)
         acc = report_acc(data)
-        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        score_file = get_intermediate_file_path(eval_file, '_acc')
         dump(acc, score_file)
         return acc
diff --git a/vlmeval/dataset/wildvision.py b/vlmeval/dataset/wildvision.py
index b1ad1fd26..3552a0b8e 100644
--- a/vlmeval/dataset/wildvision.py
+++ b/vlmeval/dataset/wildvision.py
@@ -4,6 +4,7 @@
 from .image_base import ImageBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 from ..utils import track_progress_rich
 
 
@@ -141,11 +142,10 @@ def gen_eval_base(self, eval_file, b64_map):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
         # We adopt pairwise evaluation (twice for a pair) for this dataset
-        suffix = eval_file.split('.')[-1]
         model = judge_kwargs['model']
-        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
-        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
-        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        storage = get_intermediate_file_path(eval_file, f'_{model}')
+        score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+        tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
         nproc = judge_kwargs.pop('nproc', 4)
 
         if not osp.exists(storage):
diff --git a/vlmeval/dataset/worldsense.py b/vlmeval/dataset/worldsense.py
index 6e51541d9..fe59c65ab 100644
--- a/vlmeval/dataset/worldsense.py
+++ b/vlmeval/dataset/worldsense.py
@@ -283,11 +283,11 @@ def build_prompt(self, line, video_llm):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.worldsense import get_dimension_rating, extract_characters_regex, extract_option
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'  # noqa: E501
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         if not osp.exists(score_file):
             model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/inference.py b/vlmeval/inference.py
index de3bd34c0..ffac276dd 100644
--- a/vlmeval/inference.py
+++ b/vlmeval/inference.py
@@ -47,7 +47,8 @@ def infer_data_api(model, work_dir, model_name, dataset, index_set=None, api_npr
 
     # To reuse records in MMBench_V11
     if dataset_name in ['MMBench', 'MMBench_CN']:
-        v11_pred = f'{work_dir}/{model_name}_{dataset_name}_V11.xlsx'
+        pred_format = get_pred_file_format()
+        v11_pred = f'{work_dir}/{model_name}_{dataset_name}_V11.{pred_format}'
         if osp.exists(v11_pred):
             try:
                 reuse_inds = load('http://opencompass.openxlab.space/utils/mmb_reuse.pkl')
@@ -184,12 +185,14 @@ def infer_data_job(
 ):
     rank, world_size = get_rank_and_world_size()
     dataset_name = dataset.dataset_name
-    result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+    # 使用环境变量控制的文件格式
+    result_file = get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True)
 
     prev_file = f'{work_dir}/{model_name}_{dataset_name}_PREV.pkl'
     if osp.exists(result_file):
         if rank == 0:
             data = load(result_file)
+            # breakpoint()
             results = {k: v for k, v in zip(data['index'], data['prediction'])}
             if not ignore_failed:
                 results = {k: v for k, v in results.items() if FAIL_MSG not in str(v)}
diff --git a/vlmeval/inference_mt.py b/vlmeval/inference_mt.py
index 298f2a208..25c7ce935 100644
--- a/vlmeval/inference_mt.py
+++ b/vlmeval/inference_mt.py
@@ -169,7 +169,7 @@ def infer_data_job_mt(
 ):
     rank, world_size = get_rank_and_world_size()
     dataset_name = dataset.dataset_name
-    result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.tsv')
+    result_file = get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True)
 
     tmpl = osp.join(work_dir, '{}' + f'{world_size}_{dataset_name}.pkl')
     out_file = tmpl.format(rank)
diff --git a/vlmeval/smp/file.py b/vlmeval/smp/file.py
index ecd2edefc..ac658d271 100644
--- a/vlmeval/smp/file.py
+++ b/vlmeval/smp/file.py
@@ -116,9 +116,9 @@ def MMBenchOfficialServer(dataset_name):
 
 class NumpyEncoder(json.JSONEncoder):
     def default(self, obj):
-        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
-                            np.int16, np.int32, np.int64, np.uint8,
-                            np.uint16, np.uint32, np.uint64)):
+        if isinstance(obj,
+                      (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64,
+                       np.uint8, np.uint16, np.uint32, np.uint64)):
             return int(obj)
         elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
             return float(obj)
@@ -139,6 +139,10 @@ def dump_pkl(data, pth, **kwargs):
         pickle.dump(data, open(pth, 'wb'))
 
     def dump_json(data, pth, **kwargs):
+        # 处理 DataFrame 对象
+        if isinstance(data, pd.DataFrame):
+            # 转换为 records 格式（列表格式）
+            data = data.to_dict('records')
         json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
 
     def dump_jsonl(data, f, **kwargs):
@@ -160,6 +164,65 @@ def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
     return handlers[suffix](data, f, **kwargs)
 
 
+def get_pred_file_format():
+    pred_format = os.getenv('PRED_FORMAT', '').lower()
+    if pred_format in ['tsv', 'xlsx', 'json']:
+        return pred_format
+    return 'xlsx'  # 默认格式
+
+
+def get_eval_file_format():
+    eval_format = os.getenv('EVAL_FORMAT', '').lower()
+    if eval_format in ['csv', 'json']:
+        return eval_format
+    return 'csv'  # 默认格式
+
+
+def get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True):
+    if use_env_format:
+        file_format = get_pred_file_format()
+        if file_format == 'xlsx':
+            return osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+        elif file_format == 'tsv':
+            return osp.join(work_dir, f'{model_name}_{dataset_name}.tsv')
+        elif file_format == 'json':
+            return osp.join(work_dir, f'{model_name}_{dataset_name}.json')
+    else:
+        # 保持原有行为
+        return osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+
+
+def get_eval_file_path(eval_file, judge_model, use_env_format=True):
+    suffix = eval_file.split('.')[-1]
+    if use_env_format:
+        file_format = get_eval_file_format()
+        if file_format == 'csv':
+            return eval_file.replace(f'.{suffix}', f'_{judge_model}.csv')
+        elif file_format == 'json':
+            return eval_file.replace(f'.{suffix}', f'_{judge_model}.json')
+    else:
+        # 保持原有行为
+        return eval_file.replace(f'.{suffix}', f'_{judge_model}.xlsx')
+
+
+def _should_convert_to_dataframe(data):
+    if not isinstance(data, dict):
+        return False
+    if not data:
+        return False
+    if 'columns' in data and 'data' in data:
+        return True
+    values = list(data.values())
+    if all(not isinstance(v, (list, dict)) for v in values):
+        return False
+    if any(isinstance(v, list) for v in values):
+        lists = [v for v in values if isinstance(v, list)]
+        if lists and all(len(lst) == len(lists[0]) for lst in lists):
+            return True
+
+    return False
+
+
 def load(f, fmt=None):
     def load_pkl(pth):
         return pickle.load(open(pth, 'rb'))
@@ -382,6 +445,26 @@ def fetch_aux_files(eval_file):
     return fs
 
 
+def get_file_extension(file_path):
+    return file_path.split('.')[-1]
+
+
+def get_intermediate_file_path(eval_file, suffix, target_format=None):
+    original_ext = get_file_extension(eval_file)
+
+    if target_format is None:
+        if suffix in ['_tmp', '_response', '_processed']:
+            target_format = 'pkl'
+        elif suffix in ['_rating', '_config', '_meta']:
+            target_format = 'json'
+        elif suffix in ['_acc', '_fine', '_metrics']:
+            target_format = get_eval_file_format()
+        else:
+            target_format = get_pred_file_format()
+
+    return eval_file.replace(f'.{original_ext}', f'{suffix}.{target_format}')
+
+
 def prepare_reuse_files(pred_root_meta, eval_id, model_name, dataset_name, reuse, reuse_aux):
     import shutil
     from .misc import timestr
diff --git a/vlmeval/tools.py b/vlmeval/tools.py
index 126fb76ae..98449f841 100644
--- a/vlmeval/tools.py
+++ b/vlmeval/tools.py
@@ -497,7 +497,8 @@ def SCAN_ONE(root, model, dataset):
     from termcolor import colored
     FAIL_MSG = 'Failed to obtain answer via API.'
     root = osp.join(root, model)
-    fname = f'{model}_{dataset}.xlsx'
+    pred_format = get_pred_file_format()
+    fname = f'{model}_{dataset}.{pred_format}'
     pth = osp.join(root, fname)
     if osp.exists(pth):
         data = load(pth)
@@ -549,7 +550,8 @@ def SCAN(root, models, datasets):
         cur_datasets = []
         if len(datasets) == 0:
             for d in SUPPORTED_DATASETS:
-                if osp.exists(osp.join(root, m, f'{m}_{d}.xlsx')):
+                pred_format = get_pred_file_format()
+                if osp.exists(osp.join(root, m, f'{m}_{d}.{pred_format}')):
                     cur_datasets.append(d)
         else:
             cur_datasets = datasets