diff --git a/run.py b/run.py
index ae6c4c4f0..627f96d8c 100644
--- a/run.py
+++ b/run.py
@@ -271,7 +271,8 @@ def main():
dist.barrier()
try:
- result_file_base = f'{model_name}_{dataset_name}.xlsx'
+ pred_format = get_pred_file_format()
+ result_file_base = f'{model_name}_{dataset_name}.{pred_format}'
if use_config:
if WORLD_SIZE > 1:
@@ -299,9 +300,6 @@ def main():
continue
# Handling Multi-Turn Dataset
- if dataset.TYPE == 'MT':
- result_file_base = result_file_base.replace('.xlsx', '.tsv')
-
result_file = osp.join(pred_root, result_file_base)
# Reuse the previous prediction file if exists
if RANK == 0 and len(prev_pred_roots):
diff --git a/scripts/apires_scan.py b/scripts/apires_scan.py
index c6036625f..890aea3da 100644
--- a/scripts/apires_scan.py
+++ b/scripts/apires_scan.py
@@ -10,7 +10,9 @@
model_name = root.split('/')[-1]
for d in SUPPORTED_DATASETS:
- fname = f'{model_name}_{d}.xlsx'
+ from vlmeval.smp import get_pred_file_format
+ pred_format = get_pred_file_format()
+ fname = f'{model_name}_{d}.{pred_format}'
pth = osp.join(root, fname)
if osp.exists(pth):
data = load(pth)
diff --git a/scripts/auto_run.py b/scripts/auto_run.py
index f3cd1bbf3..381c3432f 100644
--- a/scripts/auto_run.py
+++ b/scripts/auto_run.py
@@ -26,7 +26,9 @@ def is_large(x):
models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
for m in models:
- unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
+ from vlmeval.smp import get_pred_file_format
+ pred_format = get_pred_file_format()
+ unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')]
if len(unknown_datasets) == 0:
continue
dataset_str = ' '.join(unknown_datasets)
diff --git a/vlmeval/dataset/CGAVCounting/cg_av_counting.py b/vlmeval/dataset/CGAVCounting/cg_av_counting.py
index 59445bb8c..8626c7fd4 100644
--- a/vlmeval/dataset/CGAVCounting/cg_av_counting.py
+++ b/vlmeval/dataset/CGAVCounting/cg_av_counting.py
@@ -359,10 +359,11 @@ def save_video_frames(self, video, uid, num_frames=8, fps=-1):
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
+ 'data file should be an supported format (xlsx/json/tsv) file'
- tgt_file = eval_file.replace(".xlsx", "_rating.json")
- score_file = eval_file.replace(".xlsx", "_score.xlsx")
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
data = load(eval_file)
diff --git a/vlmeval/dataset/EgoExoBench/egoexobench.py b/vlmeval/dataset/EgoExoBench/egoexobench.py
index a49c20f02..9400966c0 100644
--- a/vlmeval/dataset/EgoExoBench/egoexobench.py
+++ b/vlmeval/dataset/EgoExoBench/egoexobench.py
@@ -244,11 +244,12 @@ def build_prompt(self, line, video_llm):
def evaluate(self, eval_file, **judge_kwargs):
from .utils import get_dimension_rating, extract_characters_regex, extract_option
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
+ 'data file should be an supported format (xlsx/json/tsv) file'
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/GUI/screenspot.py b/vlmeval/dataset/GUI/screenspot.py
index ac2cbe3e2..842d61ad0 100644
--- a/vlmeval/dataset/GUI/screenspot.py
+++ b/vlmeval/dataset/GUI/screenspot.py
@@ -324,7 +324,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
results_dict[key] = str(0)
else:
results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
- score_pth = eval_file.replace(".xlsx", "_score.json")
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(results_dict, score_pth)
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -437,7 +437,7 @@ def make_safe(value):
sub_stats = itertools.chain(*sub_stats)
final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
- score_pth = eval_file.replace(".xlsx", "_score.json")
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
diff --git a/vlmeval/dataset/GUI/screenspot_pro.py b/vlmeval/dataset/GUI/screenspot_pro.py
index 26fde4114..c926a29c2 100644
--- a/vlmeval/dataset/GUI/screenspot_pro.py
+++ b/vlmeval/dataset/GUI/screenspot_pro.py
@@ -312,7 +312,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
results_dict[key] = str(0)
else:
results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
- score_pth = eval_file.replace(".xlsx", "_score.json")
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(results_dict, score_pth)
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -422,7 +422,7 @@ def make_safe(value):
sub_stats = itertools.chain(*sub_stats)
final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
- score_pth = eval_file.replace(".xlsx", "_score.json")
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
diff --git a/vlmeval/dataset/OmniDocBench/omnidocbench.py b/vlmeval/dataset/OmniDocBench/omnidocbench.py
index 1d688404d..28aca5a09 100644
--- a/vlmeval/dataset/OmniDocBench/omnidocbench.py
+++ b/vlmeval/dataset/OmniDocBench/omnidocbench.py
@@ -4,10 +4,12 @@
import pandas as pd
import tempfile
import base64
+import numpy as np
from tqdm import tqdm
import torch.distributed as dist
from ..image_base import ImageBaseDataset
from ...smp import *
+from .utils import get_intermediate_file_path, load, dump
class OmniDocBench(ImageBaseDataset):
@@ -75,9 +77,6 @@ def __init__(self,
tsv_path,
match_method:str='quick_match',
filter_types:dict=None):
- self.result_foler='../../../outputs/OmniDocBench'
- if not os.path.exists(self.result_foler):
- os.makedirs(self.result_foler)
self.eval_file=eval_file
self.match_method=match_method
self.references=[]
@@ -374,17 +373,18 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
'group':group_result,
'page':page_result
}
- if not os.path.exists('./output/OmniDocBench'):
- os.makedirs('./output/OmniDocBench')
if isinstance(cur_samples,list):
saved_samples=cur_samples
else:
saved_samples=cur_samples.samples
- with open(os.path.join(self.result_foler,f'{save_name}_result.josn'),'w',encoding='utf-8') as f:
- json.dump(saved_samples,f,indent=4,ensure_ascii=False)
+ # NOTE: The original code has a bug here, it will overwrite the result file in each iteration.
+ # I will fix it by adding element to the filename.
+ # NOTE: Fixed typo .josn -> .json
+ result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_{element}_result', 'json')
+ dump(saved_samples, result_file)
- with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
- json.dump(result_all,f,indent=4,ensure_ascii=False)
+ metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
+ dump(result_all, metric_result_file)
dict_list = []
save_dict={}
@@ -409,20 +409,20 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
dict_list.append(save_dict)
df = pd.DataFrame(dict_list,index=['end2end',]).round(3)
- with open(os.path.join(self.result_foler,'End2End_Evaluation.json'),'w',encoding='utf-8') as f:
- json.dump(result_all,f,indent=4,ensure_ascii=False)
- df.to_csv(os.path.join(self.result_foler,'overall.csv'))
- over_all_path=os.path.join(self.result_foler,'End2End_Evaluation.json')
- print(f"The save path of overall.csv is :{over_all_path}")
+ e2e_eval_file = get_intermediate_file_path(self.eval_file, '_End2End_Evaluation', 'json')
+ dump(result_all, e2e_eval_file)
+
+ overall_file = get_intermediate_file_path(self.eval_file, '_overall')
+ dump(df, overall_file)
+
+ print(f"The save path of End2End_Evaluation is: {e2e_eval_file}")
+ print(f"The save path of overall metrics is: {overall_file}")
return df
class table_evalutor():
def __init__(self,eval_file,tsv_path):
-
- self.result_foler='../../../outputs/OmniDocBench'
- if not os.path.exists(self.result_foler):
- os.makedirs(self.result_foler)
+ self.eval_file = eval_file
gt_key='html'
pred_key='pred'
self.category_filter='table'
@@ -434,8 +434,8 @@ def load_data(self,eval_file,gt_file,pred_key,gt_key):
from .data_preprocess import clean_string, normalized_formula, textblock2unicode, normalized_table
samples=[]
preds=[]
- predictions=pd.read_excel(eval_file)['prediction'].tolist()
- gt_samples=pd.read_csv(gt_file,sep='\t')['answer'].tolist()
+ predictions=load(eval_file)['prediction'].tolist()
+ gt_samples=load(gt_file)['answer'].tolist()
load_success,load_fail=0,0
for i,gt_sample in tqdm(enumerate(gt_samples),desc='Loading data'):
try:
@@ -533,8 +533,8 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
'page':page_result
}
- with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
- json.dump(result_all,f,indent=4,ensure_ascii=False)
+ metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
+ dump(result_all, metric_result_file)
dict_list=[]
dict_list.append(result_all["group"]["TEDS"])
@@ -545,10 +545,7 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
selected_columns = df4[["language: table_en", "language: table_simplified_chinese", "language: table_en_ch_mixed", "line: full_line", "line: less_line", "line: fewer_line", "line: wireless_line",
"with_span: True", "with_span: False", "include_equation: True", "include_equation: False", "include_background: True", "include_background: False", "table_layout: vertical", "table_layout: horizontal"]]
- selected_columns.to_csv(os.path.join(self.result_foler,'table_attribute.csv'))
- table_attribute_path=os.path.join(self.result_foler,'table_attribute.csv')
- print(f'The save path of table_attribute.csv is :{table_attribute_path}')
- selected_columns
-
-
+ table_attr_file = get_intermediate_file_path(self.eval_file, '_table_attribute')
+ dump(selected_columns, table_attr_file)
+ print(f'The save path of table_attribute is :{table_attr_file}')
return selected_columns
diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 579444829..855049d4a 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -151,7 +151,6 @@ def supported_datasets(cls):
return list(cls.DATASET_SETS)
def evaluate(self, eval_file, **judge_kwargs):
- suffix = eval_file.split('.')[-1]
# First, split the eval_file by dataset
data_all = load(eval_file)
for dname in self.datasets:
@@ -179,11 +178,11 @@ def evaluate(self, eval_file, **judge_kwargs):
if len(df_all):
result = pd.concat(df_all)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(result, score_file)
return result
else:
- score_file = eval_file.replace(f'.{suffix}', '_score.json')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'json')
dump(dict_all, score_file)
return dict_all
diff --git a/vlmeval/dataset/cgbench.py b/vlmeval/dataset/cgbench.py
index 8ca5b5f12..aada9da6a 100644
--- a/vlmeval/dataset/cgbench.py
+++ b/vlmeval/dataset/cgbench.py
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from .utils.cgbench import *
@@ -432,10 +433,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
- tgt_file = eval_file.replace(".xlsx", "_rating.json")
- score_file = eval_file.replace(".xlsx", "_score.xlsx")
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
data = load(eval_file)
@@ -760,12 +761,12 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
- assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
- tgt_file = eval_file.replace(".xlsx", "_rating.json")
- score_file = eval_file.replace(".xlsx", "_score.xlsx")
- step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
- step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
+ step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
+ step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
data = load(eval_file)
@@ -784,13 +785,13 @@ def evaluate(self, eval_file, **judge_kwargs):
axis=1,
)
- data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
- data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
-
if judge_kwargs.get("model", None) != "gpt-4o-0806":
judge_kwargs["model"] = "gpt-4o-0806"
print("The judge model in cg-bench is gpt-4o-0806!")
+ data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+ data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+
model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
nproc = judge_kwargs.pop("nproc", 32)
@@ -1314,10 +1315,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
- tgt_file = eval_file.replace(".xlsx", "_rating.json")
- score_file = eval_file.replace(".xlsx", "_score.xlsx")
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
data = load(eval_file)
@@ -1641,12 +1642,12 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
- assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
- tgt_file = eval_file.replace(".xlsx", "_rating.json")
- score_file = eval_file.replace(".xlsx", "_score.xlsx")
- step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
- step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
+ step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
+ step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
data = load(eval_file)
diff --git a/vlmeval/dataset/chartmimic.py b/vlmeval/dataset/chartmimic.py
index 8151f0af1..f970f17a5 100644
--- a/vlmeval/dataset/chartmimic.py
+++ b/vlmeval/dataset/chartmimic.py
@@ -570,19 +570,12 @@ def judge_one_item_success(item):
infer_data_all = load(eval_file).to_dict(orient="records")
- suffix = eval_file.split(".")[-1]
print(f"judge_kwargs: {judge_kwargs}")
infer_model = judge_kwargs["model"]
- storage = os.path.abspath(
- eval_file.replace(f".{suffix}", f"_{infer_model}.jsonl")
- )
- score_file = os.path.abspath(
- eval_file.replace(f".{suffix}", f"_{infer_model}_score.csv")
- )
+ storage = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}', 'jsonl'))
+ score_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_score', 'csv'))
# use abs path because of using os.chdir()
- tmp_file = os.path.abspath(
- eval_file.replace(f".{suffix}", f"_{infer_model}_tmp.pkl")
- )
+ tmp_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_tmp', 'pkl'))
# actually the --api-nproc
nproc = judge_kwargs.pop("nproc", 8)
logger.info(f"nproc: {nproc}")
diff --git a/vlmeval/dataset/charxiv.py b/vlmeval/dataset/charxiv.py
index 0427632ba..3a3c01e13 100644
--- a/vlmeval/dataset/charxiv.py
+++ b/vlmeval/dataset/charxiv.py
@@ -6,6 +6,7 @@
from vlmeval.dataset.image_base import ImageBaseDataset
from vlmeval.smp import misc, file
+from vlmeval.smp.file import get_intermediate_file_path
from vlmeval import utils
from vlmeval.dataset.utils import build_judge
@@ -203,10 +204,9 @@ def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame:
judge_model_name = judge_model.model
# Define file paths
- suffix = eval_file.split(".")[-1]
- result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.xlsx")
- temp_result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.pkl")
- score_file = result_file.replace(".xlsx", "_acc.csv")
+ result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}")
+ temp_result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}", "pkl")
+ score_file = get_intermediate_file_path(result_file, "_acc", "csv")
# Return existing results if available
if os.path.exists(result_file):
diff --git a/vlmeval/dataset/cmmmu.py b/vlmeval/dataset/cmmmu.py
index 12c583f29..d96a241e6 100644
--- a/vlmeval/dataset/cmmmu.py
+++ b/vlmeval/dataset/cmmmu.py
@@ -5,6 +5,7 @@
import re
import tempfile
from ..smp import *
+from ..smp.file import get_intermediate_file_path
def get_multi_choice_prediction(response, all_choices, index2ans):
@@ -223,8 +224,7 @@ def dump_image(self, line):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
if not osp.exists(result_file):
data = load(eval_file)
diff --git a/vlmeval/dataset/creation.py b/vlmeval/dataset/creation.py
index 4e37102fe..38a5d3d51 100644
--- a/vlmeval/dataset/creation.py
+++ b/vlmeval/dataset/creation.py
@@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
from ..smp import *
+from ..smp.file import get_intermediate_file_path
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
import re
@@ -662,20 +663,18 @@ def evaluate(self, eval_file, **judge_kwargs):
tgt = load(eval_file)
tgt['reference_answer_by_gpt4o'] = src['prediction']
tgt['prediction'] = src['reference_answer_by_gpt4o']
- tgt_file_name = eval_file.replace('.xlsx', '_rev.xlsx')
+ tgt_file_name = get_intermediate_file_path(eval_file, '_rev')
dump(tgt, tgt_file_name)
judge_kwargs['dual_eval'] = False
rating_rev = self.evaluate(tgt_file_name, **judge_kwargs)
judge_kwargs.pop('dual_eval', None)
- suffix = '.' + eval_file.split('.')[-1]
-
- score_file = eval_file.replace(suffix, '_score.csv')
- tgt_file = eval_file.replace(suffix, '_rating.json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
model = judge_kwargs.pop('model', 'gpt-4o-0806')
model_name = model.split('/')[-1] if '/' in model else model
- tmp_file = eval_file.replace(suffix, f'_{model_name}.pkl')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
diff --git a/vlmeval/dataset/dude.py b/vlmeval/dataset/dude.py
index c520c7d28..e024d9821 100644
--- a/vlmeval/dataset/dude.py
+++ b/vlmeval/dataset/dude.py
@@ -5,6 +5,7 @@
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
from ..smp import *
+from ..smp.file import get_intermediate_file_path
FAIL_MSG = 'Failed to obtain answer via API.'
@@ -165,9 +166,8 @@ def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
@@ -203,7 +203,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score = DUDE_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
diff --git a/vlmeval/dataset/dynamath.py b/vlmeval/dataset/dynamath.py
index a463276d7..e66797ac1 100644
--- a/vlmeval/dataset/dynamath.py
+++ b/vlmeval/dataset/dynamath.py
@@ -12,6 +12,7 @@
from .utils import build_judge
from ..utils import track_progress_rich
from ..smp import load, dump, d2df, toliststr
+from ..smp.file import get_intermediate_file_path
def preprocess(str1):
@@ -170,11 +171,10 @@ def evaluate(self, eval_file, **judge_kwargs):
judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
model = build_judge(model=judge_name, **judge_kwargs)
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
- score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841
- tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
+ storage = get_intermediate_file_path(eval_file, f'_{judge_name}')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge_name}_score', 'csv')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl')
nproc = judge_kwargs.pop('nproc', 6) # noqa: F841
res = load(tmp_file) if os.path.exists(tmp_file) else {}
diff --git a/vlmeval/dataset/gobench.py b/vlmeval/dataset/gobench.py
index 3e9990c13..7667b934a 100644
--- a/vlmeval/dataset/gobench.py
+++ b/vlmeval/dataset/gobench.py
@@ -152,7 +152,7 @@ def evaluate(self, eval_file, **judge_kwargs):
'Instruction_Consistency_Score': [avg_scores.get('consistency', 0) * 100]
})
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ score_file = get_intermediate_file_path(eval_file, '_score')
dump(final_df, score_file)
print(f"Detailed scores including failed attempts saved to {score_file}")
diff --git a/vlmeval/dataset/image_caption.py b/vlmeval/dataset/image_caption.py
index 23282805c..6a9d806f5 100644
--- a/vlmeval/dataset/image_caption.py
+++ b/vlmeval/dataset/image_caption.py
@@ -70,6 +70,6 @@ def evaluate(self, eval_file, **kwargs):
scorer = COCO_Caption_Scorer(ref, gt)
coco_caption_score_dict = scorer.compute_scores()
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(coco_caption_score_dict, score_pth)
return coco_caption_score_dict
diff --git a/vlmeval/dataset/image_ccocr.py b/vlmeval/dataset/image_ccocr.py
index b1286daba..e70403d64 100644
--- a/vlmeval/dataset/image_ccocr.py
+++ b/vlmeval/dataset/image_ccocr.py
@@ -9,6 +9,7 @@
from .image_base import ImageBaseDataset
from ..smp import *
+from ..smp.file import get_intermediate_file_path
# should be the same as FAIL_MSG definded in vlmeval/inference.py
FAIL_MSG = 'Failed to obtain answer via API.'
@@ -230,13 +231,12 @@ def evaluate(self, eval_file, **judge_kwargs):
print(f"Failed to evaluate {sub_dataset_id}")
# Save comprehensive results
- base_name = os.path.splitext(os.path.abspath(eval_file))[0]
+ result_file = get_intermediate_file_path(eval_file, '_comprehensive_eval', 'json')
comprehensive_result = {
"meta": {"total_datasets": len(all_results), "datasets": list(all_results.keys())},
"results": all_results,
"summaries": all_summaries
}
- result_file = base_name + "_comprehensive_eval.json"
dump(comprehensive_result, result_file)
print(f"Comprehensive results saved to: {result_file}")
@@ -298,5 +298,6 @@ def evaluate(self, eval_file, **judge_kwargs):
print(f" {k.upper():<20}: {v:.4f}")
print("="*80)
df = d2df(res)
- dump(df, base_name + '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
+ dump(df, score_file)
return res
diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py
index b0892ad90..44ef1e9c3 100644
--- a/vlmeval/dataset/image_mcq.py
+++ b/vlmeval/dataset/image_mcq.py
@@ -258,7 +258,6 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
dump(data, eval_file)
circular = True
- suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
@@ -276,7 +275,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
data = load(eval_file)
data = data.sort_values(by='index')
@@ -299,7 +298,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
- eval_record = eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')
+ eval_record = get_intermediate_file_path(eval_file, f'_{name_str}_result')
dump(data, eval_record)
data = load(eval_record)
@@ -311,7 +310,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
else:
acc = report_acc(data)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(acc, score_file)
# The piece of code is for internal use, to check vanilla acc (circ0 & all) for circular datasets
@@ -327,16 +326,16 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
else:
offset = 1e6
circ0 = data[data['index'] <= offset]
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_result.pkl')
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_result', 'pkl')
data0 = mcq_vanilla_eval(model, circ0, meta, nproc, result_file, self.dataset_name)
- dump(data0, eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_circ0_result.{suffix}'))
- data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_circ0_result.{suffix}'))
+ dump(data0, get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_circ0_result'))
+ data = load(get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_circ0_result'))
acc_map['vanilla_0'] = report_acc(data)
# Vanilla ALL Acc
data = load(eval_file)
dataall = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
- dump(dataall, eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_all_result.{suffix}'))
- data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_all_result.{suffix}'))
+ dump(dataall, get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_all_result'))
+ data = load(get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_all_result'))
acc_map['vanilla_all'] = report_acc(data)
# Merge & Print the Evaluation Results
for k, v in acc_map.items():
@@ -350,7 +349,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
score_all = [acc_map['vanilla_0'], acc_map['vanilla_all'], acc_map['circular']]
score_all = pd.concat(score_all)
print(score_all)
- score_file = eval_file.replace(f'.{suffix}', '_acc_all.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc_all', 'csv')
dump(score_all, score_file)
if dataset == 'AesBench_VAL':
@@ -382,7 +381,6 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
if circular:
raise ValueError("circular is not supported for verifier evaluation")
- suffix = eval_file.split('.')[-1]
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
@@ -418,7 +416,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
data['verifier_score'] = verifier_scores
data['verifier_match'] = verifier_matches
- detailed_result_file = eval_file.replace(f'.{suffix}', '_detailed_results.xlsx')
+ detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
dump(data, detailed_result_file)
def report_acc_verifier(result_file):
@@ -462,7 +460,7 @@ def report_acc_verifier(result_file):
res_df = pd.DataFrame(res)
return res_df
acc = report_acc_verifier(detailed_result_file)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(acc, score_file)
return acc
@@ -615,11 +613,11 @@ def evaluate(self, eval_file, **judge_kwargs):
if 'COT' in self.dataset_name:
data = load(eval_file)
data['prediction'] = [self.cot_postproc(x) for x in data['prediction']]
- tgt = eval_file.replace('.xlsx', '_cotpost.xlsx')
+ tgt = get_intermediate_file_path(eval_file, '_cotpost')
dump(data, tgt)
res = super().evaluate(tgt, **judge_kwargs)
- acc_org = eval_file.replace('.xlsx', '_acc.csv')
- acc_now = eval_file.replace('.xlsx', '_cotpost_acc.csv')
+ acc_org = get_intermediate_file_path(eval_file, '_acc', 'csv')
+ acc_now = get_intermediate_file_path(eval_file, '_cotpost_acc', 'csv')
shutil.copy(acc_now, acc_org)
return res
else:
@@ -1017,11 +1015,11 @@ def build_prompt(self, line):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
FAIL_MSG = 'Failed to obtain answer via API.'
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
@@ -1036,6 +1034,17 @@ def evaluate(self, eval_file, **judge_kwargs):
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+ match_cot = re.search(r"(.*?)", pred, re.DOTALL)
+ cot = match_cot.group(1).strip() if match_cot else pred
+
+ target_instances = ast.literal_eval(data.loc[data['index'] == idx, 'target_instances'].values[0])
+ iou = self.evaluate_box_iou(cot, target_instances)
+
+ data.loc[data['index'] == idx, 'iou'] = iou
+
+ match_pred = re.search(r"(.*?)", pred, re.DOTALL)
+ pred = match_pred.group(1).strip().upper() if match_pred else pred
+
extract_pred = extract_characters_regex(pred)
if extract_pred == '':
cnt_rejected += 1
@@ -1055,6 +1064,86 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(rating, tgt_file)
return rating
+ def evaluate_box_iou(predict_str: str, target_instances: list) -> float:
+ pattern = r"(.*?)"
+ matches = re.findall(pattern, predict_str, re.DOTALL)
+
+ all_boxes = []
+
+ for match in matches:
+ box = match.strip()
+
+ coord_pattern = r'\[(\d+),(\d+),(\d+),(\d+)\]'
+ coord_match = re.match(coord_pattern, box)
+
+ if coord_match:
+ x1, y1, x2, y2 = map(int, coord_match.groups())
+
+ if x1 < x2 and y1 < y2:
+ # all_boxes.append([(x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1])
+ all_boxes.append([x1, y1, x2, y2])
+
+ if len(all_boxes) == 0:
+ return 0
+
+ target_boxes = target_instances
+ if len(target_boxes) == 0:
+ return len(all_boxes) > 0
+
+ def calculate_average_iou(pred_boxes, target_boxes):
+ """
+ 计算每个目标框与预测框中 IoU 最大的预测框之间的平均 IoU。
+
+ 参数:
+ pred_boxes (List[List[float]]): 预测框列表,每个框为 [cx, cy, w, h]
+ target_boxes (List[List[float]]): 目标框列表,每个框为 [cx, cy, w, h]
+
+ 返回:
+ float: 匹配上的平均 IoU
+ """
+ def compute_iou(box1, box2):
+ """计算两个框之间的 IoU"""
+ x1_min, y1_min, x1_max, y1_max = box1
+ x2_min, y2_min, x2_max, y2_max = box2
+
+ inter_x_min = max(x1_min, x2_min)
+ inter_y_min = max(y1_min, y2_min)
+ inter_x_max = min(x1_max, x2_max)
+ inter_y_max = min(y1_max, y2_max)
+
+ inter_width = max(0, inter_x_max - inter_x_min)
+ inter_height = max(0, inter_y_max - inter_y_min)
+ inter_area = inter_width * inter_height
+
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
+
+ union_area = area1 + area2 - inter_area
+
+ return inter_area / union_area if union_area > 0 else 0.0
+
+ pred_coords = pred_boxes
+ target_coords = target_boxes
+
+ total_iou = 0.0
+ num_targets = len(target_boxes)
+
+ if num_targets == 0:
+ return 0.0
+
+ # 为每个目标框找到最大 IoU 的预测框
+ for t_coord in target_coords:
+ best_iou = 0.0
+ for p_coord in pred_coords:
+ iou = compute_iou(t_coord, p_coord)
+ if iou > best_iou:
+ best_iou = iou
+ total_iou += best_iou
+
+ return total_iou / num_targets
+
+ return calculate_average_iou(all_boxes, target_boxes)
+
class CVBench(ImageMCQDataset):
"""CV-Bench, composed of two sub datasets:
@@ -1101,7 +1190,6 @@ def evaluate(self, eval_file, **judge_kwargs):
nproc = judge_kwargs.pop("nproc", 4)
- suffix = eval_file.split(".")[-1]
model_name = judge_kwargs.get("model", "extract_matching")
if model_name == "exact_matching":
@@ -1117,7 +1205,7 @@ def evaluate(self, eval_file, **judge_kwargs):
)
model = None
- result_file = eval_file.replace(f".{suffix}", f"_{model_name}_result.pkl")
+ result_file = get_intermediate_file_path(eval_file, f"_{model_name}_result", "pkl")
data = load(eval_file)
data = data.sort_values(by="index")
@@ -1136,7 +1224,7 @@ def evaluate(self, eval_file, **judge_kwargs):
k in meta_q_map
), f"eval_file should be the same as or a subset of dataset {self.dataset_name}"
- score_file = eval_file.replace(f".{suffix}", "_acc.csv")
+ score_file = get_intermediate_file_path(eval_file, "_acc", "csv")
if osp.exists(score_file):
acc = load(score_file)
@@ -1144,15 +1232,14 @@ def evaluate(self, eval_file, **judge_kwargs):
data = mcq_vanilla_eval(
model, data, meta, nproc, result_file, self.dataset_name
)
- dump(data, eval_file.replace(f".{suffix}", f"_{model}_result.{suffix}"))
- data = load(eval_file.replace(f".{suffix}", f"_{model}_result.{suffix}"))
+ dump(data, get_intermediate_file_path(eval_file, f"_{model_name}_result"))
+ data = load(get_intermediate_file_path(eval_file, f"_{model_name}_result"))
if all(data["split"] == "2D"): # 2D
acc = self.report_accuracy(data)
else: # 3D, use default evaluation strategy
acc = report_acc(data)
- score_file = eval_file.replace(f".{suffix}", "_acc.csv")
dump(acc, score_file)
return acc
@@ -1198,7 +1285,6 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.hrbench import report_acc_hrbench
nproc = judge_kwargs.pop('nproc', 4)
- suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'extract_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
@@ -1216,7 +1302,7 @@ def evaluate(self, eval_file, **judge_kwargs):
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
data = load(eval_file)
data = data.sort_values(by='index')
@@ -1233,18 +1319,17 @@ def evaluate(self, eval_file, **judge_kwargs):
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
if osp.exists(score_file):
acc = load(score_file)
return acc
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
- dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
- data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+ dump(data, get_intermediate_file_path(eval_file, f'_{name_str}_result'))
+ data = load(get_intermediate_file_path(eval_file, f'_{name_str}_result'))
acc = report_acc_hrbench(data)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
return acc
@@ -1323,7 +1408,7 @@ def evaluate(self, eval_file, **judge_kwargs):
scores = get_scores(results)
print(scores)
- score_file = 'NaturalBench_acc.csv'
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
dump(df, score_file)
@@ -1401,13 +1486,12 @@ def evaluate(self, eval_file, **judge_kwargs):
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+ storage = get_intermediate_file_path(eval_file, f'_{name_str}')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage) and model is not None:
data = load(eval_file)
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
data = load(eval_file)
data = data.sort_values(by='index')
@@ -1437,7 +1521,7 @@ def evaluate(self, eval_file, **judge_kwargs):
four_dim_scores = wemath_accuracy(eval_file)
combine_score = {**accuracy_scores, **four_dim_scores}
combine_score = pd.DataFrame(combine_score)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(combine_score, score_pth)
return combine_score
@@ -1488,15 +1572,14 @@ def build_prompt(self, line):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.vmcbench import get_mc_score, report_vmc_acc
- suffix = eval_file.split('.')[-1]
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
data['hit'] = data.apply(get_mc_score, axis=1)
- result_file = eval_file.replace(f'.{suffix}', f'_result.{suffix}')
+ result_file = get_intermediate_file_path(eval_file, '_result')
dump(data, result_file)
acc = report_vmc_acc(data)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(acc, score_file)
return acc
@@ -1638,8 +1721,7 @@ def evaluate(self, eval_file, **judge_kwargs):
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+ storage = get_intermediate_file_path(eval_file, f'_{name_str}')
if osp.exists(storage):
accuracy_scores = VisuLogic_acc(storage)
@@ -1647,7 +1729,7 @@ def evaluate(self, eval_file, **judge_kwargs):
accuracy_scores = VisuLogic_acc(eval_file)
combine_score = {**accuracy_scores,}
combine_score = pd.DataFrame(combine_score)
- score_pth = storage.replace('.xlsx', '_acc.csv')
+ score_pth = get_intermediate_file_path(storage, '_acc', 'csv')
dump(combine_score, score_pth)
return combine_score
@@ -1698,7 +1780,6 @@ def do_evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, mcq_vanilla_eval
nproc = judge_kwargs.pop('nproc', 4)
- suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125', 'gpt-4o-mini']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4', 'gpt-4o-mini': 'gpt4omini'}
@@ -1716,7 +1797,7 @@ def do_evaluate(self, eval_file, **judge_kwargs):
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
data = load(eval_file)
data = data.sort_values(by='index')
@@ -1736,12 +1817,12 @@ def do_evaluate(self, eval_file, **judge_kwargs):
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# Save evaluation results
- judged_result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')
+ judged_result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result')
dump(data, judged_result_file)
acc = report_acc(data)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(acc, score_file)
return acc, judged_result_file
@@ -1920,8 +2001,7 @@ def evaluate(self, eval_file, **judge_kwargs):
result_df = pd.DataFrame(accuracy_dict)
result_df['Overall macro'] = result_df.mean(axis=1)
result_df['Overall micro'] = micro_metric['correct'] / micro_metric['total']
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(result_df, score_file)
return result_df
@@ -2062,8 +2142,7 @@ def evaluate(self, eval_file, **judge_kwargs):
result_df[f"Sphere macro: {sphere}"] = sum(accs) / len(accs)
result_df["Overall macro"] = result_df.mean(axis=1)
result_df["Overall micro"] = micro_metric["correct"] / micro_metric["total"]
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(result_df, score_file)
return result_df
@@ -2254,31 +2333,15 @@ def evaluate(self, eval_file, **judge_kwargs):
ans = self.extract_content_in_braces(data_item["prediction"])
if ans == data_item["answers"]:
task_stats[task]['correct'] += 1
- elif data_item["task"] == "Touching Circles":
- if str.lower(data_item["answers"]) in str.lower(data_item["prediction"]):
- task_stats[task]['correct'] += 1
- elif data_item["task"] == "Counting Grid - Word Grids":
+ elif data_item["task"] == "Touchdown Reading":
if self.compare_string_with_values(data_item["prediction"], data_item["answers"]):
task_stats[task]['correct'] += 1
- elif data_item["task"] == "Counting Grid - Blank Grids":
- if self.compare_string_with_values(data_item["prediction"], data_item["answers"]):
- task_stats[task]['correct'] += 1
- elif data_item["task"] == "Olympic Counting - Pentagons":
- if data_item["answers"] in data_item["prediction"]:
- task_stats[task]['correct'] += 1
- elif data_item["task"] == "Olympic Counting - Circles":
- if data_item["answers"] in data_item["prediction"]:
- task_stats[task]['correct'] += 1
- elif data_item["task"] == "Circled Letter":
- ans = self.extract_content_in_braces(data_item["prediction"])
- if ans == data_item["answers"]:
- task_stats[task]['correct'] += 1
-
- accuracy_dict = {task: [stats['correct'] / stats['total']] for task, stats in task_stats.items()}
- result_df = pd.DataFrame(accuracy_dict)
- result_df['overall'] = result_df.mean(axis=1)
- return result_df
+ accuracy_dict = {task: [stats['correct'] / stats['total']] for task, stats in sorted(task_stats.items())}
+ accuracy_df = pd.DataFrame(accuracy_dict)
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
+ dump(accuracy_df, score_file)
+ return accuracy_df
class SCAM(ImageMCQDataset):
@@ -2330,54 +2393,23 @@ class _3DSRBench(ImageMCQDataset):
DATASET_MD5 = {'3DSRBench': '610516a0b4710595545b7613c60524e8'}
def evaluate(self, eval_file, **judge_kwargs):
- super().evaluate(eval_file, **judge_kwargs)
from .utils.multiple_choice import report_acc
- dname = osp.dirname(eval_file)
- base = osp.basename(eval_file).split('.')[:-1]
- base = '.'.join(base)
- result_file = ls(dname, match=[base + '_', 'result.xlsx'])
- assert len(result_file) == 1, result_file
- result_file = result_file[0]
- data = load(result_file)
-
- acc_map = {}
- acc_map['vanilla'] = report_acc(data)
- # Flip Acc
- qid2key = {x: x.replace('-flip', '') for x in data['qid']}
- key_set = set(list(qid2key.values()))
- main = cp.deepcopy(data[data['qid'].isin(key_set)])
- hit_map = {x: y for x, y in zip(main['qid'], main['hit'])}
- for x, y in zip(data['qid'], data['hit']):
- hit_map[qid2key[x]] *= y
- main['hit'] = [hit_map[x] for x in main['qid']]
- acc_map['flip_eval'] = report_acc(main)
- # Circ Acc
- qid2key = {x: x[:8] if '-flip' not in x else x[:13] for x in data['qid']}
- key_set = set(list(qid2key.values()))
- main = cp.deepcopy(data[data['qid'].isin(key_set)])
- hit_map = {x: y for x, y in zip(main['qid'], main['hit'])}
- for x, y in zip(data['qid'], data['hit']):
- hit_map[qid2key[x]] *= y
- main['hit'] = [hit_map[x] for x in main['qid']]
- acc_map['circ_eval'] = report_acc(main)
- # Flip Circ Acc
- qid2key = {x: x[:8] for x in data['qid']}
- key_set = set(list(qid2key.values()))
- main = cp.deepcopy(data[data['qid'].isin(key_set)])
- hit_map = {x: y for x, y in zip(main['qid'], main['hit'])}
- for x, y in zip(data['qid'], data['hit']):
- hit_map[qid2key[x]] *= y
- main['hit'] = [hit_map[x] for x in main['qid']]
- acc_map['flip_circ_eval'] = report_acc(main)
-
- metrics = []
- for k in acc_map:
- acc_map[k].pop('split')
- acc_map[k]['setting'] = [k] * len(acc_map[k])
- metrics.append(acc_map[k])
- res_all = pd.concat(metrics)
- dump(res_all, eval_file.replace('.xlsx', '_acc_all.csv'))
- return res_all
+ from .utils.sr3d import parse_3dsr_prediction, eval_3dsr
+ from ..smp import dump, load
+ from ..utils.dataset_util import TDBench_grounding_eval
+ from ..dataset import parse_img_path_list
+ from ..config import VLM_EVAL_WITH_SUBSET
+ data = load(eval_file)
+ # parse the model predictions
+ data = parse_img_path_list(data)
+ data = parse_3dsr_prediction(data)
+ # rotate the image and boxes
+ data['hit'] = eval_3dsr(data)
+ result_file = get_intermediate_file_path(eval_file, '_acc')
+ if VLM_EVAL_WITH_SUBSET:
+ data['subset'] = [x.split('|')[0] for x in data['index']]
+ dump(data, result_file)
+ return report_acc(data)
class AffordanceDataset(ImageMCQDataset):
@@ -2556,57 +2588,14 @@ def build_prompt(self, line):
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- import ast
- from .utils.multiple_choice import extract_characters_regex
- from .utils.treebench import get_dimension_rating
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
- FAIL_MSG = 'Failed to obtain answer via API.'
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
-
- if not osp.exists(score_file):
-
- res = {} if not osp.exists(tmp_file) else load(tmp_file)
- res = {k: v for k, v in res.items() if FAIL_MSG not in v}
-
- data = load(eval_file)
- cnt_rejected = 0
- data_un = data[~pd.isna(data['prediction'])]
-
- for idx in data['index']:
- ans = data.loc[data['index'] == idx, 'answer'].values[0]
- pred = data.loc[data['index'] == idx, 'prediction'].values[0]
-
- match_cot = re.search(r"(.*?)", pred, re.DOTALL)
- cot = match_cot.group(1).strip() if match_cot else pred
-
- target_instances = ast.literal_eval(data.loc[data['index'] == idx, 'target_instances'].values[0])
- iou = self.evaluate_box_iou(cot, target_instances)
-
- data.loc[data['index'] == idx, 'iou'] = iou
-
- match_pred = re.search(r"(.*?)", pred, re.DOTALL)
- pred = match_pred.group(1).strip().upper() if match_pred else pred
-
- extract_pred = extract_characters_regex(pred)
- if extract_pred == '':
- cnt_rejected += 1
- data.loc[data['index'] == idx, 'score'] = 0
- else:
- data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
-
- print(
- f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
- f'failed to obtain the score for another {cnt_rejected} questions. '
- f'Those questions will be counted as 0 score in ALL rating.'
- )
-
- dump(data, score_file)
-
- rating = get_dimension_rating(score_file)
- dump(rating, tgt_file)
- return rating
+ from .utils.treebench import get_acc
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
+ try:
+ res = get_acc(eval_file)
+ dump(res, score_file)
+ return res
+ except:
+ return 0
def evaluate_box_iou(predict_str: str, target_instances: list) -> float:
pattern = r"(.*?)"
diff --git a/vlmeval/dataset/image_mt.py b/vlmeval/dataset/image_mt.py
index 07658948a..3cd72d726 100644
--- a/vlmeval/dataset/image_mt.py
+++ b/vlmeval/dataset/image_mt.py
@@ -1,6 +1,7 @@
from .image_base import ImageBaseDataset
from .utils.judge_util import build_judge
from ..smp import *
+from ..smp.file import get_intermediate_file_path
from ..utils import track_progress_rich
@@ -86,11 +87,10 @@ def calculat_metric(self, ans):
return pd.DataFrame([sp1, sp2])
def evaluate(self, eval_file, **judge_kwargs):
- suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
nproc = judge_kwargs.pop('nproc', 4)
data = load(eval_file)
diff --git a/vlmeval/dataset/image_shortqa.py b/vlmeval/dataset/image_shortqa.py
index 3650730cb..0d60ded33 100644
--- a/vlmeval/dataset/image_shortqa.py
+++ b/vlmeval/dataset/image_shortqa.py
@@ -4,6 +4,7 @@
from .utils.multiple_choice import report_acc, eval_vanilla, eval_circular_group
from .utils.shortqa import ShortQA_prompt
from ..utils import track_progress_rich
+from ..smp.file import get_intermediate_file_path
def ShortQA_auxeval(model, line):
@@ -89,8 +90,8 @@ def evaluate(self, eval_file, **judge_kwargs):
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
- storage = eval_file.replace('.xlsx', '_judge.xlsx')
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+ storage = get_intermediate_file_path(eval_file, '_judge')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -137,7 +138,7 @@ def evaluate(self, eval_file, **judge_kwargs):
data = load(storage)
acc = report_acc(data)
- score_file = eval_file.replace('.xlsx', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(acc, score_file)
return acc
diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
index ad202d9e8..27800547c 100644
--- a/vlmeval/dataset/image_vqa.py
+++ b/vlmeval/dataset/image_vqa.py
@@ -9,6 +9,7 @@
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from ..utils import track_progress_rich
@@ -89,8 +90,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
data['eval_match'] = [r['match'] for r in res]
data['eval_score'] = [np.mean(r['match']) for r in res]
- suffix = eval_file.split('.')[-1]
- detailed_result_file = eval_file.replace(f'.{suffix}', '_results.xlsx')
+ detailed_result_file = get_intermediate_file_path(eval_file, '_results')
dump(data, detailed_result_file)
hit = hit_calculate(res, dataset)
@@ -118,8 +118,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
ret = d2df(ret)
ret.round(2)
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc')
dump(ret, result_file)
return ret
@@ -146,8 +145,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
data['verifier_score'] = scores
data['verifier_match'] = [1.0 if score else 0.0 for score in scores]
- suffix = eval_file.split('.')[-1]
- detailed_result_file = eval_file.replace(f'.{suffix}', '_detailed_results.xlsx')
+ detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
dump(data, detailed_result_file)
def hit_calculate(result):
@@ -177,8 +175,7 @@ def hit_calculate(result):
ret = d2df(ret)
ret.round(2)
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc')
dump(ret, result_file)
return ret
@@ -194,8 +191,7 @@ class VizWiz(ImageBaseDataset):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.vqa_eval import hit_calculate, process_line
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc')
if not osp.exists(result_file):
data = load(eval_file)
@@ -217,7 +213,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(ret, result_file)
- retz = pd.read_csv(result_file)
+ retz = load(result_file)
return retz
@@ -292,7 +288,7 @@ def evaluate(self, eval_file, **judge_kwargs):
+ final_score_dict['Handwritten Mathematical Expression Recognition'])
final_score_dict['Final Score Norm'] = (
float(final_score_dict['Final Score']) / 10)
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
return final_score_dict
@@ -317,9 +313,8 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
from .utils.mathvista import MathVista_auxeval, MathVista_acc
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -357,7 +352,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
dump(data, storage)
score = MathVista_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
return score
@@ -383,7 +378,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
data['verifier_score'] = verifier_scores
data['verifier_match'] = verifier_matches
- detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx')
+ detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
dump(data, detailed_result_file)
def MathVista_acc_verifier(result_file):
@@ -422,7 +417,7 @@ def MathVista_acc_verifier(result_file):
return res
score = MathVista_acc_verifier(detailed_result_file)
- score_pth = eval_file.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'csv')
dump(score, score_pth)
return score
@@ -483,11 +478,10 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
- tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
- storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
+ storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract')
+ tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract', 'pkl')
+ storage_score = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_file_score = get_intermediate_file_path(eval_file, f'_{model}_score', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
# stage1: extract the answer
if not osp.exists(storage_extract):
@@ -517,8 +511,8 @@ def evaluate(self, eval_file, **judge_kwargs):
ans = load(tmp_file_extract)
for k, v in zip(indices, new_results):
assert k in ans
- assert ans[k]['log_extract'] == v['log_extract'] and ans[
- k]['extract'] == v['extract']
+ assert ans[k]['log_extract'] == v['log_extract'] and ans[k][
+ 'extract'] == v['extract']
data['extract'] = [ans[idx]['extract'] for idx in data['index']]
data['log_extract'] = [
@@ -564,7 +558,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage_score)
score = MathVerse_acc(storage_score)
- score_pth = storage_score.replace('.xlsx', '.csv')
+ score_pth = get_intermediate_file_path(storage_score, '', 'csv')
dump(score, score_pth)
return score
@@ -595,9 +589,8 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
model = judge_kwargs['model']
else:
model = os.path.basename(os.environ.get('LOCAL_LLM'))
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -635,7 +628,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs):
dump(data, storage)
score = MATH_V_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
return score
@@ -662,11 +655,11 @@ def evaluate_verifier(self, eval_file, **judge_kwargs):
data['verifier_score'] = verifier_scores
data['verifier_match'] = verifier_matches
- detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx')
+ detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
dump(data, detailed_result_file)
else:
- detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx')
+ detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results')
if not osp.exists(detailed_result_file):
dump(data, detailed_result_file)
@@ -697,7 +690,7 @@ def MathVision_acc_verifier(result_file):
return res
score = MathVision_acc_verifier(detailed_result_file)
- score_pth = eval_file.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'csv')
dump(score, score_pth)
return score
@@ -797,9 +790,8 @@ def evaluate(self, eval_file, **judge_kwargs):
print(f'Using local model as judge model for PHYSICS: {model}')
else:
model = judge_kwargs.setdefault('model', 'gpt-4o-mini')
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -839,7 +831,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score = PHYSIC_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
return score
@@ -962,12 +954,11 @@ def evaluate(self, eval_file, **judge_kwargs):
if use_api_judger:
from .utils.olympiadbench import Olympiad_auxeval_extract, Olympiad_auxeval_score
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
- tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
- result_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_result_file = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+ storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract')
+ tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract_tmp')
+ result_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_result_file = get_intermediate_file_path(eval_file, f'_{model}_score_tmp')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
nproc = judge_kwargs.pop('nproc', 4)
# stage1: extract the answer
if not osp.exists(storage_extract):
@@ -1046,46 +1037,47 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.olympiadbench import MathJudger, extract_answer
judger = MathJudger()
- suffix = eval_file.split('.')[-1]
- name_str1 = 'judge'
- name_str2 = 'score'
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
- score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
+ name_str1 = 'judge'
+ name_str2 = 'score'
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str1}_result')
+ score_file = get_intermediate_file_path(eval_file, f'_{name_str2}_result', 'csv')
- if not osp.exists(result_file):
- data = load(eval_file)
- scorez = []
+ if not osp.exists(result_file):
+ data = load(eval_file)
+ scorez = []
- for i in tqdm(data.iterrows()):
- line = i[1]
- model_answer = line['prediction']
- is_chinese = 'zh' in line['source']
- model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
- answer_type = line['answer_type']
+ for i in tqdm(data.iterrows()):
+ line = i[1]
+ model_answer = line['prediction']
+ is_chinese = 'zh' in line['source']
+ model_answer = extract_answer(is_chinese,
+ model_answer,
+ is_deepseek=False)
+ answer_type = line['answer_type']
- final_answer = line['final_answer'][2:-2]
+ final_answer = line['final_answer'][2:-2]
- if str(answer_type) != 'nan' and 'Tuple' in answer_type:
- judge_result = judger.judge(model_answer, final_answer)
- else:
- if str(line['error']) != 'nan':
- if ',' in line['error']:
- precisions = line['error'].split(',')
- precisions = [
- float(p) if p else 1e-8 for p in precisions
- ]
- judge_result = judger.judge(
- model_answer, final_answer, precisions)
- else:
- precision = float(line['error'])
- judge_result = judger.judge(
- model_answer, final_answer, precision)
+ if str(answer_type) != 'nan' and 'Tuple' in answer_type:
+ judge_result = judger.judge(model_answer, final_answer)
+ else:
+ if str(line['error']) != 'nan':
+ if ',' in line['error']:
+ precisions = line['error'].split(',')
+ precisions = [
+ float(p) if p else 1e-8 for p in precisions
+ ]
+ judge_result = judger.judge(
+ model_answer, final_answer, precisions)
else:
- judge_result = judger.judge(model_answer, final_answer)
- scorez.append(judge_result)
+ precision = float(line['error'])
+ judge_result = judger.judge(
+ model_answer, final_answer, precision)
+ else:
+ judge_result = judger.judge(model_answer, final_answer)
+ scorez.append(judge_result)
- data['score'] = scorez
- dump(data, result_file)
+ data['score'] = scorez
+ dump(data, result_file)
judge_file = load(result_file)
@@ -1153,9 +1145,9 @@ def evaluate(self, eval_file, **judge_kwargs):
acc_dict['AVG'] = [acc]
acc_pd = pd.DataFrame(acc_dict)
- acc_pd.to_csv(score_file, index=False, encoding='gbk')
+ dump(acc_pd, score_file)
- accdz = pd.read_csv(score_file)
+ accdz = load(score_file)
return accdz
@@ -1228,9 +1220,8 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.seephys import extract, eval_acc
model = judge_kwargs.pop('model', 'deepseek')
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
@@ -1268,7 +1259,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score = eval_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(storage, '_score', 'json')
dump(score, score_pth)
return score
@@ -1312,9 +1303,8 @@ def evaluate(self, eval_file, **judge_kwargs):
)
model = None
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{name_str}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{name_str}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage) and model is not None:
@@ -1354,7 +1344,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
if osp.exists(storage):
accuracy_scores = evaluate_logicvista(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(accuracy_scores, score_pth)
return accuracy_scores
@@ -1478,7 +1468,6 @@ class LLaVABench(ImageBaseDataset):
}
DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
- # It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.llavabench import (
@@ -1487,9 +1476,8 @@ def evaluate(self, eval_file, **judge_kwargs):
LLaVABench_score,
)
- suffix = '.' + eval_file.split('.')[-1]
- record_file = eval_file.replace(suffix, '_openai_result' + suffix)
- score_file = eval_file.replace(suffix, '_score.csv')
+ record_file = get_intermediate_file_path(eval_file, '_openai_result')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
nproc = judge_kwargs.pop('nproc', 4)
system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
@@ -1534,9 +1522,8 @@ def evaluate(self, eval_file, **judge_kwargs):
LLaVABench_score,
)
- suffix = '.' + eval_file.split('.')[-1]
- record_file = eval_file.replace(suffix, '_openai_result' + suffix)
- score_file = eval_file.replace(suffix, '_score.csv')
+ record_file = get_intermediate_file_path(eval_file, '_openai_result')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
nproc = judge_kwargs.pop('nproc', 4)
system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
@@ -1583,9 +1570,8 @@ def evaluate(self, eval_file, **judge_kwargs):
VGRPBench_get_system_prompt,
)
- suffix = '.' + eval_file.split('.')[-1]
- record_file = eval_file.replace(suffix, '_openai_result' + suffix)
- score_file = eval_file.replace(suffix, '_score.csv')
+ record_file = get_intermediate_file_path(eval_file, '_openai_result')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
nproc = judge_kwargs.pop('nproc', 4)
@@ -1649,10 +1635,9 @@ class MMVet(ImageBaseDataset):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmvet import MMVet_auxeval, MMVet_acc
- suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
@@ -1687,8 +1672,8 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score, score_fine = MMVet_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
- score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
+ score_fine_pth = get_intermediate_file_path(storage, '_score_fine', 'csv')
dump(score, score_pth)
dump(score_fine, score_fine_pth)
return score
@@ -1727,8 +1712,7 @@ def evaluate(self, eval_file, **judge_kwargs):
for category, scores in category_scores.items()
}
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.json')
+ result_file = get_intermediate_file_path(eval_file, '_acc', 'json')
dump(category_averages, result_file)
return category_averages
@@ -1908,6 +1892,8 @@ def evaluate(self, eval_file, **judge_kwargs):
for task, metrics in eval_results.items()
for metric, score in metrics.items()
])
+ result_file = get_intermediate_file_path(eval_file, '_acc')
+ dump(ret_df, result_file)
return ret_df
# WildDoc adopts a custom prompt for each subset
@@ -1979,8 +1965,7 @@ def evaluate(self, eval_file, **judge_kwargs):
eval_result['average_scores'].append(
split_eval_meta['average_scores'])
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
eval_result = pd.DataFrame(eval_result)
dump(eval_result, result_file)
@@ -2089,7 +2074,7 @@ def evaluate(self, eval_file, **judge_kwargs):
else:
final_score_dict[category] = None
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
return final_score_dict
@@ -2255,9 +2240,8 @@ def evaluate(self, eval_file, **judge_kwargs):
# extract using model
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -2353,7 +2337,7 @@ def evaluate(self, eval_file, **judge_kwargs):
delta_1_point_5_per_question_type
})
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
return final_score_dict
@@ -2477,7 +2461,7 @@ def evaluate(self, eval_file, **judge_kwargs):
else:
final_score_dict[category] = None
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
return final_score_dict
@@ -2562,12 +2546,9 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmsci import (get_all_metrics_for_g_eval_score,
get_all_metrics_for_reference_based_metrics,
merge_rating, fact_score_generate)
- refer_based_metrics_output_file = eval_file.replace(
- '.xlsx', '_reference_based_metrics.xlsx')
- g_eval_metrics_output_file = eval_file.replace('.xlsx',
- '_g_eval_metrics.xlsx')
- fact_score_metrics_output_file = eval_file.replace(
- '.xlsx', '_fact_score.xlsx')
+ refer_based_metrics_output_file = get_intermediate_file_path(eval_file, '_reference_based_metrics')
+ g_eval_metrics_output_file = get_intermediate_file_path(eval_file, '_g_eval_metrics')
+ fact_score_metrics_output_file = get_intermediate_file_path(eval_file, '_fact_score')
# calculate reference-based metrics
if not osp.exists(refer_based_metrics_output_file):
@@ -2592,8 +2573,7 @@ def evaluate(self, eval_file, **judge_kwargs):
if isinstance(references[0], str):
references = [[r] for r in references]
- reference_based_metrics_file = eval_file.replace(
- '.xlsx', '_reference_based_metrics.pkl')
+ reference_based_metrics_file = get_intermediate_file_path(eval_file, '_reference_based_metrics', 'pkl')
existing_data = get_all_metrics_for_reference_based_metrics(
references, candidates, image_id_list,
reference_based_metrics_file)
@@ -2643,8 +2623,7 @@ def evaluate(self, eval_file, **judge_kwargs):
assert judge_model.working(), (
'Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
- suffix = '.' + eval_file.split('.')[-1]
- tmp_file = eval_file.replace(suffix, f'_{model}_G_eval.pkl')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}_G_eval', 'pkl')
tmp_result = get_all_metrics_for_g_eval_score(
references,
@@ -2666,7 +2645,7 @@ def evaluate(self, eval_file, **judge_kwargs):
rating = merge_rating(refer_based_metrics_output_file,
g_eval_metrics_output_file,
fact_score_metrics_output_file)
- dump(rating, eval_file.replace('.xlsx', '_final_rating.xlsx'))
+ dump(rating, get_intermediate_file_path(eval_file, '_final_rating'))
return rating
@@ -2681,7 +2660,7 @@ class BMMR(ImageBaseDataset):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.bmmr import get_acc_for_reference_based_metrics, merge_rating
- refer_based_metrics_output_file = eval_file.replace('.xlsx', '_reference_based_metrics.xlsx')
+ refer_based_metrics_output_file = get_intermediate_file_path(eval_file, '_reference_based_metrics')
if not osp.exists(refer_based_metrics_output_file):
data = load(eval_file)
old_candidates = {}
@@ -2707,7 +2686,7 @@ def evaluate(self, eval_file, **judge_kwargs):
if isinstance(references[0], str):
references = [[r] for r in references]
- reference_based_metrics_file = eval_file.replace('.xlsx', '_reference_based_metrics.pkl')
+ reference_based_metrics_file = get_intermediate_file_path(eval_file, '_reference_based_metrics', 'pkl')
assert len(references) == len(candidates) == len(image_id_list) == len(task_type_list)
existing_data = get_acc_for_reference_based_metrics(
references, candidates, image_id_list, task_type_list, reference_based_metrics_file
@@ -2720,7 +2699,7 @@ def evaluate(self, eval_file, **judge_kwargs):
rating = merge_rating(
refer_based_metrics_output_file,
)
- dump(rating, eval_file.replace('.xlsx', '_final_rating.xlsx'))
+ dump(rating, get_intermediate_file_path(eval_file, '_final_rating'))
return rating
def build_prompt(self, line):
@@ -2756,7 +2735,6 @@ class TDBenchGrounding(ImageVQADataset):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.tdbench import evaluate_bbox, extract_bbox_from_string, rotational_eval
- suffix = eval_file.split('.')[-1]
method = judge_kwargs.get('model', 'centroid')
assert method in ['centroid',
'iou'], '--judge should be either centroid or iou'
@@ -2786,16 +2764,16 @@ def evaluate(self, eval_file, **judge_kwargs):
data['hit'] = scores
data['category'] = 'visual_grounding'
- result_file = eval_file.replace(f'.{suffix}', f'_{method}_result.xlsx')
- data.to_excel(result_file, index=False)
+ result_file = get_intermediate_file_path(eval_file, f'_{method}_result')
+ dump(data, result_file)
metric_name = 'Average Centroid Containment' if method == 'centroid' else 'Average IoU'
summary_scores = {metric_name: avg_score, 'Total Samples': len(scores)}
score_df = pd.DataFrame(list(summary_scores.items()),
columns=['Metric', 'Score'])
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
- score_df.to_csv(score_file, index=False)
+ score_file = get_intermediate_file_path(eval_file, '_acc')
+ dump(score_df, score_file)
re_result = rotational_eval(result_file)
if method == 'centroid' and re_result is not None and re_result is not False:
file_addr = osp.abspath(
@@ -2902,7 +2880,11 @@ def evaluate(self, eval_file, **judge_kwargs):
if ans in pred:
correct_count += 1
accuracy = correct_count / total_count if total_count > 0 else 0
- return {'accuracy': accuracy}
+
+ result = {'accuracy': accuracy * 100}
+ result_file = get_intermediate_file_path(eval_file, '_acc')
+ dump(d2df(result), result_file)
+ return result
class OCR_Reasoning(ImageBaseDataset):
@@ -2919,9 +2901,8 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.ocr_reasoning import OcrR_auxeval, OcrR_acc
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
nproc = 1
if not osp.exists(storage):
@@ -2932,7 +2913,6 @@ def evaluate(self, eval_file, **judge_kwargs):
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
-
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
@@ -2961,7 +2941,7 @@ def evaluate(self, eval_file, **judge_kwargs):
]
dump(data, storage)
score = OcrR_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
return score
@@ -3058,8 +3038,7 @@ def evaluate(self, eval_file, **judge_kwargs):
# Open ended mode
res = pool.map(partial(PhyX_process_line), lines)
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_predict.xlsx')
+ result_file = get_intermediate_file_path(eval_file, '_predict')
df = pd.DataFrame(res)
df.to_excel(result_file, index=False)
@@ -3077,8 +3056,7 @@ def evaluate(self, eval_file, **judge_kwargs):
ret = d2df(ret)
ret.round(2)
- suffix = eval_file.split('.')[-1]
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc')
dump(ret, result_file)
return ret
@@ -3086,9 +3064,8 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.phyx import PhyX_auxeval, PhyX_acc, PhyX_auxeval_MC
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -3141,7 +3118,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score = PhyX_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
return score
@@ -3232,9 +3209,9 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.mme_reasoning import MMEReasoning_extract, MMEReasoning_openeval, MMEReasoning_acc, FAIL_MSG, mme_reasoning_eval_functions # noqa
model = judge_kwargs.get('model', 'gpt-4o-mini')
- suffix = eval_file.split('.')[-1]
- storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
- tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
+ storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract')
+ tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract_tmp')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
nproc = judge_kwargs.pop('nproc', 4)
# stage 1: extract answers using LLM
@@ -3282,11 +3259,9 @@ def evaluate(self, eval_file, **judge_kwargs):
data['log'] = log_list
dump(data, storage_extract)
- storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
-
+ tmp_file_score = get_intermediate_file_path(eval_file, f'_{model}_score_tmp')
# stage 2: evaluate score
- if not osp.exists(storage_score):
+ if not osp.exists(score_file):
data = load(storage_extract)
data = data.replace({float('nan'): None})
model = build_judge(max_tokens=1024, **judge_kwargs)
@@ -3390,10 +3365,10 @@ def evaluate(self, eval_file, **judge_kwargs):
data['score'] = [ans[idx]['score'] for idx in data['index']]
data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
- dump(data, storage_score)
+ dump(data, score_file)
- score = MMEReasoning_acc(storage_score)
- score_pth = storage_score.replace('.xlsx', '.csv')
+ score = MMEReasoning_acc(score_file)
+ score_pth = get_intermediate_file_path(score_file, '', 'csv')
dump(score, score_pth)
return score
@@ -3454,14 +3429,12 @@ def report_acc_mmatch(scores, match_types_int):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
-
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
- score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
- acc_file = eval_file.replace('.xlsx', f'_{judge}_acc.xlsx')
-
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
+ acc_file = get_intermediate_file_path(eval_file, f'_{judge}_acc')
judge_kwargs['temperature'] = 0.0
model = build_judge(**judge_kwargs)
@@ -3571,7 +3544,7 @@ def evaluate(self, eval_file, **judge_kwargs):
final_score_dict = {**en_scores, **cn_scores}
final_score_dict["English Overall Score"] = score_en_overall
final_score_dict["Chinese Overall Score"] = score_cn_overall
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
return final_score_dict
@@ -3605,10 +3578,8 @@ def evaluate(self, eval_file, **judge_kwargs):
model = build_judge(**judge_kwargs)
if not model.working():
raise RuntimeError("OPENAI API is not working properly. Please check your API key and configuration.")
-
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model_name}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model_name}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model_name}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}_tmp')
nproc = judge_kwargs.pop('nproc', 4)
data = load(eval_file)
@@ -3652,6 +3623,6 @@ def evaluate(self, eval_file, **judge_kwargs):
ret.round(2)
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ result_file = get_intermediate_file_path(eval_file, '_acc')
dump(ret, result_file)
return ret
diff --git a/vlmeval/dataset/image_yorn.py b/vlmeval/dataset/image_yorn.py
index 63ccd2b24..844ed0227 100644
--- a/vlmeval/dataset/image_yorn.py
+++ b/vlmeval/dataset/image_yorn.py
@@ -42,8 +42,8 @@ def evaluate(self, eval_file, **judge_kwargs):
dataset = self.dataset_name
data = load(eval_file)
data['prediction'] = [str(x) for x in data['prediction']]
- storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+ storage = get_intermediate_file_path(eval_file, '_auxmatch')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
@@ -104,6 +104,6 @@ def evaluate(self, eval_file, **judge_kwargs):
else:
score = default_rating(storage)
- score_tgt = eval_file.replace('.xlsx', '_score.csv')
+ score_tgt = get_intermediate_file_path(eval_file, '_score', 'csv')
dump(score, score_tgt)
return score
diff --git a/vlmeval/dataset/longvideobench.py b/vlmeval/dataset/longvideobench.py
index f4e6470d5..ea2ce0de2 100644
--- a/vlmeval/dataset/longvideobench.py
+++ b/vlmeval/dataset/longvideobench.py
@@ -278,11 +278,11 @@ def build_prompt(self, line, video_llm):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/m4bench.py b/vlmeval/dataset/m4bench.py
index 32ded4d44..2695c043c 100644
--- a/vlmeval/dataset/m4bench.py
+++ b/vlmeval/dataset/m4bench.py
@@ -6,7 +6,7 @@
from os import path as osp
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
-from ..smp import decode_base64_to_image_file, load, dump
+from ..smp import decode_base64_to_image_file, load, dump, get_intermediate_file_path
FAIL_MSG = 'Failed to obtain answer via API.'
@@ -183,8 +183,7 @@ def extract_options(q):
df['score'] = (df['parsed_pred'] == df['response'])
# Save detailed results
- base_name = os.path.splitext(os.path.abspath(eval_file))[0]
- details_file = base_name + '_details.xlsx'
+ details_file = get_intermediate_file_path(eval_file, '_details')
dump(df, details_file)
# Calculate and return accuracy
diff --git a/vlmeval/dataset/megabench.py b/vlmeval/dataset/megabench.py
index cc1cb85c7..7be235cc8 100644
--- a/vlmeval/dataset/megabench.py
+++ b/vlmeval/dataset/megabench.py
@@ -395,7 +395,7 @@ def process_text_and_media(text, media_list, is_demo=False):
return message
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
data = load(eval_file)
result = []
@@ -424,7 +424,7 @@ def process_media_path(media_str):
# save the result to json
output_path = os.path.join(os.path.dirname(eval_file), f'megabench_result_{self.subset_name}.json')
result_path = os.path.join(os.path.dirname(eval_file), f'megabench_score_{self.subset_name}.json')
- score_path = eval_file.replace('.xlsx','_acc_{self.subset_name}.json')
+ score_path = get_intermediate_file_path(eval_file, '_acc_{self.subset_name}', 'json')
if not os.path.exists(output_path) or not os.path.exists(result_path):
for task_name, group in data.groupby('task_name'):
task_dict = {
diff --git a/vlmeval/dataset/miabench.py b/vlmeval/dataset/miabench.py
index 2e99d39ec..c33f3510b 100644
--- a/vlmeval/dataset/miabench.py
+++ b/vlmeval/dataset/miabench.py
@@ -114,10 +114,9 @@ def evaluate(self, eval_file, **judge_kwargs):
judge_name = judge_kwargs.pop('model', 'gpt-4o')
model = build_judge(model=judge_name, **judge_kwargs)
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
- tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
+ storage = get_intermediate_file_path(eval_file, f'_{judge_name}') # noqa: F841
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl') # noqa: F841
nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
if not osp.exists(storage):
@@ -160,7 +159,7 @@ def evaluate(self, eval_file, **judge_kwargs):
goresult = load(storage)
results = get_score_dict(goresult, goresult['score_raw'])
- result_pth = storage.replace('.xlsx', '_score.csv')
+ result_pth = get_intermediate_file_path(storage, '_score', 'csv')
results_pd = pd.DataFrame.from_dict(list(results.items()))
dump(results_pd, result_pth)
diff --git a/vlmeval/dataset/mlvu.py b/vlmeval/dataset/mlvu.py
index 6244502d2..bcad3e961 100644
--- a/vlmeval/dataset/mlvu.py
+++ b/vlmeval/dataset/mlvu.py
@@ -1,6 +1,7 @@
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path
from .video_concat_dataset import ConcatVideoDataset
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
@@ -34,8 +35,7 @@ def supported_datasets(cls):
def evaluate(self, eval_file, **judge_kwargs):
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc')
for key in self.type_data_dict:
result.loc[key] = 0.0
for name, item in result.iterrows():
@@ -211,10 +211,10 @@ def build_prompt(self, line, video_llm):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
@@ -423,9 +423,8 @@ def evaluate(self, eval_file, **judge_kwargs):
print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
judge_kwargs['model'] = 'gpt-4-0125'
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
diff --git a/vlmeval/dataset/mmalignbench.py b/vlmeval/dataset/mmalignbench.py
index 6d8c6bb0f..fd77deccd 100644
--- a/vlmeval/dataset/mmalignbench.py
+++ b/vlmeval/dataset/mmalignbench.py
@@ -171,11 +171,10 @@ def gen_eval_base(self, eval_file, b64_map):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
# We adopt pairwise evaluation (twice for a pair) for this dataset
- suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
diff --git a/vlmeval/dataset/mmbench_video.py b/vlmeval/dataset/mmbench_video.py
index 816ec1db6..f2ada230c 100644
--- a/vlmeval/dataset/mmbench_video.py
+++ b/vlmeval/dataset/mmbench_video.py
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
@@ -208,13 +209,13 @@ def load_pack_answers(self, data_raw):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
- score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
model = build_judge(system_prompt=system_prompt, **judge_kwargs)
assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
diff --git a/vlmeval/dataset/mmifeval.py b/vlmeval/dataset/mmifeval.py
index 7e68b6b37..6dcfd1f38 100644
--- a/vlmeval/dataset/mmifeval.py
+++ b/vlmeval/dataset/mmifeval.py
@@ -4,6 +4,7 @@
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
+from ..smp.file import get_intermediate_file_path
from ..utils import track_progress_rich
from ..dataset.utils.mmif.function_and_compare import *
@@ -370,11 +371,10 @@ def build_prompt(self, line):
def evaluate(self, eval_file, **judge_kwargs):
raw_bench_data = MMIFEval("MM-IFEval").data
global aux_data_dict
- suffix = eval_file.split(".")[-1]
model = judge_kwargs["model"]
- storage = eval_file.replace(f".{suffix}", f"_{model}.jsonl")
- score_file = eval_file.replace(f".{suffix}", f"_{model}_score.csv")
- tmp_file = eval_file.replace(f".{suffix}", f"_{model}_tmp.pkl")
+ storage = get_intermediate_file_path(eval_file, f"_{model}", "jsonl")
+ score_file = get_intermediate_file_path(eval_file, f"_{model}_score", "csv")
+ tmp_file = get_intermediate_file_path(eval_file, f"_{model}_tmp", "pkl")
nproc = judge_kwargs.pop("nproc", 4)
data_all = load(eval_file).to_dict(orient="records")
diff --git a/vlmeval/dataset/mmlongbench.py b/vlmeval/dataset/mmlongbench.py
index 2b5dd3619..3379d6af6 100644
--- a/vlmeval/dataset/mmlongbench.py
+++ b/vlmeval/dataset/mmlongbench.py
@@ -7,6 +7,7 @@
from vlmeval.dataset.utils import build_judge, levenshtein_distance
from vlmeval.smp import *
from .image_base import ImageBaseDataset
+from ..smp.file import get_intermediate_file_path
FAIL_MSG = 'Failed to obtain answer via API.'
@@ -538,9 +539,8 @@ def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
@@ -576,7 +576,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score = MMLongBench_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
diff --git a/vlmeval/dataset/mmmath.py b/vlmeval/dataset/mmmath.py
index e70f592be..d71bb1263 100644
--- a/vlmeval/dataset/mmmath.py
+++ b/vlmeval/dataset/mmmath.py
@@ -11,7 +11,7 @@
from .image_base import ImageBaseDataset
from ..utils import track_progress_rich
-from ..smp import load, dump
+from ..smp import load, dump, get_intermediate_file_path
try:
import sympy as sp
@@ -432,7 +432,7 @@ def evaluate(self, eval_file, **kwargs):
data['hit'] = res
dump(data, eval_file)
- score_file = eval_file.replace('.xlsx', '_score.json')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'json')
score = {}
score['overall'] = np.mean(data['hit'])
# Results by Difficulty
diff --git a/vlmeval/dataset/moat.py b/vlmeval/dataset/moat.py
index 928fc587f..123825799 100644
--- a/vlmeval/dataset/moat.py
+++ b/vlmeval/dataset/moat.py
@@ -4,6 +4,7 @@
from ..utils import track_progress_rich
from ..smp import load, dump, decode_base64_to_image
from .utils import DEBUG_MESSAGE
+from ..smp.file import get_intermediate_file_path
import zipfile
from random import shuffle, seed
@@ -99,8 +100,7 @@ def build_prompt(self, line):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- result_path = eval_file.replace(f'.{suffix}', f"_{model}.xlsx")
+ result_path = get_intermediate_file_path(eval_file, f"_{model}")
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(result_path):
@@ -164,7 +164,7 @@ def verdict_one(model, line):
'result_path': result_path,
'capability_acc': capability_score_map,
}
- score_pth = eval_file.replace(f'.{suffix}', "_score.json")
+ score_pth = get_intermediate_file_path(eval_file, "_score", "json")
dump(metrics, score_pth)
return metrics
diff --git a/vlmeval/dataset/moviechat1k.py b/vlmeval/dataset/moviechat1k.py
index 84dba33d6..fed877536 100644
--- a/vlmeval/dataset/moviechat1k.py
+++ b/vlmeval/dataset/moviechat1k.py
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
@@ -215,16 +216,16 @@ def load_pack_answers(self, data_raw):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.moviechat1k import get_dimension_rating, prepare_score_prompt
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
judge = judge_kwargs.setdefault('model', 'chatgpt-0125')
assert judge in ['chatgpt-0125'], f'Invalid judge model for MovieChat1k: {judge}'
nproc = judge_kwargs.pop('nproc', 4)
_ = judge_kwargs.pop('verbose', None)
_ = judge_kwargs.pop('retry', None)
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
- score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
model = build_judge(**judge_kwargs)
diff --git a/vlmeval/dataset/mvbench.py b/vlmeval/dataset/mvbench.py
index 4f0aa7f03..69a49c0af 100644
--- a/vlmeval/dataset/mvbench.py
+++ b/vlmeval/dataset/mvbench.py
@@ -362,11 +362,11 @@ def build_prompt(self, line, video_llm):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
@@ -609,11 +609,11 @@ def build_prompt(self, line, video_llm):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
diff --git a/vlmeval/dataset/qbench_video.py b/vlmeval/dataset/qbench_video.py
index a208ebaf2..317fa019c 100644
--- a/vlmeval/dataset/qbench_video.py
+++ b/vlmeval/dataset/qbench_video.py
@@ -2,6 +2,7 @@
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_concat_dataset import ConcatVideoDataset
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
@@ -31,8 +32,7 @@ def supported_datasets(cls):
def evaluate(self, eval_file, **judge_kwargs):
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc')
result.at['open_ended', 'acc'] /= 2
dump(result, score_file)
return result
@@ -159,10 +159,10 @@ def build_prompt(self, line, video_llm):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'exact_matching')
@@ -318,9 +318,8 @@ def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs.setdefault('model', 'gpt-4o-0806')
assert model in ['gpt-4o-0806', 'gpt-4o']
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
diff --git a/vlmeval/dataset/sfebench.py b/vlmeval/dataset/sfebench.py
index 7d672bb8b..b2aa24bc2 100644
--- a/vlmeval/dataset/sfebench.py
+++ b/vlmeval/dataset/sfebench.py
@@ -1,5 +1,7 @@
import string
from vlmeval import *
+from .smp import *
+from .smp.file import get_intermediate_file_path
from .image_vqa import ImageVQADataset
from .utils.judge_util import build_judge
from ..utils import track_progress_rich
@@ -172,8 +174,8 @@ def evaluate(self, eval_file, **judge_kwargs):
assert 'answer' in data and 'prediction' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
- storage = eval_file.replace('.xlsx', '_judge.xlsx')
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+ storage = get_intermediate_file_path(eval_file, '_judge')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
ans_map = {} if not osp.exists(tmp_file) else load(tmp_file)
@@ -216,6 +218,6 @@ def evaluate(self, eval_file, **judge_kwargs):
data = load(storage)
score = report_score(data)
- score_file = eval_file.replace('.xlsx', '_score.csv')
+ score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
dump(score, score_file)
return score
diff --git a/vlmeval/dataset/slidevqa.py b/vlmeval/dataset/slidevqa.py
index ae7104d43..c6aa68575 100644
--- a/vlmeval/dataset/slidevqa.py
+++ b/vlmeval/dataset/slidevqa.py
@@ -6,6 +6,7 @@
from vlmeval.smp import *
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
+from ..smp.file import get_intermediate_file_path
FAIL_MSG = 'Failed to obtain answer via API.'
@@ -143,9 +144,8 @@ def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
@@ -181,7 +181,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)
score = SlideVQA_acc(storage)
- score_pth = storage.replace('.xlsx', '_score.csv')
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
dump(score, score_pth)
logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
diff --git a/vlmeval/dataset/spatial457.py b/vlmeval/dataset/spatial457.py
index 15475f1c3..4026e4c1c 100644
--- a/vlmeval/dataset/spatial457.py
+++ b/vlmeval/dataset/spatial457.py
@@ -133,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs):
all_results[f"{level}_correct"] / all_results[level] if all_results[level] > 0 else 0
)
- score_pth = eval_file.replace(".xlsx", "_score.json")
+ score_pth = get_intermediate_file_path(eval_file, "_score", "json")
dump(all_results, score_pth)
return all_results
diff --git a/vlmeval/dataset/tamperbench.py b/vlmeval/dataset/tamperbench.py
index 9c90e5e3d..7aebb4813 100644
--- a/vlmeval/dataset/tamperbench.py
+++ b/vlmeval/dataset/tamperbench.py
@@ -1,6 +1,7 @@
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
import torchvision.transforms as T
@@ -11,6 +12,7 @@
import os
import glob
from .utils.tamperbench import *
+import warnings
# constants
FAIL_MSG = 'Failed to obtain answer via API.'
@@ -25,8 +27,6 @@ class MVTamperBench(VideoBaseDataset):
'MVTamperBenchEnd': 'aa2c19dd02e1b006ee2d4be9f6f2b62b',
}
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
-the detail and movement of objects, and the action and pose of persons. \
-Based on your observations, select the best option that accurately addresses the question.
"""
TYPE = 'Video-MCQ'
@@ -87,14 +87,14 @@ def prepare_dataset(self, dataset_name='MVTamperBench', repo_id=None):
def check_integrity(pth):
"""
- Verifies the completeness and consistency of the dataset located at the specified path.
+ Verifies the completeness and consistency of the dataset located at the specified path.
- Args:
- path_to_dataset (str): The directory path where the dataset is stored.
+ Args:
+ path_to_dataset (str): The directory path where the dataset is stored.
- Returns:
- bool: True if the dataset is intact, False otherwise.
- """
+ Returns:
+ bool: True if the dataset is intact, False otherwise.
+ """
# Construct the full path to the data file
data_file = osp.join(pth, f'{dataset_name}.tsv')
@@ -436,14 +436,14 @@ def evaluate(self, eval_file, **judge_kwargs):
Evaluates the given evaluation file and generates ratings based on different dimensions.
Args:
- eval_file (str): Path to the evaluation file. The file should be in .xlsx format.
+ eval_file (str): Path to the evaluation file. The file should be in a supported format (xlsx/json/tsv).
**judge_kwargs: Additional keyword arguments for the judge model.
Returns:
dict: A dictionary containing ratings for task type, tamper type, and task-tamper type.
Raises:
- AssertionError: If the eval_file does not end with '.xlsx'.
+ AssertionError: If the eval_file is not a supported format.
Warning: If the OPENAI API is not working properly or the API key is not set,
exact matching will be used for evaluation.
@@ -454,15 +454,15 @@ def evaluate(self, eval_file, **judge_kwargs):
- Ratings are generated for different dimensions and saved to respective files.
"""
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_task_type_file = eval_file.replace('.xlsx', '_task_type_rating.json')
- tgt_tamper_type_file = eval_file.replace('.xlsx', '_tamper_type_rating.json')
- tgt_task_tamper_type_file = eval_file.replace('.xlsx', '_task_tamper_type_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
- score_metrics_file = eval_file.replace('.xlsx', '_score_f1.xlsx')
- action_metrics_file = eval_file.replace('.xlsx', '_action_f1.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_task_type_file = get_intermediate_file_path(eval_file, '_task_type_rating', 'json')
+ tgt_tamper_type_file = get_intermediate_file_path(eval_file, '_tamper_type_rating', 'json')
+ tgt_task_tamper_type_file = get_intermediate_file_path(eval_file, '_task_tamper_type_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
+ score_metrics_file = get_intermediate_file_path(eval_file, '_score_f1')
+ action_metrics_file = get_intermediate_file_path(eval_file, '_action_f1')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
diff --git a/vlmeval/dataset/tempcompass.py b/vlmeval/dataset/tempcompass.py
index 2cc10429c..6c409334e 100644
--- a/vlmeval/dataset/tempcompass.py
+++ b/vlmeval/dataset/tempcompass.py
@@ -25,9 +25,8 @@ def supported_datasets(cls):
def evaluate(self, eval_file, **judge_kwargs):
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
- suffix = eval_file.split('.')[-1]
result = result.reset_index().rename(columns={'index': 'dim.task_type'})
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
avg_dict = {}
for idx, item in result.iterrows():
dim, task_type = item['dim.task_type'].split('. ')
@@ -214,9 +213,8 @@ def evaluate(self, eval_file, **judge_kwargs):
"presence_penalty": 1,
})
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
@@ -412,9 +410,8 @@ def evaluate(self, eval_file, **judge_kwargs):
"presence_penalty": 1,
})
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
@@ -609,9 +606,8 @@ def evaluate(self, eval_file, **judge_kwargs):
"presence_penalty": 1,
})
- suffix = eval_file.split('.')[-1]
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
diff --git a/vlmeval/dataset/text_mcq.py b/vlmeval/dataset/text_mcq.py
index 9db53893d..2879551a2 100644
--- a/vlmeval/dataset/text_mcq.py
+++ b/vlmeval/dataset/text_mcq.py
@@ -1,6 +1,7 @@
from .text_base import TextBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
+from ..smp.file import get_intermediate_file_path
class TextMCQDataset(TextBaseDataset):
@@ -52,8 +53,6 @@ def evaluate(self, eval_file, **judge_kwargs):
nproc = judge_kwargs.pop('nproc', 4)
circular = False
-
- suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
@@ -71,7 +70,7 @@ def evaluate(self, eval_file, **judge_kwargs):
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
- result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+ result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl')
data = load(eval_file)
data = data.sort_values(by='index')
@@ -94,8 +93,9 @@ def evaluate(self, eval_file, **judge_kwargs):
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
- dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
- data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+ eval_name_result = get_intermediate_file_path(eval_file, f'_{name_str}_result')
+ dump(data, eval_name_result)
+ data = load(eval_name_result)
# May have different report acc functions for different datasets
if 'MMT' in dataset:
@@ -103,7 +103,7 @@ def evaluate(self, eval_file, **judge_kwargs):
else:
acc = report_acc(data)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(acc, score_file)
return acc
diff --git a/vlmeval/dataset/utils/multiple_choice.py b/vlmeval/dataset/utils/multiple_choice.py
index d36c62341..e965808aa 100644
--- a/vlmeval/dataset/utils/multiple_choice.py
+++ b/vlmeval/dataset/utils/multiple_choice.py
@@ -562,7 +562,8 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
if k not in result:
result[k] = v
- tmp_pth = f'/tmp/{timestr()}.xlsx'
+ tmp_ext = get_pred_file_format()
+ tmp_pth = f'/tmp/{timestr()}.{tmp_ext}'
dump(data_main, tmp_pth)
data_main = load(tmp_pth)
indices = data_main['index']
diff --git a/vlmeval/dataset/utils/ocrbench.py b/vlmeval/dataset/utils/ocrbench.py
index f88bb246c..dedee4bfc 100644
--- a/vlmeval/dataset/utils/ocrbench.py
+++ b/vlmeval/dataset/utils/ocrbench.py
@@ -57,7 +57,7 @@ def OCRBench_eval(eval_file):
+ final_score_dict['Handwritten Mathematical Expression Recognition']
)
final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
- score_pth = eval_file.replace('.xlsx', '_score.json')
+ score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)
logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
diff --git a/vlmeval/dataset/vcr.py b/vlmeval/dataset/vcr.py
index c659c60f4..e63fab6fc 100644
--- a/vlmeval/dataset/vcr.py
+++ b/vlmeval/dataset/vcr.py
@@ -2,6 +2,8 @@
from functools import partial
from .image_base import ImageBaseDataset
from ..smp import *
+from ..smp.file import get_intermediate_file_path
+
rouge = None
nlp_en = None
@@ -323,9 +325,7 @@ def evaluate(self, eval_file, **judge_kwargs):
'Jaccard': vcr_score['Jaccard'],
'Predictions': results_out,
}
- score_pth = eval_file.replace(
- '.xlsx', f'{self.language}_{self.difficulty}_score.json'
- )
+ score_pth = get_intermediate_file_path(eval_file, f'_{self.language}_{self.difficulty}_score', 'json')
dump(results_with_metrics, score_pth)
logger.info(
f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
diff --git a/vlmeval/dataset/vcrbench.py b/vlmeval/dataset/vcrbench.py
index 7b35f708a..13efa628a 100644
--- a/vlmeval/dataset/vcrbench.py
+++ b/vlmeval/dataset/vcrbench.py
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
@@ -141,14 +142,14 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.vcrbench.eval import precision, recall
from .utils.vcrbench.cau_total import calu_pre_recall
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
judge = judge_kwargs.pop('model','gpt-4o-0806')
nproc = judge_kwargs.pop('nproc', 4)
# step1: extract answer
print("running step 1: extracting answer")
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_extracted_answer_tmp.pkl')
- extracted_answer_file = eval_file.replace('.xlsx', f'_{judge}_extracted_answer.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_extracted_answer_tmp', 'pkl')
+ extracted_answer_file = get_intermediate_file_path(eval_file, f'_{judge}_extracted_answer')
model = build_judge(system_prompt=Answer_Extraction_Prompt_part1, model=judge, **judge_kwargs)
if not osp.exists(extracted_answer_file):
@@ -179,8 +180,8 @@ def evaluate(self, eval_file, **judge_kwargs):
# step2: scoring
print("running step 2: acc scoring")
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_answer_score_tmp.pkl')
- answer_score_file = eval_file.replace('.xlsx', f'_{judge}_answer_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score_tmp', 'pkl')
+ answer_score_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score')
model = build_judge(system_prompt=Answer_Scoring_Prompt_part1, model=judge, **judge_kwargs)
if not osp.exists(answer_score_file):
@@ -206,15 +207,15 @@ def evaluate(self, eval_file, **judge_kwargs):
data['answer_scoring'] = [answer_score_map[idx] if idx in answer_score_map else -1 for idx in data['index']]
dump(data, answer_score_file)
- txt_file = eval_file.replace('.xlsx', f'_{judge}_answer_score.txt')
- answer_score_json = eval_file.replace('.xlsx', f'_{judge}_answer_score.json')
+ txt_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score', 'txt')
+ answer_score_json = get_intermediate_file_path(eval_file, f'_{judge}_answer_score', 'json')
xlsx2json(answer_score_file, answer_score_json)
calu_acc_main(answer_score_json, txt_file)
# step3: calulate precision_score
print("running step 3: calulate precision_score")
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_pre_score_tmp.pkl')
- pre_score_file = eval_file.replace('.xlsx', f'_{judge}_pre_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_pre_score_tmp', 'pkl')
+ pre_score_file = get_intermediate_file_path(eval_file, f'_{judge}_pre_score')
model = build_judge(system_prompt=Precision_Evaluation_Prompt, model=judge, **judge_kwargs)
@@ -253,13 +254,13 @@ def evaluate(self, eval_file, **judge_kwargs):
data = data.loc[valid_indices]
dump(data, pre_score_file)
- pre_score_json = eval_file.replace('.xlsx', f'_{judge}_pre_score.json')
+ pre_score_json = get_intermediate_file_path(eval_file, f'_{judge}_pre_score', 'json')
xlsx2json(pre_score_file, pre_score_json)
# step4: calulate recall_score
print("running step 4: calulate recall_score")
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_recall_score_tmp.pkl')
- recall_score_file = eval_file.replace('.xlsx', f'_{judge}_recall_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_recall_score_tmp', 'pkl')
+ recall_score_file = get_intermediate_file_path(eval_file, f'_{judge}_recall_score')
model = build_judge(system_prompt=Recall_Evaluation_Prompt, model=judge, **judge_kwargs)
@@ -295,7 +296,7 @@ def evaluate(self, eval_file, **judge_kwargs):
data = data.loc[valid_indices]
dump(data, recall_score_file)
- txt_file = eval_file.replace('.xlsx', f'_{judge}_precision_recall_score.txt')
- recall_score_json = eval_file.replace('.xlsx', f'_{judge}_recall_score.json')
+ txt_file = get_intermediate_file_path(eval_file, f'_{judge}_precision_recall_score', 'txt')
+ recall_score_json = get_intermediate_file_path(eval_file, f'_{judge}_recall_score', 'json')
xlsx2json(recall_score_file, recall_score_json)
calu_pre_recall(pre_score_json, recall_score_json, txt_file)
diff --git a/vlmeval/dataset/vdc.py b/vlmeval/dataset/vdc.py
index dce63cb42..75e1051bc 100644
--- a/vlmeval/dataset/vdc.py
+++ b/vlmeval/dataset/vdc.py
@@ -1,6 +1,7 @@
# flake8: noqa
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
@@ -346,16 +347,16 @@ def load_pack_answers(self, data_raw):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.vdc import get_dimension_rating, prepare_response_prompt, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT, SYSTEM_GENER_PRED_PROMPT
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
_ = judge_kwargs.pop('verbose', None)
_ = judge_kwargs.pop('retry', None)
- response_file = eval_file.replace('.xlsx', f'_{judge}_response.pkl')
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
- score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+ response_file = get_intermediate_file_path(eval_file, f'_{judge}_response', 'pkl')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
model = build_judge(**judge_kwargs)
diff --git a/vlmeval/dataset/video_concat_dataset.py b/vlmeval/dataset/video_concat_dataset.py
index dab1ae1d7..fcf3e8227 100644
--- a/vlmeval/dataset/video_concat_dataset.py
+++ b/vlmeval/dataset/video_concat_dataset.py
@@ -1,4 +1,5 @@
from ..smp import *
+from ..smp.file import get_intermediate_file_path
from .video_base import VideoBaseDataset
@@ -59,7 +60,6 @@ def supported_datasets(cls):
return [] # list(cls.DATASET_SETS)
def evaluate(self, eval_file, **judge_kwargs):
- suffix = eval_file.split('.')[-1]
# First, split the eval_file by dataset
data_all = load(eval_file)
for dname in self.datasets:
@@ -80,6 +80,6 @@ def evaluate(self, eval_file, **judge_kwargs):
result = result.T
for idx, item in result.iterrows():
result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(result, score_file)
return result
diff --git a/vlmeval/dataset/video_holmes.py b/vlmeval/dataset/video_holmes.py
index c267755e3..3d6ff37b2 100644
--- a/vlmeval/dataset/video_holmes.py
+++ b/vlmeval/dataset/video_holmes.py
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
@@ -204,11 +205,11 @@ def evaluate(self, eval_file, **judge_kwargs):
from .utils.videoholmes import get_dimension_rating, extract_option
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/video_mmlu.py b/vlmeval/dataset/video_mmlu.py
index a229a95bc..977cbff9e 100644
--- a/vlmeval/dataset/video_mmlu.py
+++ b/vlmeval/dataset/video_mmlu.py
@@ -1,6 +1,7 @@
# flake8: noqa
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
@@ -276,16 +277,16 @@ def load_pack_answers(self, data_raw):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.video_mmlu import get_dimension_rating, prepare_response_prompt, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT_CAP, SYSTEM_GENER_PRED_PROMPT
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
_ = judge_kwargs.pop('verbose', None)
_ = judge_kwargs.pop('retry', None)
- response_file = eval_file.replace('.xlsx', f'_{judge}_response.pkl')
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
- score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+ response_file = get_intermediate_file_path(eval_file, f'_{judge}_response', 'pkl')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
judge_kwargs['temperature'] = 0.0
model = build_judge(**judge_kwargs)
@@ -564,15 +565,15 @@ def load_pack_answers(self, data_raw):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.video_mmlu import get_dimension_rating, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT_QA
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file'
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
_ = judge_kwargs.pop('verbose', None)
_ = judge_kwargs.pop('retry', None)
- tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
- score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, f'_{judge}_score')
judge_kwargs['temperature'] = 0.0
model = build_judge(**judge_kwargs)
diff --git a/vlmeval/dataset/videomme.py b/vlmeval/dataset/videomme.py
index c084ad796..84a20eeb4 100644
--- a/vlmeval/dataset/videomme.py
+++ b/vlmeval/dataset/videomme.py
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
@@ -231,11 +232,11 @@ def build_prompt(self, line, video_llm):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.videomme import get_dimension_rating, extract_characters_regex, extract_option
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/dataset/visfactor.py b/vlmeval/dataset/visfactor.py
index c3a0f8126..6b8313fbd 100644
--- a/vlmeval/dataset/visfactor.py
+++ b/vlmeval/dataset/visfactor.py
@@ -1,6 +1,7 @@
import re
from vlmeval import *
from .image_base import ImageBaseDataset
+from ..smp.file import get_intermediate_file_path
class VisFactor(ImageBaseDataset):
@@ -141,9 +142,11 @@ def evaluate(self, eval_file, **judge_kwargs):
accuracy['ALL'] = sum([accuracy[s] for s in accuracy]) / len([accuracy[s] for s in accuracy])
- data.to_csv(eval_file.replace('.xlsx', '.csv'), index=False)
- with open(eval_file.replace('.xlsx', '_acc.csv'), 'w') as f:
- for key in accuracy:
- f.write(f'{key},{accuracy[key]}\n')
+ verbose_file = get_intermediate_file_path(eval_file, '_verbose')
+ dump(data, verbose_file)
+
+ score_df = d2df(accuracy)
+ score_file = get_intermediate_file_path(eval_file, '_acc')
+ dump(score_df, score_file)
return accuracy
diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py
index d8dad7383..ce8b397a8 100644
--- a/vlmeval/dataset/vl_rewardbench.py
+++ b/vlmeval/dataset/vl_rewardbench.py
@@ -102,11 +102,10 @@ def build_prompt(self, line):
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
- suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
diff --git a/vlmeval/dataset/vlm2bench.py b/vlmeval/dataset/vlm2bench.py
index 5cd04e283..bbe86d554 100644
--- a/vlmeval/dataset/vlm2bench.py
+++ b/vlmeval/dataset/vlm2bench.py
@@ -8,6 +8,8 @@
cnt_aggregate_metric,
grp_aggregate_accuracy,
)
+from ..smp import *
+from ..smp.file import get_intermediate_file_path
class VLM2Bench(ImageBaseDataset):
@@ -69,25 +71,15 @@ def evaluate(cls, eval_file, **judge_kwargs):
"""
model = judge_kwargs.get("model")
if model:
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
if os.path.exists(storage):
- if storage.lower().endswith(".xlsx"):
- data = pd.read_excel(storage)
- else:
- data = pd.read_csv(storage, sep="\t", encoding="latin1", engine="python")
+ data = load(storage)
else:
- if eval_file.lower().endswith(".xlsx"):
- data = pd.read_excel(eval_file)
- else:
- data = pd.read_csv(eval_file, sep="\t", encoding="latin1", engine="python")
+ data = load(eval_file)
else:
- if eval_file.lower().endswith(".xlsx"):
- data = pd.read_excel(eval_file)
- else:
- data = pd.read_csv(eval_file, sep="\t", encoding="latin1", engine="python")
+ data = load(eval_file)
results = data.to_dict(orient="records")
processed = common_process_results(results)
@@ -117,7 +109,6 @@ def evaluate(cls, eval_file, **judge_kwargs):
if model:
final_score_file = score_file
else:
- suffix = os.path.splitext(eval_file)[1]
- final_score_file = eval_file.replace(suffix, "_score.csv")
- score_df.to_csv(final_score_file, index=False)
+ final_score_file = get_intermediate_file_path(eval_file, "_score", "csv")
+ dump(score_df, final_score_file)
return score_df
diff --git a/vlmeval/dataset/vlmbias.py b/vlmeval/dataset/vlmbias.py
index b3b42e582..45e0ebb9b 100644
--- a/vlmeval/dataset/vlmbias.py
+++ b/vlmeval/dataset/vlmbias.py
@@ -16,9 +16,8 @@ class VLMBias(ImageVQADataset):
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs.pop('model', 'gpt-4o')
- suffix = eval_file.split('.')[-1]
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}_tmp')
nproc = judge_kwargs.pop('nproc', 16)
if not osp.exists(storage):
@@ -51,6 +50,6 @@ def evaluate(self, eval_file, **judge_kwargs):
data = load(storage)
acc = report_acc(data)
- score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+ score_file = get_intermediate_file_path(eval_file, '_acc')
dump(acc, score_file)
return acc
diff --git a/vlmeval/dataset/wildvision.py b/vlmeval/dataset/wildvision.py
index b1ad1fd26..3552a0b8e 100644
--- a/vlmeval/dataset/wildvision.py
+++ b/vlmeval/dataset/wildvision.py
@@ -4,6 +4,7 @@
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
+from ..smp.file import get_intermediate_file_path
from ..utils import track_progress_rich
@@ -141,11 +142,10 @@ def gen_eval_base(self, eval_file, b64_map):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
# We adopt pairwise evaluation (twice for a pair) for this dataset
- suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
- storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
- score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
- tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+ storage = get_intermediate_file_path(eval_file, f'_{model}')
+ score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
diff --git a/vlmeval/dataset/worldsense.py b/vlmeval/dataset/worldsense.py
index 6e51541d9..fe59c65ab 100644
--- a/vlmeval/dataset/worldsense.py
+++ b/vlmeval/dataset/worldsense.py
@@ -283,11 +283,11 @@ def build_prompt(self, line, video_llm):
def evaluate(self, eval_file, **judge_kwargs):
from .utils.worldsense import get_dimension_rating, extract_characters_regex, extract_option
- assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+ assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501
- tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
- tgt_file = eval_file.replace('.xlsx', '_rating.json')
- score_file = eval_file.replace('.xlsx', '_score.xlsx')
+ tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+ tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+ score_file = get_intermediate_file_path(eval_file, '_score')
if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
diff --git a/vlmeval/inference.py b/vlmeval/inference.py
index de3bd34c0..ffac276dd 100644
--- a/vlmeval/inference.py
+++ b/vlmeval/inference.py
@@ -47,7 +47,8 @@ def infer_data_api(model, work_dir, model_name, dataset, index_set=None, api_npr
# To reuse records in MMBench_V11
if dataset_name in ['MMBench', 'MMBench_CN']:
- v11_pred = f'{work_dir}/{model_name}_{dataset_name}_V11.xlsx'
+ pred_format = get_pred_file_format()
+ v11_pred = f'{work_dir}/{model_name}_{dataset_name}_V11.{pred_format}'
if osp.exists(v11_pred):
try:
reuse_inds = load('http://opencompass.openxlab.space/utils/mmb_reuse.pkl')
@@ -184,12 +185,14 @@ def infer_data_job(
):
rank, world_size = get_rank_and_world_size()
dataset_name = dataset.dataset_name
- result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+ # 使用环境变量控制的文件格式
+ result_file = get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True)
prev_file = f'{work_dir}/{model_name}_{dataset_name}_PREV.pkl'
if osp.exists(result_file):
if rank == 0:
data = load(result_file)
+ # breakpoint()
results = {k: v for k, v in zip(data['index'], data['prediction'])}
if not ignore_failed:
results = {k: v for k, v in results.items() if FAIL_MSG not in str(v)}
diff --git a/vlmeval/inference_mt.py b/vlmeval/inference_mt.py
index 298f2a208..25c7ce935 100644
--- a/vlmeval/inference_mt.py
+++ b/vlmeval/inference_mt.py
@@ -169,7 +169,7 @@ def infer_data_job_mt(
):
rank, world_size = get_rank_and_world_size()
dataset_name = dataset.dataset_name
- result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.tsv')
+ result_file = get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True)
tmpl = osp.join(work_dir, '{}' + f'{world_size}_{dataset_name}.pkl')
out_file = tmpl.format(rank)
diff --git a/vlmeval/smp/file.py b/vlmeval/smp/file.py
index ecd2edefc..ac658d271 100644
--- a/vlmeval/smp/file.py
+++ b/vlmeval/smp/file.py
@@ -116,9 +116,9 @@ def MMBenchOfficialServer(dataset_name):
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
- if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
- np.int16, np.int32, np.int64, np.uint8,
- np.uint16, np.uint32, np.uint64)):
+ if isinstance(obj,
+ (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64,
+ np.uint8, np.uint16, np.uint32, np.uint64)):
return int(obj)
elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
return float(obj)
@@ -139,6 +139,10 @@ def dump_pkl(data, pth, **kwargs):
pickle.dump(data, open(pth, 'wb'))
def dump_json(data, pth, **kwargs):
+ # 处理 DataFrame 对象
+ if isinstance(data, pd.DataFrame):
+ # 转换为 records 格式(列表格式)
+ data = data.to_dict('records')
json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
def dump_jsonl(data, f, **kwargs):
@@ -160,6 +164,65 @@ def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
return handlers[suffix](data, f, **kwargs)
+def get_pred_file_format():
+ pred_format = os.getenv('PRED_FORMAT', '').lower()
+ if pred_format in ['tsv', 'xlsx', 'json']:
+ return pred_format
+ return 'xlsx' # 默认格式
+
+
+def get_eval_file_format():
+ eval_format = os.getenv('EVAL_FORMAT', '').lower()
+ if eval_format in ['csv', 'json']:
+ return eval_format
+ return 'csv' # 默认格式
+
+
+def get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True):
+ if use_env_format:
+ file_format = get_pred_file_format()
+ if file_format == 'xlsx':
+ return osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+ elif file_format == 'tsv':
+ return osp.join(work_dir, f'{model_name}_{dataset_name}.tsv')
+ elif file_format == 'json':
+ return osp.join(work_dir, f'{model_name}_{dataset_name}.json')
+ else:
+ # 保持原有行为
+ return osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+
+
+def get_eval_file_path(eval_file, judge_model, use_env_format=True):
+ suffix = eval_file.split('.')[-1]
+ if use_env_format:
+ file_format = get_eval_file_format()
+ if file_format == 'csv':
+ return eval_file.replace(f'.{suffix}', f'_{judge_model}.csv')
+ elif file_format == 'json':
+ return eval_file.replace(f'.{suffix}', f'_{judge_model}.json')
+ else:
+ # 保持原有行为
+ return eval_file.replace(f'.{suffix}', f'_{judge_model}.xlsx')
+
+
+def _should_convert_to_dataframe(data):
+ if not isinstance(data, dict):
+ return False
+ if not data:
+ return False
+ if 'columns' in data and 'data' in data:
+ return True
+ values = list(data.values())
+ if all(not isinstance(v, (list, dict)) for v in values):
+ return False
+ if any(isinstance(v, list) for v in values):
+ lists = [v for v in values if isinstance(v, list)]
+ if lists and all(len(lst) == len(lists[0]) for lst in lists):
+ return True
+
+ return False
+
+
def load(f, fmt=None):
def load_pkl(pth):
return pickle.load(open(pth, 'rb'))
@@ -382,6 +445,26 @@ def fetch_aux_files(eval_file):
return fs
+def get_file_extension(file_path):
+ return file_path.split('.')[-1]
+
+
+def get_intermediate_file_path(eval_file, suffix, target_format=None):
+ original_ext = get_file_extension(eval_file)
+
+ if target_format is None:
+ if suffix in ['_tmp', '_response', '_processed']:
+ target_format = 'pkl'
+ elif suffix in ['_rating', '_config', '_meta']:
+ target_format = 'json'
+ elif suffix in ['_acc', '_fine', '_metrics']:
+ target_format = get_eval_file_format()
+ else:
+ target_format = get_pred_file_format()
+
+ return eval_file.replace(f'.{original_ext}', f'{suffix}.{target_format}')
+
+
def prepare_reuse_files(pred_root_meta, eval_id, model_name, dataset_name, reuse, reuse_aux):
import shutil
from .misc import timestr
diff --git a/vlmeval/tools.py b/vlmeval/tools.py
index 126fb76ae..98449f841 100644
--- a/vlmeval/tools.py
+++ b/vlmeval/tools.py
@@ -497,7 +497,8 @@ def SCAN_ONE(root, model, dataset):
from termcolor import colored
FAIL_MSG = 'Failed to obtain answer via API.'
root = osp.join(root, model)
- fname = f'{model}_{dataset}.xlsx'
+ pred_format = get_pred_file_format()
+ fname = f'{model}_{dataset}.{pred_format}'
pth = osp.join(root, fname)
if osp.exists(pth):
data = load(pth)
@@ -549,7 +550,8 @@ def SCAN(root, models, datasets):
cur_datasets = []
if len(datasets) == 0:
for d in SUPPORTED_DATASETS:
- if osp.exists(osp.join(root, m, f'{m}_{d}.xlsx')):
+ pred_format = get_pred_file_format()
+ if osp.exists(osp.join(root, m, f'{m}_{d}.{pred_format}')):
cur_datasets.append(d)
else:
cur_datasets = datasets