diff --git a/run.py b/run.py index ae6c4c4f0..627f96d8c 100644 --- a/run.py +++ b/run.py @@ -271,7 +271,8 @@ def main(): dist.barrier() try: - result_file_base = f'{model_name}_{dataset_name}.xlsx' + pred_format = get_pred_file_format() + result_file_base = f'{model_name}_{dataset_name}.{pred_format}' if use_config: if WORLD_SIZE > 1: @@ -299,9 +300,6 @@ def main(): continue # Handling Multi-Turn Dataset - if dataset.TYPE == 'MT': - result_file_base = result_file_base.replace('.xlsx', '.tsv') - result_file = osp.join(pred_root, result_file_base) # Reuse the previous prediction file if exists if RANK == 0 and len(prev_pred_roots): diff --git a/scripts/apires_scan.py b/scripts/apires_scan.py index c6036625f..890aea3da 100644 --- a/scripts/apires_scan.py +++ b/scripts/apires_scan.py @@ -10,7 +10,9 @@ model_name = root.split('/')[-1] for d in SUPPORTED_DATASETS: - fname = f'{model_name}_{d}.xlsx' + from vlmeval.smp import get_pred_file_format + pred_format = get_pred_file_format() + fname = f'{model_name}_{d}.{pred_format}' pth = osp.join(root, fname) if osp.exists(pth): data = load(pth) diff --git a/scripts/auto_run.py b/scripts/auto_run.py index f3cd1bbf3..381c3432f 100644 --- a/scripts/auto_run.py +++ b/scripts/auto_run.py @@ -26,7 +26,9 @@ def is_large(x): models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)] for m in models: - unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')] + from vlmeval.smp import get_pred_file_format + pred_format = get_pred_file_format() + unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')] if len(unknown_datasets) == 0: continue dataset_str = ' '.join(unknown_datasets) diff --git a/vlmeval/dataset/CGAVCounting/cg_av_counting.py b/vlmeval/dataset/CGAVCounting/cg_av_counting.py index 59445bb8c..8626c7fd4 100644 --- a/vlmeval/dataset/CGAVCounting/cg_av_counting.py +++ b/vlmeval/dataset/CGAVCounting/cg_av_counting.py @@ -359,10 +359,11 @@ def save_video_frames(self, video, uid, num_frames=8, fps=-1): def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \ + 'data file should be an supported format (xlsx/json/tsv) file' - tgt_file = eval_file.replace(".xlsx", "_rating.json") - score_file = eval_file.replace(".xlsx", "_score.xlsx") + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') data = load(eval_file) diff --git a/vlmeval/dataset/EgoExoBench/egoexobench.py b/vlmeval/dataset/EgoExoBench/egoexobench.py index a49c20f02..9400966c0 100644 --- a/vlmeval/dataset/EgoExoBench/egoexobench.py +++ b/vlmeval/dataset/EgoExoBench/egoexobench.py @@ -244,11 +244,12 @@ def build_prompt(self, line, video_llm): def evaluate(self, eval_file, **judge_kwargs): from .utils import get_dimension_rating, extract_characters_regex, extract_option - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \ + 'data file should be an supported format (xlsx/json/tsv) file' - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') if not osp.exists(score_file): model = judge_kwargs.get('model', 'exact_matching') diff --git a/vlmeval/dataset/GUI/screenspot.py b/vlmeval/dataset/GUI/screenspot.py index ac2cbe3e2..842d61ad0 100644 --- a/vlmeval/dataset/GUI/screenspot.py +++ b/vlmeval/dataset/GUI/screenspot.py @@ -324,7 +324,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs): results_dict[key] = str(0) else: results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key])) - score_pth = eval_file.replace(".xlsx", "_score.json") + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(results_dict, score_pth) failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None) @@ -437,7 +437,7 @@ def make_safe(value): sub_stats = itertools.chain(*sub_stats) final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100 - score_pth = eval_file.replace(".xlsx", "_score.json") + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None) diff --git a/vlmeval/dataset/GUI/screenspot_pro.py b/vlmeval/dataset/GUI/screenspot_pro.py index 26fde4114..c926a29c2 100644 --- a/vlmeval/dataset/GUI/screenspot_pro.py +++ b/vlmeval/dataset/GUI/screenspot_pro.py @@ -312,7 +312,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs): results_dict[key] = str(0) else: results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key])) - score_pth = eval_file.replace(".xlsx", "_score.json") + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(results_dict, score_pth) failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None) @@ -422,7 +422,7 @@ def make_safe(value): sub_stats = itertools.chain(*sub_stats) final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100 - score_pth = eval_file.replace(".xlsx", "_score.json") + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None) diff --git a/vlmeval/dataset/OmniDocBench/omnidocbench.py b/vlmeval/dataset/OmniDocBench/omnidocbench.py index 1d688404d..28aca5a09 100644 --- a/vlmeval/dataset/OmniDocBench/omnidocbench.py +++ b/vlmeval/dataset/OmniDocBench/omnidocbench.py @@ -4,10 +4,12 @@ import pandas as pd import tempfile import base64 +import numpy as np from tqdm import tqdm import torch.distributed as dist from ..image_base import ImageBaseDataset from ...smp import * +from .utils import get_intermediate_file_path, load, dump class OmniDocBench(ImageBaseDataset): @@ -75,9 +77,6 @@ def __init__(self, tsv_path, match_method:str='quick_match', filter_types:dict=None): - self.result_foler='../../../outputs/OmniDocBench' - if not os.path.exists(self.result_foler): - os.makedirs(self.result_foler) self.eval_file=eval_file self.match_method=match_method self.references=[] @@ -374,17 +373,18 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m 'group':group_result, 'page':page_result } - if not os.path.exists('./output/OmniDocBench'): - os.makedirs('./output/OmniDocBench') if isinstance(cur_samples,list): saved_samples=cur_samples else: saved_samples=cur_samples.samples - with open(os.path.join(self.result_foler,f'{save_name}_result.josn'),'w',encoding='utf-8') as f: - json.dump(saved_samples,f,indent=4,ensure_ascii=False) + # NOTE: The original code has a bug here, it will overwrite the result file in each iteration. + # I will fix it by adding element to the filename. + # NOTE: Fixed typo .josn -> .json + result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_{element}_result', 'json') + dump(saved_samples, result_file) - with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f: - json.dump(result_all,f,indent=4,ensure_ascii=False) + metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json') + dump(result_all, metric_result_file) dict_list = [] save_dict={} @@ -409,20 +409,20 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m dict_list.append(save_dict) df = pd.DataFrame(dict_list,index=['end2end',]).round(3) - with open(os.path.join(self.result_foler,'End2End_Evaluation.json'),'w',encoding='utf-8') as f: - json.dump(result_all,f,indent=4,ensure_ascii=False) - df.to_csv(os.path.join(self.result_foler,'overall.csv')) - over_all_path=os.path.join(self.result_foler,'End2End_Evaluation.json') - print(f"The save path of overall.csv is :{over_all_path}") + e2e_eval_file = get_intermediate_file_path(self.eval_file, '_End2End_Evaluation', 'json') + dump(result_all, e2e_eval_file) + + overall_file = get_intermediate_file_path(self.eval_file, '_overall') + dump(df, overall_file) + + print(f"The save path of End2End_Evaluation is: {e2e_eval_file}") + print(f"The save path of overall metrics is: {overall_file}") return df class table_evalutor(): def __init__(self,eval_file,tsv_path): - - self.result_foler='../../../outputs/OmniDocBench' - if not os.path.exists(self.result_foler): - os.makedirs(self.result_foler) + self.eval_file = eval_file gt_key='html' pred_key='pred' self.category_filter='table' @@ -434,8 +434,8 @@ def load_data(self,eval_file,gt_file,pred_key,gt_key): from .data_preprocess import clean_string, normalized_formula, textblock2unicode, normalized_table samples=[] preds=[] - predictions=pd.read_excel(eval_file)['prediction'].tolist() - gt_samples=pd.read_csv(gt_file,sep='\t')['answer'].tolist() + predictions=load(eval_file)['prediction'].tolist() + gt_samples=load(gt_file)['answer'].tolist() load_success,load_fail=0,0 for i,gt_sample in tqdm(enumerate(gt_samples),desc='Loading data'): try: @@ -533,8 +533,8 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'): 'page':page_result } - with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f: - json.dump(result_all,f,indent=4,ensure_ascii=False) + metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json') + dump(result_all, metric_result_file) dict_list=[] dict_list.append(result_all["group"]["TEDS"]) @@ -545,10 +545,7 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'): selected_columns = df4[["language: table_en", "language: table_simplified_chinese", "language: table_en_ch_mixed", "line: full_line", "line: less_line", "line: fewer_line", "line: wireless_line", "with_span: True", "with_span: False", "include_equation: True", "include_equation: False", "include_background: True", "include_background: False", "table_layout: vertical", "table_layout: horizontal"]] - selected_columns.to_csv(os.path.join(self.result_foler,'table_attribute.csv')) - table_attribute_path=os.path.join(self.result_foler,'table_attribute.csv') - print(f'The save path of table_attribute.csv is :{table_attribute_path}') - selected_columns - - + table_attr_file = get_intermediate_file_path(self.eval_file, '_table_attribute') + dump(selected_columns, table_attr_file) + print(f'The save path of table_attribute is :{table_attr_file}') return selected_columns diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 579444829..855049d4a 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -151,7 +151,6 @@ def supported_datasets(cls): return list(cls.DATASET_SETS) def evaluate(self, eval_file, **judge_kwargs): - suffix = eval_file.split('.')[-1] # First, split the eval_file by dataset data_all = load(eval_file) for dname in self.datasets: @@ -179,11 +178,11 @@ def evaluate(self, eval_file, **judge_kwargs): if len(df_all): result = pd.concat(df_all) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(result, score_file) return result else: - score_file = eval_file.replace(f'.{suffix}', '_score.json') + score_file = get_intermediate_file_path(eval_file, '_score', 'json') dump(dict_all, score_file) return dict_all diff --git a/vlmeval/dataset/cgbench.py b/vlmeval/dataset/cgbench.py index 8ca5b5f12..aada9da6a 100644 --- a/vlmeval/dataset/cgbench.py +++ b/vlmeval/dataset/cgbench.py @@ -1,5 +1,6 @@ from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE from .utils.cgbench import * @@ -432,10 +433,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=- def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format" - tgt_file = eval_file.replace(".xlsx", "_rating.json") - score_file = eval_file.replace(".xlsx", "_score.xlsx") + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') data = load(eval_file) @@ -760,12 +761,12 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.cgbench import get_dimention_rating_open_ended, post_process_open - assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format" - tgt_file = eval_file.replace(".xlsx", "_rating.json") - score_file = eval_file.replace(".xlsx", "_score.xlsx") - step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl") - step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl") + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') + step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl') + step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl') data = load(eval_file) @@ -784,13 +785,13 @@ def evaluate(self, eval_file, **judge_kwargs): axis=1, ) - data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1] - data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1] - if judge_kwargs.get("model", None) != "gpt-4o-0806": judge_kwargs["model"] = "gpt-4o-0806" print("The judge model in cg-bench is gpt-4o-0806!") + data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1] + data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1] + model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs) nproc = judge_kwargs.pop("nproc", 32) @@ -1314,10 +1315,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=- def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format" - tgt_file = eval_file.replace(".xlsx", "_rating.json") - score_file = eval_file.replace(".xlsx", "_score.xlsx") + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') data = load(eval_file) @@ -1641,12 +1642,12 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.cgbench import get_dimention_rating_open_ended, post_process_open - assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format" - tgt_file = eval_file.replace(".xlsx", "_rating.json") - score_file = eval_file.replace(".xlsx", "_score.xlsx") - step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl") - step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl") + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') + step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl') + step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl') data = load(eval_file) diff --git a/vlmeval/dataset/chartmimic.py b/vlmeval/dataset/chartmimic.py index 8151f0af1..f970f17a5 100644 --- a/vlmeval/dataset/chartmimic.py +++ b/vlmeval/dataset/chartmimic.py @@ -570,19 +570,12 @@ def judge_one_item_success(item): infer_data_all = load(eval_file).to_dict(orient="records") - suffix = eval_file.split(".")[-1] print(f"judge_kwargs: {judge_kwargs}") infer_model = judge_kwargs["model"] - storage = os.path.abspath( - eval_file.replace(f".{suffix}", f"_{infer_model}.jsonl") - ) - score_file = os.path.abspath( - eval_file.replace(f".{suffix}", f"_{infer_model}_score.csv") - ) + storage = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}', 'jsonl')) + score_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_score', 'csv')) # use abs path because of using os.chdir() - tmp_file = os.path.abspath( - eval_file.replace(f".{suffix}", f"_{infer_model}_tmp.pkl") - ) + tmp_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_tmp', 'pkl')) # actually the --api-nproc nproc = judge_kwargs.pop("nproc", 8) logger.info(f"nproc: {nproc}") diff --git a/vlmeval/dataset/charxiv.py b/vlmeval/dataset/charxiv.py index 0427632ba..3a3c01e13 100644 --- a/vlmeval/dataset/charxiv.py +++ b/vlmeval/dataset/charxiv.py @@ -6,6 +6,7 @@ from vlmeval.dataset.image_base import ImageBaseDataset from vlmeval.smp import misc, file +from vlmeval.smp.file import get_intermediate_file_path from vlmeval import utils from vlmeval.dataset.utils import build_judge @@ -203,10 +204,9 @@ def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame: judge_model_name = judge_model.model # Define file paths - suffix = eval_file.split(".")[-1] - result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.xlsx") - temp_result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.pkl") - score_file = result_file.replace(".xlsx", "_acc.csv") + result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}") + temp_result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}", "pkl") + score_file = get_intermediate_file_path(result_file, "_acc", "csv") # Return existing results if available if os.path.exists(result_file): diff --git a/vlmeval/dataset/cmmmu.py b/vlmeval/dataset/cmmmu.py index 12c583f29..d96a241e6 100644 --- a/vlmeval/dataset/cmmmu.py +++ b/vlmeval/dataset/cmmmu.py @@ -5,6 +5,7 @@ import re import tempfile from ..smp import * +from ..smp.file import get_intermediate_file_path def get_multi_choice_prediction(response, all_choices, index2ans): @@ -223,8 +224,7 @@ def dump_image(self, line): @classmethod def evaluate(self, eval_file, **judge_kwargs): - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc', 'csv') if not osp.exists(result_file): data = load(eval_file) diff --git a/vlmeval/dataset/creation.py b/vlmeval/dataset/creation.py index 4e37102fe..38a5d3d51 100644 --- a/vlmeval/dataset/creation.py +++ b/vlmeval/dataset/creation.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from ..smp import * +from ..smp.file import get_intermediate_file_path from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich import re @@ -662,20 +663,18 @@ def evaluate(self, eval_file, **judge_kwargs): tgt = load(eval_file) tgt['reference_answer_by_gpt4o'] = src['prediction'] tgt['prediction'] = src['reference_answer_by_gpt4o'] - tgt_file_name = eval_file.replace('.xlsx', '_rev.xlsx') + tgt_file_name = get_intermediate_file_path(eval_file, '_rev') dump(tgt, tgt_file_name) judge_kwargs['dual_eval'] = False rating_rev = self.evaluate(tgt_file_name, **judge_kwargs) judge_kwargs.pop('dual_eval', None) - suffix = '.' + eval_file.split('.')[-1] - - score_file = eval_file.replace(suffix, '_score.csv') - tgt_file = eval_file.replace(suffix, '_rating.json') + score_file = get_intermediate_file_path(eval_file, '_score') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') model = judge_kwargs.pop('model', 'gpt-4o-0806') model_name = model.split('/')[-1] if '/' in model else model - tmp_file = eval_file.replace(suffix, f'_{model_name}.pkl') + tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) diff --git a/vlmeval/dataset/dude.py b/vlmeval/dataset/dude.py index c520c7d28..e024d9821 100644 --- a/vlmeval/dataset/dude.py +++ b/vlmeval/dataset/dude.py @@ -5,6 +5,7 @@ from .image_base import ImageBaseDataset from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute from ..smp import * +from ..smp.file import get_intermediate_file_path FAIL_MSG = 'Failed to obtain answer via API.' @@ -165,9 +166,8 @@ def evaluate(self, eval_file, **judge_kwargs): logger = get_logger('Evaluation') model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') if osp.exists(storage): logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ') @@ -203,7 +203,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score = DUDE_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}') diff --git a/vlmeval/dataset/dynamath.py b/vlmeval/dataset/dynamath.py index a463276d7..e66797ac1 100644 --- a/vlmeval/dataset/dynamath.py +++ b/vlmeval/dataset/dynamath.py @@ -12,6 +12,7 @@ from .utils import build_judge from ..utils import track_progress_rich from ..smp import load, dump, d2df, toliststr +from ..smp.file import get_intermediate_file_path def preprocess(str1): @@ -170,11 +171,10 @@ def evaluate(self, eval_file, **judge_kwargs): judge_name = judge_kwargs.pop('model', 'gpt-4o-mini') model = build_judge(model=judge_name, **judge_kwargs) - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841 - score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841 - tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841 + storage = get_intermediate_file_path(eval_file, f'_{judge_name}') + score_file = get_intermediate_file_path(eval_file, f'_{judge_name}_score', 'csv') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl') nproc = judge_kwargs.pop('nproc', 6) # noqa: F841 res = load(tmp_file) if os.path.exists(tmp_file) else {} diff --git a/vlmeval/dataset/gobench.py b/vlmeval/dataset/gobench.py index 3e9990c13..7667b934a 100644 --- a/vlmeval/dataset/gobench.py +++ b/vlmeval/dataset/gobench.py @@ -152,7 +152,7 @@ def evaluate(self, eval_file, **judge_kwargs): 'Instruction_Consistency_Score': [avg_scores.get('consistency', 0) * 100] }) - score_file = eval_file.replace('.xlsx', '_score.xlsx') + score_file = get_intermediate_file_path(eval_file, '_score') dump(final_df, score_file) print(f"Detailed scores including failed attempts saved to {score_file}") diff --git a/vlmeval/dataset/image_caption.py b/vlmeval/dataset/image_caption.py index 23282805c..6a9d806f5 100644 --- a/vlmeval/dataset/image_caption.py +++ b/vlmeval/dataset/image_caption.py @@ -70,6 +70,6 @@ def evaluate(self, eval_file, **kwargs): scorer = COCO_Caption_Scorer(ref, gt) coco_caption_score_dict = scorer.compute_scores() - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(coco_caption_score_dict, score_pth) return coco_caption_score_dict diff --git a/vlmeval/dataset/image_ccocr.py b/vlmeval/dataset/image_ccocr.py index b1286daba..e70403d64 100644 --- a/vlmeval/dataset/image_ccocr.py +++ b/vlmeval/dataset/image_ccocr.py @@ -9,6 +9,7 @@ from .image_base import ImageBaseDataset from ..smp import * +from ..smp.file import get_intermediate_file_path # should be the same as FAIL_MSG definded in vlmeval/inference.py FAIL_MSG = 'Failed to obtain answer via API.' @@ -230,13 +231,12 @@ def evaluate(self, eval_file, **judge_kwargs): print(f"Failed to evaluate {sub_dataset_id}") # Save comprehensive results - base_name = os.path.splitext(os.path.abspath(eval_file))[0] + result_file = get_intermediate_file_path(eval_file, '_comprehensive_eval', 'json') comprehensive_result = { "meta": {"total_datasets": len(all_results), "datasets": list(all_results.keys())}, "results": all_results, "summaries": all_summaries } - result_file = base_name + "_comprehensive_eval.json" dump(comprehensive_result, result_file) print(f"Comprehensive results saved to: {result_file}") @@ -298,5 +298,6 @@ def evaluate(self, eval_file, **judge_kwargs): print(f" {k.upper():<20}: {v:.4f}") print("="*80) df = d2df(res) - dump(df, base_name + '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') + dump(df, score_file) return res diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index b0892ad90..44ef1e9c3 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -258,7 +258,6 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): dump(data, eval_file) circular = True - suffix = eval_file.split('.')[-1] model = judge_kwargs.get('model', 'exact_matching') assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'} @@ -276,7 +275,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl') data = load(eval_file) data = data.sort_values(by='index') @@ -299,7 +298,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) # load split - eval_record = eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}') + eval_record = get_intermediate_file_path(eval_file, f'_{name_str}_result') dump(data, eval_record) data = load(eval_record) @@ -311,7 +310,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): else: acc = report_acc(data) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(acc, score_file) # The piece of code is for internal use, to check vanilla acc (circ0 & all) for circular datasets @@ -327,16 +326,16 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): else: offset = 1e6 circ0 = data[data['index'] <= offset] - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_result.pkl') + result_file = get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_result', 'pkl') data0 = mcq_vanilla_eval(model, circ0, meta, nproc, result_file, self.dataset_name) - dump(data0, eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_circ0_result.{suffix}')) - data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_circ0_result.{suffix}')) + dump(data0, get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_circ0_result')) + data = load(get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_circ0_result')) acc_map['vanilla_0'] = report_acc(data) # Vanilla ALL Acc data = load(eval_file) dataall = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) - dump(dataall, eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_all_result.{suffix}')) - data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_vanilla_all_result.{suffix}')) + dump(dataall, get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_all_result')) + data = load(get_intermediate_file_path(eval_file, f'_{name_str}_vanilla_all_result')) acc_map['vanilla_all'] = report_acc(data) # Merge & Print the Evaluation Results for k, v in acc_map.items(): @@ -350,7 +349,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): score_all = [acc_map['vanilla_0'], acc_map['vanilla_all'], acc_map['circular']] score_all = pd.concat(score_all) print(score_all) - score_file = eval_file.replace(f'.{suffix}', '_acc_all.csv') + score_file = get_intermediate_file_path(eval_file, '_acc_all', 'csv') dump(score_all, score_file) if dataset == 'AesBench_VAL': @@ -382,7 +381,6 @@ def evaluate_verifier(self, eval_file, **judge_kwargs): if circular: raise ValueError("circular is not supported for verifier evaluation") - suffix = eval_file.split('.')[-1] data = load(eval_file) data = data.sort_values(by='index') data['prediction'] = [str(x) for x in data['prediction']] @@ -418,7 +416,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs): data['verifier_score'] = verifier_scores data['verifier_match'] = verifier_matches - detailed_result_file = eval_file.replace(f'.{suffix}', '_detailed_results.xlsx') + detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results') dump(data, detailed_result_file) def report_acc_verifier(result_file): @@ -462,7 +460,7 @@ def report_acc_verifier(result_file): res_df = pd.DataFrame(res) return res_df acc = report_acc_verifier(detailed_result_file) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(acc, score_file) return acc @@ -615,11 +613,11 @@ def evaluate(self, eval_file, **judge_kwargs): if 'COT' in self.dataset_name: data = load(eval_file) data['prediction'] = [self.cot_postproc(x) for x in data['prediction']] - tgt = eval_file.replace('.xlsx', '_cotpost.xlsx') + tgt = get_intermediate_file_path(eval_file, '_cotpost') dump(data, tgt) res = super().evaluate(tgt, **judge_kwargs) - acc_org = eval_file.replace('.xlsx', '_acc.csv') - acc_now = eval_file.replace('.xlsx', '_cotpost_acc.csv') + acc_org = get_intermediate_file_path(eval_file, '_acc', 'csv') + acc_now = get_intermediate_file_path(eval_file, '_cotpost_acc', 'csv') shutil.copy(acc_now, acc_org) return res else: @@ -1017,11 +1015,11 @@ def build_prompt(self, line): @classmethod def evaluate(self, eval_file, **judge_kwargs): from .utils.multiple_choice import extract_characters_regex, get_dimension_rating - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 FAIL_MSG = 'Failed to obtain answer via API.' - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): @@ -1036,6 +1034,17 @@ def evaluate(self, eval_file, **judge_kwargs): ans = data.loc[data['index'] == idx, 'answer'].values[0] pred = data.loc[data['index'] == idx, 'prediction'].values[0] + match_cot = re.search(r"(.*?)", pred, re.DOTALL) + cot = match_cot.group(1).strip() if match_cot else pred + + target_instances = ast.literal_eval(data.loc[data['index'] == idx, 'target_instances'].values[0]) + iou = self.evaluate_box_iou(cot, target_instances) + + data.loc[data['index'] == idx, 'iou'] = iou + + match_pred = re.search(r"(.*?)", pred, re.DOTALL) + pred = match_pred.group(1).strip().upper() if match_pred else pred + extract_pred = extract_characters_regex(pred) if extract_pred == '': cnt_rejected += 1 @@ -1055,6 +1064,86 @@ def evaluate(self, eval_file, **judge_kwargs): dump(rating, tgt_file) return rating + def evaluate_box_iou(predict_str: str, target_instances: list) -> float: + pattern = r"(.*?)" + matches = re.findall(pattern, predict_str, re.DOTALL) + + all_boxes = [] + + for match in matches: + box = match.strip() + + coord_pattern = r'\[(\d+),(\d+),(\d+),(\d+)\]' + coord_match = re.match(coord_pattern, box) + + if coord_match: + x1, y1, x2, y2 = map(int, coord_match.groups()) + + if x1 < x2 and y1 < y2: + # all_boxes.append([(x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1]) + all_boxes.append([x1, y1, x2, y2]) + + if len(all_boxes) == 0: + return 0 + + target_boxes = target_instances + if len(target_boxes) == 0: + return len(all_boxes) > 0 + + def calculate_average_iou(pred_boxes, target_boxes): + """ + 计算每个目标框与预测框中 IoU 最大的预测框之间的平均 IoU。 + + 参数: + pred_boxes (List[List[float]]): 预测框列表,每个框为 [cx, cy, w, h] + target_boxes (List[List[float]]): 目标框列表,每个框为 [cx, cy, w, h] + + 返回: + float: 匹配上的平均 IoU + """ + def compute_iou(box1, box2): + """计算两个框之间的 IoU""" + x1_min, y1_min, x1_max, y1_max = box1 + x2_min, y2_min, x2_max, y2_max = box2 + + inter_x_min = max(x1_min, x2_min) + inter_y_min = max(y1_min, y2_min) + inter_x_max = min(x1_max, x2_max) + inter_y_max = min(y1_max, y2_max) + + inter_width = max(0, inter_x_max - inter_x_min) + inter_height = max(0, inter_y_max - inter_y_min) + inter_area = inter_width * inter_height + + area1 = (x1_max - x1_min) * (y1_max - y1_min) + area2 = (x2_max - x2_min) * (y2_max - y2_min) + + union_area = area1 + area2 - inter_area + + return inter_area / union_area if union_area > 0 else 0.0 + + pred_coords = pred_boxes + target_coords = target_boxes + + total_iou = 0.0 + num_targets = len(target_boxes) + + if num_targets == 0: + return 0.0 + + # 为每个目标框找到最大 IoU 的预测框 + for t_coord in target_coords: + best_iou = 0.0 + for p_coord in pred_coords: + iou = compute_iou(t_coord, p_coord) + if iou > best_iou: + best_iou = iou + total_iou += best_iou + + return total_iou / num_targets + + return calculate_average_iou(all_boxes, target_boxes) + class CVBench(ImageMCQDataset): """CV-Bench, composed of two sub datasets: @@ -1101,7 +1190,6 @@ def evaluate(self, eval_file, **judge_kwargs): nproc = judge_kwargs.pop("nproc", 4) - suffix = eval_file.split(".")[-1] model_name = judge_kwargs.get("model", "extract_matching") if model_name == "exact_matching": @@ -1117,7 +1205,7 @@ def evaluate(self, eval_file, **judge_kwargs): ) model = None - result_file = eval_file.replace(f".{suffix}", f"_{model_name}_result.pkl") + result_file = get_intermediate_file_path(eval_file, f"_{model_name}_result", "pkl") data = load(eval_file) data = data.sort_values(by="index") @@ -1136,7 +1224,7 @@ def evaluate(self, eval_file, **judge_kwargs): k in meta_q_map ), f"eval_file should be the same as or a subset of dataset {self.dataset_name}" - score_file = eval_file.replace(f".{suffix}", "_acc.csv") + score_file = get_intermediate_file_path(eval_file, "_acc", "csv") if osp.exists(score_file): acc = load(score_file) @@ -1144,15 +1232,14 @@ def evaluate(self, eval_file, **judge_kwargs): data = mcq_vanilla_eval( model, data, meta, nproc, result_file, self.dataset_name ) - dump(data, eval_file.replace(f".{suffix}", f"_{model}_result.{suffix}")) - data = load(eval_file.replace(f".{suffix}", f"_{model}_result.{suffix}")) + dump(data, get_intermediate_file_path(eval_file, f"_{model_name}_result")) + data = load(get_intermediate_file_path(eval_file, f"_{model_name}_result")) if all(data["split"] == "2D"): # 2D acc = self.report_accuracy(data) else: # 3D, use default evaluation strategy acc = report_acc(data) - score_file = eval_file.replace(f".{suffix}", "_acc.csv") dump(acc, score_file) return acc @@ -1198,7 +1285,6 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.hrbench import report_acc_hrbench nproc = judge_kwargs.pop('nproc', 4) - suffix = eval_file.split('.')[-1] model = judge_kwargs.get('model', 'extract_matching') assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'} @@ -1216,7 +1302,7 @@ def evaluate(self, eval_file, **judge_kwargs): warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl') data = load(eval_file) data = data.sort_values(by='index') @@ -1233,18 +1319,17 @@ def evaluate(self, eval_file, **judge_kwargs): f'eval_file should be the same as or a subset of dataset {self.dataset_name}' ) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') if osp.exists(score_file): acc = load(score_file) return acc data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) - dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) - data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + dump(data, get_intermediate_file_path(eval_file, f'_{name_str}_result')) + data = load(get_intermediate_file_path(eval_file, f'_{name_str}_result')) acc = report_acc_hrbench(data) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') dump(acc, score_file) return acc @@ -1323,7 +1408,7 @@ def evaluate(self, eval_file, **judge_kwargs): scores = get_scores(results) print(scores) - score_file = 'NaturalBench_acc.csv' + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score']) dump(df, score_file) @@ -1401,13 +1486,12 @@ def evaluate(self, eval_file, **judge_kwargs): warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx') + storage = get_intermediate_file_path(eval_file, f'_{name_str}') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage) and model is not None: data = load(eval_file) - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl') data = load(eval_file) data = data.sort_values(by='index') @@ -1437,7 +1521,7 @@ def evaluate(self, eval_file, **judge_kwargs): four_dim_scores = wemath_accuracy(eval_file) combine_score = {**accuracy_scores, **four_dim_scores} combine_score = pd.DataFrame(combine_score) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(combine_score, score_pth) return combine_score @@ -1488,15 +1572,14 @@ def build_prompt(self, line): def evaluate(self, eval_file, **judge_kwargs): from .utils.vmcbench import get_mc_score, report_vmc_acc - suffix = eval_file.split('.')[-1] data = load(eval_file) data = data.sort_values(by='index') data['prediction'] = [str(x) for x in data['prediction']] data['hit'] = data.apply(get_mc_score, axis=1) - result_file = eval_file.replace(f'.{suffix}', f'_result.{suffix}') + result_file = get_intermediate_file_path(eval_file, '_result') dump(data, result_file) acc = report_vmc_acc(data) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(acc, score_file) return acc @@ -1638,8 +1721,7 @@ def evaluate(self, eval_file, **judge_kwargs): warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx') + storage = get_intermediate_file_path(eval_file, f'_{name_str}') if osp.exists(storage): accuracy_scores = VisuLogic_acc(storage) @@ -1647,7 +1729,7 @@ def evaluate(self, eval_file, **judge_kwargs): accuracy_scores = VisuLogic_acc(eval_file) combine_score = {**accuracy_scores,} combine_score = pd.DataFrame(combine_score) - score_pth = storage.replace('.xlsx', '_acc.csv') + score_pth = get_intermediate_file_path(storage, '_acc', 'csv') dump(combine_score, score_pth) return combine_score @@ -1698,7 +1780,6 @@ def do_evaluate(self, eval_file, **judge_kwargs): from .utils.multiple_choice import report_acc, mcq_vanilla_eval nproc = judge_kwargs.pop('nproc', 4) - suffix = eval_file.split('.')[-1] model = judge_kwargs.get('model', 'exact_matching') assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125', 'gpt-4o-mini'] name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4', 'gpt-4o-mini': 'gpt4omini'} @@ -1716,7 +1797,7 @@ def do_evaluate(self, eval_file, **judge_kwargs): warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl') data = load(eval_file) data = data.sort_values(by='index') @@ -1736,12 +1817,12 @@ def do_evaluate(self, eval_file, **judge_kwargs): data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) # Save evaluation results - judged_result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}') + judged_result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result') dump(data, judged_result_file) acc = report_acc(data) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(acc, score_file) return acc, judged_result_file @@ -1920,8 +2001,7 @@ def evaluate(self, eval_file, **judge_kwargs): result_df = pd.DataFrame(accuracy_dict) result_df['Overall macro'] = result_df.mean(axis=1) result_df['Overall micro'] = micro_metric['correct'] / micro_metric['total'] - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(result_df, score_file) return result_df @@ -2062,8 +2142,7 @@ def evaluate(self, eval_file, **judge_kwargs): result_df[f"Sphere macro: {sphere}"] = sum(accs) / len(accs) result_df["Overall macro"] = result_df.mean(axis=1) result_df["Overall micro"] = micro_metric["correct"] / micro_metric["total"] - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(result_df, score_file) return result_df @@ -2254,31 +2333,15 @@ def evaluate(self, eval_file, **judge_kwargs): ans = self.extract_content_in_braces(data_item["prediction"]) if ans == data_item["answers"]: task_stats[task]['correct'] += 1 - elif data_item["task"] == "Touching Circles": - if str.lower(data_item["answers"]) in str.lower(data_item["prediction"]): - task_stats[task]['correct'] += 1 - elif data_item["task"] == "Counting Grid - Word Grids": + elif data_item["task"] == "Touchdown Reading": if self.compare_string_with_values(data_item["prediction"], data_item["answers"]): task_stats[task]['correct'] += 1 - elif data_item["task"] == "Counting Grid - Blank Grids": - if self.compare_string_with_values(data_item["prediction"], data_item["answers"]): - task_stats[task]['correct'] += 1 - elif data_item["task"] == "Olympic Counting - Pentagons": - if data_item["answers"] in data_item["prediction"]: - task_stats[task]['correct'] += 1 - elif data_item["task"] == "Olympic Counting - Circles": - if data_item["answers"] in data_item["prediction"]: - task_stats[task]['correct'] += 1 - elif data_item["task"] == "Circled Letter": - ans = self.extract_content_in_braces(data_item["prediction"]) - if ans == data_item["answers"]: - task_stats[task]['correct'] += 1 - - accuracy_dict = {task: [stats['correct'] / stats['total']] for task, stats in task_stats.items()} - result_df = pd.DataFrame(accuracy_dict) - result_df['overall'] = result_df.mean(axis=1) - return result_df + accuracy_dict = {task: [stats['correct'] / stats['total']] for task, stats in sorted(task_stats.items())} + accuracy_df = pd.DataFrame(accuracy_dict) + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') + dump(accuracy_df, score_file) + return accuracy_df class SCAM(ImageMCQDataset): @@ -2330,54 +2393,23 @@ class _3DSRBench(ImageMCQDataset): DATASET_MD5 = {'3DSRBench': '610516a0b4710595545b7613c60524e8'} def evaluate(self, eval_file, **judge_kwargs): - super().evaluate(eval_file, **judge_kwargs) from .utils.multiple_choice import report_acc - dname = osp.dirname(eval_file) - base = osp.basename(eval_file).split('.')[:-1] - base = '.'.join(base) - result_file = ls(dname, match=[base + '_', 'result.xlsx']) - assert len(result_file) == 1, result_file - result_file = result_file[0] - data = load(result_file) - - acc_map = {} - acc_map['vanilla'] = report_acc(data) - # Flip Acc - qid2key = {x: x.replace('-flip', '') for x in data['qid']} - key_set = set(list(qid2key.values())) - main = cp.deepcopy(data[data['qid'].isin(key_set)]) - hit_map = {x: y for x, y in zip(main['qid'], main['hit'])} - for x, y in zip(data['qid'], data['hit']): - hit_map[qid2key[x]] *= y - main['hit'] = [hit_map[x] for x in main['qid']] - acc_map['flip_eval'] = report_acc(main) - # Circ Acc - qid2key = {x: x[:8] if '-flip' not in x else x[:13] for x in data['qid']} - key_set = set(list(qid2key.values())) - main = cp.deepcopy(data[data['qid'].isin(key_set)]) - hit_map = {x: y for x, y in zip(main['qid'], main['hit'])} - for x, y in zip(data['qid'], data['hit']): - hit_map[qid2key[x]] *= y - main['hit'] = [hit_map[x] for x in main['qid']] - acc_map['circ_eval'] = report_acc(main) - # Flip Circ Acc - qid2key = {x: x[:8] for x in data['qid']} - key_set = set(list(qid2key.values())) - main = cp.deepcopy(data[data['qid'].isin(key_set)]) - hit_map = {x: y for x, y in zip(main['qid'], main['hit'])} - for x, y in zip(data['qid'], data['hit']): - hit_map[qid2key[x]] *= y - main['hit'] = [hit_map[x] for x in main['qid']] - acc_map['flip_circ_eval'] = report_acc(main) - - metrics = [] - for k in acc_map: - acc_map[k].pop('split') - acc_map[k]['setting'] = [k] * len(acc_map[k]) - metrics.append(acc_map[k]) - res_all = pd.concat(metrics) - dump(res_all, eval_file.replace('.xlsx', '_acc_all.csv')) - return res_all + from .utils.sr3d import parse_3dsr_prediction, eval_3dsr + from ..smp import dump, load + from ..utils.dataset_util import TDBench_grounding_eval + from ..dataset import parse_img_path_list + from ..config import VLM_EVAL_WITH_SUBSET + data = load(eval_file) + # parse the model predictions + data = parse_img_path_list(data) + data = parse_3dsr_prediction(data) + # rotate the image and boxes + data['hit'] = eval_3dsr(data) + result_file = get_intermediate_file_path(eval_file, '_acc') + if VLM_EVAL_WITH_SUBSET: + data['subset'] = [x.split('|')[0] for x in data['index']] + dump(data, result_file) + return report_acc(data) class AffordanceDataset(ImageMCQDataset): @@ -2556,57 +2588,14 @@ def build_prompt(self, line): # It returns a dictionary @classmethod def evaluate(self, eval_file, **judge_kwargs): - import ast - from .utils.multiple_choice import extract_characters_regex - from .utils.treebench import get_dimension_rating - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' - FAIL_MSG = 'Failed to obtain answer via API.' - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') - - if not osp.exists(score_file): - - res = {} if not osp.exists(tmp_file) else load(tmp_file) - res = {k: v for k, v in res.items() if FAIL_MSG not in v} - - data = load(eval_file) - cnt_rejected = 0 - data_un = data[~pd.isna(data['prediction'])] - - for idx in data['index']: - ans = data.loc[data['index'] == idx, 'answer'].values[0] - pred = data.loc[data['index'] == idx, 'prediction'].values[0] - - match_cot = re.search(r"(.*?)", pred, re.DOTALL) - cot = match_cot.group(1).strip() if match_cot else pred - - target_instances = ast.literal_eval(data.loc[data['index'] == idx, 'target_instances'].values[0]) - iou = self.evaluate_box_iou(cot, target_instances) - - data.loc[data['index'] == idx, 'iou'] = iou - - match_pred = re.search(r"(.*?)", pred, re.DOTALL) - pred = match_pred.group(1).strip().upper() if match_pred else pred - - extract_pred = extract_characters_regex(pred) - if extract_pred == '': - cnt_rejected += 1 - data.loc[data['index'] == idx, 'score'] = 0 - else: - data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans) - - print( - f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, ' - f'failed to obtain the score for another {cnt_rejected} questions. ' - f'Those questions will be counted as 0 score in ALL rating.' - ) - - dump(data, score_file) - - rating = get_dimension_rating(score_file) - dump(rating, tgt_file) - return rating + from .utils.treebench import get_acc + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') + try: + res = get_acc(eval_file) + dump(res, score_file) + return res + except: + return 0 def evaluate_box_iou(predict_str: str, target_instances: list) -> float: pattern = r"(.*?)" diff --git a/vlmeval/dataset/image_mt.py b/vlmeval/dataset/image_mt.py index 07658948a..3cd72d726 100644 --- a/vlmeval/dataset/image_mt.py +++ b/vlmeval/dataset/image_mt.py @@ -1,6 +1,7 @@ from .image_base import ImageBaseDataset from .utils.judge_util import build_judge from ..smp import * +from ..smp.file import get_intermediate_file_path from ..utils import track_progress_rich @@ -86,11 +87,10 @@ def calculat_metric(self, ans): return pd.DataFrame([sp1, sp2]) def evaluate(self, eval_file, **judge_kwargs): - suffix = eval_file.split('.')[-1] model = judge_kwargs['model'] - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv') nproc = judge_kwargs.pop('nproc', 4) data = load(eval_file) diff --git a/vlmeval/dataset/image_shortqa.py b/vlmeval/dataset/image_shortqa.py index 3650730cb..0d60ded33 100644 --- a/vlmeval/dataset/image_shortqa.py +++ b/vlmeval/dataset/image_shortqa.py @@ -4,6 +4,7 @@ from .utils.multiple_choice import report_acc, eval_vanilla, eval_circular_group from .utils.shortqa import ShortQA_prompt from ..utils import track_progress_rich +from ..smp.file import get_intermediate_file_path def ShortQA_auxeval(model, line): @@ -89,8 +90,8 @@ def evaluate(self, eval_file, **judge_kwargs): data['prediction'] = [str(x) for x in data['prediction']] data['answer'] = [str(x) for x in data['answer']] - storage = eval_file.replace('.xlsx', '_judge.xlsx') - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + storage = get_intermediate_file_path(eval_file, '_judge') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -137,7 +138,7 @@ def evaluate(self, eval_file, **judge_kwargs): data = load(storage) acc = report_acc(data) - score_file = eval_file.replace('.xlsx', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(acc, score_file) return acc diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py index ad202d9e8..27800547c 100644 --- a/vlmeval/dataset/image_vqa.py +++ b/vlmeval/dataset/image_vqa.py @@ -9,6 +9,7 @@ from .image_base import ImageBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from ..utils import track_progress_rich @@ -89,8 +90,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): data['eval_match'] = [r['match'] for r in res] data['eval_score'] = [np.mean(r['match']) for r in res] - suffix = eval_file.split('.')[-1] - detailed_result_file = eval_file.replace(f'.{suffix}', '_results.xlsx') + detailed_result_file = get_intermediate_file_path(eval_file, '_results') dump(data, detailed_result_file) hit = hit_calculate(res, dataset) @@ -118,8 +118,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): ret = d2df(ret) ret.round(2) - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc') dump(ret, result_file) return ret @@ -146,8 +145,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs): data['verifier_score'] = scores data['verifier_match'] = [1.0 if score else 0.0 for score in scores] - suffix = eval_file.split('.')[-1] - detailed_result_file = eval_file.replace(f'.{suffix}', '_detailed_results.xlsx') + detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results') dump(data, detailed_result_file) def hit_calculate(result): @@ -177,8 +175,7 @@ def hit_calculate(result): ret = d2df(ret) ret.round(2) - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc') dump(ret, result_file) return ret @@ -194,8 +191,7 @@ class VizWiz(ImageBaseDataset): def evaluate(self, eval_file, **judge_kwargs): from .utils.vqa_eval import hit_calculate, process_line - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc') if not osp.exists(result_file): data = load(eval_file) @@ -217,7 +213,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(ret, result_file) - retz = pd.read_csv(result_file) + retz = load(result_file) return retz @@ -292,7 +288,7 @@ def evaluate(self, eval_file, **judge_kwargs): + final_score_dict['Handwritten Mathematical Expression Recognition']) final_score_dict['Final Score Norm'] = ( float(final_score_dict['Final Score']) / 10) - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) return final_score_dict @@ -317,9 +313,8 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): from .utils.mathvista import MathVista_auxeval, MathVista_acc model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -357,7 +352,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): dump(data, storage) score = MathVista_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) return score @@ -383,7 +378,7 @@ def evaluate_verifier(self, eval_file, **judge_kwargs): data['verifier_score'] = verifier_scores data['verifier_match'] = verifier_matches - detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx') + detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results') dump(data, detailed_result_file) def MathVista_acc_verifier(result_file): @@ -422,7 +417,7 @@ def MathVista_acc_verifier(result_file): return res score = MathVista_acc_verifier(detailed_result_file) - score_pth = eval_file.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(eval_file, '_score', 'csv') dump(score, score_pth) return score @@ -483,11 +478,10 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx') - tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl') - storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl') + storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract') + tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract', 'pkl') + storage_score = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_file_score = get_intermediate_file_path(eval_file, f'_{model}_score', 'pkl') nproc = judge_kwargs.pop('nproc', 4) # stage1: extract the answer if not osp.exists(storage_extract): @@ -517,8 +511,8 @@ def evaluate(self, eval_file, **judge_kwargs): ans = load(tmp_file_extract) for k, v in zip(indices, new_results): assert k in ans - assert ans[k]['log_extract'] == v['log_extract'] and ans[ - k]['extract'] == v['extract'] + assert ans[k]['log_extract'] == v['log_extract'] and ans[k][ + 'extract'] == v['extract'] data['extract'] = [ans[idx]['extract'] for idx in data['index']] data['log_extract'] = [ @@ -564,7 +558,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage_score) score = MathVerse_acc(storage_score) - score_pth = storage_score.replace('.xlsx', '.csv') + score_pth = get_intermediate_file_path(storage_score, '', 'csv') dump(score, score_pth) return score @@ -595,9 +589,8 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): model = judge_kwargs['model'] else: model = os.path.basename(os.environ.get('LOCAL_LLM')) - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -635,7 +628,7 @@ def evaluate_heuristic(self, eval_file, **judge_kwargs): dump(data, storage) score = MATH_V_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) return score @@ -662,11 +655,11 @@ def evaluate_verifier(self, eval_file, **judge_kwargs): data['verifier_score'] = verifier_scores data['verifier_match'] = verifier_matches - detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx') + detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results') dump(data, detailed_result_file) else: - detailed_result_file = eval_file.replace('.xlsx', '_detailed_results.xlsx') + detailed_result_file = get_intermediate_file_path(eval_file, '_detailed_results') if not osp.exists(detailed_result_file): dump(data, detailed_result_file) @@ -697,7 +690,7 @@ def MathVision_acc_verifier(result_file): return res score = MathVision_acc_verifier(detailed_result_file) - score_pth = eval_file.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(eval_file, '_score', 'csv') dump(score, score_pth) return score @@ -797,9 +790,8 @@ def evaluate(self, eval_file, **judge_kwargs): print(f'Using local model as judge model for PHYSICS: {model}') else: model = judge_kwargs.setdefault('model', 'gpt-4o-mini') - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -839,7 +831,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score = PHYSIC_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) return score @@ -962,12 +954,11 @@ def evaluate(self, eval_file, **judge_kwargs): if use_api_judger: from .utils.olympiadbench import Olympiad_auxeval_extract, Olympiad_auxeval_score model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx') - tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl') - result_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_result_file = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl') - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') + storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract') + tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract_tmp') + result_file = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_result_file = get_intermediate_file_path(eval_file, f'_{model}_score_tmp') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') nproc = judge_kwargs.pop('nproc', 4) # stage1: extract the answer if not osp.exists(storage_extract): @@ -1046,46 +1037,47 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.olympiadbench import MathJudger, extract_answer judger = MathJudger() - suffix = eval_file.split('.')[-1] - name_str1 = 'judge' - name_str2 = 'score' - result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx') - score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv') + name_str1 = 'judge' + name_str2 = 'score' + result_file = get_intermediate_file_path(eval_file, f'_{name_str1}_result') + score_file = get_intermediate_file_path(eval_file, f'_{name_str2}_result', 'csv') - if not osp.exists(result_file): - data = load(eval_file) - scorez = [] + if not osp.exists(result_file): + data = load(eval_file) + scorez = [] - for i in tqdm(data.iterrows()): - line = i[1] - model_answer = line['prediction'] - is_chinese = 'zh' in line['source'] - model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False) - answer_type = line['answer_type'] + for i in tqdm(data.iterrows()): + line = i[1] + model_answer = line['prediction'] + is_chinese = 'zh' in line['source'] + model_answer = extract_answer(is_chinese, + model_answer, + is_deepseek=False) + answer_type = line['answer_type'] - final_answer = line['final_answer'][2:-2] + final_answer = line['final_answer'][2:-2] - if str(answer_type) != 'nan' and 'Tuple' in answer_type: - judge_result = judger.judge(model_answer, final_answer) - else: - if str(line['error']) != 'nan': - if ',' in line['error']: - precisions = line['error'].split(',') - precisions = [ - float(p) if p else 1e-8 for p in precisions - ] - judge_result = judger.judge( - model_answer, final_answer, precisions) - else: - precision = float(line['error']) - judge_result = judger.judge( - model_answer, final_answer, precision) + if str(answer_type) != 'nan' and 'Tuple' in answer_type: + judge_result = judger.judge(model_answer, final_answer) + else: + if str(line['error']) != 'nan': + if ',' in line['error']: + precisions = line['error'].split(',') + precisions = [ + float(p) if p else 1e-8 for p in precisions + ] + judge_result = judger.judge( + model_answer, final_answer, precisions) else: - judge_result = judger.judge(model_answer, final_answer) - scorez.append(judge_result) + precision = float(line['error']) + judge_result = judger.judge( + model_answer, final_answer, precision) + else: + judge_result = judger.judge(model_answer, final_answer) + scorez.append(judge_result) - data['score'] = scorez - dump(data, result_file) + data['score'] = scorez + dump(data, result_file) judge_file = load(result_file) @@ -1153,9 +1145,9 @@ def evaluate(self, eval_file, **judge_kwargs): acc_dict['AVG'] = [acc] acc_pd = pd.DataFrame(acc_dict) - acc_pd.to_csv(score_file, index=False, encoding='gbk') + dump(acc_pd, score_file) - accdz = pd.read_csv(score_file) + accdz = load(score_file) return accdz @@ -1228,9 +1220,8 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.seephys import extract, eval_acc model = judge_kwargs.pop('model', 'deepseek') - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): data = load(eval_file) @@ -1268,7 +1259,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score = eval_acc(storage) - score_pth = storage.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(storage, '_score', 'json') dump(score, score_pth) return score @@ -1312,9 +1303,8 @@ def evaluate(self, eval_file, **judge_kwargs): ) model = None - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{name_str}') + tmp_file = get_intermediate_file_path(eval_file, f'_{name_str}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage) and model is not None: @@ -1354,7 +1344,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) if osp.exists(storage): accuracy_scores = evaluate_logicvista(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(accuracy_scores, score_pth) return accuracy_scores @@ -1478,7 +1468,6 @@ class LLaVABench(ImageBaseDataset): } DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'} - # It returns a DataFrame @classmethod def evaluate(self, eval_file, **judge_kwargs): from .utils.llavabench import ( @@ -1487,9 +1476,8 @@ def evaluate(self, eval_file, **judge_kwargs): LLaVABench_score, ) - suffix = '.' + eval_file.split('.')[-1] - record_file = eval_file.replace(suffix, '_openai_result' + suffix) - score_file = eval_file.replace(suffix, '_score.csv') + record_file = get_intermediate_file_path(eval_file, '_openai_result') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') nproc = judge_kwargs.pop('nproc', 4) system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.' @@ -1534,9 +1522,8 @@ def evaluate(self, eval_file, **judge_kwargs): LLaVABench_score, ) - suffix = '.' + eval_file.split('.')[-1] - record_file = eval_file.replace(suffix, '_openai_result' + suffix) - score_file = eval_file.replace(suffix, '_score.csv') + record_file = get_intermediate_file_path(eval_file, '_openai_result') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') nproc = judge_kwargs.pop('nproc', 4) system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.' @@ -1583,9 +1570,8 @@ def evaluate(self, eval_file, **judge_kwargs): VGRPBench_get_system_prompt, ) - suffix = '.' + eval_file.split('.')[-1] - record_file = eval_file.replace(suffix, '_openai_result' + suffix) - score_file = eval_file.replace(suffix, '_score.csv') + record_file = get_intermediate_file_path(eval_file, '_openai_result') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') nproc = judge_kwargs.pop('nproc', 4) @@ -1649,10 +1635,9 @@ class MMVet(ImageBaseDataset): def evaluate(self, eval_file, **judge_kwargs): from .utils.mmvet import MMVet_auxeval, MMVet_acc - suffix = eval_file.split('.')[-1] model = judge_kwargs['model'] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): data = load(eval_file) @@ -1687,8 +1672,8 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score, score_fine = MMVet_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') - score_fine_pth = storage.replace('.xlsx', '_score_fine.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') + score_fine_pth = get_intermediate_file_path(storage, '_score_fine', 'csv') dump(score, score_pth) dump(score_fine, score_fine_pth) return score @@ -1727,8 +1712,7 @@ def evaluate(self, eval_file, **judge_kwargs): for category, scores in category_scores.items() } - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.json') + result_file = get_intermediate_file_path(eval_file, '_acc', 'json') dump(category_averages, result_file) return category_averages @@ -1908,6 +1892,8 @@ def evaluate(self, eval_file, **judge_kwargs): for task, metrics in eval_results.items() for metric, score in metrics.items() ]) + result_file = get_intermediate_file_path(eval_file, '_acc') + dump(ret_df, result_file) return ret_df # WildDoc adopts a custom prompt for each subset @@ -1979,8 +1965,7 @@ def evaluate(self, eval_file, **judge_kwargs): eval_result['average_scores'].append( split_eval_meta['average_scores']) - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc', 'csv') eval_result = pd.DataFrame(eval_result) dump(eval_result, result_file) @@ -2089,7 +2074,7 @@ def evaluate(self, eval_file, **judge_kwargs): else: final_score_dict[category] = None - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) return final_score_dict @@ -2255,9 +2240,8 @@ def evaluate(self, eval_file, **judge_kwargs): # extract using model model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -2353,7 +2337,7 @@ def evaluate(self, eval_file, **judge_kwargs): delta_1_point_5_per_question_type }) - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) return final_score_dict @@ -2477,7 +2461,7 @@ def evaluate(self, eval_file, **judge_kwargs): else: final_score_dict[category] = None - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) return final_score_dict @@ -2562,12 +2546,9 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.mmsci import (get_all_metrics_for_g_eval_score, get_all_metrics_for_reference_based_metrics, merge_rating, fact_score_generate) - refer_based_metrics_output_file = eval_file.replace( - '.xlsx', '_reference_based_metrics.xlsx') - g_eval_metrics_output_file = eval_file.replace('.xlsx', - '_g_eval_metrics.xlsx') - fact_score_metrics_output_file = eval_file.replace( - '.xlsx', '_fact_score.xlsx') + refer_based_metrics_output_file = get_intermediate_file_path(eval_file, '_reference_based_metrics') + g_eval_metrics_output_file = get_intermediate_file_path(eval_file, '_g_eval_metrics') + fact_score_metrics_output_file = get_intermediate_file_path(eval_file, '_fact_score') # calculate reference-based metrics if not osp.exists(refer_based_metrics_output_file): @@ -2592,8 +2573,7 @@ def evaluate(self, eval_file, **judge_kwargs): if isinstance(references[0], str): references = [[r] for r in references] - reference_based_metrics_file = eval_file.replace( - '.xlsx', '_reference_based_metrics.pkl') + reference_based_metrics_file = get_intermediate_file_path(eval_file, '_reference_based_metrics', 'pkl') existing_data = get_all_metrics_for_reference_based_metrics( references, candidates, image_id_list, reference_based_metrics_file) @@ -2643,8 +2623,7 @@ def evaluate(self, eval_file, **judge_kwargs): assert judge_model.working(), ( 'Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) - suffix = '.' + eval_file.split('.')[-1] - tmp_file = eval_file.replace(suffix, f'_{model}_G_eval.pkl') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}_G_eval', 'pkl') tmp_result = get_all_metrics_for_g_eval_score( references, @@ -2666,7 +2645,7 @@ def evaluate(self, eval_file, **judge_kwargs): rating = merge_rating(refer_based_metrics_output_file, g_eval_metrics_output_file, fact_score_metrics_output_file) - dump(rating, eval_file.replace('.xlsx', '_final_rating.xlsx')) + dump(rating, get_intermediate_file_path(eval_file, '_final_rating')) return rating @@ -2681,7 +2660,7 @@ class BMMR(ImageBaseDataset): def evaluate(self, eval_file, **judge_kwargs): from .utils.bmmr import get_acc_for_reference_based_metrics, merge_rating - refer_based_metrics_output_file = eval_file.replace('.xlsx', '_reference_based_metrics.xlsx') + refer_based_metrics_output_file = get_intermediate_file_path(eval_file, '_reference_based_metrics') if not osp.exists(refer_based_metrics_output_file): data = load(eval_file) old_candidates = {} @@ -2707,7 +2686,7 @@ def evaluate(self, eval_file, **judge_kwargs): if isinstance(references[0], str): references = [[r] for r in references] - reference_based_metrics_file = eval_file.replace('.xlsx', '_reference_based_metrics.pkl') + reference_based_metrics_file = get_intermediate_file_path(eval_file, '_reference_based_metrics', 'pkl') assert len(references) == len(candidates) == len(image_id_list) == len(task_type_list) existing_data = get_acc_for_reference_based_metrics( references, candidates, image_id_list, task_type_list, reference_based_metrics_file @@ -2720,7 +2699,7 @@ def evaluate(self, eval_file, **judge_kwargs): rating = merge_rating( refer_based_metrics_output_file, ) - dump(rating, eval_file.replace('.xlsx', '_final_rating.xlsx')) + dump(rating, get_intermediate_file_path(eval_file, '_final_rating')) return rating def build_prompt(self, line): @@ -2756,7 +2735,6 @@ class TDBenchGrounding(ImageVQADataset): def evaluate(self, eval_file, **judge_kwargs): from .utils.tdbench import evaluate_bbox, extract_bbox_from_string, rotational_eval - suffix = eval_file.split('.')[-1] method = judge_kwargs.get('model', 'centroid') assert method in ['centroid', 'iou'], '--judge should be either centroid or iou' @@ -2786,16 +2764,16 @@ def evaluate(self, eval_file, **judge_kwargs): data['hit'] = scores data['category'] = 'visual_grounding' - result_file = eval_file.replace(f'.{suffix}', f'_{method}_result.xlsx') - data.to_excel(result_file, index=False) + result_file = get_intermediate_file_path(eval_file, f'_{method}_result') + dump(data, result_file) metric_name = 'Average Centroid Containment' if method == 'centroid' else 'Average IoU' summary_scores = {metric_name: avg_score, 'Total Samples': len(scores)} score_df = pd.DataFrame(list(summary_scores.items()), columns=['Metric', 'Score']) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') - score_df.to_csv(score_file, index=False) + score_file = get_intermediate_file_path(eval_file, '_acc') + dump(score_df, score_file) re_result = rotational_eval(result_file) if method == 'centroid' and re_result is not None and re_result is not False: file_addr = osp.abspath( @@ -2902,7 +2880,11 @@ def evaluate(self, eval_file, **judge_kwargs): if ans in pred: correct_count += 1 accuracy = correct_count / total_count if total_count > 0 else 0 - return {'accuracy': accuracy} + + result = {'accuracy': accuracy * 100} + result_file = get_intermediate_file_path(eval_file, '_acc') + dump(d2df(result), result_file) + return result class OCR_Reasoning(ImageBaseDataset): @@ -2919,9 +2901,8 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.ocr_reasoning import OcrR_auxeval, OcrR_acc model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) nproc = 1 if not osp.exists(storage): @@ -2932,7 +2913,6 @@ def evaluate(self, eval_file, **judge_kwargs): lines = [data.iloc[i] for i in range(lt)] tups = [(model, line) for line in lines] indices = [line['index'] for line in lines] - ans = {} if osp.exists(tmp_file): ans = load(tmp_file) @@ -2961,7 +2941,7 @@ def evaluate(self, eval_file, **judge_kwargs): ] dump(data, storage) score = OcrR_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) return score @@ -3058,8 +3038,7 @@ def evaluate(self, eval_file, **judge_kwargs): # Open ended mode res = pool.map(partial(PhyX_process_line), lines) - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_predict.xlsx') + result_file = get_intermediate_file_path(eval_file, '_predict') df = pd.DataFrame(res) df.to_excel(result_file, index=False) @@ -3077,8 +3056,7 @@ def evaluate(self, eval_file, **judge_kwargs): ret = d2df(ret) ret.round(2) - suffix = eval_file.split('.')[-1] - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc') dump(ret, result_file) return ret @@ -3086,9 +3064,8 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.phyx import PhyX_auxeval, PhyX_acc, PhyX_auxeval_MC model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -3141,7 +3118,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score = PhyX_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) return score @@ -3232,9 +3209,9 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.mme_reasoning import MMEReasoning_extract, MMEReasoning_openeval, MMEReasoning_acc, FAIL_MSG, mme_reasoning_eval_functions # noqa model = judge_kwargs.get('model', 'gpt-4o-mini') - suffix = eval_file.split('.')[-1] - storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx') - tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl') + storage_extract = get_intermediate_file_path(eval_file, f'_{model}_extract') + tmp_file_extract = get_intermediate_file_path(eval_file, f'_{model}_extract_tmp') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') nproc = judge_kwargs.pop('nproc', 4) # stage 1: extract answers using LLM @@ -3282,11 +3259,9 @@ def evaluate(self, eval_file, **judge_kwargs): data['log'] = log_list dump(data, storage_extract) - storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl') - + tmp_file_score = get_intermediate_file_path(eval_file, f'_{model}_score_tmp') # stage 2: evaluate score - if not osp.exists(storage_score): + if not osp.exists(score_file): data = load(storage_extract) data = data.replace({float('nan'): None}) model = build_judge(max_tokens=1024, **judge_kwargs) @@ -3390,10 +3365,10 @@ def evaluate(self, eval_file, **judge_kwargs): data['score'] = [ans[idx]['score'] for idx in data['index']] data['log_score'] = [ans[idx]['log_score'] for idx in data['index']] - dump(data, storage_score) + dump(data, score_file) - score = MMEReasoning_acc(storage_score) - score_pth = storage_score.replace('.xlsx', '.csv') + score = MMEReasoning_acc(score_file) + score_pth = get_intermediate_file_path(score_file, '', 'csv') dump(score, score_pth) return score @@ -3454,14 +3429,12 @@ def report_acc_mmatch(scores, match_types_int): @classmethod def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 judge = judge_kwargs['model'] nproc = judge_kwargs.pop('nproc', 4) - - tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl') - score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx') - acc_file = eval_file.replace('.xlsx', f'_{judge}_acc.xlsx') - + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp') + score_file = get_intermediate_file_path(eval_file, f'_{judge}_score') + acc_file = get_intermediate_file_path(eval_file, f'_{judge}_acc') judge_kwargs['temperature'] = 0.0 model = build_judge(**judge_kwargs) @@ -3571,7 +3544,7 @@ def evaluate(self, eval_file, **judge_kwargs): final_score_dict = {**en_scores, **cn_scores} final_score_dict["English Overall Score"] = score_en_overall final_score_dict["Chinese Overall Score"] = score_cn_overall - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) return final_score_dict @@ -3605,10 +3578,8 @@ def evaluate(self, eval_file, **judge_kwargs): model = build_judge(**judge_kwargs) if not model.working(): raise RuntimeError("OPENAI API is not working properly. Please check your API key and configuration.") - - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model_name}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model_name}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model_name}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}_tmp') nproc = judge_kwargs.pop('nproc', 4) data = load(eval_file) @@ -3652,6 +3623,6 @@ def evaluate(self, eval_file, **judge_kwargs): ret.round(2) - result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + result_file = get_intermediate_file_path(eval_file, '_acc') dump(ret, result_file) return ret diff --git a/vlmeval/dataset/image_yorn.py b/vlmeval/dataset/image_yorn.py index 63ccd2b24..844ed0227 100644 --- a/vlmeval/dataset/image_yorn.py +++ b/vlmeval/dataset/image_yorn.py @@ -42,8 +42,8 @@ def evaluate(self, eval_file, **judge_kwargs): dataset = self.dataset_name data = load(eval_file) data['prediction'] = [str(x) for x in data['prediction']] - storage = eval_file.replace('.xlsx', '_auxmatch.xlsx') - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + storage = get_intermediate_file_path(eval_file, '_auxmatch') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): @@ -104,6 +104,6 @@ def evaluate(self, eval_file, **judge_kwargs): else: score = default_rating(storage) - score_tgt = eval_file.replace('.xlsx', '_score.csv') + score_tgt = get_intermediate_file_path(eval_file, '_score', 'csv') dump(score, score_tgt) return score diff --git a/vlmeval/dataset/longvideobench.py b/vlmeval/dataset/longvideobench.py index f4e6470d5..ea2ce0de2 100644 --- a/vlmeval/dataset/longvideobench.py +++ b/vlmeval/dataset/longvideobench.py @@ -278,11 +278,11 @@ def build_prompt(self, line, video_llm): def evaluate(self, eval_file, **judge_kwargs): from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.get('model', 'exact_matching') diff --git a/vlmeval/dataset/m4bench.py b/vlmeval/dataset/m4bench.py index 32ded4d44..2695c043c 100644 --- a/vlmeval/dataset/m4bench.py +++ b/vlmeval/dataset/m4bench.py @@ -6,7 +6,7 @@ from os import path as osp from .image_base import ImageBaseDataset from .utils import build_judge, DEBUG_MESSAGE -from ..smp import decode_base64_to_image_file, load, dump +from ..smp import decode_base64_to_image_file, load, dump, get_intermediate_file_path FAIL_MSG = 'Failed to obtain answer via API.' @@ -183,8 +183,7 @@ def extract_options(q): df['score'] = (df['parsed_pred'] == df['response']) # Save detailed results - base_name = os.path.splitext(os.path.abspath(eval_file))[0] - details_file = base_name + '_details.xlsx' + details_file = get_intermediate_file_path(eval_file, '_details') dump(df, details_file) # Calculate and return accuracy diff --git a/vlmeval/dataset/megabench.py b/vlmeval/dataset/megabench.py index cc1cb85c7..7be235cc8 100644 --- a/vlmeval/dataset/megabench.py +++ b/vlmeval/dataset/megabench.py @@ -395,7 +395,7 @@ def process_text_and_media(text, media_list, is_demo=False): return message def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 data = load(eval_file) result = [] @@ -424,7 +424,7 @@ def process_media_path(media_str): # save the result to json output_path = os.path.join(os.path.dirname(eval_file), f'megabench_result_{self.subset_name}.json') result_path = os.path.join(os.path.dirname(eval_file), f'megabench_score_{self.subset_name}.json') - score_path = eval_file.replace('.xlsx','_acc_{self.subset_name}.json') + score_path = get_intermediate_file_path(eval_file, '_acc_{self.subset_name}', 'json') if not os.path.exists(output_path) or not os.path.exists(result_path): for task_name, group in data.groupby('task_name'): task_dict = { diff --git a/vlmeval/dataset/miabench.py b/vlmeval/dataset/miabench.py index 2e99d39ec..c33f3510b 100644 --- a/vlmeval/dataset/miabench.py +++ b/vlmeval/dataset/miabench.py @@ -114,10 +114,9 @@ def evaluate(self, eval_file, **judge_kwargs): judge_name = judge_kwargs.pop('model', 'gpt-4o') model = build_judge(model=judge_name, **judge_kwargs) - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841 - tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841 + storage = get_intermediate_file_path(eval_file, f'_{judge_name}') # noqa: F841 + tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl') # noqa: F841 nproc = judge_kwargs.pop('nproc', 4) # noqa: F841 if not osp.exists(storage): @@ -160,7 +159,7 @@ def evaluate(self, eval_file, **judge_kwargs): goresult = load(storage) results = get_score_dict(goresult, goresult['score_raw']) - result_pth = storage.replace('.xlsx', '_score.csv') + result_pth = get_intermediate_file_path(storage, '_score', 'csv') results_pd = pd.DataFrame.from_dict(list(results.items())) dump(results_pd, result_pth) diff --git a/vlmeval/dataset/mlvu.py b/vlmeval/dataset/mlvu.py index 6244502d2..bcad3e961 100644 --- a/vlmeval/dataset/mlvu.py +++ b/vlmeval/dataset/mlvu.py @@ -1,6 +1,7 @@ import huggingface_hub from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path from .video_concat_dataset import ConcatVideoDataset from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE @@ -34,8 +35,7 @@ def supported_datasets(cls): def evaluate(self, eval_file, **judge_kwargs): result = super().evaluate(eval_file=eval_file, **judge_kwargs) - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc') for key in self.type_data_dict: result.loc[key] = 0.0 for name, item in result.iterrows(): @@ -211,10 +211,10 @@ def build_prompt(self, line, video_llm): @classmethod def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.setdefault('model', 'chatgpt-0125') @@ -423,9 +423,8 @@ def evaluate(self, eval_file, **judge_kwargs): print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125') judge_kwargs['model'] = 'gpt-4-0125' - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(score_file): diff --git a/vlmeval/dataset/mmalignbench.py b/vlmeval/dataset/mmalignbench.py index 6d8c6bb0f..fd77deccd 100644 --- a/vlmeval/dataset/mmalignbench.py +++ b/vlmeval/dataset/mmalignbench.py @@ -171,11 +171,10 @@ def gen_eval_base(self, eval_file, b64_map): @classmethod def evaluate(self, eval_file, **judge_kwargs): # We adopt pairwise evaluation (twice for a pair) for this dataset - suffix = eval_file.split('.')[-1] model = judge_kwargs['model'] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): diff --git a/vlmeval/dataset/mmbench_video.py b/vlmeval/dataset/mmbench_video.py index 816ec1db6..f2ada230c 100644 --- a/vlmeval/dataset/mmbench_video.py +++ b/vlmeval/dataset/mmbench_video.py @@ -1,5 +1,6 @@ from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich @@ -208,13 +209,13 @@ def load_pack_answers(self, data_raw): def evaluate(self, eval_file, **judge_kwargs): from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 judge = judge_kwargs['model'] nproc = judge_kwargs.pop('nproc', 4) - tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json') - score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json') + score_file = get_intermediate_file_path(eval_file, f'_{judge}_score') model = build_judge(system_prompt=system_prompt, **judge_kwargs) assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE diff --git a/vlmeval/dataset/mmifeval.py b/vlmeval/dataset/mmifeval.py index 7e68b6b37..6dcfd1f38 100644 --- a/vlmeval/dataset/mmifeval.py +++ b/vlmeval/dataset/mmifeval.py @@ -4,6 +4,7 @@ from .image_base import ImageBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..smp import * +from ..smp.file import get_intermediate_file_path from ..utils import track_progress_rich from ..dataset.utils.mmif.function_and_compare import * @@ -370,11 +371,10 @@ def build_prompt(self, line): def evaluate(self, eval_file, **judge_kwargs): raw_bench_data = MMIFEval("MM-IFEval").data global aux_data_dict - suffix = eval_file.split(".")[-1] model = judge_kwargs["model"] - storage = eval_file.replace(f".{suffix}", f"_{model}.jsonl") - score_file = eval_file.replace(f".{suffix}", f"_{model}_score.csv") - tmp_file = eval_file.replace(f".{suffix}", f"_{model}_tmp.pkl") + storage = get_intermediate_file_path(eval_file, f"_{model}", "jsonl") + score_file = get_intermediate_file_path(eval_file, f"_{model}_score", "csv") + tmp_file = get_intermediate_file_path(eval_file, f"_{model}_tmp", "pkl") nproc = judge_kwargs.pop("nproc", 4) data_all = load(eval_file).to_dict(orient="records") diff --git a/vlmeval/dataset/mmlongbench.py b/vlmeval/dataset/mmlongbench.py index 2b5dd3619..3379d6af6 100644 --- a/vlmeval/dataset/mmlongbench.py +++ b/vlmeval/dataset/mmlongbench.py @@ -7,6 +7,7 @@ from vlmeval.dataset.utils import build_judge, levenshtein_distance from vlmeval.smp import * from .image_base import ImageBaseDataset +from ..smp.file import get_intermediate_file_path FAIL_MSG = 'Failed to obtain answer via API.' @@ -538,9 +539,8 @@ def evaluate(self, eval_file, **judge_kwargs): logger = get_logger('Evaluation') model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') if osp.exists(storage): logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ') @@ -576,7 +576,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score = MMLongBench_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') diff --git a/vlmeval/dataset/mmmath.py b/vlmeval/dataset/mmmath.py index e70f592be..d71bb1263 100644 --- a/vlmeval/dataset/mmmath.py +++ b/vlmeval/dataset/mmmath.py @@ -11,7 +11,7 @@ from .image_base import ImageBaseDataset from ..utils import track_progress_rich -from ..smp import load, dump +from ..smp import load, dump, get_intermediate_file_path try: import sympy as sp @@ -432,7 +432,7 @@ def evaluate(self, eval_file, **kwargs): data['hit'] = res dump(data, eval_file) - score_file = eval_file.replace('.xlsx', '_score.json') + score_file = get_intermediate_file_path(eval_file, '_score', 'json') score = {} score['overall'] = np.mean(data['hit']) # Results by Difficulty diff --git a/vlmeval/dataset/moat.py b/vlmeval/dataset/moat.py index 928fc587f..123825799 100644 --- a/vlmeval/dataset/moat.py +++ b/vlmeval/dataset/moat.py @@ -4,6 +4,7 @@ from ..utils import track_progress_rich from ..smp import load, dump, decode_base64_to_image from .utils import DEBUG_MESSAGE +from ..smp.file import get_intermediate_file_path import zipfile from random import shuffle, seed @@ -99,8 +100,7 @@ def build_prompt(self, line): @classmethod def evaluate(self, eval_file, **judge_kwargs): model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - result_path = eval_file.replace(f'.{suffix}', f"_{model}.xlsx") + result_path = get_intermediate_file_path(eval_file, f"_{model}") nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(result_path): @@ -164,7 +164,7 @@ def verdict_one(model, line): 'result_path': result_path, 'capability_acc': capability_score_map, } - score_pth = eval_file.replace(f'.{suffix}', "_score.json") + score_pth = get_intermediate_file_path(eval_file, "_score", "json") dump(metrics, score_pth) return metrics diff --git a/vlmeval/dataset/moviechat1k.py b/vlmeval/dataset/moviechat1k.py index 84dba33d6..fed877536 100644 --- a/vlmeval/dataset/moviechat1k.py +++ b/vlmeval/dataset/moviechat1k.py @@ -1,5 +1,6 @@ from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich @@ -215,16 +216,16 @@ def load_pack_answers(self, data_raw): def evaluate(self, eval_file, **judge_kwargs): from .utils.moviechat1k import get_dimension_rating, prepare_score_prompt - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 judge = judge_kwargs.setdefault('model', 'chatgpt-0125') assert judge in ['chatgpt-0125'], f'Invalid judge model for MovieChat1k: {judge}' nproc = judge_kwargs.pop('nproc', 4) _ = judge_kwargs.pop('verbose', None) _ = judge_kwargs.pop('retry', None) - tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json') - score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json') + score_file = get_intermediate_file_path(eval_file, f'_{judge}_score') model = build_judge(**judge_kwargs) diff --git a/vlmeval/dataset/mvbench.py b/vlmeval/dataset/mvbench.py index 4f0aa7f03..69a49c0af 100644 --- a/vlmeval/dataset/mvbench.py +++ b/vlmeval/dataset/mvbench.py @@ -362,11 +362,11 @@ def build_prompt(self, line, video_llm): @classmethod def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.setdefault('model', 'chatgpt-0125') @@ -609,11 +609,11 @@ def build_prompt(self, line, video_llm): @classmethod def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.setdefault('model', 'chatgpt-0125') diff --git a/vlmeval/dataset/qbench_video.py b/vlmeval/dataset/qbench_video.py index a208ebaf2..317fa019c 100644 --- a/vlmeval/dataset/qbench_video.py +++ b/vlmeval/dataset/qbench_video.py @@ -2,6 +2,7 @@ import huggingface_hub from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_concat_dataset import ConcatVideoDataset from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE @@ -31,8 +32,7 @@ def supported_datasets(cls): def evaluate(self, eval_file, **judge_kwargs): result = super().evaluate(eval_file=eval_file, **judge_kwargs) - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc') result.at['open_ended', 'acc'] /= 2 dump(result, score_file) return result @@ -159,10 +159,10 @@ def build_prompt(self, line, video_llm): @classmethod def evaluate(self, eval_file, **judge_kwargs): - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.setdefault('model', 'exact_matching') @@ -318,9 +318,8 @@ def evaluate(self, eval_file, **judge_kwargs): model = judge_kwargs.setdefault('model', 'gpt-4o-0806') assert model in ['gpt-4o-0806', 'gpt-4o'] - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(score_file): diff --git a/vlmeval/dataset/sfebench.py b/vlmeval/dataset/sfebench.py index 7d672bb8b..b2aa24bc2 100644 --- a/vlmeval/dataset/sfebench.py +++ b/vlmeval/dataset/sfebench.py @@ -1,5 +1,7 @@ import string from vlmeval import * +from .smp import * +from .smp.file import get_intermediate_file_path from .image_vqa import ImageVQADataset from .utils.judge_util import build_judge from ..utils import track_progress_rich @@ -172,8 +174,8 @@ def evaluate(self, eval_file, **judge_kwargs): assert 'answer' in data and 'prediction' in data data['prediction'] = [str(x) for x in data['prediction']] data['answer'] = [str(x) for x in data['answer']] - storage = eval_file.replace('.xlsx', '_judge.xlsx') - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + storage = get_intermediate_file_path(eval_file, '_judge') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): ans_map = {} if not osp.exists(tmp_file) else load(tmp_file) @@ -216,6 +218,6 @@ def evaluate(self, eval_file, **judge_kwargs): data = load(storage) score = report_score(data) - score_file = eval_file.replace('.xlsx', '_score.csv') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') dump(score, score_file) return score diff --git a/vlmeval/dataset/slidevqa.py b/vlmeval/dataset/slidevqa.py index ae7104d43..c6aa68575 100644 --- a/vlmeval/dataset/slidevqa.py +++ b/vlmeval/dataset/slidevqa.py @@ -6,6 +6,7 @@ from vlmeval.smp import * from .image_base import ImageBaseDataset from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute +from ..smp.file import get_intermediate_file_path FAIL_MSG = 'Failed to obtain answer via API.' @@ -143,9 +144,8 @@ def evaluate(self, eval_file, **judge_kwargs): logger = get_logger('Evaluation') model = judge_kwargs['model'] - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') if osp.exists(storage): logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ') @@ -181,7 +181,7 @@ def evaluate(self, eval_file, **judge_kwargs): dump(data, storage) score = SlideVQA_acc(storage) - score_pth = storage.replace('.xlsx', '_score.csv') + score_pth = get_intermediate_file_path(storage, '_score', 'csv') dump(score, score_pth) logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}') diff --git a/vlmeval/dataset/spatial457.py b/vlmeval/dataset/spatial457.py index 15475f1c3..4026e4c1c 100644 --- a/vlmeval/dataset/spatial457.py +++ b/vlmeval/dataset/spatial457.py @@ -133,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs): all_results[f"{level}_correct"] / all_results[level] if all_results[level] > 0 else 0 ) - score_pth = eval_file.replace(".xlsx", "_score.json") + score_pth = get_intermediate_file_path(eval_file, "_score", "json") dump(all_results, score_pth) return all_results diff --git a/vlmeval/dataset/tamperbench.py b/vlmeval/dataset/tamperbench.py index 9c90e5e3d..7aebb4813 100644 --- a/vlmeval/dataset/tamperbench.py +++ b/vlmeval/dataset/tamperbench.py @@ -1,6 +1,7 @@ import huggingface_hub from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE import torchvision.transforms as T @@ -11,6 +12,7 @@ import os import glob from .utils.tamperbench import * +import warnings # constants FAIL_MSG = 'Failed to obtain answer via API.' @@ -25,8 +27,6 @@ class MVTamperBench(VideoBaseDataset): 'MVTamperBenchEnd': 'aa2c19dd02e1b006ee2d4be9f6f2b62b', } SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \ -the detail and movement of objects, and the action and pose of persons. \ -Based on your observations, select the best option that accurately addresses the question. """ TYPE = 'Video-MCQ' @@ -87,14 +87,14 @@ def prepare_dataset(self, dataset_name='MVTamperBench', repo_id=None): def check_integrity(pth): """ - Verifies the completeness and consistency of the dataset located at the specified path. + Verifies the completeness and consistency of the dataset located at the specified path. - Args: - path_to_dataset (str): The directory path where the dataset is stored. + Args: + path_to_dataset (str): The directory path where the dataset is stored. - Returns: - bool: True if the dataset is intact, False otherwise. - """ + Returns: + bool: True if the dataset is intact, False otherwise. + """ # Construct the full path to the data file data_file = osp.join(pth, f'{dataset_name}.tsv') @@ -436,14 +436,14 @@ def evaluate(self, eval_file, **judge_kwargs): Evaluates the given evaluation file and generates ratings based on different dimensions. Args: - eval_file (str): Path to the evaluation file. The file should be in .xlsx format. + eval_file (str): Path to the evaluation file. The file should be in a supported format (xlsx/json/tsv). **judge_kwargs: Additional keyword arguments for the judge model. Returns: dict: A dictionary containing ratings for task type, tamper type, and task-tamper type. Raises: - AssertionError: If the eval_file does not end with '.xlsx'. + AssertionError: If the eval_file is not a supported format. Warning: If the OPENAI API is not working properly or the API key is not set, exact matching will be used for evaluation. @@ -454,15 +454,15 @@ def evaluate(self, eval_file, **judge_kwargs): - Ratings are generated for different dimensions and saved to respective files. """ - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_task_type_file = eval_file.replace('.xlsx', '_task_type_rating.json') - tgt_tamper_type_file = eval_file.replace('.xlsx', '_tamper_type_rating.json') - tgt_task_tamper_type_file = eval_file.replace('.xlsx', '_task_tamper_type_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') - score_metrics_file = eval_file.replace('.xlsx', '_score_f1.xlsx') - action_metrics_file = eval_file.replace('.xlsx', '_action_f1.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_task_type_file = get_intermediate_file_path(eval_file, '_task_type_rating', 'json') + tgt_tamper_type_file = get_intermediate_file_path(eval_file, '_tamper_type_rating', 'json') + tgt_task_tamper_type_file = get_intermediate_file_path(eval_file, '_task_tamper_type_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') + score_metrics_file = get_intermediate_file_path(eval_file, '_score_f1') + action_metrics_file = get_intermediate_file_path(eval_file, '_action_f1') if not osp.exists(score_file): model = judge_kwargs.setdefault('model', 'chatgpt-0125') diff --git a/vlmeval/dataset/tempcompass.py b/vlmeval/dataset/tempcompass.py index 2cc10429c..6c409334e 100644 --- a/vlmeval/dataset/tempcompass.py +++ b/vlmeval/dataset/tempcompass.py @@ -25,9 +25,8 @@ def supported_datasets(cls): def evaluate(self, eval_file, **judge_kwargs): result = super().evaluate(eval_file=eval_file, **judge_kwargs) - suffix = eval_file.split('.')[-1] result = result.reset_index().rename(columns={'index': 'dim.task_type'}) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') avg_dict = {} for idx, item in result.iterrows(): dim, task_type = item['dim.task_type'].split('. ') @@ -214,9 +213,8 @@ def evaluate(self, eval_file, **judge_kwargs): "presence_penalty": 1, }) - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(score_file): @@ -412,9 +410,8 @@ def evaluate(self, eval_file, **judge_kwargs): "presence_penalty": 1, }) - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(score_file): @@ -609,9 +606,8 @@ def evaluate(self, eval_file, **judge_kwargs): "presence_penalty": 1, }) - suffix = eval_file.split('.')[-1] - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(score_file): diff --git a/vlmeval/dataset/text_mcq.py b/vlmeval/dataset/text_mcq.py index 9db53893d..2879551a2 100644 --- a/vlmeval/dataset/text_mcq.py +++ b/vlmeval/dataset/text_mcq.py @@ -1,6 +1,7 @@ from .text_base import TextBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..smp import * +from ..smp.file import get_intermediate_file_path class TextMCQDataset(TextBaseDataset): @@ -52,8 +53,6 @@ def evaluate(self, eval_file, **judge_kwargs): nproc = judge_kwargs.pop('nproc', 4) circular = False - - suffix = eval_file.split('.')[-1] model = judge_kwargs.get('model', 'exact_matching') assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'} @@ -71,7 +70,7 @@ def evaluate(self, eval_file, **judge_kwargs): warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + result_file = get_intermediate_file_path(eval_file, f'_{name_str}_result', 'pkl') data = load(eval_file) data = data.sort_values(by='index') @@ -94,8 +93,9 @@ def evaluate(self, eval_file, **judge_kwargs): data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) # load split - dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) - data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + eval_name_result = get_intermediate_file_path(eval_file, f'_{name_str}_result') + dump(data, eval_name_result) + data = load(eval_name_result) # May have different report acc functions for different datasets if 'MMT' in dataset: @@ -103,7 +103,7 @@ def evaluate(self, eval_file, **judge_kwargs): else: acc = report_acc(data) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(acc, score_file) return acc diff --git a/vlmeval/dataset/utils/multiple_choice.py b/vlmeval/dataset/utils/multiple_choice.py index d36c62341..e965808aa 100644 --- a/vlmeval/dataset/utils/multiple_choice.py +++ b/vlmeval/dataset/utils/multiple_choice.py @@ -562,7 +562,8 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None): if k not in result: result[k] = v - tmp_pth = f'/tmp/{timestr()}.xlsx' + tmp_ext = get_pred_file_format() + tmp_pth = f'/tmp/{timestr()}.{tmp_ext}' dump(data_main, tmp_pth) data_main = load(tmp_pth) indices = data_main['index'] diff --git a/vlmeval/dataset/utils/ocrbench.py b/vlmeval/dataset/utils/ocrbench.py index f88bb246c..dedee4bfc 100644 --- a/vlmeval/dataset/utils/ocrbench.py +++ b/vlmeval/dataset/utils/ocrbench.py @@ -57,7 +57,7 @@ def OCRBench_eval(eval_file): + final_score_dict['Handwritten Mathematical Expression Recognition'] ) final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10 - score_pth = eval_file.replace('.xlsx', '_score.json') + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') dump(final_score_dict, score_pth) logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') logger.info('Score: ') diff --git a/vlmeval/dataset/vcr.py b/vlmeval/dataset/vcr.py index c659c60f4..e63fab6fc 100644 --- a/vlmeval/dataset/vcr.py +++ b/vlmeval/dataset/vcr.py @@ -2,6 +2,8 @@ from functools import partial from .image_base import ImageBaseDataset from ..smp import * +from ..smp.file import get_intermediate_file_path + rouge = None nlp_en = None @@ -323,9 +325,7 @@ def evaluate(self, eval_file, **judge_kwargs): 'Jaccard': vcr_score['Jaccard'], 'Predictions': results_out, } - score_pth = eval_file.replace( - '.xlsx', f'{self.language}_{self.difficulty}_score.json' - ) + score_pth = get_intermediate_file_path(eval_file, f'_{self.language}_{self.difficulty}_score', 'json') dump(results_with_metrics, score_pth) logger.info( f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}' diff --git a/vlmeval/dataset/vcrbench.py b/vlmeval/dataset/vcrbench.py index 7b35f708a..13efa628a 100644 --- a/vlmeval/dataset/vcrbench.py +++ b/vlmeval/dataset/vcrbench.py @@ -1,5 +1,6 @@ from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich @@ -141,14 +142,14 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.vcrbench.eval import precision, recall from .utils.vcrbench.cau_total import calu_pre_recall - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 judge = judge_kwargs.pop('model','gpt-4o-0806') nproc = judge_kwargs.pop('nproc', 4) # step1: extract answer print("running step 1: extracting answer") - tmp_file = eval_file.replace('.xlsx', f'_{judge}_extracted_answer_tmp.pkl') - extracted_answer_file = eval_file.replace('.xlsx', f'_{judge}_extracted_answer.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_extracted_answer_tmp', 'pkl') + extracted_answer_file = get_intermediate_file_path(eval_file, f'_{judge}_extracted_answer') model = build_judge(system_prompt=Answer_Extraction_Prompt_part1, model=judge, **judge_kwargs) if not osp.exists(extracted_answer_file): @@ -179,8 +180,8 @@ def evaluate(self, eval_file, **judge_kwargs): # step2: scoring print("running step 2: acc scoring") - tmp_file = eval_file.replace('.xlsx', f'_{judge}_answer_score_tmp.pkl') - answer_score_file = eval_file.replace('.xlsx', f'_{judge}_answer_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score_tmp', 'pkl') + answer_score_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score') model = build_judge(system_prompt=Answer_Scoring_Prompt_part1, model=judge, **judge_kwargs) if not osp.exists(answer_score_file): @@ -206,15 +207,15 @@ def evaluate(self, eval_file, **judge_kwargs): data['answer_scoring'] = [answer_score_map[idx] if idx in answer_score_map else -1 for idx in data['index']] dump(data, answer_score_file) - txt_file = eval_file.replace('.xlsx', f'_{judge}_answer_score.txt') - answer_score_json = eval_file.replace('.xlsx', f'_{judge}_answer_score.json') + txt_file = get_intermediate_file_path(eval_file, f'_{judge}_answer_score', 'txt') + answer_score_json = get_intermediate_file_path(eval_file, f'_{judge}_answer_score', 'json') xlsx2json(answer_score_file, answer_score_json) calu_acc_main(answer_score_json, txt_file) # step3: calulate precision_score print("running step 3: calulate precision_score") - tmp_file = eval_file.replace('.xlsx', f'_{judge}_pre_score_tmp.pkl') - pre_score_file = eval_file.replace('.xlsx', f'_{judge}_pre_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_pre_score_tmp', 'pkl') + pre_score_file = get_intermediate_file_path(eval_file, f'_{judge}_pre_score') model = build_judge(system_prompt=Precision_Evaluation_Prompt, model=judge, **judge_kwargs) @@ -253,13 +254,13 @@ def evaluate(self, eval_file, **judge_kwargs): data = data.loc[valid_indices] dump(data, pre_score_file) - pre_score_json = eval_file.replace('.xlsx', f'_{judge}_pre_score.json') + pre_score_json = get_intermediate_file_path(eval_file, f'_{judge}_pre_score', 'json') xlsx2json(pre_score_file, pre_score_json) # step4: calulate recall_score print("running step 4: calulate recall_score") - tmp_file = eval_file.replace('.xlsx', f'_{judge}_recall_score_tmp.pkl') - recall_score_file = eval_file.replace('.xlsx', f'_{judge}_recall_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_recall_score_tmp', 'pkl') + recall_score_file = get_intermediate_file_path(eval_file, f'_{judge}_recall_score') model = build_judge(system_prompt=Recall_Evaluation_Prompt, model=judge, **judge_kwargs) @@ -295,7 +296,7 @@ def evaluate(self, eval_file, **judge_kwargs): data = data.loc[valid_indices] dump(data, recall_score_file) - txt_file = eval_file.replace('.xlsx', f'_{judge}_precision_recall_score.txt') - recall_score_json = eval_file.replace('.xlsx', f'_{judge}_recall_score.json') + txt_file = get_intermediate_file_path(eval_file, f'_{judge}_precision_recall_score', 'txt') + recall_score_json = get_intermediate_file_path(eval_file, f'_{judge}_recall_score', 'json') xlsx2json(recall_score_file, recall_score_json) calu_pre_recall(pre_score_json, recall_score_json, txt_file) diff --git a/vlmeval/dataset/vdc.py b/vlmeval/dataset/vdc.py index dce63cb42..75e1051bc 100644 --- a/vlmeval/dataset/vdc.py +++ b/vlmeval/dataset/vdc.py @@ -1,6 +1,7 @@ # flake8: noqa from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich @@ -346,16 +347,16 @@ def load_pack_answers(self, data_raw): def evaluate(self, eval_file, **judge_kwargs): from .utils.vdc import get_dimension_rating, prepare_response_prompt, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT, SYSTEM_GENER_PRED_PROMPT - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' judge = judge_kwargs['model'] nproc = judge_kwargs.pop('nproc', 4) _ = judge_kwargs.pop('verbose', None) _ = judge_kwargs.pop('retry', None) - response_file = eval_file.replace('.xlsx', f'_{judge}_response.pkl') - tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json') - score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx') + response_file = get_intermediate_file_path(eval_file, f'_{judge}_response', 'pkl') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json') + score_file = get_intermediate_file_path(eval_file, f'_{judge}_score') model = build_judge(**judge_kwargs) diff --git a/vlmeval/dataset/video_concat_dataset.py b/vlmeval/dataset/video_concat_dataset.py index dab1ae1d7..fcf3e8227 100644 --- a/vlmeval/dataset/video_concat_dataset.py +++ b/vlmeval/dataset/video_concat_dataset.py @@ -1,4 +1,5 @@ from ..smp import * +from ..smp.file import get_intermediate_file_path from .video_base import VideoBaseDataset @@ -59,7 +60,6 @@ def supported_datasets(cls): return [] # list(cls.DATASET_SETS) def evaluate(self, eval_file, **judge_kwargs): - suffix = eval_file.split('.')[-1] # First, split the eval_file by dataset data_all = load(eval_file) for dname in self.datasets: @@ -80,6 +80,6 @@ def evaluate(self, eval_file, **judge_kwargs): result = result.T for idx, item in result.iterrows(): result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc', 'csv') dump(result, score_file) return result diff --git a/vlmeval/dataset/video_holmes.py b/vlmeval/dataset/video_holmes.py index c267755e3..3d6ff37b2 100644 --- a/vlmeval/dataset/video_holmes.py +++ b/vlmeval/dataset/video_holmes.py @@ -1,5 +1,6 @@ from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE @@ -204,11 +205,11 @@ def evaluate(self, eval_file, **judge_kwargs): from .utils.videoholmes import get_dimension_rating, extract_option - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.get('model', 'exact_matching') diff --git a/vlmeval/dataset/video_mmlu.py b/vlmeval/dataset/video_mmlu.py index a229a95bc..977cbff9e 100644 --- a/vlmeval/dataset/video_mmlu.py +++ b/vlmeval/dataset/video_mmlu.py @@ -1,6 +1,7 @@ # flake8: noqa from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich @@ -276,16 +277,16 @@ def load_pack_answers(self, data_raw): def evaluate(self, eval_file, **judge_kwargs): from .utils.video_mmlu import get_dimension_rating, prepare_response_prompt, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT_CAP, SYSTEM_GENER_PRED_PROMPT - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' judge = judge_kwargs['model'] nproc = judge_kwargs.pop('nproc', 4) _ = judge_kwargs.pop('verbose', None) _ = judge_kwargs.pop('retry', None) - response_file = eval_file.replace('.xlsx', f'_{judge}_response.pkl') - tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json') - score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx') + response_file = get_intermediate_file_path(eval_file, f'_{judge}_response', 'pkl') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json') + score_file = get_intermediate_file_path(eval_file, f'_{judge}_score') judge_kwargs['temperature'] = 0.0 model = build_judge(**judge_kwargs) @@ -564,15 +565,15 @@ def load_pack_answers(self, data_raw): def evaluate(self, eval_file, **judge_kwargs): from .utils.video_mmlu import get_dimension_rating, prepare_score_prompt, SYSTEM_CAL_SCORE_PROMPT_QA - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' judge = judge_kwargs['model'] nproc = judge_kwargs.pop('nproc', 4) _ = judge_kwargs.pop('verbose', None) _ = judge_kwargs.pop('retry', None) - tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json') - score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge}_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, f'_{judge}_rating', 'json') + score_file = get_intermediate_file_path(eval_file, f'_{judge}_score') judge_kwargs['temperature'] = 0.0 model = build_judge(**judge_kwargs) diff --git a/vlmeval/dataset/videomme.py b/vlmeval/dataset/videomme.py index c084ad796..84a20eeb4 100644 --- a/vlmeval/dataset/videomme.py +++ b/vlmeval/dataset/videomme.py @@ -1,5 +1,6 @@ from huggingface_hub import snapshot_download from ..smp import * +from ..smp.file import get_intermediate_file_path, get_file_extension from .video_base import VideoBaseDataset from .utils import build_judge, DEBUG_MESSAGE @@ -231,11 +232,11 @@ def build_prompt(self, line, video_llm): def evaluate(self, eval_file, **judge_kwargs): from .utils.videomme import get_dimension_rating, extract_characters_regex, extract_option - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.get('model', 'exact_matching') diff --git a/vlmeval/dataset/visfactor.py b/vlmeval/dataset/visfactor.py index c3a0f8126..6b8313fbd 100644 --- a/vlmeval/dataset/visfactor.py +++ b/vlmeval/dataset/visfactor.py @@ -1,6 +1,7 @@ import re from vlmeval import * from .image_base import ImageBaseDataset +from ..smp.file import get_intermediate_file_path class VisFactor(ImageBaseDataset): @@ -141,9 +142,11 @@ def evaluate(self, eval_file, **judge_kwargs): accuracy['ALL'] = sum([accuracy[s] for s in accuracy]) / len([accuracy[s] for s in accuracy]) - data.to_csv(eval_file.replace('.xlsx', '.csv'), index=False) - with open(eval_file.replace('.xlsx', '_acc.csv'), 'w') as f: - for key in accuracy: - f.write(f'{key},{accuracy[key]}\n') + verbose_file = get_intermediate_file_path(eval_file, '_verbose') + dump(data, verbose_file) + + score_df = d2df(accuracy) + score_file = get_intermediate_file_path(eval_file, '_acc') + dump(score_df, score_file) return accuracy diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py index d8dad7383..ce8b397a8 100644 --- a/vlmeval/dataset/vl_rewardbench.py +++ b/vlmeval/dataset/vl_rewardbench.py @@ -102,11 +102,10 @@ def build_prompt(self, line): # It returns a DataFrame @classmethod def evaluate(self, eval_file, **judge_kwargs): - suffix = eval_file.split('.')[-1] model = judge_kwargs['model'] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): diff --git a/vlmeval/dataset/vlm2bench.py b/vlmeval/dataset/vlm2bench.py index 5cd04e283..bbe86d554 100644 --- a/vlmeval/dataset/vlm2bench.py +++ b/vlmeval/dataset/vlm2bench.py @@ -8,6 +8,8 @@ cnt_aggregate_metric, grp_aggregate_accuracy, ) +from ..smp import * +from ..smp.file import get_intermediate_file_path class VLM2Bench(ImageBaseDataset): @@ -69,25 +71,15 @@ def evaluate(cls, eval_file, **judge_kwargs): """ model = judge_kwargs.get("model") if model: - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') if os.path.exists(storage): - if storage.lower().endswith(".xlsx"): - data = pd.read_excel(storage) - else: - data = pd.read_csv(storage, sep="\t", encoding="latin1", engine="python") + data = load(storage) else: - if eval_file.lower().endswith(".xlsx"): - data = pd.read_excel(eval_file) - else: - data = pd.read_csv(eval_file, sep="\t", encoding="latin1", engine="python") + data = load(eval_file) else: - if eval_file.lower().endswith(".xlsx"): - data = pd.read_excel(eval_file) - else: - data = pd.read_csv(eval_file, sep="\t", encoding="latin1", engine="python") + data = load(eval_file) results = data.to_dict(orient="records") processed = common_process_results(results) @@ -117,7 +109,6 @@ def evaluate(cls, eval_file, **judge_kwargs): if model: final_score_file = score_file else: - suffix = os.path.splitext(eval_file)[1] - final_score_file = eval_file.replace(suffix, "_score.csv") - score_df.to_csv(final_score_file, index=False) + final_score_file = get_intermediate_file_path(eval_file, "_score", "csv") + dump(score_df, final_score_file) return score_df diff --git a/vlmeval/dataset/vlmbias.py b/vlmeval/dataset/vlmbias.py index b3b42e582..45e0ebb9b 100644 --- a/vlmeval/dataset/vlmbias.py +++ b/vlmeval/dataset/vlmbias.py @@ -16,9 +16,8 @@ class VLMBias(ImageVQADataset): def evaluate(self, eval_file, **judge_kwargs): model = judge_kwargs.pop('model', 'gpt-4o') - suffix = eval_file.split('.')[-1] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}_tmp') nproc = judge_kwargs.pop('nproc', 16) if not osp.exists(storage): @@ -51,6 +50,6 @@ def evaluate(self, eval_file, **judge_kwargs): data = load(storage) acc = report_acc(data) - score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + score_file = get_intermediate_file_path(eval_file, '_acc') dump(acc, score_file) return acc diff --git a/vlmeval/dataset/wildvision.py b/vlmeval/dataset/wildvision.py index b1ad1fd26..3552a0b8e 100644 --- a/vlmeval/dataset/wildvision.py +++ b/vlmeval/dataset/wildvision.py @@ -4,6 +4,7 @@ from .image_base import ImageBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..smp import * +from ..smp.file import get_intermediate_file_path from ..utils import track_progress_rich @@ -141,11 +142,10 @@ def gen_eval_base(self, eval_file, b64_map): @classmethod def evaluate(self, eval_file, **judge_kwargs): # We adopt pairwise evaluation (twice for a pair) for this dataset - suffix = eval_file.split('.')[-1] model = judge_kwargs['model'] - storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') - score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') - tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + storage = get_intermediate_file_path(eval_file, f'_{model}') + score_file = get_intermediate_file_path(eval_file, f'_{model}_score', 'csv') + tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl') nproc = judge_kwargs.pop('nproc', 4) if not osp.exists(storage): diff --git a/vlmeval/dataset/worldsense.py b/vlmeval/dataset/worldsense.py index 6e51541d9..fe59c65ab 100644 --- a/vlmeval/dataset/worldsense.py +++ b/vlmeval/dataset/worldsense.py @@ -283,11 +283,11 @@ def build_prompt(self, line, video_llm): def evaluate(self, eval_file, **judge_kwargs): from .utils.worldsense import get_dimension_rating, extract_characters_regex, extract_option - assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], 'data file should be an supported format (xlsx/json/tsv) file' # noqa: E501 - tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') - tgt_file = eval_file.replace('.xlsx', '_rating.json') - score_file = eval_file.replace('.xlsx', '_score.xlsx') + tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl') + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score') if not osp.exists(score_file): model = judge_kwargs.get('model', 'exact_matching') diff --git a/vlmeval/inference.py b/vlmeval/inference.py index de3bd34c0..ffac276dd 100644 --- a/vlmeval/inference.py +++ b/vlmeval/inference.py @@ -47,7 +47,8 @@ def infer_data_api(model, work_dir, model_name, dataset, index_set=None, api_npr # To reuse records in MMBench_V11 if dataset_name in ['MMBench', 'MMBench_CN']: - v11_pred = f'{work_dir}/{model_name}_{dataset_name}_V11.xlsx' + pred_format = get_pred_file_format() + v11_pred = f'{work_dir}/{model_name}_{dataset_name}_V11.{pred_format}' if osp.exists(v11_pred): try: reuse_inds = load('http://opencompass.openxlab.space/utils/mmb_reuse.pkl') @@ -184,12 +185,14 @@ def infer_data_job( ): rank, world_size = get_rank_and_world_size() dataset_name = dataset.dataset_name - result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx') + # 使用环境变量控制的文件格式 + result_file = get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True) prev_file = f'{work_dir}/{model_name}_{dataset_name}_PREV.pkl' if osp.exists(result_file): if rank == 0: data = load(result_file) + # breakpoint() results = {k: v for k, v in zip(data['index'], data['prediction'])} if not ignore_failed: results = {k: v for k, v in results.items() if FAIL_MSG not in str(v)} diff --git a/vlmeval/inference_mt.py b/vlmeval/inference_mt.py index 298f2a208..25c7ce935 100644 --- a/vlmeval/inference_mt.py +++ b/vlmeval/inference_mt.py @@ -169,7 +169,7 @@ def infer_data_job_mt( ): rank, world_size = get_rank_and_world_size() dataset_name = dataset.dataset_name - result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.tsv') + result_file = get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True) tmpl = osp.join(work_dir, '{}' + f'{world_size}_{dataset_name}.pkl') out_file = tmpl.format(rank) diff --git a/vlmeval/smp/file.py b/vlmeval/smp/file.py index ecd2edefc..ac658d271 100644 --- a/vlmeval/smp/file.py +++ b/vlmeval/smp/file.py @@ -116,9 +116,9 @@ def MMBenchOfficialServer(dataset_name): class NumpyEncoder(json.JSONEncoder): def default(self, obj): - if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, - np.int16, np.int32, np.int64, np.uint8, - np.uint16, np.uint32, np.uint64)): + if isinstance(obj, + (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, + np.uint8, np.uint16, np.uint32, np.uint64)): return int(obj) elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) @@ -139,6 +139,10 @@ def dump_pkl(data, pth, **kwargs): pickle.dump(data, open(pth, 'wb')) def dump_json(data, pth, **kwargs): + # 处理 DataFrame 对象 + if isinstance(data, pd.DataFrame): + # 转换为 records 格式(列表格式) + data = data.to_dict('records') json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder) def dump_jsonl(data, f, **kwargs): @@ -160,6 +164,65 @@ def dump_tsv(data, f, quoting=csv.QUOTE_ALL): return handlers[suffix](data, f, **kwargs) +def get_pred_file_format(): + pred_format = os.getenv('PRED_FORMAT', '').lower() + if pred_format in ['tsv', 'xlsx', 'json']: + return pred_format + return 'xlsx' # 默认格式 + + +def get_eval_file_format(): + eval_format = os.getenv('EVAL_FORMAT', '').lower() + if eval_format in ['csv', 'json']: + return eval_format + return 'csv' # 默认格式 + + +def get_pred_file_path(work_dir, model_name, dataset_name, use_env_format=True): + if use_env_format: + file_format = get_pred_file_format() + if file_format == 'xlsx': + return osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx') + elif file_format == 'tsv': + return osp.join(work_dir, f'{model_name}_{dataset_name}.tsv') + elif file_format == 'json': + return osp.join(work_dir, f'{model_name}_{dataset_name}.json') + else: + # 保持原有行为 + return osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx') + + +def get_eval_file_path(eval_file, judge_model, use_env_format=True): + suffix = eval_file.split('.')[-1] + if use_env_format: + file_format = get_eval_file_format() + if file_format == 'csv': + return eval_file.replace(f'.{suffix}', f'_{judge_model}.csv') + elif file_format == 'json': + return eval_file.replace(f'.{suffix}', f'_{judge_model}.json') + else: + # 保持原有行为 + return eval_file.replace(f'.{suffix}', f'_{judge_model}.xlsx') + + +def _should_convert_to_dataframe(data): + if not isinstance(data, dict): + return False + if not data: + return False + if 'columns' in data and 'data' in data: + return True + values = list(data.values()) + if all(not isinstance(v, (list, dict)) for v in values): + return False + if any(isinstance(v, list) for v in values): + lists = [v for v in values if isinstance(v, list)] + if lists and all(len(lst) == len(lists[0]) for lst in lists): + return True + + return False + + def load(f, fmt=None): def load_pkl(pth): return pickle.load(open(pth, 'rb')) @@ -382,6 +445,26 @@ def fetch_aux_files(eval_file): return fs +def get_file_extension(file_path): + return file_path.split('.')[-1] + + +def get_intermediate_file_path(eval_file, suffix, target_format=None): + original_ext = get_file_extension(eval_file) + + if target_format is None: + if suffix in ['_tmp', '_response', '_processed']: + target_format = 'pkl' + elif suffix in ['_rating', '_config', '_meta']: + target_format = 'json' + elif suffix in ['_acc', '_fine', '_metrics']: + target_format = get_eval_file_format() + else: + target_format = get_pred_file_format() + + return eval_file.replace(f'.{original_ext}', f'{suffix}.{target_format}') + + def prepare_reuse_files(pred_root_meta, eval_id, model_name, dataset_name, reuse, reuse_aux): import shutil from .misc import timestr diff --git a/vlmeval/tools.py b/vlmeval/tools.py index 126fb76ae..98449f841 100644 --- a/vlmeval/tools.py +++ b/vlmeval/tools.py @@ -497,7 +497,8 @@ def SCAN_ONE(root, model, dataset): from termcolor import colored FAIL_MSG = 'Failed to obtain answer via API.' root = osp.join(root, model) - fname = f'{model}_{dataset}.xlsx' + pred_format = get_pred_file_format() + fname = f'{model}_{dataset}.{pred_format}' pth = osp.join(root, fname) if osp.exists(pth): data = load(pth) @@ -549,7 +550,8 @@ def SCAN(root, models, datasets): cur_datasets = [] if len(datasets) == 0: for d in SUPPORTED_DATASETS: - if osp.exists(osp.join(root, m, f'{m}_{d}.xlsx')): + pred_format = get_pred_file_format() + if osp.exists(osp.join(root, m, f'{m}_{d}.{pred_format}')): cur_datasets.append(d) else: cur_datasets = datasets