diff --git a/run.py b/run.py index 977cf044b..cba4ed6b3 100644 --- a/run.py +++ b/run.py @@ -393,6 +393,8 @@ def main(): judge_kwargs['model'] = 'llama31-8b' elif listinstr(['VideoMMLU_QA', 'VideoMMLU_CAP'], dataset_name): judge_kwargs['model'] = 'qwen-72b' + elif listinstr(['CAPTURE_real', 'CAPTURE_synthetic'], dataset_name): + judge_kwargs['model'] = 'llama31-8b' if RANK == 0: logger.info(judge_kwargs) diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 80ef1f301..818424063 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -12,7 +12,7 @@ from .image_vqa import ( ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, VGRPBench, MMVet, MTVQADataset, TableVQABench, CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH, LogicVista, MME_CoT, - MMSci_Captioning, Physics_yale, TDBenchGrounding + MMSci_Captioning, Physics_yale, TDBenchGrounding, CAPTURE ) from .image_ccocr import CCOCRDataset @@ -158,8 +158,8 @@ def evaluate(self, eval_file, **judge_kwargs): CreationMMBenchDataset, ImageShortQADataset, MMAlignBench, OmniDocBench, VLM2Bench, VMCBenchDataset, EMMADataset, MME_CoT, MOAT, MedXpertQA_MM_test, LEGO, MMSci_Captioning, Physics_yale, MMIFEval, Spatial457, VisuLogic, CVBench, - CMMU_MCQ, PathVQA_VAL, PathVQA_TEST, PathMMU_VAL, PathMMU_TEST, TDBench, TDBenchGrounding - + CMMU_MCQ, PathVQA_VAL, PathVQA_TEST, PathMMU_VAL, PathMMU_TEST, TDBench, TDBenchGrounding, + CAPTURE ] diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py index 1fae8990a..2c4be47a7 100644 --- a/vlmeval/dataset/image_vqa.py +++ b/vlmeval/dataset/image_vqa.py @@ -1937,3 +1937,50 @@ def build_prompt(self, line): msgs.extend([dict(type='image', value=p) for p in tgt_path]) msgs.append(dict(type='text', value=question)) return msgs + + +class CAPTURE(ImageBaseDataset): + TYPE = '' + DATASET_URL = {'CAPTURE_real': '', + 'CAPTURE_synthetic': ''} + DATASET_MD5 = {'CAPTURE_real': None, + 'CAPTURE_synthetic': None} + + def create_tsv_from_hf(self): + pass + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.capture import CAPTURE_atomeval, CAPTURE_smape + + model = judge_kwargs['model'] + suffix = '.' + eval_file.split('.')[-1] + record_file = eval_file.replace(suffix, f'_{model}.{suffix}') + score_file = eval_file.replace(suffix, '_score.csv') + nproc = judge_kwargs.pop('nproc', 4) + system_prompt = ( + "You are an answer extractor. When given someone's answer to " + "some question, you will only extract their final number answer " + "and will respond with just the number. If there is no exact " + "number answer, respond with -1" + ) + if not osp.exists(record_file): + data = load(eval_file) + model = build_judge(**judge_kwargs, system_prompt=system_prompt) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + + extracted_answers = track_progress_rich( + CAPTURE_atomeval, + tups, + nproc=nproc, + chunksize=nproc, + ) + data['extracted_answer'] = extracted_answers + dump(data, record_file) + + data = load(record_file) + score = CAPTURE_smape(data) + dump(score, score_file) + return score diff --git a/vlmeval/dataset/utils/capture.py b/vlmeval/dataset/utils/capture.py new file mode 100644 index 000000000..99b9ffa20 --- /dev/null +++ b/vlmeval/dataset/utils/capture.py @@ -0,0 +1,139 @@ +from huggingface_hub import hf_hub_download +import zipfile +import os +import json +import tqdm +from ...smp import * + + +def create_csv_from_meta(meta_file, object_key, data_dir, out_file): + with open(meta_file, "r") as fp: + meta = json.load(fp) + + data = [] + for entry in tqdm(meta): + image_file = entry["image_file"] + image_path = osp.join(data_dir, image_file) + image = encode_image_file_to_base64(image_path) + object_name = entry[object_key] + question = ( + f"Count the exact number of {object_name} in the image. " + f"Assume the pattern of {object_name} continues behind any " + f"black box. Provide the total number of {object_name} as if " + f"the black box were not there. Only count {object_name} that " + f"are visible within the frame (or would be visible without " + f"the occluding box). If {object_name} are partially in the " + f"frame (i.e. if any part of {object_name} are visible), " + f"count it. If the {object_name} would be partially in the " + f"frame without the occluding box, count it." + ) + answer = str(entry["ground_truth"]) + data.append( + dict( + image=image, + question=question, + answer=answer, + image_file=image_file, + ) + ) + df = pd.DataFrame(data).sort_values(by="image_file") + df.to_csv(out_file, index=True, index_label="index", sep="\t") + + +def create_tsv_real(): + data_root = LMUDataRoot() + data_dir = osp.join(data_root, "capture") + os.makedirs(data_root, exist_ok=True) + real_zip = hf_hub_download( + repo_id="atinp/CAPTURe", + filename="real_dataset.zip", + repo_type="dataset", + ) + + with zipfile.ZipFile(real_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + # rename the extracted folder (originally called dataset) to real_dataset + os.rename(f"{data_dir}/dataset", f"{data_dir}/real_dataset") + + real_meta = hf_hub_download( + repo_id="atinp/CAPTURe", + filename="real_metadata.json", + repo_type="dataset", + ) + out_file = os.path.join(data_root, "CAPTURE_real.tsv") + create_csv_from_meta( + real_meta, "object", f"{data_dir}/real_dataset", out_file + ) + return out_file + + +def create_tsv_synthetic(): + syn_zip = hf_hub_download( + repo_id="atinp/CAPTURe", + filename="synthetic_dataset.zip", + repo_type="dataset", + ) + data_root = LMUDataRoot() + data_dir = osp.join(data_root, "capture") + os.makedirs(data_root, exist_ok=True) + + with zipfile.ZipFile(syn_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + synth_meta = hf_hub_download( + repo_id="atinp/CAPTURe", + filename="synthetic_metadata.json", + repo_type="dataset", + ) + out_file = os.path.join(data_root, "CAPTURE_synthetic.tsv") + create_csv_from_meta( + synth_meta, "dot_shape", f"{data_dir}/synthetic_dataset", out_file + ) + return out_file + + +def safe_string_to_int(s): + try: + return int(s) + except ValueError: + return -1 + + +def CAPTURE_atomeval(model, line): + ans = model.generate_str(line["prediction"]) + return safe_string_to_int(ans) + + +def CAPTURE_smape(data): + total_percentage_error = 0 + count = 0 + skip = 0 + + for i in range(len(data)): + row = data.iloc[i] + ground_truth = int(row["answer"]) + answer = row["extracted_answer"] + + if answer == -1: + skip += 1 + total_percentage_error += 100 + count += 1 + continue + + # Compute sMAPE (Symmetric Mean Absolute Percentage Error) + numerator = abs(answer - ground_truth) + denominator = abs(answer) + abs(ground_truth) + smape = (numerator / denominator) * 100 + + # Add to total percentage error + total_percentage_error += smape + count += 1 + + # Calculate MAPE + mape = total_percentage_error / count if count != 0 else 0 + return pd.DataFrame([dict(SMAPE=mape, skip=skip)]) + + +if __name__ == "__main__": + create_tsv_real() + create_tsv_synthetic()