From a9171221e3bafecdc75580c3f25347f1c3d18851 Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Mon, 3 Nov 2025 12:04:38 -0800 Subject: [PATCH] fix: GenAI Client(evals) - Support direct pandas DataFrame dataset in evaluate() PiperOrigin-RevId: 827595694 --- .../vertexai/genai/replays/test_evaluate.py | 41 +++++++++++++++++++ vertexai/_genai/evals.py | 17 ++++++-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py index 7b497ccd11..d934b7f5d4 100644 --- a/tests/unit/vertexai/genai/replays/test_evaluate.py +++ b/tests/unit/vertexai/genai/replays/test_evaluate.py @@ -54,6 +54,47 @@ def test_evaluation_result(client): assert case_result.response_candidate_results is not None +def test_evaluation_byor(client): + """Tests that evaluate() with BYOR (Bring-Your-Own Response) produces a correctly structured EvaluationResult.""" + byor_df = pd.DataFrame( + { + "prompt": [ + "Write a simple story about a dinosaur", + "Generate a poem about Vertex AI", + ], + "response": [ + "Once upon a time, there was a T-Rex named Rexy.", + "In clouds of code, a mind of silicon born...", + ], + } + ) + + metrics_to_run = [ + types.RubricMetric.GENERAL_QUALITY, + ] + + evaluation_result = client.evals.evaluate( + dataset=byor_df, + metrics=metrics_to_run, + ) + + assert isinstance(evaluation_result, types.EvaluationResult) + + assert evaluation_result.summary_metrics is not None + assert len(evaluation_result.summary_metrics) > 0 + for summary in evaluation_result.summary_metrics: + assert isinstance(summary, types.AggregatedMetricResult) + assert summary.metric_name is not None + assert summary.mean_score is not None + + assert evaluation_result.eval_case_results is not None + assert len(evaluation_result.eval_case_results) > 0 + for case_result in evaluation_result.eval_case_results: + assert isinstance(case_result, types.EvalCaseResult) + assert case_result.eval_case_index is not None + assert case_result.response_candidate_results is not None + + pytestmark = pytest_helper.setup( file=__file__, globals_for_file=globals(), diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 7b7ad2c023..c21b186ecd 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -970,7 +970,9 @@ def evaluate( self, *, dataset: Union[ - types.EvaluationDatasetOrDict, list[types.EvaluationDatasetOrDict] + pd.DataFrame, + types.EvaluationDatasetOrDict, + list[types.EvaluationDatasetOrDict], ], metrics: list[types.MetricOrDict] = None, config: Optional[types.EvaluateMethodConfigOrDict] = None, @@ -979,10 +981,13 @@ def evaluate( """Evaluates candidate responses in the provided dataset(s) using the specified metrics. Args: - dataset: The dataset(s) to evaluate. Can be a single `types.EvaluationDataset` or a list of `types.EvaluationDataset`. + dataset: The dataset(s) to evaluate. Can be a pandas DataFrame, a single + `types.EvaluationDataset` or a list of `types.EvaluationDataset`. metrics: The list of metrics to use for evaluation. - config: Optional configuration for the evaluation. Can be a dictionary or a `types.EvaluateMethodConfig` object. - - dataset_schema: Schema to use for the dataset. If not specified, the dataset schema will be inferred from the dataset automatically. + config: Optional configuration for the evaluation. Can be a dictionary or a + `types.EvaluateMethodConfig` object. + - dataset_schema: Schema to use for the dataset. If not specified, the + dataset schema will be inferred from the dataset automatically. - dest: Destination path for storing evaluation results. **kwargs: Extra arguments to pass to evaluation, such as `agent_info`. @@ -993,6 +998,10 @@ def evaluate( config = types.EvaluateMethodConfig() if isinstance(config, dict): config = types.EvaluateMethodConfig.model_validate(config) + + if isinstance(dataset, pd.DataFrame): + dataset = types.EvaluationDataset(eval_dataset_df=dataset) + if isinstance(dataset, list): dataset = [ (