googleapis · copybara-service · Nov 3, 2025 · Nov 3, 2025
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py
@@ -54,6 +54,47 @@ def test_evaluation_result(client):
         assert case_result.response_candidate_results is not None
 
 
+def test_evaluation_byor(client):
+    """Tests that evaluate() with BYOR (Bring-Your-Own Response) produces a correctly structured EvaluationResult."""
+    byor_df = pd.DataFrame(
+        {
+            "prompt": [
+                "Write a simple story about a dinosaur",
+                "Generate a poem about Vertex AI",
+            ],
+            "response": [
+                "Once upon a time, there was a T-Rex named Rexy.",
+                "In clouds of code, a mind of silicon born...",
+            ],
+        }
+    )
+
+    metrics_to_run = [
+        types.RubricMetric.GENERAL_QUALITY,
+    ]
+
+    evaluation_result = client.evals.evaluate(
+        dataset=byor_df,
+        metrics=metrics_to_run,
+    )
+
+    assert isinstance(evaluation_result, types.EvaluationResult)
+
+    assert evaluation_result.summary_metrics is not None
+    assert len(evaluation_result.summary_metrics) > 0
+    for summary in evaluation_result.summary_metrics:
+        assert isinstance(summary, types.AggregatedMetricResult)
+        assert summary.metric_name is not None
+        assert summary.mean_score is not None
+
+    assert evaluation_result.eval_case_results is not None
+    assert len(evaluation_result.eval_case_results) > 0
+    for case_result in evaluation_result.eval_case_results:
+        assert isinstance(case_result, types.EvalCaseResult)
+        assert case_result.eval_case_index is not None
+        assert case_result.response_candidate_results is not None
+
+
 pytestmark = pytest_helper.setup(
     file=__file__,
     globals_for_file=globals(),

diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
@@ -970,7 +970,9 @@ def evaluate(
         self,
         *,
         dataset: Union[
-            types.EvaluationDatasetOrDict, list[types.EvaluationDatasetOrDict]
+            pd.DataFrame,
+            types.EvaluationDatasetOrDict,
+            list[types.EvaluationDatasetOrDict],
         ],
         metrics: list[types.MetricOrDict] = None,
         config: Optional[types.EvaluateMethodConfigOrDict] = None,
@@ -979,10 +981,13 @@ def evaluate(
         """Evaluates candidate responses in the provided dataset(s) using the specified metrics.
 
         Args:
-          dataset: The dataset(s) to evaluate. Can be a single `types.EvaluationDataset` or a list of `types.EvaluationDataset`.
+          dataset: The dataset(s) to evaluate. Can be a pandas DataFrame, a single
+            `types.EvaluationDataset` or a list of `types.EvaluationDataset`.
           metrics: The list of metrics to use for evaluation.
-          config: Optional configuration for the evaluation. Can be a dictionary or a `types.EvaluateMethodConfig` object.
-            - dataset_schema: Schema to use for the dataset. If not specified, the dataset schema will be inferred from the dataset automatically.
+          config: Optional configuration for the evaluation. Can be a dictionary or a
+            `types.EvaluateMethodConfig` object.
+            - dataset_schema: Schema to use for the dataset. If not specified, the
+              dataset schema will be inferred from the dataset automatically.
             - dest: Destination path for storing evaluation results.
           **kwargs: Extra arguments to pass to evaluation, such as `agent_info`.
 
@@ -993,6 +998,10 @@ def evaluate(
             config = types.EvaluateMethodConfig()
         if isinstance(config, dict):
             config = types.EvaluateMethodConfig.model_validate(config)
+
+        if isinstance(dataset, pd.DataFrame):
+            dataset = types.EvaluationDataset(eval_dataset_df=dataset)
+
         if isinstance(dataset, list):
             dataset = [
                 (