From d62afc32db85a103c25878dd82a338feb86f53fe Mon Sep 17 00:00:00 2001
From: A Vertex SDK engineer <vertex-sdk-bot@google.com>
Date: Fri, 24 Oct 2025 14:31:00 -0700
Subject: [PATCH] feat: GenAI Client(evals) - Add agent data to EvaluationRun
 `show` in Vertex AI GenAI SDK evals

PiperOrigin-RevId: 823667266
---
 tests/unit/vertexai/genai/replays/conftest.py |  12 +-
 .../genai/replays/test_get_evaluation_run.py  | 241 ++++++++----------
 vertexai/_genai/_evals_common.py              | 104 ++++++--
 vertexai/_genai/_evals_constant.py            |  14 +
 vertexai/_genai/_gcs_utils.py                 |   7 +-
 vertexai/_genai/evals.py                      | 121 +++++++--
 6 files changed, 313 insertions(+), 186 deletions(-)

diff --git a/tests/unit/vertexai/genai/replays/conftest.py b/tests/unit/vertexai/genai/replays/conftest.py
index eafc155e13..b6319cf1ee 100644
--- a/tests/unit/vertexai/genai/replays/conftest.py
+++ b/tests/unit/vertexai/genai/replays/conftest.py
@@ -133,10 +133,14 @@ def _get_replay_id(use_vertex: bool, replays_prefix: str) -> str:
 )
 EVAL_ITEM_REQUEST_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/request_"
 EVAL_ITEM_RESULT_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/result_"
+EVAL_ITEM_REQUEST_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/request_"
+EVAL_ITEM_RESULT_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/result_"
 EVAL_GCS_URI_ITEMS = {
     EVAL_CONFIG_GCS_URI: "test_resources/mock_eval_config.yaml",
     EVAL_ITEM_REQUEST_GCS_URI: "test_resources/request_4813679498589372416.json",
     EVAL_ITEM_RESULT_GCS_URI: "test_resources/result_1486082323915997184.json",
+    EVAL_ITEM_REQUEST_GCS_URI_2: "test_resources/request_4813679498589372416.json",
+    EVAL_ITEM_RESULT_GCS_URI_2: "test_resources/result_1486082323915997184.json",
 }
 
 
@@ -148,11 +152,15 @@ def _mock_read_file_contents_side_effect(uri: str):
     current_dir = os.path.dirname(__file__)
     if uri in EVAL_GCS_URI_ITEMS:
         local_mock_file_path = os.path.join(current_dir, EVAL_GCS_URI_ITEMS[uri])
-    elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI):
+    elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI) or uri.startswith(
+        EVAL_ITEM_REQUEST_GCS_URI_2
+    ):
         local_mock_file_path = os.path.join(
             current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_REQUEST_GCS_URI]
         )
-    elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI):
+    elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI) or uri.startswith(
+        EVAL_ITEM_RESULT_GCS_URI_2
+    ):
         local_mock_file_path = os.path.join(
             current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_RESULT_GCS_URI]
         )
diff --git a/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py
index 5236c0ab7d..3db3bea517 100644
--- a/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py
+++ b/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py
@@ -16,31 +16,34 @@
 
 from tests.unit.vertexai.genai.replays import pytest_helper
 from vertexai import types
+from google.genai import types as genai_types
 import datetime
 import pytest
 
 
 def test_get_eval_run(client):
     """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
+    client._api_client._http_options.api_version = "v1beta1"
     evaluation_run_name = (
-        "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
+        "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
     )
     evaluation_run = client.evals.get_evaluation_run(
         name=evaluation_run_name, include_evaluation_items=True
     )
-    check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
-    check_run_1957799200510967808_evaluation_item_results(
+    check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
+    check_run_5133048044039700480_evaluation_item_results(
         client, evaluation_run, evaluation_run_name
     )
 
 
 def test_get_eval_run_include_evaluation_items_false(client):
     """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
+    client._api_client._http_options.api_version = "v1beta1"
     evaluation_run_name = (
-        "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
+        "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
     )
     evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
-    check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
+    check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
     assert evaluation_run.evaluation_item_results is None
 
 
@@ -99,158 +102,142 @@ def test_get_eval_run_eval_set_source(client):
 @pytest.mark.asyncio
 async def test_get_eval_run_async(client):
     """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
-    eval_run_id = "1957799200510967808"
+    client._api_client._http_options.api_version = "v1beta1"
+    eval_run_id = "5133048044039700480"
     evaluation_run_name = (
         f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
     )
     evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
-    check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
+    check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
     assert evaluation_run.evaluation_item_results is None
 
 
-def check_run_1957799200510967808(
+def check_run_5133048044039700480(
     client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
 ):
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.name == evaluation_run_name
-    assert evaluation_run.display_name == "test2"
-    assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"}
+    assert evaluation_run.display_name == "sdk-test-1"
+    assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"}
     assert evaluation_run.create_time == datetime.datetime(
-        2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc
+        2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc
     )
     assert evaluation_run.completion_time == datetime.datetime(
-        2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc
+        2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc
     )
     assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
     assert evaluation_run.evaluation_set_snapshot == (
-        "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
+        "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
     )
-    assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet(
-        uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
-        prompt_column="request",
-        candidate_response_columns={
-            "baseline_model_response": "baseline_model_response",
-            "checkpoint_1": "checkpoint_1",
-            "checkpoint_2": "checkpoint_2",
-        },
+    assert (
+        evaluation_run.data_source.evaluation_set
+        == "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
     )
     assert evaluation_run.evaluation_run_results.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
+        "projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
     )
     assert evaluation_run.inference_configs == {
-        "checkpoint_1": types.EvaluationRunInferenceConfig(
-            model="projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
-        ),
-        "checkpoint_2": types.EvaluationRunInferenceConfig(
-            model="projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
+        "gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig(
+            agent_config=types.EvaluationRunAgentConfig(
+                developer_instruction={
+                    "parts": [{"text": "example agent developer instruction"}]
+                },
+                tools=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(
+                                name="check_chime",
+                                description="Check chime.",
+                                parameters={
+                                    "type": "OBJECT",
+                                    "properties": {
+                                        "nums": {
+                                            "type": "STRING",
+                                            "description": "List of numbers to be verified.",
+                                        }
+                                    },
+                                    "required": ["nums"],
+                                },
+                            ),
+                        ],
+                    )
+                ],
+            )
         ),
     }
     assert evaluation_run.evaluation_run_results.summary_metrics == (
         types.SummaryMetric(
             metrics={
-                "checkpoint_1/user_defined/MODE": 5,
-                "checkpoint_2/universal/P90": 1,
-                "gemini-2.0-flash-001@default/universal/AVERAGE": 0.6943817985685249,
-                "gemini-2.0-flash-001@default/user_defined/P90": 5,
-                "gemini-2.0-flash-001@default/universal/VARIANCE": 0.03146487552180889,
-                "gemini-2.0-flash-001@default/user_defined/P95": 5,
-                "checkpoint_1/universal/MINIMUM": 0.8571428656578064,
-                "checkpoint_1/universal/VARIANCE": 0.0015452162403157982,
-                "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.17738341388587855,
-                "checkpoint_2/user_defined/P95": 5,
-                "checkpoint_2/universal/MODE": 1,
-                "checkpoint_2/user_defined/P90": 5,
-                "checkpoint_2/universal/P99": 1,
+                "gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077,
+                "gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1,
+                "gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842,
+                "gemini-2.0-flash-001@default/universal/P90": 1,
+                "gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1,
+                "gemini-2.0-flash-001@default/universal/P95": 1,
+                "gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077,
+                "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675,
+                "gemini-2.0-flash-001@default/universal/MEDIAN": 1,
+                "gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675,
+                "gemini-2.0-flash-001@default/universal/MODE": 1,
+                "gemini-2.0-flash-001@default/safety_v1/MODE": 1,
+                "gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408,
+                "gemini-2.0-flash-001@default/safety_v1/P90": 1,
+                "gemini-2.0-flash-001@default/safety_v1/P95": 1,
+                "gemini-2.0-flash-001@default/universal/P99": 1,
+                "gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842,
+                "gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408,
                 "gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
-                "checkpoint_2/universal/P95": 1,
-                "checkpoint_2/user_defined/P99": 5,
-                "checkpoint_2/universal/MINIMUM": 0.7777777910232544,
-                "gemini-2.0-flash-001@default/universal/P90": 0.8777777791023255,
-                "checkpoint_1/universal/AVERAGE": 0.986633250587865,
-                "checkpoint_1/universal/MAXIMUM": 1,
-                "checkpoint_1/universal/STANDARD_DEVIATION": 0.0393092386127714,
-                "gemini-2.0-flash-001@default/universal/P95": 0.9000000059604645,
-                "gemini-2.0-flash-001@default/user_defined/MAXIMUM": 5,
-                "gemini-2.0-flash-001@default/user_defined/MINIMUM": 3,
-                "gemini-2.0-flash-001@default/user_defined/VARIANCE": 0.4044321329639886,
-                "checkpoint_2/user_defined/MAXIMUM": 5,
-                "checkpoint_1/universal/MEDIAN": 1,
-                "gemini-2.0-flash-001@default/universal/MEDIAN": 0.7142857313156128,
-                "gemini-2.0-flash-001@default/user_defined/AVERAGE": 4.736842105263158,
-                "gemini-2.0-flash-001@default/user_defined/MEDIAN": 5,
-                "checkpoint_2/user_defined/AVERAGE": 5,
-                "checkpoint_2/user_defined/MEDIAN": 5,
-                "checkpoint_2/user_defined/STANDARD_DEVIATION": 0,
-                "checkpoint_2/universal/MAXIMUM": 1,
-                "checkpoint_1/universal/MODE": 1,
-                "checkpoint_2/user_defined/MINIMUM": 5,
-                "checkpoint_1/user_defined/VARIANCE": 0,
-                "checkpoint_2/universal/VARIANCE": 0.005771725970062436,
-                "checkpoint_2/universal/AVERAGE": 0.9438178790243048,
-                "checkpoint_1/user_defined/MINIMUM": 5,
-                "gemini-2.0-flash-001@default/universal/P99": 0.9800000011920929,
-                "gemini-2.0-flash-001@default/universal/MINIMUM": 0.2857142984867096,
-                "checkpoint_2/user_defined/VARIANCE": 0,
-                "checkpoint_1/user_defined/MEDIAN": 5,
-                "checkpoint_2/universal/STANDARD_DEVIATION": 0.07597187617837561,
-                "checkpoint_1/user_defined/AVERAGE": 5,
-                "checkpoint_1/user_defined/MAXIMUM": 5,
-                "gemini-2.0-flash-001@default/user_defined/MODE": 5,
-                "checkpoint_1/user_defined/P95": 5,
-                "checkpoint_1/universal/P99": 1,
-                "checkpoint_1/user_defined/P90": 5,
-                "checkpoint_2/universal/MEDIAN": 1,
-                "checkpoint_1/universal/P95": 1,
-                "checkpoint_1/user_defined/STANDARD_DEVIATION": 0,
-                "gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION": 0.6359497880839245,
-                "checkpoint_1/user_defined/P99": 5,
-                "gemini-2.0-flash-001@default/universal/MODE": [
-                    0.75,
-                    0.8571428656578064,
-                ],
-                "checkpoint_2/user_defined/MODE": 5,
-                "checkpoint_1/universal/P90": 1,
-                "gemini-2.0-flash-001@default/user_defined/P99": 5,
+                "gemini-2.0-flash-001@default/safety_v1/P99": 1,
             },
-            total_items=19,
+            total_items=3,
         )
     )
     assert evaluation_run.error is None
 
 
-def check_run_1957799200510967808_evaluation_item_results(
+def check_run_5133048044039700480_evaluation_item_results(
     client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
 ):
     eval_result = evaluation_run.evaluation_item_results
     assert isinstance(eval_result, types.EvaluationResult)
     assert eval_result.summary_metrics == [
         types.AggregatedMetricResult(
-            metric_name="checkpoint_1/universal",
-            mean_score=0.986633250587865,
-            stdev_score=0.0393092386127714,
+            metric_name="safety_v1",
+            mean_score=0.7888888915379842,
+            stdev_score=0.2991758188061675,
         ),
         types.AggregatedMetricResult(
-            metric_name="checkpoint_2/universal",
-            mean_score=0.9438178790243048,
-            stdev_score=0.07597187617837561,
-        ),
-        types.AggregatedMetricResult(
-            metric_name="gemini-2.0-flash-001@default/universal",
-            mean_score=0.6943817985685249,
-            stdev_score=0.17738341388587855,
-        ),
-        types.AggregatedMetricResult(
-            metric_name="checkpoint_1/user_defined", mean_score=5, stdev_score=0
-        ),
-        types.AggregatedMetricResult(
-            metric_name="checkpoint_2/user_defined", mean_score=5, stdev_score=0
-        ),
-        types.AggregatedMetricResult(
-            metric_name="gemini-2.0-flash-001@default/user_defined",
-            mean_score=4.736842105263158,
-            stdev_score=0.6359497880839245,
+            metric_name="universal",
+            mean_score=0.7888888915379842,
+            stdev_score=0.2991758188061675,
         ),
     ]
+    # Check the agent info.
+    assert eval_result.agent_info == types.evals.AgentInfo(
+        name="gemini-2.0-flash-001@default",
+        instruction="example agent developer instruction",
+        description=None,
+        tool_declarations=[
+            genai_types.Tool(
+                function_declarations=[
+                    genai_types.FunctionDeclaration(
+                        name="check_chime",
+                        description="Check chime.",
+                        parameters={
+                            "type": "OBJECT",
+                            "properties": {
+                                "nums": {
+                                    "type": "STRING",
+                                    "description": "List of numbers to be verified.",
+                                }
+                            },
+                            "required": ["nums"],
+                        },
+                    ),
+                ],
+            )
+        ],
+    )
     # Check the first eval case result.
     eval_case_result = eval_result.eval_case_results[0]
     assert isinstance(eval_case_result, types.EvalCaseResult)
@@ -264,26 +251,24 @@ def check_run_1957799200510967808_evaluation_item_results(
     assert universal_metric_result.explanation is None
     # Check the first rubric verdict.
     rubric_verdict_0 = universal_metric_result.rubric_verdicts[0]
-    assert rubric_verdict_0 == (
-        types.RubricVerdict(
-            evaluated_rubric=types.Rubric(
-                content=types.RubricContent(
-                    property=types.RubricContentProperty(
-                        description="The response is in English."
-                    )
-                ),
-                importance="HIGH",
-                type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
-            ),
-            reasoning=("The entire response is written in the English language."),
-            verdict=True,
-        )
+    assert isinstance(rubric_verdict_0, types.RubricVerdict)
+    assert rubric_verdict_0.evaluated_rubric == types.Rubric(
+        content=types.RubricContent(
+            property=types.RubricContentProperty(
+                description="The response is in English."
+            )
+        ),
+        importance="HIGH",
+        type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
     )
+    assert rubric_verdict_0.reasoning is not None
+    assert rubric_verdict_0.verdict is True
     # Check the first evaluation dataset.
     eval_dataset = eval_result.evaluation_dataset[0]
     assert isinstance(eval_dataset, types.EvaluationDataset)
     assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default"
-    assert eval_dataset.eval_dataset_df.shape == (19, 3)
+    assert eval_dataset.eval_dataset_df.shape[0] == 3
+    assert eval_dataset.eval_dataset_df.shape[1] > 3
 
 
 pytestmark = pytest_helper.setup(
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index 89357d9a1d..8388784272 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -14,6 +14,7 @@
 #
 """Common utilities for evals."""
 import asyncio
+import base64
 import collections
 import concurrent.futures
 import datetime
@@ -1114,7 +1115,7 @@ def _execute_evaluation(
             validated_agent_info = agent_info
         else:
             raise TypeError(
-                f"agent_info values must be of type types.AgentInfo or dict, but got {type(agent_info)}'"
+                f"agent_info values must be of type types.evals.AgentInfo or dict, but got {type(agent_info)}'"
             )
 
     processed_eval_dataset, num_response_candidates = _resolve_dataset_inputs(
@@ -1395,7 +1396,7 @@ def _get_aggregated_metrics(
 
     return [
         types.AggregatedMetricResult(
-            metric_name=name,
+            metric_name=name.split("/")[-1],
             mean_score=values.get("AVERAGE"),
             stdev_score=values.get("STANDARD_DEVIATION"),
         )
@@ -1434,10 +1435,23 @@ def _convert_request_to_dataset_row(
 ) -> dict[str, Any]:
     """Converts an EvaluationItemRequest to a dictionary."""
     dict_row = {}
-    dict_row["prompt"] = request.prompt.text if request.prompt.text else None
-    dict_row["reference"] = request.golden_response
-    for candidate in request.candidate_responses:
-        dict_row[candidate.candidate] = candidate.text if candidate.text else None
+    dict_row[_evals_constant.PROMPT] = (
+        request.prompt.text if request.prompt.text else None
+    )
+    dict_row[_evals_constant.REFERENCE] = request.golden_response
+    intermediate_events = []
+    if request.candidate_responses:
+        for candidate in request.candidate_responses:
+            dict_row[candidate.candidate] = candidate.text if candidate.text else None
+            if candidate.events:
+                for event in candidate.events:
+                    content_dict = {"parts": event.parts, "role": event.role}
+                    int_events_dict = {
+                        "event_id": candidate.candidate,
+                        "content": content_dict,
+                    }
+                    intermediate_events.append(int_events_dict)
+    dict_row[_evals_constant.INTERMEDIATE_EVENTS] = intermediate_events
     return dict_row
 
 
@@ -1451,15 +1465,14 @@ def _transform_dataframe(rows: list[dict[str, Any]]) -> list[types.EvaluationDat
       A list of EvaluationDatasets, one for each candidate.
     """
     df = pd.DataFrame(rows)
-    exclude_cols = ["prompt", "reference"]
-    candidates = [col for col in df.columns if col not in exclude_cols]
+    candidates = [
+        col for col in df.columns if col not in _evals_constant.COMMON_DATASET_COLUMNS
+    ]
 
     eval_dfs = [
         types.EvaluationDataset(
             candidate_name=candidate,
-            eval_dataset_df=df[["prompt", "reference", candidate]].rename(
-                columns={candidate: "response"}
-            ),
+            eval_dataset_df=df.rename(columns={candidate: _evals_constant.RESPONSE}),
         )
         for candidate in candidates
     ]
@@ -1487,7 +1500,6 @@ def _get_eval_cases_eval_dfs_from_eval_items(
             eval_item
             and eval_item.evaluation_response
             and eval_item.evaluation_response.request
-            and eval_item.evaluation_response.candidate_results
         ):
             eval_case_results.append(
                 _get_eval_case_result_from_eval_item(index, eval_item)
@@ -1499,9 +1511,37 @@ def _get_eval_cases_eval_dfs_from_eval_items(
     return eval_case_results, eval_dfs
 
 
+def _get_agent_info_from_inference_configs(
+    candidate_names: list[str],
+    inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
+) -> Optional[types.evals.AgentInfo]:
+    """Retrieves an AgentInfo from the inference configs."""
+    # TODO(lakeyk): Support multiple agents.
+    if not (
+        inference_configs
+        and candidate_names
+        and candidate_names[0] in inference_configs
+        and inference_configs[candidate_names[0]].agent_config
+    ):
+        return None
+    if len(inference_configs.keys()) > 1:
+        logger.warning(
+            "Multiple agents are not supported yet. Displaying the first agent."
+        )
+    agent_config = inference_configs[candidate_names[0]].agent_config
+    di = agent_config.developer_instruction
+    instruction = di.parts[0].text if di and di.parts and di.parts[0].text else None
+    return types.evals.AgentInfo(
+        name=candidate_names[0],
+        instruction=instruction,
+        tool_declarations=agent_config.tools,
+    )
+
+
 def _get_eval_result_from_eval_items(
     results: types.EvaluationRunResults,
     eval_items: list[types.EvaluationItem],
+    inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
 ) -> types.EvaluationResult:
     """Retrieves an EvaluationResult from the EvaluationRunResults.
 
@@ -1525,6 +1565,9 @@ def _get_eval_result_from_eval_items(
         metadata=types.EvaluationRunMetadata(
             candidate_names=candidate_names,
         ),
+        agent_info=_get_agent_info_from_inference_configs(
+            candidate_names, inference_configs
+        ),
     )
     return eval_result
 
@@ -1532,6 +1575,7 @@ def _get_eval_result_from_eval_items(
 def _convert_evaluation_run_results(
     api_client: BaseApiClient,
     evaluation_run_results: types.EvaluationRunResults,
+    inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
 ) -> list[types.EvaluationItem]:
     """Retrieves an EvaluationItem from the EvaluationRunResults."""
     if not evaluation_run_results or not evaluation_run_results.evaluation_set:
@@ -1548,12 +1592,15 @@ def _convert_evaluation_run_results(
             evals_module.get_evaluation_item(name=item_name)
             for item_name in eval_set.evaluation_items
         ]
-    return _get_eval_result_from_eval_items(evaluation_run_results, eval_items)
+    return _get_eval_result_from_eval_items(
+        evaluation_run_results, eval_items, inference_configs
+    )
 
 
 async def _convert_evaluation_run_results_async(
     api_client: BaseApiClient,
     evaluation_run_results: types.EvaluationRunResults,
+    inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
 ) -> list[types.EvaluationItem]:
     """Retrieves an EvaluationItem from the EvaluationRunResults."""
     if not evaluation_run_results or not evaluation_run_results.evaluation_set:
@@ -1571,7 +1618,9 @@ async def _convert_evaluation_run_results_async(
             for eval_item in eval_set.evaluation_items
         ]
         eval_items = await asyncio.gather(*tasks)
-    return _get_eval_result_from_eval_items(evaluation_run_results, eval_items)
+    return _get_eval_result_from_eval_items(
+        evaluation_run_results, eval_items, inference_configs
+    )
 
 
 def _object_to_dict(obj) -> dict[str, Any]:
@@ -1587,6 +1636,8 @@ def _object_to_dict(obj) -> dict[str, Any]:
             result[key] = value
         elif isinstance(value, (list, tuple)):
             result[key] = [_object_to_dict(item) for item in value]
+        elif isinstance(value, bytes):
+            result[key] = base64.b64encode(value).decode("utf-8")
         elif hasattr(value, "__dict__"):  # Nested object
             result[key] = _object_to_dict(value)
         else:
@@ -1604,29 +1655,30 @@ def _create_evaluation_set_from_dataframe(
     eval_item_requests = []
     for _, row in eval_df.iterrows():
         intermediate_events = []
-        if "intermediate_events" in row:
-            for event in row["intermediate_events"]:
-                intermediate_events.append(
-                    genai_types.Content(
-                        parts=event["content"]["parts"], role=event["content"]["role"]
-                    )
-                )
+        if (
+            _evals_constant.INTERMEDIATE_EVENTS in row
+            and isinstance(row[_evals_constant.INTERMEDIATE_EVENTS], list)
+            and len(row[_evals_constant.INTERMEDIATE_EVENTS]) > 0
+        ):
+            for event in row[_evals_constant.INTERMEDIATE_EVENTS]:
+                if "content" in event:
+                    intermediate_events.append(event["content"])
         eval_item_requests.append(
             types.EvaluationItemRequest(
                 prompt=(
-                    types.EvaluationPrompt(text=row["prompt"])
-                    if "prompt" in row
+                    types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
+                    if _evals_constant.PROMPT in row
                     else None
                 ),
                 golden_response=(
-                    types.CandidateResponse(text=row["reference"])
-                    if "reference" in row
+                    types.CandidateResponse(text=row[_evals_constant.REFERENCE])
+                    if _evals_constant.REFERENCE in row
                     else None
                 ),
                 candidate_responses=[
                     types.CandidateResponse(
                         candidate=candidate_name or "Candidate 1",
-                        text=row.get("response", None),
+                        text=row.get(_evals_constant.RESPONSE, None),
                         events=(
                             intermediate_events
                             if len(intermediate_events) > 0
diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py
index d82970d981..a92c8a70dc 100644
--- a/vertexai/_genai/_evals_constant.py
+++ b/vertexai/_genai/_evals_constant.py
@@ -46,3 +46,17 @@
 )
 INTERMEDIATE_EVENTS = "intermediate_events"
 RESPONSE = "response"
+PROMPT = "prompt"
+REFERENCE = "reference"
+SESSION_INPUT = "session_inputs"
+CONTEXT = "context"
+
+COMMON_DATASET_COLUMNS = frozenset(
+    {
+        INTERMEDIATE_EVENTS,
+        PROMPT,
+        REFERENCE,
+        SESSION_INPUT,
+        CONTEXT,
+    }
+)
diff --git a/vertexai/_genai/_gcs_utils.py b/vertexai/_genai/_gcs_utils.py
index 8e8363dfc8..021d9d0051 100644
--- a/vertexai/_genai/_gcs_utils.py
+++ b/vertexai/_genai/_gcs_utils.py
@@ -16,12 +16,12 @@
 import io
 import json
 import logging
-import time
 from typing import Any, Union
 
 from google.cloud import storage  # type: ignore[attr-defined]
 from google.genai._api_client import BaseApiClient
 import pandas as pd
+import uuid
 
 
 logger = logging.getLogger(__name__)
@@ -125,7 +125,7 @@ def upload_json_to_prefix(
         gcs_dest_prefix: str,
         filename_prefix: str = "data",
     ) -> str:
-        """Uploads a dictionary to a GCS prefix with a timestamped JSON filename.
+        """Uploads a dictionary to a GCS prefix with a UUID JSON filename.
 
         Args:
           data: The dictionary to upload.
@@ -151,8 +151,7 @@ def upload_json_to_prefix(
         if user_prefix_path and not user_prefix_path.endswith("/"):
             user_prefix_path += "/"
 
-        timestamp = time.strftime("%Y%m%d-%H%M%S")
-        filename = f"{filename_prefix}_{timestamp}.json"
+        filename = f"{filename_prefix}_{uuid.uuid4()}.json"
 
         blob_name = f"{user_prefix_path}{filename}"
 
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index d3c5827468..765c28a41b 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -19,6 +19,7 @@
 import logging
 from typing import Any, Callable, Optional, Union
 from urllib.parse import urlencode
+import uuid
 
 from google.genai import _api_module
 from google.genai import _common
@@ -1295,7 +1296,20 @@ def get_evaluation_run(
         include_evaluation_items: bool = False,
         config: Optional[types.GetEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
-        """Retrieves an EvaluationRun from the resource name."""
+        """Retrieves an EvaluationRun from the resource name.
+        Args:
+          name: The resource name of the EvaluationRun. Format:
+            `projects/{project}/locations/{location}/evaluationRuns/{evaluation_run}`
+          include_evaluation_items: Whether to include the evaluation items in the
+            response.
+          config: The optional configuration for the evaluation run. Must be a dict or
+              `types.GetEvaluationRunConfigOrDict` type.
+
+        Returns:
+          The evaluation run.
+        Raises:
+          ValueError: If the name is empty or invalid.
+        """
         if not name:
             raise ValueError("name cannot be empty.")
         if name.startswith("projects/"):
@@ -1304,7 +1318,9 @@ def get_evaluation_run(
         if include_evaluation_items:
             result.evaluation_item_results = (
                 _evals_common._convert_evaluation_run_results(
-                    self._api_client, result.evaluation_run_results
+                    self._api_client,
+                    result.evaluation_run_results,
+                    result.inference_configs,
                 )
             )
         return result
@@ -1316,18 +1332,32 @@ def get_evaluation_run(
     def create_evaluation_run(
         self,
         *,
-        name: str,
         dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
         dest: str,
+        name: Optional[str] = None,
         display_name: Optional[str] = None,
         metrics: Optional[
             list[types.EvaluationRunMetricOrDict]
         ] = None,  # TODO: Make required unified metrics available in prod.
-        agent_info: Optional[types.evals.AgentInfo] = None,
+        agent_info: Optional[types.evals.AgentInfoOrDict] = None,
         labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
-        """Creates an EvaluationRun."""
+        """Creates an EvaluationRun.
+
+        Args:
+          dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
+          dest: The GCS URI prefix to write the evaluation results to.
+          name: The name of the evaluation run.
+          display_name: The display name of the evaluation run.
+          metrics: The list of metrics to evaluate.
+          agent_info: The agent info to evaluate.
+          labels: The labels to apply to the evaluation run.
+          config: The configuration for the evaluation run.
+
+        Returns:
+            The created evaluation run.
+        """
         if type(dataset).__name__ == "EvaluationDataset":
             logger.warning(
                 "EvaluationDataset input is experimental and may change in future versions."
@@ -1336,6 +1366,8 @@ def create_evaluation_run(
                 raise ValueError(
                     "EvaluationDataset must have eval_dataset_df populated."
                 )
+            if dataset.candidate_name is None and agent_info:
+                dataset.candidate_name = agent_info.name
             eval_set = _evals_common._create_evaluation_set_from_dataframe(
                 self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
             )
@@ -1354,6 +1386,15 @@ def create_evaluation_run(
             logger.warning(
                 "The agent_info field is experimental and may change in future versions."
             )
+            if isinstance(agent_info, dict):
+                agent_info = types.evals.AgentInfo.model_validate(agent_info)
+            if (
+                not agent_info.agent
+                or len(agent_info.agent.split("reasoningEngines/")) != 2
+            ):
+                raise ValueError(
+                    "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}."
+                )
             inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
                 agent_config=types.EvaluationRunAgentConfig(
                     developer_instruction=genai_types.Content(
@@ -1362,21 +1403,16 @@ def create_evaluation_run(
                     tools=agent_info.tool_declarations,
                 )
             )
-            if (
-                not agent_info.agent
-                or len(agent_info.agent.split("reasoningEngines/")) != 2
-            ):
-                raise ValueError(
-                    "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}."
-                )
             labels = labels or {}
             labels["vertex-ai-evaluation-agent-engine-id"] = agent_info.agent.split(
                 "reasoningEngines/"
             )[-1]
+        if not name:
+            name = f"evaluation_run_{uuid.uuid4()}"
 
         return self._create_evaluation_run(  # type: ignore[no-any-return]
             name=name,
-            display_name=display_name,
+            display_name=display_name or name,
             data_source=dataset,
             evaluation_config=evaluation_config,
             inference_configs=inference_configs,
@@ -2112,8 +2148,19 @@ async def get_evaluation_run(
         include_evaluation_items: bool = False,
         config: Optional[types.GetEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
-        """
-        Retrieves an EvaluationRun from the resource name.
+        """Retrieves the EvaluationRun from the resource name.
+        Args:
+          name: The resource name of the EvaluationRun. Format:
+            `projects/{project}/locations/{location}/evaluationRuns/{evaluation_run}`
+          include_evaluation_items: Whether to include the evaluation items in the
+            response.
+          config: The optional configuration for the evaluation run. Must be a dict or
+              `types.GetEvaluationRunConfigOrDict` type.
+
+        Returns:
+          The evaluation run.
+        Raises:
+          ValueError: If the name is empty or invalid.
         """
         if not name:
             raise ValueError("name cannot be empty.")
@@ -2123,7 +2170,9 @@ async def get_evaluation_run(
         if include_evaluation_items:
             result.evaluation_item_results = (
                 await _evals_common._convert_evaluation_run_results_async(
-                    self._api_client, result.evaluation_run_results
+                    self._api_client,
+                    result.evaluation_run_results,
+                    result.inference_configs,
                 )
             )
 
@@ -2136,9 +2185,9 @@ async def get_evaluation_run(
     async def create_evaluation_run(
         self,
         *,
-        name: str,
         dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
         dest: str,
+        name: Optional[str] = None,
         display_name: Optional[str] = None,
         metrics: Optional[
             list[types.EvaluationRunMetricOrDict]
@@ -2147,7 +2196,21 @@ async def create_evaluation_run(
         labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
-        """Creates an EvaluationRun."""
+        """Creates an EvaluationRun.
+
+        Args:
+          dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
+          dest: The GCS URI prefix to write the evaluation results to.
+          name: The name of the evaluation run.
+          display_name: The display name of the evaluation run.
+          metrics: The list of metrics to evaluate.
+          agent_info: The agent info to evaluate.
+          labels: The labels to apply to the evaluation run.
+          config: The configuration for the evaluation run.
+
+        Returns:
+            The created evaluation run.
+        """
         if type(dataset).__name__ == "EvaluationDataset":
             logger.warning(
                 "EvaluationDataset input is experimental and may change in future versions."
@@ -2156,6 +2219,8 @@ async def create_evaluation_run(
                 raise ValueError(
                     "EvaluationDataset must have eval_dataset_df populated."
                 )
+            if dataset.candidate_name is None and agent_info:
+                dataset.candidate_name = agent_info.name
             eval_set = _evals_common._create_evaluation_set_from_dataframe(
                 self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
             )
@@ -2174,6 +2239,15 @@ async def create_evaluation_run(
             logger.warning(
                 "The agent_info field is experimental and may change in future versions."
             )
+            if isinstance(agent_info, dict):
+                agent_info = types.evals.AgentInfo.model_validate(agent_info)
+            if (
+                not agent_info.agent
+                or len(agent_info.agent.split("reasoningEngines/")) != 2
+            ):
+                raise ValueError(
+                    "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}."
+                )
             inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
                 agent_config=types.EvaluationRunAgentConfig(
                     developer_instruction=genai_types.Content(
@@ -2182,21 +2256,16 @@ async def create_evaluation_run(
                     tools=agent_info.tool_declarations,
                 )
             )
-            if (
-                not agent_info.agent
-                or len(agent_info.agent.split("reasoningEngines/")) != 2
-            ):
-                raise ValueError(
-                    "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}."
-                )
             labels = labels or {}
             labels["vertex-ai-evaluation-agent-engine-id"] = agent_info.agent.split(
                 "reasoningEngines/"
             )[-1]
+        if not name:
+            name = f"evaluation_run_{uuid.uuid4()}"
 
         result = await self._create_evaluation_run(  # type: ignore[no-any-return]
             name=name,
-            display_name=display_name,
+            display_name=display_name or name,
             data_source=dataset,
             evaluation_config=evaluation_config,
             inference_configs=inference_configs,