From 741c6ad6bf860ec10653f86ad816a62465cdaaf5 Mon Sep 17 00:00:00 2001
From: A Vertex SDK engineer
Date: Thu, 30 Oct 2025 09:36:31 -0700
Subject: [PATCH] fix: GenAI Client(evals) - Support EvaluationDataset output
from run_inference as input `dataset` in `create_evaluation_run` in Vertex AI
GenAI SDK evals
PiperOrigin-RevId: 826069078
---
.../replays/test_create_evaluation_run.py | 136 +++++++++---------
.../genai/replays/test_evaluate_instances.py | 2 +-
tests/unit/vertexai/genai/test_evals.py | 4 +-
vertexai/_genai/_evals_common.py | 1 -
vertexai/_genai/_evals_data_converters.py | 4 -
vertexai/_genai/_evals_visualization.py | 35 ++++-
vertexai/_genai/evals.py | 48 ++++---
vertexai/_genai/types/common.py | 6 +-
8 files changed, 135 insertions(+), 101 deletions(-)
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
index 9b6cc31735..da9a537d23 100644
--- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
+++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -48,70 +48,69 @@
)
-# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
-# def test_create_eval_run_data_source_evaluation_set(client):
-# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
-# client._api_client._http_options.api_version = "v1beta1"
-# tool = genai_types.Tool(
-# function_declarations=[
-# genai_types.FunctionDeclaration(
-# name="get_weather",
-# description="Get weather in a location",
-# parameters={
-# "type": "object",
-# "properties": {"location": {"type": "string"}},
-# },
-# )
-# ]
-# )
-# evaluation_run = client.evals.create_evaluation_run(
-# name="test4",
-# display_name="test4",
-# dataset=types.EvaluationRunDataSource(
-# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-# ),
-# dest=GCS_DEST,
-# metrics=[
-# UNIVERSAL_AR_METRIC,
-# types.RubricMetric.FINAL_RESPONSE_QUALITY,
-# LLM_METRIC
-# ],
-# agent_info=types.AgentInfo(
-# agent="project/123/locations/us-central1/reasoningEngines/456",
-# name="agent-1",
-# instruction="agent-1 instruction",
-# tool_declarations=[tool],
-# ),
-# labels={"label1": "value1"},
-# )
-# assert isinstance(evaluation_run, types.EvaluationRun)
-# assert evaluation_run.display_name == "test4"
-# assert evaluation_run.state == types.EvaluationRunState.PENDING
-# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-# assert evaluation_run.data_source.evaluation_set == (
-# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-# )
-# assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
-# output_config=genai_types.OutputConfig(
-# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
-# ),
-# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
-# )
-# assert evaluation_run.inference_configs[
-# "agent-1"
-# ] == types.EvaluationRunInferenceConfig(
-# agent_config=types.EvaluationRunAgentConfig(
-# developer_instruction=genai_types.Content(
-# parts=[genai_types.Part(text="agent-1 instruction")]
-# ),
-# tools=[tool],
-# )
-# )
-# assert evaluation_run.labels == {
-# "vertex-ai-evaluation-agent-engine-id": "456",
-# "label1": "value1",
-# }
-# assert evaluation_run.error is None
+def test_create_eval_run_data_source_evaluation_set(client):
+ """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
+ client._api_client._http_options.api_version = "v1beta1"
+ tool = genai_types.Tool(
+ function_declarations=[
+ genai_types.FunctionDeclaration(
+ name="get_weather",
+ description="Get weather in a location",
+ parameters={
+ "type": "object",
+ "properties": {"location": {"type": "string"}},
+ },
+ )
+ ]
+ )
+ evaluation_run = client.evals.create_evaluation_run(
+ name="test4",
+ display_name="test4",
+ dataset=types.EvaluationRunDataSource(
+ evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+ ),
+ dest=GCS_DEST,
+ metrics=[
+ UNIVERSAL_AR_METRIC,
+ types.RubricMetric.FINAL_RESPONSE_QUALITY,
+ LLM_METRIC,
+ ],
+ agent_info=types.evals.AgentInfo(
+ agent="project/123/locations/us-central1/reasoningEngines/456",
+ name="agent-1",
+ instruction="agent-1 instruction",
+ tool_declarations=[tool],
+ ),
+ labels={"label1": "value1"},
+ )
+ assert isinstance(evaluation_run, types.EvaluationRun)
+ assert evaluation_run.display_name == "test4"
+ assert evaluation_run.state == types.EvaluationRunState.PENDING
+ assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+ assert evaluation_run.data_source.evaluation_set == (
+ "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+ )
+ assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+ output_config=genai_types.OutputConfig(
+ gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+ ),
+ metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
+ )
+ assert evaluation_run.inference_configs[
+ "agent-1"
+ ] == types.EvaluationRunInferenceConfig(
+ agent_config=types.EvaluationRunAgentConfig(
+ developer_instruction=genai_types.Content(
+ parts=[genai_types.Part(text="agent-1 instruction")]
+ ),
+ tools=[tool],
+ )
+ )
+ assert evaluation_run.labels == {
+ "vertex-ai-evaluation-agent-engine-id": "456",
+ "label1": "value1",
+ }
+ assert evaluation_run.error is None
def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
),
labels={"label1": "value1"},
dest=GCS_DEST,
+ metrics=[UNIVERSAL_AR_METRIC],
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test5"
@@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
+ metrics=[UNIVERSAL_AR_METRIC],
)
assert evaluation_run.inference_configs is None
assert evaluation_run.labels == {
@@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
assert evaluation_run.error is None
-# Test fails in replay mode because of the timestamp issue
+# Test fails in replay mode because of UUID generation mismatch.
# def test_create_eval_run_data_source_evaluation_dataset(client):
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
# input_df = pd.DataFrame(
@@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
# candidate_name="candidate_1",
# eval_dataset_df=input_df,
# ),
-# dest="gs://lakeyk-limited-bucket/eval_run_output",
+# dest=GCS_DEST,
+# metrics=[UNIVERSAL_AR_METRIC],
# )
# assert isinstance(evaluation_run, types.EvaluationRun)
# assert evaluation_run.display_name == "test6"
@@ -276,6 +278,7 @@ async def test_create_eval_run_async(client):
)
),
dest=GCS_DEST,
+ metrics=[UNIVERSAL_AR_METRIC],
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test8"
@@ -292,6 +295,7 @@ async def test_create_eval_run_async(client):
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
+ metrics=[UNIVERSAL_AR_METRIC],
)
assert evaluation_run.error is None
assert evaluation_run.inference_configs is None
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
index 76c04d16af..cf51bedf41 100644
--- a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
+++ b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
@@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):
agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
src=test_df,
)
- assert inference_result.candidate_name == "agent"
+ assert inference_result.candidate_name is None
assert inference_result.gcs_source is None
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
index d72b48a2fc..b6da92e69c 100644
--- a/tests/unit/vertexai/genai/test_evals.py
+++ b/tests/unit/vertexai/genai/test_evals.py
@@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(
}
),
)
- assert inference_result.candidate_name == "agent"
+ assert inference_result.candidate_name is None
assert inference_result.gcs_source is None
@mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")
@@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(
}
),
)
- assert inference_result.candidate_name == "agent"
+ assert inference_result.candidate_name is None
assert inference_result.gcs_source is None
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index 341ae88a7f..bd43229bd9 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -798,7 +798,6 @@ def _execute_inference(
evaluation_dataset = types.EvaluationDataset(
eval_dataset_df=results_df,
- candidate_name="agent",
)
else:
raise ValueError("Either model or agent_engine must be provided.")
diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py
index 340bf72965..337abaaae8 100644
--- a/vertexai/_genai/_evals_data_converters.py
+++ b/vertexai/_genai/_evals_data_converters.py
@@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
intermediate_events: Optional[list[types.Event]] = None
if intermediate_events_data:
- logger.warning(
- "intermediate_events attribute is experimental and may change in "
- "future versions."
- )
if isinstance(intermediate_events_data, list):
intermediate_events = []
for event in intermediate_events_data:
diff --git a/vertexai/_genai/_evals_visualization.py b/vertexai/_genai/_evals_visualization.py
index 0436d893ae..7b7e6174c7 100644
--- a/vertexai/_genai/_evals_visualization.py
+++ b/vertexai/_genai/_evals_visualization.py
@@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str:
// If we have agent info, render as trace
if(agentInfo) {{
- let traceHtml = ``;
+ let traceHtml = ``;
eventsArray.forEach(event => {{
if (event.content && event.content.parts && event.content.parts.length > 0) {{
event.content.parts.forEach(part => {{
@@ -1073,3 +1073,36 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str)
html_content = _get_inference_html(dataframe_json_string)
display.display(display.HTML(html_content))
+
+
+def _get_status_html(status: str, error_message: Optional[str] = None) -> str:
+ """Returns a simple HTML string for displaying a status and optional error."""
+ error_html = ""
+ if error_message:
+ error_html = f"""
+
+ Error:
+
{error_message}
+
+ """
+
+ return f"""
+
+
Status: {status}
+ {error_html}
+
+ """
+
+
+def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None:
+ """Displays the status of an evaluation run in an IPython environment."""
+ if not _is_ipython_env():
+ logger.warning("Skipping display: not in an IPython environment.")
+ return
+ else:
+ from IPython import display
+
+ status = eval_run_obj.state.name if eval_run_obj.state else "UNKNOWN"
+ error_message = str(eval_run_obj.error) if eval_run_obj.error else None
+ html_content = _get_status_html(status, error_message)
+ display.display(display.HTML(html_content))
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index 765c28a41b..bc60977a82 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -1334,11 +1334,9 @@ def create_evaluation_run(
*,
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
dest: str,
+ metrics: list[types.EvaluationRunMetricOrDict],
name: Optional[str] = None,
display_name: Optional[str] = None,
- metrics: Optional[
- list[types.EvaluationRunMetricOrDict]
- ] = None, # TODO: Make required unified metrics available in prod.
agent_info: Optional[types.evals.AgentInfoOrDict] = None,
labels: Optional[dict[str, str]] = None,
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -1348,9 +1346,9 @@ def create_evaluation_run(
Args:
dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
dest: The GCS URI prefix to write the evaluation results to.
+ metrics: The list of metrics to evaluate.
name: The name of the evaluation run.
display_name: The display name of the evaluation run.
- metrics: The list of metrics to evaluate.
agent_info: The agent info to evaluate.
labels: The labels to apply to the evaluation run.
config: The configuration for the evaluation run.
@@ -1358,15 +1356,22 @@ def create_evaluation_run(
Returns:
The created evaluation run.
"""
+ if agent_info and isinstance(agent_info, dict):
+ agent_info = types.evals.AgentInfo.model_validate(agent_info)
if type(dataset).__name__ == "EvaluationDataset":
- logger.warning(
- "EvaluationDataset input is experimental and may change in future versions."
- )
if dataset.eval_dataset_df is None:
raise ValueError(
"EvaluationDataset must have eval_dataset_df populated."
)
- if dataset.candidate_name is None and agent_info:
+ if (
+ dataset.candidate_name
+ and agent_info.name
+ and dataset.candidate_name != agent_info.name
+ ):
+ logger.warning(
+ "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
+ )
+ elif dataset.candidate_name is None and agent_info:
dataset.candidate_name = agent_info.name
eval_set = _evals_common._create_evaluation_set_from_dataframe(
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -1383,9 +1388,6 @@ def create_evaluation_run(
)
inference_configs = {}
if agent_info:
- logger.warning(
- "The agent_info field is experimental and may change in future versions."
- )
if isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if (
@@ -2187,11 +2189,9 @@ async def create_evaluation_run(
*,
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
dest: str,
+ metrics: list[types.EvaluationRunMetricOrDict],
name: Optional[str] = None,
display_name: Optional[str] = None,
- metrics: Optional[
- list[types.EvaluationRunMetricOrDict]
- ] = None, # TODO: Make required unified metrics available in prod.
agent_info: Optional[types.evals.AgentInfo] = None,
labels: Optional[dict[str, str]] = None,
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -2201,9 +2201,9 @@ async def create_evaluation_run(
Args:
dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
dest: The GCS URI prefix to write the evaluation results to.
+ metrics: The list of metrics to evaluate.
name: The name of the evaluation run.
display_name: The display name of the evaluation run.
- metrics: The list of metrics to evaluate.
agent_info: The agent info to evaluate.
labels: The labels to apply to the evaluation run.
config: The configuration for the evaluation run.
@@ -2211,15 +2211,22 @@ async def create_evaluation_run(
Returns:
The created evaluation run.
"""
+ if agent_info and isinstance(agent_info, dict):
+ agent_info = types.evals.AgentInfo.model_validate(agent_info)
if type(dataset).__name__ == "EvaluationDataset":
- logger.warning(
- "EvaluationDataset input is experimental and may change in future versions."
- )
if dataset.eval_dataset_df is None:
raise ValueError(
"EvaluationDataset must have eval_dataset_df populated."
)
- if dataset.candidate_name is None and agent_info:
+ if (
+ dataset.candidate_name
+ and agent_info.name
+ and dataset.candidate_name != agent_info.name
+ ):
+ logger.warning(
+ "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
+ )
+ elif dataset.candidate_name is None and agent_info:
dataset.candidate_name = agent_info.name
eval_set = _evals_common._create_evaluation_set_from_dataframe(
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -2236,9 +2243,6 @@ async def create_evaluation_run(
)
inference_configs = {}
if agent_info:
- logger.warning(
- "The agent_info field is experimental and may change in future versions."
- )
if isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if (
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py
index 62384bbf68..3e6622acf4 100644
--- a/vertexai/_genai/types/common.py
+++ b/vertexai/_genai/types/common.py
@@ -1918,19 +1918,17 @@ def show(self) -> None:
"""Shows the evaluation result."""
from .. import _evals_visualization
- logger.warning(f"Evaluation Run state: {self.state}.")
- if self.error:
- logger.warning(f"Evaluation Run error: {self.error.message}")
if self.state == "SUCCEEDED":
if self.evaluation_item_results is not None:
_evals_visualization.display_evaluation_result(
self.evaluation_item_results, None
)
else:
- logger.warning(f"Evaluation Run state: {self.state}.")
logger.warning(
"Evaluation Run succeeded but no evaluation item results found. To display results, please set include_evaluation_items to True when calling get_evaluation_run()."
)
+ else:
+ _evals_visualization.display_evaluation_run_status(self)
class EvaluationRunDict(TypedDict, total=False):