From 741c6ad6bf860ec10653f86ad816a62465cdaaf5 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Thu, 30 Oct 2025 09:36:31 -0700 Subject: [PATCH] fix: GenAI Client(evals) - Support EvaluationDataset output from run_inference as input `dataset` in `create_evaluation_run` in Vertex AI GenAI SDK evals PiperOrigin-RevId: 826069078 --- .../replays/test_create_evaluation_run.py | 136 +++++++++--------- .../genai/replays/test_evaluate_instances.py | 2 +- tests/unit/vertexai/genai/test_evals.py | 4 +- vertexai/_genai/_evals_common.py | 1 - vertexai/_genai/_evals_data_converters.py | 4 - vertexai/_genai/_evals_visualization.py | 35 ++++- vertexai/_genai/evals.py | 48 ++++--- vertexai/_genai/types/common.py | 6 +- 8 files changed, 135 insertions(+), 101 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index 9b6cc31735..da9a537d23 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -48,70 +48,69 @@ ) -# TODO(b/431231205): Re-enable once Unified Metrics are in prod. -# def test_create_eval_run_data_source_evaluation_set(client): -# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" -# client._api_client._http_options.api_version = "v1beta1" -# tool = genai_types.Tool( -# function_declarations=[ -# genai_types.FunctionDeclaration( -# name="get_weather", -# description="Get weather in a location", -# parameters={ -# "type": "object", -# "properties": {"location": {"type": "string"}}, -# }, -# ) -# ] -# ) -# evaluation_run = client.evals.create_evaluation_run( -# name="test4", -# display_name="test4", -# dataset=types.EvaluationRunDataSource( -# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" -# ), -# dest=GCS_DEST, -# metrics=[ -# UNIVERSAL_AR_METRIC, -# types.RubricMetric.FINAL_RESPONSE_QUALITY, -# LLM_METRIC -# ], -# agent_info=types.AgentInfo( -# agent="project/123/locations/us-central1/reasoningEngines/456", -# name="agent-1", -# instruction="agent-1 instruction", -# tool_declarations=[tool], -# ), -# labels={"label1": "value1"}, -# ) -# assert isinstance(evaluation_run, types.EvaluationRun) -# assert evaluation_run.display_name == "test4" -# assert evaluation_run.state == types.EvaluationRunState.PENDING -# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) -# assert evaluation_run.data_source.evaluation_set == ( -# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" -# ) -# assert evaluation_run.evaluation_config == types.EvaluationRunConfig( -# output_config=genai_types.OutputConfig( -# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) -# ), -# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], -# ) -# assert evaluation_run.inference_configs[ -# "agent-1" -# ] == types.EvaluationRunInferenceConfig( -# agent_config=types.EvaluationRunAgentConfig( -# developer_instruction=genai_types.Content( -# parts=[genai_types.Part(text="agent-1 instruction")] -# ), -# tools=[tool], -# ) -# ) -# assert evaluation_run.labels == { -# "vertex-ai-evaluation-agent-engine-id": "456", -# "label1": "value1", -# } -# assert evaluation_run.error is None +def test_create_eval_run_data_source_evaluation_set(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" + client._api_client._http_options.api_version = "v1beta1" + tool = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="get_weather", + description="Get weather in a location", + parameters={ + "type": "object", + "properties": {"location": {"type": "string"}}, + }, + ) + ] + ) + evaluation_run = client.evals.create_evaluation_run( + name="test4", + display_name="test4", + dataset=types.EvaluationRunDataSource( + evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ), + dest=GCS_DEST, + metrics=[ + UNIVERSAL_AR_METRIC, + types.RubricMetric.FINAL_RESPONSE_QUALITY, + LLM_METRIC, + ], + agent_info=types.evals.AgentInfo( + agent="project/123/locations/us-central1/reasoningEngines/456", + name="agent-1", + instruction="agent-1 instruction", + tool_declarations=[tool], + ), + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test4" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.data_source.evaluation_set == ( + "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], + ) + assert evaluation_run.inference_configs[ + "agent-1" + ] == types.EvaluationRunInferenceConfig( + agent_config=types.EvaluationRunAgentConfig( + developer_instruction=genai_types.Content( + parts=[genai_types.Part(text="agent-1 instruction")] + ), + tools=[tool], + ) + ) + assert evaluation_run.labels == { + "vertex-ai-evaluation-agent-engine-id": "456", + "label1": "value1", + } + assert evaluation_run.error is None def test_create_eval_run_data_source_bigquery_request_set(client): @@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): ), labels={"label1": "value1"}, dest=GCS_DEST, + metrics=[UNIVERSAL_AR_METRIC], ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test5" @@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), + metrics=[UNIVERSAL_AR_METRIC], ) assert evaluation_run.inference_configs is None assert evaluation_run.labels == { @@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): assert evaluation_run.error is None -# Test fails in replay mode because of the timestamp issue +# Test fails in replay mode because of UUID generation mismatch. # def test_create_eval_run_data_source_evaluation_dataset(client): # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" # input_df = pd.DataFrame( @@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client): # candidate_name="candidate_1", # eval_dataset_df=input_df, # ), -# dest="gs://lakeyk-limited-bucket/eval_run_output", +# dest=GCS_DEST, +# metrics=[UNIVERSAL_AR_METRIC], # ) # assert isinstance(evaluation_run, types.EvaluationRun) # assert evaluation_run.display_name == "test6" @@ -276,6 +278,7 @@ async def test_create_eval_run_async(client): ) ), dest=GCS_DEST, + metrics=[UNIVERSAL_AR_METRIC], ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test8" @@ -292,6 +295,7 @@ async def test_create_eval_run_async(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), + metrics=[UNIVERSAL_AR_METRIC], ) assert evaluation_run.error is None assert evaluation_run.inference_configs is None diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py index 76c04d16af..cf51bedf41 100644 --- a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py +++ b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py @@ -246,7 +246,7 @@ def test_run_inference_with_agent(client): agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864", src=test_df, ) - assert inference_result.candidate_name == "agent" + assert inference_result.candidate_name is None assert inference_result.gcs_source is None diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index d72b48a2fc..b6da92e69c 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict( } ), ) - assert inference_result.candidate_name == "agent" + assert inference_result.candidate_name is None assert inference_result.gcs_source is None @mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader") @@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string( } ), ) - assert inference_result.candidate_name == "agent" + assert inference_result.candidate_name is None assert inference_result.gcs_source is None @mock.patch.object(_evals_utils, "EvalDatasetLoader") diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 341ae88a7f..bd43229bd9 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -798,7 +798,6 @@ def _execute_inference( evaluation_dataset = types.EvaluationDataset( eval_dataset_df=results_df, - candidate_name="agent", ) else: raise ValueError("Either model or agent_engine must be provided.") diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py index 340bf72965..337abaaae8 100644 --- a/vertexai/_genai/_evals_data_converters.py +++ b/vertexai/_genai/_evals_data_converters.py @@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: intermediate_events: Optional[list[types.Event]] = None if intermediate_events_data: - logger.warning( - "intermediate_events attribute is experimental and may change in " - "future versions." - ) if isinstance(intermediate_events_data, list): intermediate_events = [] for event in intermediate_events_data: diff --git a/vertexai/_genai/_evals_visualization.py b/vertexai/_genai/_evals_visualization.py index 0436d893ae..7b7e6174c7 100644 --- a/vertexai/_genai/_evals_visualization.py +++ b/vertexai/_genai/_evals_visualization.py @@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str: // If we have agent info, render as trace if(agentInfo) {{ - let traceHtml = `
🏃agent_run
`; + let traceHtml = `
🤖agent_run
`; eventsArray.forEach(event => {{ if (event.content && event.content.parts && event.content.parts.length > 0) {{ event.content.parts.forEach(part => {{ @@ -1073,3 +1073,36 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str) html_content = _get_inference_html(dataframe_json_string) display.display(display.HTML(html_content)) + + +def _get_status_html(status: str, error_message: Optional[str] = None) -> str: + """Returns a simple HTML string for displaying a status and optional error.""" + error_html = "" + if error_message: + error_html = f""" +

+ Error: +

{error_message}
+

+ """ + + return f""" +
+

Status: {status}

+ {error_html} +
+ """ + + +def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None: + """Displays the status of an evaluation run in an IPython environment.""" + if not _is_ipython_env(): + logger.warning("Skipping display: not in an IPython environment.") + return + else: + from IPython import display + + status = eval_run_obj.state.name if eval_run_obj.state else "UNKNOWN" + error_message = str(eval_run_obj.error) if eval_run_obj.error else None + html_content = _get_status_html(status, error_message) + display.display(display.HTML(html_content)) diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 765c28a41b..bc60977a82 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -1334,11 +1334,9 @@ def create_evaluation_run( *, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, + metrics: list[types.EvaluationRunMetricOrDict], name: Optional[str] = None, display_name: Optional[str] = None, - metrics: Optional[ - list[types.EvaluationRunMetricOrDict] - ] = None, # TODO: Make required unified metrics available in prod. agent_info: Optional[types.evals.AgentInfoOrDict] = None, labels: Optional[dict[str, str]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, @@ -1348,9 +1346,9 @@ def create_evaluation_run( Args: dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset. dest: The GCS URI prefix to write the evaluation results to. + metrics: The list of metrics to evaluate. name: The name of the evaluation run. display_name: The display name of the evaluation run. - metrics: The list of metrics to evaluate. agent_info: The agent info to evaluate. labels: The labels to apply to the evaluation run. config: The configuration for the evaluation run. @@ -1358,15 +1356,22 @@ def create_evaluation_run( Returns: The created evaluation run. """ + if agent_info and isinstance(agent_info, dict): + agent_info = types.evals.AgentInfo.model_validate(agent_info) if type(dataset).__name__ == "EvaluationDataset": - logger.warning( - "EvaluationDataset input is experimental and may change in future versions." - ) if dataset.eval_dataset_df is None: raise ValueError( "EvaluationDataset must have eval_dataset_df populated." ) - if dataset.candidate_name is None and agent_info: + if ( + dataset.candidate_name + and agent_info.name + and dataset.candidate_name != agent_info.name + ): + logger.warning( + "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." + ) + elif dataset.candidate_name is None and agent_info: dataset.candidate_name = agent_info.name eval_set = _evals_common._create_evaluation_set_from_dataframe( self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name @@ -1383,9 +1388,6 @@ def create_evaluation_run( ) inference_configs = {} if agent_info: - logger.warning( - "The agent_info field is experimental and may change in future versions." - ) if isinstance(agent_info, dict): agent_info = types.evals.AgentInfo.model_validate(agent_info) if ( @@ -2187,11 +2189,9 @@ async def create_evaluation_run( *, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, + metrics: list[types.EvaluationRunMetricOrDict], name: Optional[str] = None, display_name: Optional[str] = None, - metrics: Optional[ - list[types.EvaluationRunMetricOrDict] - ] = None, # TODO: Make required unified metrics available in prod. agent_info: Optional[types.evals.AgentInfo] = None, labels: Optional[dict[str, str]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, @@ -2201,9 +2201,9 @@ async def create_evaluation_run( Args: dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset. dest: The GCS URI prefix to write the evaluation results to. + metrics: The list of metrics to evaluate. name: The name of the evaluation run. display_name: The display name of the evaluation run. - metrics: The list of metrics to evaluate. agent_info: The agent info to evaluate. labels: The labels to apply to the evaluation run. config: The configuration for the evaluation run. @@ -2211,15 +2211,22 @@ async def create_evaluation_run( Returns: The created evaluation run. """ + if agent_info and isinstance(agent_info, dict): + agent_info = types.evals.AgentInfo.model_validate(agent_info) if type(dataset).__name__ == "EvaluationDataset": - logger.warning( - "EvaluationDataset input is experimental and may change in future versions." - ) if dataset.eval_dataset_df is None: raise ValueError( "EvaluationDataset must have eval_dataset_df populated." ) - if dataset.candidate_name is None and agent_info: + if ( + dataset.candidate_name + and agent_info.name + and dataset.candidate_name != agent_info.name + ): + logger.warning( + "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." + ) + elif dataset.candidate_name is None and agent_info: dataset.candidate_name = agent_info.name eval_set = _evals_common._create_evaluation_set_from_dataframe( self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name @@ -2236,9 +2243,6 @@ async def create_evaluation_run( ) inference_configs = {} if agent_info: - logger.warning( - "The agent_info field is experimental and may change in future versions." - ) if isinstance(agent_info, dict): agent_info = types.evals.AgentInfo.model_validate(agent_info) if ( diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 62384bbf68..3e6622acf4 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -1918,19 +1918,17 @@ def show(self) -> None: """Shows the evaluation result.""" from .. import _evals_visualization - logger.warning(f"Evaluation Run state: {self.state}.") - if self.error: - logger.warning(f"Evaluation Run error: {self.error.message}") if self.state == "SUCCEEDED": if self.evaluation_item_results is not None: _evals_visualization.display_evaluation_result( self.evaluation_item_results, None ) else: - logger.warning(f"Evaluation Run state: {self.state}.") logger.warning( "Evaluation Run succeeded but no evaluation item results found. To display results, please set include_evaluation_items to True when calling get_evaluation_run()." ) + else: + _evals_visualization.display_evaluation_run_status(self) class EvaluationRunDict(TypedDict, total=False):