diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index 9b6cc31735..da9a537d23 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -48,70 +48,69 @@ ) -# TODO(b/431231205): Re-enable once Unified Metrics are in prod. -# def test_create_eval_run_data_source_evaluation_set(client): -# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" -# client._api_client._http_options.api_version = "v1beta1" -# tool = genai_types.Tool( -# function_declarations=[ -# genai_types.FunctionDeclaration( -# name="get_weather", -# description="Get weather in a location", -# parameters={ -# "type": "object", -# "properties": {"location": {"type": "string"}}, -# }, -# ) -# ] -# ) -# evaluation_run = client.evals.create_evaluation_run( -# name="test4", -# display_name="test4", -# dataset=types.EvaluationRunDataSource( -# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" -# ), -# dest=GCS_DEST, -# metrics=[ -# UNIVERSAL_AR_METRIC, -# types.RubricMetric.FINAL_RESPONSE_QUALITY, -# LLM_METRIC -# ], -# agent_info=types.AgentInfo( -# agent="project/123/locations/us-central1/reasoningEngines/456", -# name="agent-1", -# instruction="agent-1 instruction", -# tool_declarations=[tool], -# ), -# labels={"label1": "value1"}, -# ) -# assert isinstance(evaluation_run, types.EvaluationRun) -# assert evaluation_run.display_name == "test4" -# assert evaluation_run.state == types.EvaluationRunState.PENDING -# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) -# assert evaluation_run.data_source.evaluation_set == ( -# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" -# ) -# assert evaluation_run.evaluation_config == types.EvaluationRunConfig( -# output_config=genai_types.OutputConfig( -# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) -# ), -# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], -# ) -# assert evaluation_run.inference_configs[ -# "agent-1" -# ] == types.EvaluationRunInferenceConfig( -# agent_config=types.EvaluationRunAgentConfig( -# developer_instruction=genai_types.Content( -# parts=[genai_types.Part(text="agent-1 instruction")] -# ), -# tools=[tool], -# ) -# ) -# assert evaluation_run.labels == { -# "vertex-ai-evaluation-agent-engine-id": "456", -# "label1": "value1", -# } -# assert evaluation_run.error is None +def test_create_eval_run_data_source_evaluation_set(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" + client._api_client._http_options.api_version = "v1beta1" + tool = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="get_weather", + description="Get weather in a location", + parameters={ + "type": "object", + "properties": {"location": {"type": "string"}}, + }, + ) + ] + ) + evaluation_run = client.evals.create_evaluation_run( + name="test4", + display_name="test4", + dataset=types.EvaluationRunDataSource( + evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ), + dest=GCS_DEST, + metrics=[ + UNIVERSAL_AR_METRIC, + types.RubricMetric.FINAL_RESPONSE_QUALITY, + LLM_METRIC, + ], + agent_info=types.evals.AgentInfo( + agent="project/123/locations/us-central1/reasoningEngines/456", + name="agent-1", + instruction="agent-1 instruction", + tool_declarations=[tool], + ), + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test4" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.data_source.evaluation_set == ( + "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], + ) + assert evaluation_run.inference_configs[ + "agent-1" + ] == types.EvaluationRunInferenceConfig( + agent_config=types.EvaluationRunAgentConfig( + developer_instruction=genai_types.Content( + parts=[genai_types.Part(text="agent-1 instruction")] + ), + tools=[tool], + ) + ) + assert evaluation_run.labels == { + "vertex-ai-evaluation-agent-engine-id": "456", + "label1": "value1", + } + assert evaluation_run.error is None def test_create_eval_run_data_source_bigquery_request_set(client): @@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): ), labels={"label1": "value1"}, dest=GCS_DEST, + metrics=[UNIVERSAL_AR_METRIC], ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test5" @@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), + metrics=[UNIVERSAL_AR_METRIC], ) assert evaluation_run.inference_configs is None assert evaluation_run.labels == { @@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): assert evaluation_run.error is None -# Test fails in replay mode because of the timestamp issue +# Test fails in replay mode because of UUID generation mismatch. # def test_create_eval_run_data_source_evaluation_dataset(client): # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" # input_df = pd.DataFrame( @@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client): # candidate_name="candidate_1", # eval_dataset_df=input_df, # ), -# dest="gs://lakeyk-limited-bucket/eval_run_output", +# dest=GCS_DEST, +# metrics=[UNIVERSAL_AR_METRIC], # ) # assert isinstance(evaluation_run, types.EvaluationRun) # assert evaluation_run.display_name == "test6" @@ -276,6 +278,7 @@ async def test_create_eval_run_async(client): ) ), dest=GCS_DEST, + metrics=[UNIVERSAL_AR_METRIC], ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test8" @@ -292,6 +295,7 @@ async def test_create_eval_run_async(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), + metrics=[UNIVERSAL_AR_METRIC], ) assert evaluation_run.error is None assert evaluation_run.inference_configs is None diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py index 76c04d16af..cf51bedf41 100644 --- a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py +++ b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py @@ -246,7 +246,7 @@ def test_run_inference_with_agent(client): agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864", src=test_df, ) - assert inference_result.candidate_name == "agent" + assert inference_result.candidate_name is None assert inference_result.gcs_source is None diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index d72b48a2fc..b6da92e69c 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict( } ), ) - assert inference_result.candidate_name == "agent" + assert inference_result.candidate_name is None assert inference_result.gcs_source is None @mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader") @@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string( } ), ) - assert inference_result.candidate_name == "agent" + assert inference_result.candidate_name is None assert inference_result.gcs_source is None @mock.patch.object(_evals_utils, "EvalDatasetLoader") diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 341ae88a7f..bd43229bd9 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -798,7 +798,6 @@ def _execute_inference( evaluation_dataset = types.EvaluationDataset( eval_dataset_df=results_df, - candidate_name="agent", ) else: raise ValueError("Either model or agent_engine must be provided.") diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py index 340bf72965..337abaaae8 100644 --- a/vertexai/_genai/_evals_data_converters.py +++ b/vertexai/_genai/_evals_data_converters.py @@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: intermediate_events: Optional[list[types.Event]] = None if intermediate_events_data: - logger.warning( - "intermediate_events attribute is experimental and may change in " - "future versions." - ) if isinstance(intermediate_events_data, list): intermediate_events = [] for event in intermediate_events_data: diff --git a/vertexai/_genai/_evals_visualization.py b/vertexai/_genai/_evals_visualization.py index 0436d893ae..7b7e6174c7 100644 --- a/vertexai/_genai/_evals_visualization.py +++ b/vertexai/_genai/_evals_visualization.py @@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str: // If we have agent info, render as trace if(agentInfo) {{ - let traceHtml = `
+ Error: +
{error_message}
+
+ """
+
+ return f"""
+ Status: {status}
+ {error_html} +