From d62afc32db85a103c25878dd82a338feb86f53fe Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Fri, 24 Oct 2025 14:31:00 -0700 Subject: [PATCH] feat: GenAI Client(evals) - Add agent data to EvaluationRun `show` in Vertex AI GenAI SDK evals PiperOrigin-RevId: 823667266 --- tests/unit/vertexai/genai/replays/conftest.py | 12 +- .../genai/replays/test_get_evaluation_run.py | 241 ++++++++---------- vertexai/_genai/_evals_common.py | 104 ++++++-- vertexai/_genai/_evals_constant.py | 14 + vertexai/_genai/_gcs_utils.py | 7 +- vertexai/_genai/evals.py | 121 +++++++-- 6 files changed, 313 insertions(+), 186 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/conftest.py b/tests/unit/vertexai/genai/replays/conftest.py index eafc155e13..b6319cf1ee 100644 --- a/tests/unit/vertexai/genai/replays/conftest.py +++ b/tests/unit/vertexai/genai/replays/conftest.py @@ -133,10 +133,14 @@ def _get_replay_id(use_vertex: bool, replays_prefix: str) -> str: ) EVAL_ITEM_REQUEST_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/request_" EVAL_ITEM_RESULT_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/result_" +EVAL_ITEM_REQUEST_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/request_" +EVAL_ITEM_RESULT_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/result_" EVAL_GCS_URI_ITEMS = { EVAL_CONFIG_GCS_URI: "test_resources/mock_eval_config.yaml", EVAL_ITEM_REQUEST_GCS_URI: "test_resources/request_4813679498589372416.json", EVAL_ITEM_RESULT_GCS_URI: "test_resources/result_1486082323915997184.json", + EVAL_ITEM_REQUEST_GCS_URI_2: "test_resources/request_4813679498589372416.json", + EVAL_ITEM_RESULT_GCS_URI_2: "test_resources/result_1486082323915997184.json", } @@ -148,11 +152,15 @@ def _mock_read_file_contents_side_effect(uri: str): current_dir = os.path.dirname(__file__) if uri in EVAL_GCS_URI_ITEMS: local_mock_file_path = os.path.join(current_dir, EVAL_GCS_URI_ITEMS[uri]) - elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI): + elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI) or uri.startswith( + EVAL_ITEM_REQUEST_GCS_URI_2 + ): local_mock_file_path = os.path.join( current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_REQUEST_GCS_URI] ) - elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI): + elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI) or uri.startswith( + EVAL_ITEM_RESULT_GCS_URI_2 + ): local_mock_file_path = os.path.join( current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_RESULT_GCS_URI] ) diff --git a/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py index 5236c0ab7d..3db3bea517 100644 --- a/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py @@ -16,31 +16,34 @@ from tests.unit.vertexai.genai.replays import pytest_helper from vertexai import types +from google.genai import types as genai_types import datetime import pytest def test_get_eval_run(client): """Tests that get_evaluation_run() returns a correctly structured EvaluationRun.""" + client._api_client._http_options.api_version = "v1beta1" evaluation_run_name = ( - "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808" + "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480" ) evaluation_run = client.evals.get_evaluation_run( name=evaluation_run_name, include_evaluation_items=True ) - check_run_1957799200510967808(client, evaluation_run, evaluation_run_name) - check_run_1957799200510967808_evaluation_item_results( + check_run_5133048044039700480(client, evaluation_run, evaluation_run_name) + check_run_5133048044039700480_evaluation_item_results( client, evaluation_run, evaluation_run_name ) def test_get_eval_run_include_evaluation_items_false(client): """Tests that get_evaluation_run() returns a correctly structured EvaluationRun.""" + client._api_client._http_options.api_version = "v1beta1" evaluation_run_name = ( - "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808" + "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480" ) evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name) - check_run_1957799200510967808(client, evaluation_run, evaluation_run_name) + check_run_5133048044039700480(client, evaluation_run, evaluation_run_name) assert evaluation_run.evaluation_item_results is None @@ -99,158 +102,142 @@ def test_get_eval_run_eval_set_source(client): @pytest.mark.asyncio async def test_get_eval_run_async(client): """Tests that get_evaluation_run() returns a correctly structured EvaluationRun.""" - eval_run_id = "1957799200510967808" + client._api_client._http_options.api_version = "v1beta1" + eval_run_id = "5133048044039700480" evaluation_run_name = ( f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}" ) evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id) - check_run_1957799200510967808(client, evaluation_run, evaluation_run_name) + check_run_5133048044039700480(client, evaluation_run, evaluation_run_name) assert evaluation_run.evaluation_item_results is None -def check_run_1957799200510967808( +def check_run_5133048044039700480( client, evaluation_run: types.EvaluationRun, evaluation_run_name: str ): assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.name == evaluation_run_name - assert evaluation_run.display_name == "test2" - assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"} + assert evaluation_run.display_name == "sdk-test-1" + assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"} assert evaluation_run.create_time == datetime.datetime( - 2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc + 2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc ) assert evaluation_run.completion_time == datetime.datetime( - 2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc + 2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc ) assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED assert evaluation_run.evaluation_set_snapshot == ( - "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200" + "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184" ) - assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet( - uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b", - prompt_column="request", - candidate_response_columns={ - "baseline_model_response": "baseline_model_response", - "checkpoint_1": "checkpoint_1", - "checkpoint_2": "checkpoint_2", - }, + assert ( + evaluation_run.data_source.evaluation_set + == "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184" ) assert evaluation_run.evaluation_run_results.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120" + "projects/503583131166/locations/us-central1/evaluationSets/129513673658990592" ) assert evaluation_run.inference_configs == { - "checkpoint_1": types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/endpoints/9030177948249882624" - ), - "checkpoint_2": types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/endpoints/7751155654076661760" + "gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig( + agent_config=types.EvaluationRunAgentConfig( + developer_instruction={ + "parts": [{"text": "example agent developer instruction"}] + }, + tools=[ + genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="check_chime", + description="Check chime.", + parameters={ + "type": "OBJECT", + "properties": { + "nums": { + "type": "STRING", + "description": "List of numbers to be verified.", + } + }, + "required": ["nums"], + }, + ), + ], + ) + ], + ) ), } assert evaluation_run.evaluation_run_results.summary_metrics == ( types.SummaryMetric( metrics={ - "checkpoint_1/user_defined/MODE": 5, - "checkpoint_2/universal/P90": 1, - "gemini-2.0-flash-001@default/universal/AVERAGE": 0.6943817985685249, - "gemini-2.0-flash-001@default/user_defined/P90": 5, - "gemini-2.0-flash-001@default/universal/VARIANCE": 0.03146487552180889, - "gemini-2.0-flash-001@default/user_defined/P95": 5, - "checkpoint_1/universal/MINIMUM": 0.8571428656578064, - "checkpoint_1/universal/VARIANCE": 0.0015452162403157982, - "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.17738341388587855, - "checkpoint_2/user_defined/P95": 5, - "checkpoint_2/universal/MODE": 1, - "checkpoint_2/user_defined/P90": 5, - "checkpoint_2/universal/P99": 1, + "gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077, + "gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1, + "gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842, + "gemini-2.0-flash-001@default/universal/P90": 1, + "gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1, + "gemini-2.0-flash-001@default/universal/P95": 1, + "gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077, + "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675, + "gemini-2.0-flash-001@default/universal/MEDIAN": 1, + "gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675, + "gemini-2.0-flash-001@default/universal/MODE": 1, + "gemini-2.0-flash-001@default/safety_v1/MODE": 1, + "gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408, + "gemini-2.0-flash-001@default/safety_v1/P90": 1, + "gemini-2.0-flash-001@default/safety_v1/P95": 1, + "gemini-2.0-flash-001@default/universal/P99": 1, + "gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842, + "gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408, "gemini-2.0-flash-001@default/universal/MAXIMUM": 1, - "checkpoint_2/universal/P95": 1, - "checkpoint_2/user_defined/P99": 5, - "checkpoint_2/universal/MINIMUM": 0.7777777910232544, - "gemini-2.0-flash-001@default/universal/P90": 0.8777777791023255, - "checkpoint_1/universal/AVERAGE": 0.986633250587865, - "checkpoint_1/universal/MAXIMUM": 1, - "checkpoint_1/universal/STANDARD_DEVIATION": 0.0393092386127714, - "gemini-2.0-flash-001@default/universal/P95": 0.9000000059604645, - "gemini-2.0-flash-001@default/user_defined/MAXIMUM": 5, - "gemini-2.0-flash-001@default/user_defined/MINIMUM": 3, - "gemini-2.0-flash-001@default/user_defined/VARIANCE": 0.4044321329639886, - "checkpoint_2/user_defined/MAXIMUM": 5, - "checkpoint_1/universal/MEDIAN": 1, - "gemini-2.0-flash-001@default/universal/MEDIAN": 0.7142857313156128, - "gemini-2.0-flash-001@default/user_defined/AVERAGE": 4.736842105263158, - "gemini-2.0-flash-001@default/user_defined/MEDIAN": 5, - "checkpoint_2/user_defined/AVERAGE": 5, - "checkpoint_2/user_defined/MEDIAN": 5, - "checkpoint_2/user_defined/STANDARD_DEVIATION": 0, - "checkpoint_2/universal/MAXIMUM": 1, - "checkpoint_1/universal/MODE": 1, - "checkpoint_2/user_defined/MINIMUM": 5, - "checkpoint_1/user_defined/VARIANCE": 0, - "checkpoint_2/universal/VARIANCE": 0.005771725970062436, - "checkpoint_2/universal/AVERAGE": 0.9438178790243048, - "checkpoint_1/user_defined/MINIMUM": 5, - "gemini-2.0-flash-001@default/universal/P99": 0.9800000011920929, - "gemini-2.0-flash-001@default/universal/MINIMUM": 0.2857142984867096, - "checkpoint_2/user_defined/VARIANCE": 0, - "checkpoint_1/user_defined/MEDIAN": 5, - "checkpoint_2/universal/STANDARD_DEVIATION": 0.07597187617837561, - "checkpoint_1/user_defined/AVERAGE": 5, - "checkpoint_1/user_defined/MAXIMUM": 5, - "gemini-2.0-flash-001@default/user_defined/MODE": 5, - "checkpoint_1/user_defined/P95": 5, - "checkpoint_1/universal/P99": 1, - "checkpoint_1/user_defined/P90": 5, - "checkpoint_2/universal/MEDIAN": 1, - "checkpoint_1/universal/P95": 1, - "checkpoint_1/user_defined/STANDARD_DEVIATION": 0, - "gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION": 0.6359497880839245, - "checkpoint_1/user_defined/P99": 5, - "gemini-2.0-flash-001@default/universal/MODE": [ - 0.75, - 0.8571428656578064, - ], - "checkpoint_2/user_defined/MODE": 5, - "checkpoint_1/universal/P90": 1, - "gemini-2.0-flash-001@default/user_defined/P99": 5, + "gemini-2.0-flash-001@default/safety_v1/P99": 1, }, - total_items=19, + total_items=3, ) ) assert evaluation_run.error is None -def check_run_1957799200510967808_evaluation_item_results( +def check_run_5133048044039700480_evaluation_item_results( client, evaluation_run: types.EvaluationRun, evaluation_run_name: str ): eval_result = evaluation_run.evaluation_item_results assert isinstance(eval_result, types.EvaluationResult) assert eval_result.summary_metrics == [ types.AggregatedMetricResult( - metric_name="checkpoint_1/universal", - mean_score=0.986633250587865, - stdev_score=0.0393092386127714, + metric_name="safety_v1", + mean_score=0.7888888915379842, + stdev_score=0.2991758188061675, ), types.AggregatedMetricResult( - metric_name="checkpoint_2/universal", - mean_score=0.9438178790243048, - stdev_score=0.07597187617837561, - ), - types.AggregatedMetricResult( - metric_name="gemini-2.0-flash-001@default/universal", - mean_score=0.6943817985685249, - stdev_score=0.17738341388587855, - ), - types.AggregatedMetricResult( - metric_name="checkpoint_1/user_defined", mean_score=5, stdev_score=0 - ), - types.AggregatedMetricResult( - metric_name="checkpoint_2/user_defined", mean_score=5, stdev_score=0 - ), - types.AggregatedMetricResult( - metric_name="gemini-2.0-flash-001@default/user_defined", - mean_score=4.736842105263158, - stdev_score=0.6359497880839245, + metric_name="universal", + mean_score=0.7888888915379842, + stdev_score=0.2991758188061675, ), ] + # Check the agent info. + assert eval_result.agent_info == types.evals.AgentInfo( + name="gemini-2.0-flash-001@default", + instruction="example agent developer instruction", + description=None, + tool_declarations=[ + genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="check_chime", + description="Check chime.", + parameters={ + "type": "OBJECT", + "properties": { + "nums": { + "type": "STRING", + "description": "List of numbers to be verified.", + } + }, + "required": ["nums"], + }, + ), + ], + ) + ], + ) # Check the first eval case result. eval_case_result = eval_result.eval_case_results[0] assert isinstance(eval_case_result, types.EvalCaseResult) @@ -264,26 +251,24 @@ def check_run_1957799200510967808_evaluation_item_results( assert universal_metric_result.explanation is None # Check the first rubric verdict. rubric_verdict_0 = universal_metric_result.rubric_verdicts[0] - assert rubric_verdict_0 == ( - types.RubricVerdict( - evaluated_rubric=types.Rubric( - content=types.RubricContent( - property=types.RubricContentProperty( - description="The response is in English." - ) - ), - importance="HIGH", - type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE", - ), - reasoning=("The entire response is written in the English language."), - verdict=True, - ) + assert isinstance(rubric_verdict_0, types.RubricVerdict) + assert rubric_verdict_0.evaluated_rubric == types.Rubric( + content=types.RubricContent( + property=types.RubricContentProperty( + description="The response is in English." + ) + ), + importance="HIGH", + type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE", ) + assert rubric_verdict_0.reasoning is not None + assert rubric_verdict_0.verdict is True # Check the first evaluation dataset. eval_dataset = eval_result.evaluation_dataset[0] assert isinstance(eval_dataset, types.EvaluationDataset) assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default" - assert eval_dataset.eval_dataset_df.shape == (19, 3) + assert eval_dataset.eval_dataset_df.shape[0] == 3 + assert eval_dataset.eval_dataset_df.shape[1] > 3 pytestmark = pytest_helper.setup( diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 89357d9a1d..8388784272 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -14,6 +14,7 @@ # """Common utilities for evals.""" import asyncio +import base64 import collections import concurrent.futures import datetime @@ -1114,7 +1115,7 @@ def _execute_evaluation( validated_agent_info = agent_info else: raise TypeError( - f"agent_info values must be of type types.AgentInfo or dict, but got {type(agent_info)}'" + f"agent_info values must be of type types.evals.AgentInfo or dict, but got {type(agent_info)}'" ) processed_eval_dataset, num_response_candidates = _resolve_dataset_inputs( @@ -1395,7 +1396,7 @@ def _get_aggregated_metrics( return [ types.AggregatedMetricResult( - metric_name=name, + metric_name=name.split("/")[-1], mean_score=values.get("AVERAGE"), stdev_score=values.get("STANDARD_DEVIATION"), ) @@ -1434,10 +1435,23 @@ def _convert_request_to_dataset_row( ) -> dict[str, Any]: """Converts an EvaluationItemRequest to a dictionary.""" dict_row = {} - dict_row["prompt"] = request.prompt.text if request.prompt.text else None - dict_row["reference"] = request.golden_response - for candidate in request.candidate_responses: - dict_row[candidate.candidate] = candidate.text if candidate.text else None + dict_row[_evals_constant.PROMPT] = ( + request.prompt.text if request.prompt.text else None + ) + dict_row[_evals_constant.REFERENCE] = request.golden_response + intermediate_events = [] + if request.candidate_responses: + for candidate in request.candidate_responses: + dict_row[candidate.candidate] = candidate.text if candidate.text else None + if candidate.events: + for event in candidate.events: + content_dict = {"parts": event.parts, "role": event.role} + int_events_dict = { + "event_id": candidate.candidate, + "content": content_dict, + } + intermediate_events.append(int_events_dict) + dict_row[_evals_constant.INTERMEDIATE_EVENTS] = intermediate_events return dict_row @@ -1451,15 +1465,14 @@ def _transform_dataframe(rows: list[dict[str, Any]]) -> list[types.EvaluationDat A list of EvaluationDatasets, one for each candidate. """ df = pd.DataFrame(rows) - exclude_cols = ["prompt", "reference"] - candidates = [col for col in df.columns if col not in exclude_cols] + candidates = [ + col for col in df.columns if col not in _evals_constant.COMMON_DATASET_COLUMNS + ] eval_dfs = [ types.EvaluationDataset( candidate_name=candidate, - eval_dataset_df=df[["prompt", "reference", candidate]].rename( - columns={candidate: "response"} - ), + eval_dataset_df=df.rename(columns={candidate: _evals_constant.RESPONSE}), ) for candidate in candidates ] @@ -1487,7 +1500,6 @@ def _get_eval_cases_eval_dfs_from_eval_items( eval_item and eval_item.evaluation_response and eval_item.evaluation_response.request - and eval_item.evaluation_response.candidate_results ): eval_case_results.append( _get_eval_case_result_from_eval_item(index, eval_item) @@ -1499,9 +1511,37 @@ def _get_eval_cases_eval_dfs_from_eval_items( return eval_case_results, eval_dfs +def _get_agent_info_from_inference_configs( + candidate_names: list[str], + inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, +) -> Optional[types.evals.AgentInfo]: + """Retrieves an AgentInfo from the inference configs.""" + # TODO(lakeyk): Support multiple agents. + if not ( + inference_configs + and candidate_names + and candidate_names[0] in inference_configs + and inference_configs[candidate_names[0]].agent_config + ): + return None + if len(inference_configs.keys()) > 1: + logger.warning( + "Multiple agents are not supported yet. Displaying the first agent." + ) + agent_config = inference_configs[candidate_names[0]].agent_config + di = agent_config.developer_instruction + instruction = di.parts[0].text if di and di.parts and di.parts[0].text else None + return types.evals.AgentInfo( + name=candidate_names[0], + instruction=instruction, + tool_declarations=agent_config.tools, + ) + + def _get_eval_result_from_eval_items( results: types.EvaluationRunResults, eval_items: list[types.EvaluationItem], + inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, ) -> types.EvaluationResult: """Retrieves an EvaluationResult from the EvaluationRunResults. @@ -1525,6 +1565,9 @@ def _get_eval_result_from_eval_items( metadata=types.EvaluationRunMetadata( candidate_names=candidate_names, ), + agent_info=_get_agent_info_from_inference_configs( + candidate_names, inference_configs + ), ) return eval_result @@ -1532,6 +1575,7 @@ def _get_eval_result_from_eval_items( def _convert_evaluation_run_results( api_client: BaseApiClient, evaluation_run_results: types.EvaluationRunResults, + inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, ) -> list[types.EvaluationItem]: """Retrieves an EvaluationItem from the EvaluationRunResults.""" if not evaluation_run_results or not evaluation_run_results.evaluation_set: @@ -1548,12 +1592,15 @@ def _convert_evaluation_run_results( evals_module.get_evaluation_item(name=item_name) for item_name in eval_set.evaluation_items ] - return _get_eval_result_from_eval_items(evaluation_run_results, eval_items) + return _get_eval_result_from_eval_items( + evaluation_run_results, eval_items, inference_configs + ) async def _convert_evaluation_run_results_async( api_client: BaseApiClient, evaluation_run_results: types.EvaluationRunResults, + inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, ) -> list[types.EvaluationItem]: """Retrieves an EvaluationItem from the EvaluationRunResults.""" if not evaluation_run_results or not evaluation_run_results.evaluation_set: @@ -1571,7 +1618,9 @@ async def _convert_evaluation_run_results_async( for eval_item in eval_set.evaluation_items ] eval_items = await asyncio.gather(*tasks) - return _get_eval_result_from_eval_items(evaluation_run_results, eval_items) + return _get_eval_result_from_eval_items( + evaluation_run_results, eval_items, inference_configs + ) def _object_to_dict(obj) -> dict[str, Any]: @@ -1587,6 +1636,8 @@ def _object_to_dict(obj) -> dict[str, Any]: result[key] = value elif isinstance(value, (list, tuple)): result[key] = [_object_to_dict(item) for item in value] + elif isinstance(value, bytes): + result[key] = base64.b64encode(value).decode("utf-8") elif hasattr(value, "__dict__"): # Nested object result[key] = _object_to_dict(value) else: @@ -1604,29 +1655,30 @@ def _create_evaluation_set_from_dataframe( eval_item_requests = [] for _, row in eval_df.iterrows(): intermediate_events = [] - if "intermediate_events" in row: - for event in row["intermediate_events"]: - intermediate_events.append( - genai_types.Content( - parts=event["content"]["parts"], role=event["content"]["role"] - ) - ) + if ( + _evals_constant.INTERMEDIATE_EVENTS in row + and isinstance(row[_evals_constant.INTERMEDIATE_EVENTS], list) + and len(row[_evals_constant.INTERMEDIATE_EVENTS]) > 0 + ): + for event in row[_evals_constant.INTERMEDIATE_EVENTS]: + if "content" in event: + intermediate_events.append(event["content"]) eval_item_requests.append( types.EvaluationItemRequest( prompt=( - types.EvaluationPrompt(text=row["prompt"]) - if "prompt" in row + types.EvaluationPrompt(text=row[_evals_constant.PROMPT]) + if _evals_constant.PROMPT in row else None ), golden_response=( - types.CandidateResponse(text=row["reference"]) - if "reference" in row + types.CandidateResponse(text=row[_evals_constant.REFERENCE]) + if _evals_constant.REFERENCE in row else None ), candidate_responses=[ types.CandidateResponse( candidate=candidate_name or "Candidate 1", - text=row.get("response", None), + text=row.get(_evals_constant.RESPONSE, None), events=( intermediate_events if len(intermediate_events) > 0 diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py index d82970d981..a92c8a70dc 100644 --- a/vertexai/_genai/_evals_constant.py +++ b/vertexai/_genai/_evals_constant.py @@ -46,3 +46,17 @@ ) INTERMEDIATE_EVENTS = "intermediate_events" RESPONSE = "response" +PROMPT = "prompt" +REFERENCE = "reference" +SESSION_INPUT = "session_inputs" +CONTEXT = "context" + +COMMON_DATASET_COLUMNS = frozenset( + { + INTERMEDIATE_EVENTS, + PROMPT, + REFERENCE, + SESSION_INPUT, + CONTEXT, + } +) diff --git a/vertexai/_genai/_gcs_utils.py b/vertexai/_genai/_gcs_utils.py index 8e8363dfc8..021d9d0051 100644 --- a/vertexai/_genai/_gcs_utils.py +++ b/vertexai/_genai/_gcs_utils.py @@ -16,12 +16,12 @@ import io import json import logging -import time from typing import Any, Union from google.cloud import storage # type: ignore[attr-defined] from google.genai._api_client import BaseApiClient import pandas as pd +import uuid logger = logging.getLogger(__name__) @@ -125,7 +125,7 @@ def upload_json_to_prefix( gcs_dest_prefix: str, filename_prefix: str = "data", ) -> str: - """Uploads a dictionary to a GCS prefix with a timestamped JSON filename. + """Uploads a dictionary to a GCS prefix with a UUID JSON filename. Args: data: The dictionary to upload. @@ -151,8 +151,7 @@ def upload_json_to_prefix( if user_prefix_path and not user_prefix_path.endswith("/"): user_prefix_path += "/" - timestamp = time.strftime("%Y%m%d-%H%M%S") - filename = f"{filename_prefix}_{timestamp}.json" + filename = f"{filename_prefix}_{uuid.uuid4()}.json" blob_name = f"{user_prefix_path}{filename}" diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index d3c5827468..765c28a41b 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -19,6 +19,7 @@ import logging from typing import Any, Callable, Optional, Union from urllib.parse import urlencode +import uuid from google.genai import _api_module from google.genai import _common @@ -1295,7 +1296,20 @@ def get_evaluation_run( include_evaluation_items: bool = False, config: Optional[types.GetEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: - """Retrieves an EvaluationRun from the resource name.""" + """Retrieves an EvaluationRun from the resource name. + Args: + name: The resource name of the EvaluationRun. Format: + `projects/{project}/locations/{location}/evaluationRuns/{evaluation_run}` + include_evaluation_items: Whether to include the evaluation items in the + response. + config: The optional configuration for the evaluation run. Must be a dict or + `types.GetEvaluationRunConfigOrDict` type. + + Returns: + The evaluation run. + Raises: + ValueError: If the name is empty or invalid. + """ if not name: raise ValueError("name cannot be empty.") if name.startswith("projects/"): @@ -1304,7 +1318,9 @@ def get_evaluation_run( if include_evaluation_items: result.evaluation_item_results = ( _evals_common._convert_evaluation_run_results( - self._api_client, result.evaluation_run_results + self._api_client, + result.evaluation_run_results, + result.inference_configs, ) ) return result @@ -1316,18 +1332,32 @@ def get_evaluation_run( def create_evaluation_run( self, *, - name: str, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, + name: Optional[str] = None, display_name: Optional[str] = None, metrics: Optional[ list[types.EvaluationRunMetricOrDict] ] = None, # TODO: Make required unified metrics available in prod. - agent_info: Optional[types.evals.AgentInfo] = None, + agent_info: Optional[types.evals.AgentInfoOrDict] = None, labels: Optional[dict[str, str]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: - """Creates an EvaluationRun.""" + """Creates an EvaluationRun. + + Args: + dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset. + dest: The GCS URI prefix to write the evaluation results to. + name: The name of the evaluation run. + display_name: The display name of the evaluation run. + metrics: The list of metrics to evaluate. + agent_info: The agent info to evaluate. + labels: The labels to apply to the evaluation run. + config: The configuration for the evaluation run. + + Returns: + The created evaluation run. + """ if type(dataset).__name__ == "EvaluationDataset": logger.warning( "EvaluationDataset input is experimental and may change in future versions." @@ -1336,6 +1366,8 @@ def create_evaluation_run( raise ValueError( "EvaluationDataset must have eval_dataset_df populated." ) + if dataset.candidate_name is None and agent_info: + dataset.candidate_name = agent_info.name eval_set = _evals_common._create_evaluation_set_from_dataframe( self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name ) @@ -1354,6 +1386,15 @@ def create_evaluation_run( logger.warning( "The agent_info field is experimental and may change in future versions." ) + if isinstance(agent_info, dict): + agent_info = types.evals.AgentInfo.model_validate(agent_info) + if ( + not agent_info.agent + or len(agent_info.agent.split("reasoningEngines/")) != 2 + ): + raise ValueError( + "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}." + ) inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig( agent_config=types.EvaluationRunAgentConfig( developer_instruction=genai_types.Content( @@ -1362,21 +1403,16 @@ def create_evaluation_run( tools=agent_info.tool_declarations, ) ) - if ( - not agent_info.agent - or len(agent_info.agent.split("reasoningEngines/")) != 2 - ): - raise ValueError( - "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}." - ) labels = labels or {} labels["vertex-ai-evaluation-agent-engine-id"] = agent_info.agent.split( "reasoningEngines/" )[-1] + if not name: + name = f"evaluation_run_{uuid.uuid4()}" return self._create_evaluation_run( # type: ignore[no-any-return] name=name, - display_name=display_name, + display_name=display_name or name, data_source=dataset, evaluation_config=evaluation_config, inference_configs=inference_configs, @@ -2112,8 +2148,19 @@ async def get_evaluation_run( include_evaluation_items: bool = False, config: Optional[types.GetEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: - """ - Retrieves an EvaluationRun from the resource name. + """Retrieves the EvaluationRun from the resource name. + Args: + name: The resource name of the EvaluationRun. Format: + `projects/{project}/locations/{location}/evaluationRuns/{evaluation_run}` + include_evaluation_items: Whether to include the evaluation items in the + response. + config: The optional configuration for the evaluation run. Must be a dict or + `types.GetEvaluationRunConfigOrDict` type. + + Returns: + The evaluation run. + Raises: + ValueError: If the name is empty or invalid. """ if not name: raise ValueError("name cannot be empty.") @@ -2123,7 +2170,9 @@ async def get_evaluation_run( if include_evaluation_items: result.evaluation_item_results = ( await _evals_common._convert_evaluation_run_results_async( - self._api_client, result.evaluation_run_results + self._api_client, + result.evaluation_run_results, + result.inference_configs, ) ) @@ -2136,9 +2185,9 @@ async def get_evaluation_run( async def create_evaluation_run( self, *, - name: str, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, + name: Optional[str] = None, display_name: Optional[str] = None, metrics: Optional[ list[types.EvaluationRunMetricOrDict] @@ -2147,7 +2196,21 @@ async def create_evaluation_run( labels: Optional[dict[str, str]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: - """Creates an EvaluationRun.""" + """Creates an EvaluationRun. + + Args: + dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset. + dest: The GCS URI prefix to write the evaluation results to. + name: The name of the evaluation run. + display_name: The display name of the evaluation run. + metrics: The list of metrics to evaluate. + agent_info: The agent info to evaluate. + labels: The labels to apply to the evaluation run. + config: The configuration for the evaluation run. + + Returns: + The created evaluation run. + """ if type(dataset).__name__ == "EvaluationDataset": logger.warning( "EvaluationDataset input is experimental and may change in future versions." @@ -2156,6 +2219,8 @@ async def create_evaluation_run( raise ValueError( "EvaluationDataset must have eval_dataset_df populated." ) + if dataset.candidate_name is None and agent_info: + dataset.candidate_name = agent_info.name eval_set = _evals_common._create_evaluation_set_from_dataframe( self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name ) @@ -2174,6 +2239,15 @@ async def create_evaluation_run( logger.warning( "The agent_info field is experimental and may change in future versions." ) + if isinstance(agent_info, dict): + agent_info = types.evals.AgentInfo.model_validate(agent_info) + if ( + not agent_info.agent + or len(agent_info.agent.split("reasoningEngines/")) != 2 + ): + raise ValueError( + "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}." + ) inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig( agent_config=types.EvaluationRunAgentConfig( developer_instruction=genai_types.Content( @@ -2182,21 +2256,16 @@ async def create_evaluation_run( tools=agent_info.tool_declarations, ) ) - if ( - not agent_info.agent - or len(agent_info.agent.split("reasoningEngines/")) != 2 - ): - raise ValueError( - "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}." - ) labels = labels or {} labels["vertex-ai-evaluation-agent-engine-id"] = agent_info.agent.split( "reasoningEngines/" )[-1] + if not name: + name = f"evaluation_run_{uuid.uuid4()}" result = await self._create_evaluation_run( # type: ignore[no-any-return] name=name, - display_name=display_name, + display_name=display_name or name, data_source=dataset, evaluation_config=evaluation_config, inference_configs=inference_configs,