From 9a46e67a8c341673b14bece88bc635b455314711 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Sat, 1 Nov 2025 18:35:27 -0700 Subject: [PATCH] feat: GenAI Client(evals) - Add retry to predefine metric PiperOrigin-RevId: 826984457 --- tests/unit/vertexai/genai/test_evals.py | 105 ++++++++++++++++++++++ vertexai/_genai/_evals_metric_handlers.py | 30 ++++++- 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index dec2bb447d..bcfd8cc888 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -33,6 +33,7 @@ from vertexai._genai import evals from vertexai._genai import types as vertexai_genai_types from google.genai import client +from google.genai import errors as genai_errors from google.genai import types as genai_types import pandas as pd import pytest @@ -4861,6 +4862,110 @@ def test_execute_evaluation_adds_creation_timestamp( assert result.metadata is not None assert result.metadata.creation_timestamp == mock_now + @mock.patch( + "vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS", + frozenset(["summarization_quality"]), + ) + @mock.patch("time.sleep", return_value=None) + @mock.patch("vertexai._genai.evals.Evals._evaluate_instances") + def test_predefined_metric_retry_on_resource_exhausted( + self, + mock_private_evaluate_instances, + mock_sleep, + mock_api_client_fixture, + ): + dataset_df = pd.DataFrame( + [{"prompt": "Test prompt", "response": "Test response"}] + ) + input_dataset = vertexai_genai_types.EvaluationDataset( + eval_dataset_df=dataset_df + ) + metric = vertexai_genai_types.Metric(name="summarization_quality") + metric_result = vertexai_genai_types.MetricResult( + score=0.9, + explanation="Mocked predefined explanation", + rubric_verdicts=[], + error=None, + ) + error_response_json = { + "error": { + "code": 429, + "message": ("Judge model resource exhausted. Please try again later."), + "status": "RESOURCE_EXHAUSTED", + } + } + mock_private_evaluate_instances.side_effect = [ + genai_errors.ClientError(code=429, response_json=error_response_json), + genai_errors.ClientError(code=429, response_json=error_response_json), + vertexai_genai_types.EvaluateInstancesResponse( + metric_results=[metric_result] + ), + ] + + result = _evals_common._execute_evaluation( + api_client=mock_api_client_fixture, + dataset=input_dataset, + metrics=[metric], + ) + + assert mock_private_evaluate_instances.call_count == 3 + assert mock_sleep.call_count == 2 + assert len(result.summary_metrics) == 1 + summary_metric = result.summary_metrics[0] + assert summary_metric.metric_name == "summarization_quality" + assert summary_metric.mean_score == 0.9 + + @mock.patch( + "vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS", + frozenset(["summarization_quality"]), + ) + @mock.patch("time.sleep", return_value=None) + @mock.patch("vertexai._genai.evals.Evals._evaluate_instances") + def test_predefined_metric_retry_fail_on_resource_exhausted( + self, + mock_private_evaluate_instances, + mock_sleep, + mock_api_client_fixture, + ): + dataset_df = pd.DataFrame( + [{"prompt": "Test prompt", "response": "Test response"}] + ) + input_dataset = vertexai_genai_types.EvaluationDataset( + eval_dataset_df=dataset_df + ) + error_response_json = { + "error": { + "code": 429, + "message": ("Judge model resource exhausted. Please try again later."), + "status": "RESOURCE_EXHAUSTED", + } + } + metric = vertexai_genai_types.Metric(name="summarization_quality") + mock_private_evaluate_instances.side_effect = [ + genai_errors.ClientError(code=429, response_json=error_response_json), + genai_errors.ClientError(code=429, response_json=error_response_json), + genai_errors.ClientError(code=429, response_json=error_response_json), + ] + + result = _evals_common._execute_evaluation( + api_client=mock_api_client_fixture, + dataset=input_dataset, + metrics=[metric], + ) + + assert mock_private_evaluate_instances.call_count == 3 + assert mock_sleep.call_count == 2 + assert len(result.summary_metrics) == 1 + summary_metric = result.summary_metrics[0] + assert summary_metric.metric_name == "summarization_quality" + assert summary_metric.mean_score is None + assert summary_metric.num_cases_error == 1 + assert ( + "Judge model resource exhausted after 3 retries" + ) in result.eval_case_results[0].response_candidate_results[0].metric_results[ + "summarization_quality" + ].error_message + class TestEvaluationDataset: """Contains set of tests for the EvaluationDataset class methods.""" diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py index 9f68bc353d..eec98cfaa0 100644 --- a/vertexai/_genai/_evals_metric_handlers.py +++ b/vertexai/_genai/_evals_metric_handlers.py @@ -20,8 +20,10 @@ import json import logging import statistics +import time from typing import Any, Callable, Optional, TypeVar, Union +from google.genai import errors as genai_errors from google.genai import _common from google.genai import types as genai_types from tqdm import tqdm @@ -34,6 +36,7 @@ logger = logging.getLogger(__name__) +_MAX_RETRIES = 3 def _extract_text_from_content( @@ -964,9 +967,30 @@ def get_metric_result( metric_name = self.metric.name try: payload = self._build_request_payload(eval_case, response_index) - api_response = self.module._evaluate_instances( - metrics=[self.metric], instance=payload.get("instance") - ) + for attempt in range(_MAX_RETRIES): + try: + api_response = self.module._evaluate_instances( + metrics=[self.metric], instance=payload.get("instance") + ) + break + except genai_errors.ClientError as e: + if e.code == 429: + logger.warning( + "Resource Exhausted error on attempt %d/%d: %s. Retrying in %s" + " seconds...", + attempt + 1, + _MAX_RETRIES, + e, + 2**attempt, + ) + if attempt == _MAX_RETRIES - 1: + return types.EvalCaseMetricResult( + metric_name=metric_name, + error_message=f"Judge model resource exhausted after {_MAX_RETRIES} retries: {e}", + ) + time.sleep(2**attempt) + else: + raise e if ( api_response