Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions tests/unit/vertexai/genai/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from vertexai._genai import evals
from vertexai._genai import types as vertexai_genai_types
from google.genai import client
from google.genai import errors as genai_errors
from google.genai import types as genai_types
import pandas as pd
import pytest
Expand Down Expand Up @@ -4861,6 +4862,110 @@ def test_execute_evaluation_adds_creation_timestamp(
assert result.metadata is not None
assert result.metadata.creation_timestamp == mock_now

@mock.patch(
"vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS",
frozenset(["summarization_quality"]),
)
@mock.patch("time.sleep", return_value=None)
@mock.patch("vertexai._genai.evals.Evals._evaluate_instances")
def test_predefined_metric_retry_on_resource_exhausted(
self,
mock_private_evaluate_instances,
mock_sleep,
mock_api_client_fixture,
):
dataset_df = pd.DataFrame(
[{"prompt": "Test prompt", "response": "Test response"}]
)
input_dataset = vertexai_genai_types.EvaluationDataset(
eval_dataset_df=dataset_df
)
metric = vertexai_genai_types.Metric(name="summarization_quality")
metric_result = vertexai_genai_types.MetricResult(
score=0.9,
explanation="Mocked predefined explanation",
rubric_verdicts=[],
error=None,
)
error_response_json = {
"error": {
"code": 429,
"message": ("Judge model resource exhausted. Please try again later."),
"status": "RESOURCE_EXHAUSTED",
}
}
mock_private_evaluate_instances.side_effect = [
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
vertexai_genai_types.EvaluateInstancesResponse(
metric_results=[metric_result]
),
]

result = _evals_common._execute_evaluation(
api_client=mock_api_client_fixture,
dataset=input_dataset,
metrics=[metric],
)

assert mock_private_evaluate_instances.call_count == 3
assert mock_sleep.call_count == 2
assert len(result.summary_metrics) == 1
summary_metric = result.summary_metrics[0]
assert summary_metric.metric_name == "summarization_quality"
assert summary_metric.mean_score == 0.9

@mock.patch(
"vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS",
frozenset(["summarization_quality"]),
)
@mock.patch("time.sleep", return_value=None)
@mock.patch("vertexai._genai.evals.Evals._evaluate_instances")
def test_predefined_metric_retry_fail_on_resource_exhausted(
self,
mock_private_evaluate_instances,
mock_sleep,
mock_api_client_fixture,
):
dataset_df = pd.DataFrame(
[{"prompt": "Test prompt", "response": "Test response"}]
)
input_dataset = vertexai_genai_types.EvaluationDataset(
eval_dataset_df=dataset_df
)
error_response_json = {
"error": {
"code": 429,
"message": ("Judge model resource exhausted. Please try again later."),
"status": "RESOURCE_EXHAUSTED",
}
}
metric = vertexai_genai_types.Metric(name="summarization_quality")
mock_private_evaluate_instances.side_effect = [
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
]

result = _evals_common._execute_evaluation(
api_client=mock_api_client_fixture,
dataset=input_dataset,
metrics=[metric],
)

assert mock_private_evaluate_instances.call_count == 3
assert mock_sleep.call_count == 2
assert len(result.summary_metrics) == 1
summary_metric = result.summary_metrics[0]
assert summary_metric.metric_name == "summarization_quality"
assert summary_metric.mean_score is None
assert summary_metric.num_cases_error == 1
assert (
"Judge model resource exhausted after 3 retries"
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
"summarization_quality"
].error_message


class TestEvaluationDataset:
"""Contains set of tests for the EvaluationDataset class methods."""
Expand Down
30 changes: 27 additions & 3 deletions vertexai/_genai/_evals_metric_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
import json
import logging
import statistics
import time
from typing import Any, Callable, Optional, TypeVar, Union

from google.genai import errors as genai_errors
from google.genai import _common
from google.genai import types as genai_types
from tqdm import tqdm
Expand All @@ -34,6 +36,7 @@


logger = logging.getLogger(__name__)
_MAX_RETRIES = 3


def _extract_text_from_content(
Expand Down Expand Up @@ -964,9 +967,30 @@ def get_metric_result(
metric_name = self.metric.name
try:
payload = self._build_request_payload(eval_case, response_index)
api_response = self.module._evaluate_instances(
metrics=[self.metric], instance=payload.get("instance")
)
for attempt in range(_MAX_RETRIES):
try:
api_response = self.module._evaluate_instances(
metrics=[self.metric], instance=payload.get("instance")
)
break
except genai_errors.ClientError as e:
if e.code == 429:
logger.warning(
"Resource Exhausted error on attempt %d/%d: %s. Retrying in %s"
" seconds...",
attempt + 1,
_MAX_RETRIES,
e,
2**attempt,
)
if attempt == _MAX_RETRIES - 1:
return types.EvalCaseMetricResult(
metric_name=metric_name,
error_message=f"Judge model resource exhausted after {_MAX_RETRIES} retries: {e}",
)
time.sleep(2**attempt)
else:
raise e

if (
api_response
Expand Down
Loading