Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions tests/unit/vertexai/genai/replays/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,14 @@ def _get_replay_id(use_vertex: bool, replays_prefix: str) -> str:
)
EVAL_ITEM_REQUEST_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/request_"
EVAL_ITEM_RESULT_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/result_"
EVAL_ITEM_REQUEST_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/request_"
EVAL_ITEM_RESULT_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/result_"
EVAL_GCS_URI_ITEMS = {
EVAL_CONFIG_GCS_URI: "test_resources/mock_eval_config.yaml",
EVAL_ITEM_REQUEST_GCS_URI: "test_resources/request_4813679498589372416.json",
EVAL_ITEM_RESULT_GCS_URI: "test_resources/result_1486082323915997184.json",
EVAL_ITEM_REQUEST_GCS_URI_2: "test_resources/request_4813679498589372416.json",
EVAL_ITEM_RESULT_GCS_URI_2: "test_resources/result_1486082323915997184.json",
}


Expand All @@ -148,11 +152,15 @@ def _mock_read_file_contents_side_effect(uri: str):
current_dir = os.path.dirname(__file__)
if uri in EVAL_GCS_URI_ITEMS:
local_mock_file_path = os.path.join(current_dir, EVAL_GCS_URI_ITEMS[uri])
elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI):
elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI) or uri.startswith(
EVAL_ITEM_REQUEST_GCS_URI_2
):
local_mock_file_path = os.path.join(
current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_REQUEST_GCS_URI]
)
elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI):
elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI) or uri.startswith(
EVAL_ITEM_RESULT_GCS_URI_2
):
local_mock_file_path = os.path.join(
current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_RESULT_GCS_URI]
)
Expand Down
241 changes: 113 additions & 128 deletions tests/unit/vertexai/genai/replays/test_get_evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,34 @@

from tests.unit.vertexai.genai.replays import pytest_helper
from vertexai import types
from google.genai import types as genai_types
import datetime
import pytest


def test_get_eval_run(client):
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
client._api_client._http_options.api_version = "v1beta1"
evaluation_run_name = (
"projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
)
evaluation_run = client.evals.get_evaluation_run(
name=evaluation_run_name, include_evaluation_items=True
)
check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
check_run_1957799200510967808_evaluation_item_results(
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
check_run_5133048044039700480_evaluation_item_results(
client, evaluation_run, evaluation_run_name
)


def test_get_eval_run_include_evaluation_items_false(client):
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
client._api_client._http_options.api_version = "v1beta1"
evaluation_run_name = (
"projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
)
evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
assert evaluation_run.evaluation_item_results is None


Expand Down Expand Up @@ -99,158 +102,142 @@ def test_get_eval_run_eval_set_source(client):
@pytest.mark.asyncio
async def test_get_eval_run_async(client):
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
eval_run_id = "1957799200510967808"
client._api_client._http_options.api_version = "v1beta1"
eval_run_id = "5133048044039700480"
evaluation_run_name = (
f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
)
evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
assert evaluation_run.evaluation_item_results is None


def check_run_1957799200510967808(
def check_run_5133048044039700480(
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
):
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.name == evaluation_run_name
assert evaluation_run.display_name == "test2"
assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"}
assert evaluation_run.display_name == "sdk-test-1"
assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"}
assert evaluation_run.create_time == datetime.datetime(
2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc
2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc
)
assert evaluation_run.completion_time == datetime.datetime(
2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc
2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc
)
assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
assert evaluation_run.evaluation_set_snapshot == (
"projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
"projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
)
assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet(
uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
prompt_column="request",
candidate_response_columns={
"baseline_model_response": "baseline_model_response",
"checkpoint_1": "checkpoint_1",
"checkpoint_2": "checkpoint_2",
},
assert (
evaluation_run.data_source.evaluation_set
== "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
)
assert evaluation_run.evaluation_run_results.evaluation_set == (
"projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
"projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
)
assert evaluation_run.inference_configs == {
"checkpoint_1": types.EvaluationRunInferenceConfig(
model="projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
),
"checkpoint_2": types.EvaluationRunInferenceConfig(
model="projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
"gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
developer_instruction={
"parts": [{"text": "example agent developer instruction"}]
},
tools=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="check_chime",
description="Check chime.",
parameters={
"type": "OBJECT",
"properties": {
"nums": {
"type": "STRING",
"description": "List of numbers to be verified.",
}
},
"required": ["nums"],
},
),
],
)
],
)
),
}
assert evaluation_run.evaluation_run_results.summary_metrics == (
types.SummaryMetric(
metrics={
"checkpoint_1/user_defined/MODE": 5,
"checkpoint_2/universal/P90": 1,
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.6943817985685249,
"gemini-2.0-flash-001@default/user_defined/P90": 5,
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.03146487552180889,
"gemini-2.0-flash-001@default/user_defined/P95": 5,
"checkpoint_1/universal/MINIMUM": 0.8571428656578064,
"checkpoint_1/universal/VARIANCE": 0.0015452162403157982,
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.17738341388587855,
"checkpoint_2/user_defined/P95": 5,
"checkpoint_2/universal/MODE": 1,
"checkpoint_2/user_defined/P90": 5,
"checkpoint_2/universal/P99": 1,
"gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077,
"gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1,
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842,
"gemini-2.0-flash-001@default/universal/P90": 1,
"gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1,
"gemini-2.0-flash-001@default/universal/P95": 1,
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077,
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675,
"gemini-2.0-flash-001@default/universal/MEDIAN": 1,
"gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675,
"gemini-2.0-flash-001@default/universal/MODE": 1,
"gemini-2.0-flash-001@default/safety_v1/MODE": 1,
"gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408,
"gemini-2.0-flash-001@default/safety_v1/P90": 1,
"gemini-2.0-flash-001@default/safety_v1/P95": 1,
"gemini-2.0-flash-001@default/universal/P99": 1,
"gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842,
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408,
"gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
"checkpoint_2/universal/P95": 1,
"checkpoint_2/user_defined/P99": 5,
"checkpoint_2/universal/MINIMUM": 0.7777777910232544,
"gemini-2.0-flash-001@default/universal/P90": 0.8777777791023255,
"checkpoint_1/universal/AVERAGE": 0.986633250587865,
"checkpoint_1/universal/MAXIMUM": 1,
"checkpoint_1/universal/STANDARD_DEVIATION": 0.0393092386127714,
"gemini-2.0-flash-001@default/universal/P95": 0.9000000059604645,
"gemini-2.0-flash-001@default/user_defined/MAXIMUM": 5,
"gemini-2.0-flash-001@default/user_defined/MINIMUM": 3,
"gemini-2.0-flash-001@default/user_defined/VARIANCE": 0.4044321329639886,
"checkpoint_2/user_defined/MAXIMUM": 5,
"checkpoint_1/universal/MEDIAN": 1,
"gemini-2.0-flash-001@default/universal/MEDIAN": 0.7142857313156128,
"gemini-2.0-flash-001@default/user_defined/AVERAGE": 4.736842105263158,
"gemini-2.0-flash-001@default/user_defined/MEDIAN": 5,
"checkpoint_2/user_defined/AVERAGE": 5,
"checkpoint_2/user_defined/MEDIAN": 5,
"checkpoint_2/user_defined/STANDARD_DEVIATION": 0,
"checkpoint_2/universal/MAXIMUM": 1,
"checkpoint_1/universal/MODE": 1,
"checkpoint_2/user_defined/MINIMUM": 5,
"checkpoint_1/user_defined/VARIANCE": 0,
"checkpoint_2/universal/VARIANCE": 0.005771725970062436,
"checkpoint_2/universal/AVERAGE": 0.9438178790243048,
"checkpoint_1/user_defined/MINIMUM": 5,
"gemini-2.0-flash-001@default/universal/P99": 0.9800000011920929,
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.2857142984867096,
"checkpoint_2/user_defined/VARIANCE": 0,
"checkpoint_1/user_defined/MEDIAN": 5,
"checkpoint_2/universal/STANDARD_DEVIATION": 0.07597187617837561,
"checkpoint_1/user_defined/AVERAGE": 5,
"checkpoint_1/user_defined/MAXIMUM": 5,
"gemini-2.0-flash-001@default/user_defined/MODE": 5,
"checkpoint_1/user_defined/P95": 5,
"checkpoint_1/universal/P99": 1,
"checkpoint_1/user_defined/P90": 5,
"checkpoint_2/universal/MEDIAN": 1,
"checkpoint_1/universal/P95": 1,
"checkpoint_1/user_defined/STANDARD_DEVIATION": 0,
"gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION": 0.6359497880839245,
"checkpoint_1/user_defined/P99": 5,
"gemini-2.0-flash-001@default/universal/MODE": [
0.75,
0.8571428656578064,
],
"checkpoint_2/user_defined/MODE": 5,
"checkpoint_1/universal/P90": 1,
"gemini-2.0-flash-001@default/user_defined/P99": 5,
"gemini-2.0-flash-001@default/safety_v1/P99": 1,
},
total_items=19,
total_items=3,
)
)
assert evaluation_run.error is None


def check_run_1957799200510967808_evaluation_item_results(
def check_run_5133048044039700480_evaluation_item_results(
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
):
eval_result = evaluation_run.evaluation_item_results
assert isinstance(eval_result, types.EvaluationResult)
assert eval_result.summary_metrics == [
types.AggregatedMetricResult(
metric_name="checkpoint_1/universal",
mean_score=0.986633250587865,
stdev_score=0.0393092386127714,
metric_name="safety_v1",
mean_score=0.7888888915379842,
stdev_score=0.2991758188061675,
),
types.AggregatedMetricResult(
metric_name="checkpoint_2/universal",
mean_score=0.9438178790243048,
stdev_score=0.07597187617837561,
),
types.AggregatedMetricResult(
metric_name="gemini-2.0-flash-001@default/universal",
mean_score=0.6943817985685249,
stdev_score=0.17738341388587855,
),
types.AggregatedMetricResult(
metric_name="checkpoint_1/user_defined", mean_score=5, stdev_score=0
),
types.AggregatedMetricResult(
metric_name="checkpoint_2/user_defined", mean_score=5, stdev_score=0
),
types.AggregatedMetricResult(
metric_name="gemini-2.0-flash-001@default/user_defined",
mean_score=4.736842105263158,
stdev_score=0.6359497880839245,
metric_name="universal",
mean_score=0.7888888915379842,
stdev_score=0.2991758188061675,
),
]
# Check the agent info.
assert eval_result.agent_info == types.evals.AgentInfo(
name="gemini-2.0-flash-001@default",
instruction="example agent developer instruction",
description=None,
tool_declarations=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="check_chime",
description="Check chime.",
parameters={
"type": "OBJECT",
"properties": {
"nums": {
"type": "STRING",
"description": "List of numbers to be verified.",
}
},
"required": ["nums"],
},
),
],
)
],
)
# Check the first eval case result.
eval_case_result = eval_result.eval_case_results[0]
assert isinstance(eval_case_result, types.EvalCaseResult)
Expand All @@ -264,26 +251,24 @@ def check_run_1957799200510967808_evaluation_item_results(
assert universal_metric_result.explanation is None
# Check the first rubric verdict.
rubric_verdict_0 = universal_metric_result.rubric_verdicts[0]
assert rubric_verdict_0 == (
types.RubricVerdict(
evaluated_rubric=types.Rubric(
content=types.RubricContent(
property=types.RubricContentProperty(
description="The response is in English."
)
),
importance="HIGH",
type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
),
reasoning=("The entire response is written in the English language."),
verdict=True,
)
assert isinstance(rubric_verdict_0, types.RubricVerdict)
assert rubric_verdict_0.evaluated_rubric == types.Rubric(
content=types.RubricContent(
property=types.RubricContentProperty(
description="The response is in English."
)
),
importance="HIGH",
type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
)
assert rubric_verdict_0.reasoning is not None
assert rubric_verdict_0.verdict is True
# Check the first evaluation dataset.
eval_dataset = eval_result.evaluation_dataset[0]
assert isinstance(eval_dataset, types.EvaluationDataset)
assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default"
assert eval_dataset.eval_dataset_df.shape == (19, 3)
assert eval_dataset.eval_dataset_df.shape[0] == 3
assert eval_dataset.eval_dataset_df.shape[1] > 3


pytestmark = pytest_helper.setup(
Expand Down
Loading
Loading