Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 70 additions & 66 deletions tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,70 +48,69 @@
)


# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
# def test_create_eval_run_data_source_evaluation_set(client):
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
# client._api_client._http_options.api_version = "v1beta1"
# tool = genai_types.Tool(
# function_declarations=[
# genai_types.FunctionDeclaration(
# name="get_weather",
# description="Get weather in a location",
# parameters={
# "type": "object",
# "properties": {"location": {"type": "string"}},
# },
# )
# ]
# )
# evaluation_run = client.evals.create_evaluation_run(
# name="test4",
# display_name="test4",
# dataset=types.EvaluationRunDataSource(
# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
# ),
# dest=GCS_DEST,
# metrics=[
# UNIVERSAL_AR_METRIC,
# types.RubricMetric.FINAL_RESPONSE_QUALITY,
# LLM_METRIC
# ],
# agent_info=types.AgentInfo(
# agent="project/123/locations/us-central1/reasoningEngines/456",
# name="agent-1",
# instruction="agent-1 instruction",
# tool_declarations=[tool],
# ),
# labels={"label1": "value1"},
# )
# assert isinstance(evaluation_run, types.EvaluationRun)
# assert evaluation_run.display_name == "test4"
# assert evaluation_run.state == types.EvaluationRunState.PENDING
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
# assert evaluation_run.data_source.evaluation_set == (
# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
# )
# assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
# output_config=genai_types.OutputConfig(
# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
# ),
# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
# )
# assert evaluation_run.inference_configs[
# "agent-1"
# ] == types.EvaluationRunInferenceConfig(
# agent_config=types.EvaluationRunAgentConfig(
# developer_instruction=genai_types.Content(
# parts=[genai_types.Part(text="agent-1 instruction")]
# ),
# tools=[tool],
# )
# )
# assert evaluation_run.labels == {
# "vertex-ai-evaluation-agent-engine-id": "456",
# "label1": "value1",
# }
# assert evaluation_run.error is None
def test_create_eval_run_data_source_evaluation_set(client):
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
client._api_client._http_options.api_version = "v1beta1"
tool = genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="get_weather",
description="Get weather in a location",
parameters={
"type": "object",
"properties": {"location": {"type": "string"}},
},
)
]
)
evaluation_run = client.evals.create_evaluation_run(
name="test4",
display_name="test4",
dataset=types.EvaluationRunDataSource(
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
),
dest=GCS_DEST,
metrics=[
UNIVERSAL_AR_METRIC,
types.RubricMetric.FINAL_RESPONSE_QUALITY,
LLM_METRIC,
],
agent_info=types.evals.AgentInfo(
agent="project/123/locations/us-central1/reasoningEngines/456",
name="agent-1",
instruction="agent-1 instruction",
tool_declarations=[tool],
),
labels={"label1": "value1"},
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test4"
assert evaluation_run.state == types.EvaluationRunState.PENDING
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
assert evaluation_run.data_source.evaluation_set == (
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
)
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
)
assert evaluation_run.inference_configs[
"agent-1"
] == types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
developer_instruction=genai_types.Content(
parts=[genai_types.Part(text="agent-1 instruction")]
),
tools=[tool],
)
)
assert evaluation_run.labels == {
"vertex-ai-evaluation-agent-engine-id": "456",
"label1": "value1",
}
assert evaluation_run.error is None


def test_create_eval_run_data_source_bigquery_request_set(client):
Expand All @@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
),
labels={"label1": "value1"},
dest=GCS_DEST,
metrics=[UNIVERSAL_AR_METRIC],
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test5"
Expand All @@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
metrics=[UNIVERSAL_AR_METRIC],
)
assert evaluation_run.inference_configs is None
assert evaluation_run.labels == {
Expand All @@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
assert evaluation_run.error is None


# Test fails in replay mode because of the timestamp issue
# Test fails in replay mode because of UUID generation mismatch.
# def test_create_eval_run_data_source_evaluation_dataset(client):
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
# input_df = pd.DataFrame(
Expand Down Expand Up @@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
# candidate_name="candidate_1",
# eval_dataset_df=input_df,
# ),
# dest="gs://lakeyk-limited-bucket/eval_run_output",
# dest=GCS_DEST,
# metrics=[UNIVERSAL_AR_METRIC],
# )
# assert isinstance(evaluation_run, types.EvaluationRun)
# assert evaluation_run.display_name == "test6"
Expand Down Expand Up @@ -276,6 +278,7 @@ async def test_create_eval_run_async(client):
)
),
dest=GCS_DEST,
metrics=[UNIVERSAL_AR_METRIC],
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test8"
Expand All @@ -292,6 +295,7 @@ async def test_create_eval_run_async(client):
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
metrics=[UNIVERSAL_AR_METRIC],
)
assert evaluation_run.error is None
assert evaluation_run.inference_configs is None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):
agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
src=test_df,
)
assert inference_result.candidate_name == "agent"
assert inference_result.candidate_name is None
assert inference_result.gcs_source is None


Expand Down
4 changes: 2 additions & 2 deletions tests/unit/vertexai/genai/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(
}
),
)
assert inference_result.candidate_name == "agent"
assert inference_result.candidate_name is None
assert inference_result.gcs_source is None

@mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")
Expand Down Expand Up @@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(
}
),
)
assert inference_result.candidate_name == "agent"
assert inference_result.candidate_name is None
assert inference_result.gcs_source is None

@mock.patch.object(_evals_utils, "EvalDatasetLoader")
Expand Down
1 change: 0 additions & 1 deletion vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,6 @@ def _execute_inference(

evaluation_dataset = types.EvaluationDataset(
eval_dataset_df=results_df,
candidate_name="agent",
)
else:
raise ValueError("Either model or agent_engine must be provided.")
Expand Down
4 changes: 0 additions & 4 deletions vertexai/_genai/_evals_data_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:

intermediate_events: Optional[list[types.Event]] = None
if intermediate_events_data:
logger.warning(
"intermediate_events attribute is experimental and may change in "
"future versions."
)
if isinstance(intermediate_events_data, list):
intermediate_events = []
for event in intermediate_events_data:
Expand Down
35 changes: 34 additions & 1 deletion vertexai/_genai/_evals_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str:

// If we have agent info, render as trace
if(agentInfo) {{
let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🏃</span>agent_run</div></div>`;
let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🤖</span>agent_run</div></div>`;
eventsArray.forEach(event => {{
if (event.content && event.content.parts && event.content.parts.length > 0) {{
event.content.parts.forEach(part => {{
Expand Down Expand Up @@ -1073,3 +1073,36 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str)
html_content = _get_inference_html(dataframe_json_string)
display.display(display.HTML(html_content))


def _get_status_html(status: str, error_message: Optional[str] = None) -> str:
"""Returns a simple HTML string for displaying a status and optional error."""
error_html = ""
if error_message:
error_html = f"""
<p>
<b>Error:</b>
<pre style="white-space: pre-wrap; word-wrap: break-word;">{error_message}</pre>
</p>
"""

return f"""
<div>
<p><b>Status:</b> {status}</p>
{error_html}
</div>
"""


def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None:
"""Displays the status of an evaluation run in an IPython environment."""
if not _is_ipython_env():
logger.warning("Skipping display: not in an IPython environment.")
return
else:
from IPython import display

status = eval_run_obj.state.name if eval_run_obj.state else "UNKNOWN"
error_message = str(eval_run_obj.error) if eval_run_obj.error else None
html_content = _get_status_html(status, error_message)
display.display(display.HTML(html_content))
48 changes: 26 additions & 22 deletions vertexai/_genai/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,11 +1334,9 @@ def create_evaluation_run(
*,
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
dest: str,
metrics: list[types.EvaluationRunMetricOrDict],
name: Optional[str] = None,
display_name: Optional[str] = None,
metrics: Optional[
list[types.EvaluationRunMetricOrDict]
] = None, # TODO: Make required unified metrics available in prod.
agent_info: Optional[types.evals.AgentInfoOrDict] = None,
labels: Optional[dict[str, str]] = None,
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
Expand All @@ -1348,25 +1346,32 @@ def create_evaluation_run(
Args:
dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
dest: The GCS URI prefix to write the evaluation results to.
metrics: The list of metrics to evaluate.
name: The name of the evaluation run.
display_name: The display name of the evaluation run.
metrics: The list of metrics to evaluate.
agent_info: The agent info to evaluate.
labels: The labels to apply to the evaluation run.
config: The configuration for the evaluation run.

Returns:
The created evaluation run.
"""
if agent_info and isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if type(dataset).__name__ == "EvaluationDataset":
logger.warning(
"EvaluationDataset input is experimental and may change in future versions."
)
if dataset.eval_dataset_df is None:
raise ValueError(
"EvaluationDataset must have eval_dataset_df populated."
)
if dataset.candidate_name is None and agent_info:
if (
dataset.candidate_name
and agent_info.name
and dataset.candidate_name != agent_info.name
):
logger.warning(
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
)
elif dataset.candidate_name is None and agent_info:
dataset.candidate_name = agent_info.name
eval_set = _evals_common._create_evaluation_set_from_dataframe(
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
Expand All @@ -1383,9 +1388,6 @@ def create_evaluation_run(
)
inference_configs = {}
if agent_info:
logger.warning(
"The agent_info field is experimental and may change in future versions."
)
if isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if (
Expand Down Expand Up @@ -2187,11 +2189,9 @@ async def create_evaluation_run(
*,
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
dest: str,
metrics: list[types.EvaluationRunMetricOrDict],
name: Optional[str] = None,
display_name: Optional[str] = None,
metrics: Optional[
list[types.EvaluationRunMetricOrDict]
] = None, # TODO: Make required unified metrics available in prod.
agent_info: Optional[types.evals.AgentInfo] = None,
labels: Optional[dict[str, str]] = None,
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
Expand All @@ -2201,25 +2201,32 @@ async def create_evaluation_run(
Args:
dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
dest: The GCS URI prefix to write the evaluation results to.
metrics: The list of metrics to evaluate.
name: The name of the evaluation run.
display_name: The display name of the evaluation run.
metrics: The list of metrics to evaluate.
agent_info: The agent info to evaluate.
labels: The labels to apply to the evaluation run.
config: The configuration for the evaluation run.

Returns:
The created evaluation run.
"""
if agent_info and isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if type(dataset).__name__ == "EvaluationDataset":
logger.warning(
"EvaluationDataset input is experimental and may change in future versions."
)
if dataset.eval_dataset_df is None:
raise ValueError(
"EvaluationDataset must have eval_dataset_df populated."
)
if dataset.candidate_name is None and agent_info:
if (
dataset.candidate_name
and agent_info.name
and dataset.candidate_name != agent_info.name
):
logger.warning(
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
)
elif dataset.candidate_name is None and agent_info:
dataset.candidate_name = agent_info.name
eval_set = _evals_common._create_evaluation_set_from_dataframe(
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
Expand All @@ -2236,9 +2243,6 @@ async def create_evaluation_run(
)
inference_configs = {}
if agent_info:
logger.warning(
"The agent_info field is experimental and may change in future versions."
)
if isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if (
Expand Down
Loading
Loading