Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions tests/unit/vertexai/genai/replays/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,47 @@ def test_evaluation_result(client):
assert case_result.response_candidate_results is not None


def test_evaluation_byor(client):
"""Tests that evaluate() with BYOR (Bring-Your-Own Response) produces a correctly structured EvaluationResult."""
byor_df = pd.DataFrame(
{
"prompt": [
"Write a simple story about a dinosaur",
"Generate a poem about Vertex AI",
],
"response": [
"Once upon a time, there was a T-Rex named Rexy.",
"In clouds of code, a mind of silicon born...",
],
}
)

metrics_to_run = [
types.RubricMetric.GENERAL_QUALITY,
]

evaluation_result = client.evals.evaluate(
dataset=byor_df,
metrics=metrics_to_run,
)

assert isinstance(evaluation_result, types.EvaluationResult)

assert evaluation_result.summary_metrics is not None
assert len(evaluation_result.summary_metrics) > 0
for summary in evaluation_result.summary_metrics:
assert isinstance(summary, types.AggregatedMetricResult)
assert summary.metric_name is not None
assert summary.mean_score is not None

assert evaluation_result.eval_case_results is not None
assert len(evaluation_result.eval_case_results) > 0
for case_result in evaluation_result.eval_case_results:
assert isinstance(case_result, types.EvalCaseResult)
assert case_result.eval_case_index is not None
assert case_result.response_candidate_results is not None


pytestmark = pytest_helper.setup(
file=__file__,
globals_for_file=globals(),
Expand Down
17 changes: 13 additions & 4 deletions vertexai/_genai/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,9 @@ def evaluate(
self,
*,
dataset: Union[
types.EvaluationDatasetOrDict, list[types.EvaluationDatasetOrDict]
pd.DataFrame,
types.EvaluationDatasetOrDict,
list[types.EvaluationDatasetOrDict],
],
metrics: list[types.MetricOrDict] = None,
config: Optional[types.EvaluateMethodConfigOrDict] = None,
Expand All @@ -979,10 +981,13 @@ def evaluate(
"""Evaluates candidate responses in the provided dataset(s) using the specified metrics.

Args:
dataset: The dataset(s) to evaluate. Can be a single `types.EvaluationDataset` or a list of `types.EvaluationDataset`.
dataset: The dataset(s) to evaluate. Can be a pandas DataFrame, a single
`types.EvaluationDataset` or a list of `types.EvaluationDataset`.
metrics: The list of metrics to use for evaluation.
config: Optional configuration for the evaluation. Can be a dictionary or a `types.EvaluateMethodConfig` object.
- dataset_schema: Schema to use for the dataset. If not specified, the dataset schema will be inferred from the dataset automatically.
config: Optional configuration for the evaluation. Can be a dictionary or a
`types.EvaluateMethodConfig` object.
- dataset_schema: Schema to use for the dataset. If not specified, the
dataset schema will be inferred from the dataset automatically.
- dest: Destination path for storing evaluation results.
**kwargs: Extra arguments to pass to evaluation, such as `agent_info`.

Expand All @@ -993,6 +998,10 @@ def evaluate(
config = types.EvaluateMethodConfig()
if isinstance(config, dict):
config = types.EvaluateMethodConfig.model_validate(config)

if isinstance(dataset, pd.DataFrame):
dataset = types.EvaluationDataset(eval_dataset_df=dataset)

if isinstance(dataset, list):
dataset = [
(
Expand Down
Loading