diff --git a/tests/unit/vertexai/genai/replays/test_custom_code_execution_metric.py b/tests/unit/vertexai/genai/replays/test_custom_code_execution_metric.py new file mode 100644 index 0000000000..d3ec9ab8db --- /dev/null +++ b/tests/unit/vertexai/genai/replays/test_custom_code_execution_metric.py @@ -0,0 +1,105 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pylint: disable=protected-access,bad-continuation,missing-function-docstring + +from tests.unit.vertexai.genai.replays import pytest_helper +from vertexai._genai import types +import pandas as pd + + +def test_custom_code_execution(client): + """Tests that custom code execution metric produces a correctly structured EvaluationResult.""" + + code_snippet = """ +def evaluate(instance): + if instance['response'] == instance['reference']: + return 1.0 + return 0.0 +""" + + custom_metric = types.Metric( + name="my_custom_code_metric", + remote_custom_function=code_snippet, + ) + + prompts_df = pd.DataFrame( + { + "prompt": ["What is 2+2?", "What is 3+3?"], + "response": ["4", "5"], + "reference": ["4", "6"], + } + ) + + eval_dataset = types.EvaluationDataset( + eval_dataset_df=prompts_df, + candidate_name="test_model", + ) + + evaluation_result = client.evals.evaluate( + dataset=eval_dataset, + metrics=[custom_metric], + ) + + assert isinstance(evaluation_result, types.EvaluationResult) + + assert evaluation_result.summary_metrics is not None + assert evaluation_result.summary_metrics + for summary in evaluation_result.summary_metrics: + assert isinstance(summary, types.AggregatedMetricResult) + assert summary.metric_name == "my_custom_code_metric" + + assert evaluation_result.eval_case_results is not None + assert evaluation_result.eval_case_results + for case_result in evaluation_result.eval_case_results: + assert isinstance(case_result, types.EvalCaseResult) + assert case_result.eval_case_index is not None + assert case_result.response_candidate_results is not None + + +def test_custom_code_execution_batch_evaluate(client): + """Tests that batch_evaluate() works with custom code execution metric.""" + + code_snippet = """ +def evaluate(instance): + if instance['response'] == instance['reference']: + return 1.0 + return 0.0 +""" + + custom_metric = types.Metric( + name="my_custom_code_metric", + remote_custom_function=code_snippet, + ) + + eval_dataset = types.EvaluationDataset( + gcs_source=types.GcsSource( + uris=["gs://genai-eval-sdk-replay-test/test_data/inference_results.jsonl"] + ), + ) + + evaluation_result = client.evals.batch_evaluate( + dataset=eval_dataset, + metrics=[custom_metric], + dest="gs://genai-eval-sdk-replay-test/test_data/batch_eval_output", + ) + + assert evaluation_result is not None + + +pytestmark = pytest_helper.setup( + file=__file__, + globals_for_file=globals(), + test_method="evals.evaluate", +) diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py index 59b4d3f3f5..9475846fdb 100644 --- a/vertexai/_genai/_evals_metric_handlers.py +++ b/vertexai/_genai/_evals_metric_handlers.py @@ -685,10 +685,9 @@ def get_metric_result( ) except Exception as e: # pylint: disable=broad-exception-caught logger.error( - "Error processing metric %s for case %s: %s", + "Error processing metric %s for case %s.", metric_name, eval_case.eval_case_id, - e, exc_info=True, ) return types.EvalCaseMetricResult( @@ -1099,7 +1098,147 @@ def aggregate( ) +class CustomCodeExecutionMetricHandler(MetricHandler): + """Metric handler for custom code execution metrics.""" + + def __init__(self, module: "evals.Evals", metric: types.Metric): + super().__init__(module=module, metric=metric) + + if not self.metric.remote_custom_function: + raise ValueError( + f"CustomCodeExecutionMetricHandler for '{self.metric.name}' needs " + " Metric.remote_custom_function to be set." + ) + + def _build_request_payload( + self, eval_case: types.EvalCase, response_index: int + ) -> dict[str, Any]: + """Builds the request parameters for evaluate instances request.""" + if not eval_case.responses or response_index >= len(eval_case.responses): + raise IndexError(f"response_index {response_index} is out of bounds.") + + response_content = eval_case.responses[response_index].response + if not response_content: + raise ValueError( + f"Response content missing for candidate {response_index}." + ) + + reference_instance_data = None + if eval_case.reference: + reference_instance_data = PredefinedMetricHandler._content_to_instance_data( + eval_case.reference.response + ) + + prompt_instance_data = PredefinedMetricHandler._content_to_instance_data( + eval_case.prompt + ) + + instance_payload = types.EvaluationInstance( + prompt=prompt_instance_data, + response=PredefinedMetricHandler._content_to_instance_data( + response_content + ), + reference=reference_instance_data, + ) + + return { + "instance": instance_payload, + } + + @override + def get_metric_result( + self, eval_case: types.EvalCase, response_index: int + ) -> types.EvalCaseMetricResult: + """Processes a single evaluation case for a specific custom code execution metric.""" + metric_name = self.metric.name + try: + payload = self._build_request_payload(eval_case, response_index) + for attempt in range(_MAX_RETRIES): + try: + api_response = self.module._evaluate_instances( + metrics=[self.metric], + instance=payload.get("instance"), + ) + break + except genai_errors.ClientError as e: + if e.code == 429: + logger.warning( + "Resource Exhausted error on attempt %d/%d: %s. Retrying in %s" + " seconds...", + attempt + 1, + _MAX_RETRIES, + e, + 2**attempt, + ) + if attempt == _MAX_RETRIES - 1: + return types.EvalCaseMetricResult( + metric_name=metric_name, + error_message=f"Resource exhausted after {_MAX_RETRIES} retries: {e}", + ) + time.sleep(2**attempt) + else: + raise e + + if ( + api_response + and hasattr(api_response, "metric_results") + and api_response.metric_results + ): + result_data = api_response.metric_results[0] + + error_message = None + if result_data.error and getattr(result_data.error, "code"): + error_message = f"Error in metric result: {result_data.error}" + return types.EvalCaseMetricResult( + metric_name=metric_name, + score=result_data.score, + explanation=result_data.explanation, + error_message=error_message, + ) + else: + logger.error( + "Metric results missing in API response for metric '%s'." + " API response: %s", + metric_name, + ( + api_response.model_dump_json(exclude_none=True) + if api_response + else "None" + ), + ) + return types.EvalCaseMetricResult( + metric_name=metric_name, + error_message="Metric results missing in API response.", + ) + except Exception as e: # pylint: disable=broad-exception-caught + logger.error( + "Error processing metric %s for case %s", + metric_name, + eval_case.eval_case_id, + exc_info=True, + ) + return types.EvalCaseMetricResult( + metric_name=metric_name, error_message=str(e) + ) + + @override + def aggregate( + self, eval_case_metric_results: list[types.EvalCaseMetricResult] + ) -> types.AggregatedMetricResult: + """Aggregates the metric results for a custom code execution metric.""" + logger.debug( + "Aggregating results for custom code execution metric: %s", self.metric.name + ) + return _default_aggregate_scores( + self.metric.name, eval_case_metric_results, calculate_pass_rate=True + ) + + _METRIC_HANDLER_MAPPING = [ + ( + lambda m: hasattr(m, "remote_custom_function") and m.remote_custom_function, + CustomCodeExecutionMetricHandler, + ), ( lambda m: m.custom_function and isinstance(m.custom_function, Callable), CustomMetricHandler, @@ -1125,6 +1264,7 @@ def aggregate( TranslationMetricHandler, LLMMetricHandler, CustomMetricHandler, + CustomCodeExecutionMetricHandler, PredefinedMetricHandler, ) diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py index 02356a98be..ab0c3f1f31 100644 --- a/vertexai/_genai/_transformers.py +++ b/vertexai/_genai/_transformers.py @@ -60,6 +60,13 @@ def t_metrics( "metric_spec_name": metric_name, "metric_spec_parameters": metric.metric_spec_parameters, } + # Custom Code Execution Metric + elif ( + hasattr(metric, "remote_custom_function") and metric.remote_custom_function + ): + metric_payload_item["custom_code_execution_spec"] = { + "evaluation_function": metric.remote_custom_function + } # Pointwise metrics elif hasattr(metric, "prompt_template") and metric.prompt_template: pointwise_spec = {"metric_prompt_template": metric.prompt_template} diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 9f88aab784..2a67707e1b 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -81,7 +81,13 @@ def _CreateEvaluationRunParameters_to_vertex( setv(to_object, ["dataSource"], getv(from_object, ["data_source"])) if getv(from_object, ["evaluation_config"]) is not None: - setv(to_object, ["evaluationConfig"], getv(from_object, ["evaluation_config"])) + setv( + to_object, + ["evaluationConfig"], + _EvaluationRunConfig_to_vertex( + getv(from_object, ["evaluation_config"]), to_object + ), + ) if getv(from_object, ["labels"]) is not None: setv(to_object, ["labels"], getv(from_object, ["labels"])) @@ -112,6 +118,36 @@ def _CreateEvaluationSetParameters_to_vertex( return to_object +def _CustomCodeExecutionSpec_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["evaluation_function"]) is not None: + setv( + to_object, + ["remote_custom_function"], + getv(from_object, ["evaluation_function"]), + ) + + return to_object + + +def _CustomCodeExecutionSpec_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["remote_custom_function"]) is not None: + setv( + to_object, + ["evaluation_function"], + getv(from_object, ["remote_custom_function"]), + ) + + return to_object + + def _EvaluateInstancesRequestParameters_to_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -196,6 +232,90 @@ def _EvaluateInstancesRequestParameters_to_vertex( return to_object +def _EvaluationRunConfig_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["metrics"]) is not None: + setv( + to_object, + ["metrics"], + [ + _EvaluationRunMetric_from_vertex(item, to_object) + for item in getv(from_object, ["metrics"]) + ], + ) + + if getv(from_object, ["outputConfig"]) is not None: + setv(to_object, ["output_config"], getv(from_object, ["outputConfig"])) + + if getv(from_object, ["autoraterConfig"]) is not None: + setv(to_object, ["autorater_config"], getv(from_object, ["autoraterConfig"])) + + return to_object + + +def _EvaluationRunConfig_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["metrics"]) is not None: + setv( + to_object, + ["metrics"], + [ + _EvaluationRunMetric_to_vertex(item, to_object) + for item in getv(from_object, ["metrics"]) + ], + ) + + if getv(from_object, ["output_config"]) is not None: + setv(to_object, ["outputConfig"], getv(from_object, ["output_config"])) + + if getv(from_object, ["autorater_config"]) is not None: + setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"])) + + return to_object + + +def _EvaluationRunMetric_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["metric"]) is not None: + setv(to_object, ["metric"], getv(from_object, ["metric"])) + + if getv(from_object, ["metricConfig"]) is not None: + setv( + to_object, + ["metric_config"], + _UnifiedMetric_from_vertex(getv(from_object, ["metricConfig"]), to_object), + ) + + return to_object + + +def _EvaluationRunMetric_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["metric"]) is not None: + setv(to_object, ["metric"], getv(from_object, ["metric"])) + + if getv(from_object, ["metric_config"]) is not None: + setv( + to_object, + ["metricConfig"], + _UnifiedMetric_to_vertex(getv(from_object, ["metric_config"]), to_object), + ) + + return to_object + + def _EvaluationRun_from_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -240,7 +360,13 @@ def _EvaluationRun_from_vertex( ) if getv(from_object, ["evaluationConfig"]) is not None: - setv(to_object, ["evaluation_config"], getv(from_object, ["evaluationConfig"])) + setv( + to_object, + ["evaluation_config"], + _EvaluationRunConfig_from_vertex( + getv(from_object, ["evaluationConfig"]), to_object + ), + ) if getv(from_object, ["inferenceConfigs"]) is not None: setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"])) @@ -410,6 +536,94 @@ def _RubricGenerationSpec_to_vertex( return to_object +def _UnifiedMetric_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["bleuSpec"]) is not None: + setv(to_object, ["bleu_spec"], getv(from_object, ["bleuSpec"])) + + if getv(from_object, ["rougeSpec"]) is not None: + setv(to_object, ["rouge_spec"], getv(from_object, ["rougeSpec"])) + + if getv(from_object, ["pointwiseMetricSpec"]) is not None: + setv( + to_object, + ["pointwise_metric_spec"], + getv(from_object, ["pointwiseMetricSpec"]), + ) + + if getv(from_object, ["llmBasedMetricSpec"]) is not None: + setv( + to_object, + ["llm_based_metric_spec"], + getv(from_object, ["llmBasedMetricSpec"]), + ) + + if getv(from_object, ["customCodeExecutionSpec"]) is not None: + setv( + to_object, + ["custom_code_execution_spec"], + _CustomCodeExecutionSpec_from_vertex( + getv(from_object, ["customCodeExecutionSpec"]), to_object + ), + ) + + if getv(from_object, ["predefinedMetricSpec"]) is not None: + setv( + to_object, + ["predefined_metric_spec"], + getv(from_object, ["predefinedMetricSpec"]), + ) + + return to_object + + +def _UnifiedMetric_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["bleu_spec"]) is not None: + setv(to_object, ["bleuSpec"], getv(from_object, ["bleu_spec"])) + + if getv(from_object, ["rouge_spec"]) is not None: + setv(to_object, ["rougeSpec"], getv(from_object, ["rouge_spec"])) + + if getv(from_object, ["pointwise_metric_spec"]) is not None: + setv( + to_object, + ["pointwiseMetricSpec"], + getv(from_object, ["pointwise_metric_spec"]), + ) + + if getv(from_object, ["llm_based_metric_spec"]) is not None: + setv( + to_object, + ["llmBasedMetricSpec"], + getv(from_object, ["llm_based_metric_spec"]), + ) + + if getv(from_object, ["custom_code_execution_spec"]) is not None: + setv( + to_object, + ["customCodeExecutionSpec"], + _CustomCodeExecutionSpec_to_vertex( + getv(from_object, ["custom_code_execution_spec"]), to_object + ), + ) + + if getv(from_object, ["predefined_metric_spec"]) is not None: + setv( + to_object, + ["predefinedMetricSpec"], + getv(from_object, ["predefined_metric_spec"]), + ) + + return to_object + + class Evals(_api_module.BaseModule): def _create_evaluation_item( diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py index 6ff808a52e..a1f8da0079 100644 --- a/vertexai/_genai/types/__init__.py +++ b/vertexai/_genai/types/__init__.py @@ -218,6 +218,9 @@ from .common import CreatePromptVersionConfig from .common import CreatePromptVersionConfigDict from .common import CreatePromptVersionConfigOrDict +from .common import CustomCodeExecutionSpec +from .common import CustomCodeExecutionSpecDict +from .common import CustomCodeExecutionSpecOrDict from .common import CustomJob from .common import CustomJobDict from .common import CustomJobOrDict @@ -1059,6 +1062,9 @@ "LLMBasedMetricSpec", "LLMBasedMetricSpecDict", "LLMBasedMetricSpecOrDict", + "CustomCodeExecutionSpec", + "CustomCodeExecutionSpecDict", + "CustomCodeExecutionSpecOrDict", "UnifiedMetric", "UnifiedMetricDict", "UnifiedMetricOrDict", diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index b1d0dd88be..e8efba4216 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -905,6 +905,37 @@ class LLMBasedMetricSpecDict(TypedDict, total=False): LLMBasedMetricSpecOrDict = Union[LLMBasedMetricSpec, LLMBasedMetricSpecDict] +class CustomCodeExecutionSpec(_common.BaseModel): + """Specificies a metric that is computed by running user-defined Python functions remotely.""" + + remote_custom_function: Optional[str] = Field( + default=None, + description="""A string representing a user-defined function for evaluation. + Expected user to define the following function, e.g.: + def evaluate(instance: dict[str, Any]) -> float: + Please include this function signature in the code snippet. + Instance is the evaluation instance, any fields populated in the instance + are available to the function as instance[field_name].""", + ) + + +class CustomCodeExecutionSpecDict(TypedDict, total=False): + """Specificies a metric that is computed by running user-defined Python functions remotely.""" + + remote_custom_function: Optional[str] + """A string representing a user-defined function for evaluation. + Expected user to define the following function, e.g.: + def evaluate(instance: dict[str, Any]) -> float: + Please include this function signature in the code snippet. + Instance is the evaluation instance, any fields populated in the instance + are available to the function as instance[field_name].""" + + +CustomCodeExecutionSpecOrDict = Union[ + CustomCodeExecutionSpec, CustomCodeExecutionSpecDict +] + + class UnifiedMetric(_common.BaseModel): """The unified metric used for evaluation.""" @@ -920,6 +951,9 @@ class UnifiedMetric(_common.BaseModel): llm_based_metric_spec: Optional[LLMBasedMetricSpec] = Field( default=None, description="""The spec for an LLM based metric.""" ) + custom_code_execution_spec: Optional[CustomCodeExecutionSpec] = Field( + default=None, description="""The spec for a custom code execution metric.""" + ) predefined_metric_spec: Optional[PredefinedMetricSpec] = Field( default=None, description="""The spec for a pre-defined metric.""" ) @@ -940,6 +974,9 @@ class UnifiedMetricDict(TypedDict, total=False): llm_based_metric_spec: Optional[LLMBasedMetricSpecDict] """The spec for an LLM based metric.""" + custom_code_execution_spec: Optional[CustomCodeExecutionSpecDict] + """The spec for a custom code execution metric.""" + predefined_metric_spec: Optional[PredefinedMetricSpecDict] """The spec for a pre-defined metric.""" @@ -2616,6 +2653,10 @@ class Metric(_common.BaseModel): default=None, description="""The custom function that defines the end-to-end logic for metric computation.""", ) + remote_custom_function: Optional[str] = Field( + default=None, + description="""The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""", + ) prompt_template: Optional[str] = Field( default=None, description="""The prompt template for the metric.""" ) @@ -2823,6 +2864,9 @@ class MetricDict(TypedDict, total=False): custom_function: Optional[Callable[..., Any]] """The custom function that defines the end-to-end logic for metric computation.""" + remote_custom_function: Optional[str] + """The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""" + prompt_template: Optional[str] """The prompt template for the metric."""