diff --git a/tests/unit/vertexai/genai/replays/test_custom_code_execution_metric.py b/tests/unit/vertexai/genai/replays/test_custom_code_execution_metric.py
new file mode 100644
index 0000000000..d3ec9ab8db
--- /dev/null
+++ b/tests/unit/vertexai/genai/replays/test_custom_code_execution_metric.py
@@ -0,0 +1,105 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# pylint: disable=protected-access,bad-continuation,missing-function-docstring
+
+from tests.unit.vertexai.genai.replays import pytest_helper
+from vertexai._genai import types
+import pandas as pd
+
+
+def test_custom_code_execution(client):
+    """Tests that custom code execution metric produces a correctly structured EvaluationResult."""
+
+    code_snippet = """
+def evaluate(instance):
+    if instance['response'] == instance['reference']:
+        return 1.0
+    return 0.0
+"""
+
+    custom_metric = types.Metric(
+        name="my_custom_code_metric",
+        remote_custom_function=code_snippet,
+    )
+
+    prompts_df = pd.DataFrame(
+        {
+            "prompt": ["What is 2+2?", "What is 3+3?"],
+            "response": ["4", "5"],
+            "reference": ["4", "6"],
+        }
+    )
+
+    eval_dataset = types.EvaluationDataset(
+        eval_dataset_df=prompts_df,
+        candidate_name="test_model",
+    )
+
+    evaluation_result = client.evals.evaluate(
+        dataset=eval_dataset,
+        metrics=[custom_metric],
+    )
+
+    assert isinstance(evaluation_result, types.EvaluationResult)
+
+    assert evaluation_result.summary_metrics is not None
+    assert evaluation_result.summary_metrics
+    for summary in evaluation_result.summary_metrics:
+        assert isinstance(summary, types.AggregatedMetricResult)
+        assert summary.metric_name == "my_custom_code_metric"
+
+    assert evaluation_result.eval_case_results is not None
+    assert evaluation_result.eval_case_results
+    for case_result in evaluation_result.eval_case_results:
+        assert isinstance(case_result, types.EvalCaseResult)
+        assert case_result.eval_case_index is not None
+        assert case_result.response_candidate_results is not None
+
+
+def test_custom_code_execution_batch_evaluate(client):
+    """Tests that batch_evaluate() works with custom code execution metric."""
+
+    code_snippet = """
+def evaluate(instance):
+    if instance['response'] == instance['reference']:
+        return 1.0
+    return 0.0
+"""
+
+    custom_metric = types.Metric(
+        name="my_custom_code_metric",
+        remote_custom_function=code_snippet,
+    )
+
+    eval_dataset = types.EvaluationDataset(
+        gcs_source=types.GcsSource(
+            uris=["gs://genai-eval-sdk-replay-test/test_data/inference_results.jsonl"]
+        ),
+    )
+
+    evaluation_result = client.evals.batch_evaluate(
+        dataset=eval_dataset,
+        metrics=[custom_metric],
+        dest="gs://genai-eval-sdk-replay-test/test_data/batch_eval_output",
+    )
+
+    assert evaluation_result is not None
+
+
+pytestmark = pytest_helper.setup(
+    file=__file__,
+    globals_for_file=globals(),
+    test_method="evals.evaluate",
+)
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
index 59b4d3f3f5..9475846fdb 100644
--- a/vertexai/_genai/_evals_metric_handlers.py
+++ b/vertexai/_genai/_evals_metric_handlers.py
@@ -685,10 +685,9 @@ def get_metric_result(
                 )
         except Exception as e:  # pylint: disable=broad-exception-caught
             logger.error(
-                "Error processing metric %s for case %s: %s",
+                "Error processing metric %s for case %s.",
                 metric_name,
                 eval_case.eval_case_id,
-                e,
                 exc_info=True,
             )
             return types.EvalCaseMetricResult(
@@ -1099,7 +1098,147 @@ def aggregate(
         )
 
 
+class CustomCodeExecutionMetricHandler(MetricHandler):
+    """Metric handler for custom code execution metrics."""
+
+    def __init__(self, module: "evals.Evals", metric: types.Metric):
+        super().__init__(module=module, metric=metric)
+
+        if not self.metric.remote_custom_function:
+            raise ValueError(
+                f"CustomCodeExecutionMetricHandler for '{self.metric.name}' needs "
+                " Metric.remote_custom_function to be set."
+            )
+
+    def _build_request_payload(
+        self, eval_case: types.EvalCase, response_index: int
+    ) -> dict[str, Any]:
+        """Builds the request parameters for evaluate instances request."""
+        if not eval_case.responses or response_index >= len(eval_case.responses):
+            raise IndexError(f"response_index {response_index} is out of bounds.")
+
+        response_content = eval_case.responses[response_index].response
+        if not response_content:
+            raise ValueError(
+                f"Response content missing for candidate {response_index}."
+            )
+
+        reference_instance_data = None
+        if eval_case.reference:
+            reference_instance_data = PredefinedMetricHandler._content_to_instance_data(
+                eval_case.reference.response
+            )
+
+        prompt_instance_data = PredefinedMetricHandler._content_to_instance_data(
+            eval_case.prompt
+        )
+
+        instance_payload = types.EvaluationInstance(
+            prompt=prompt_instance_data,
+            response=PredefinedMetricHandler._content_to_instance_data(
+                response_content
+            ),
+            reference=reference_instance_data,
+        )
+
+        return {
+            "instance": instance_payload,
+        }
+
+    @override
+    def get_metric_result(
+        self, eval_case: types.EvalCase, response_index: int
+    ) -> types.EvalCaseMetricResult:
+        """Processes a single evaluation case for a specific custom code execution metric."""
+        metric_name = self.metric.name
+        try:
+            payload = self._build_request_payload(eval_case, response_index)
+            for attempt in range(_MAX_RETRIES):
+                try:
+                    api_response = self.module._evaluate_instances(
+                        metrics=[self.metric],
+                        instance=payload.get("instance"),
+                    )
+                    break
+                except genai_errors.ClientError as e:
+                    if e.code == 429:
+                        logger.warning(
+                            "Resource Exhausted error on attempt %d/%d: %s. Retrying in %s"
+                            " seconds...",
+                            attempt + 1,
+                            _MAX_RETRIES,
+                            e,
+                            2**attempt,
+                        )
+                        if attempt == _MAX_RETRIES - 1:
+                            return types.EvalCaseMetricResult(
+                                metric_name=metric_name,
+                                error_message=f"Resource exhausted after {_MAX_RETRIES} retries: {e}",
+                            )
+                        time.sleep(2**attempt)
+                    else:
+                        raise e
+
+            if (
+                api_response
+                and hasattr(api_response, "metric_results")
+                and api_response.metric_results
+            ):
+                result_data = api_response.metric_results[0]
+
+                error_message = None
+                if result_data.error and getattr(result_data.error, "code"):
+                    error_message = f"Error in metric result: {result_data.error}"
+                return types.EvalCaseMetricResult(
+                    metric_name=metric_name,
+                    score=result_data.score,
+                    explanation=result_data.explanation,
+                    error_message=error_message,
+                )
+            else:
+                logger.error(
+                    "Metric results missing in API response for metric '%s'."
+                    " API response: %s",
+                    metric_name,
+                    (
+                        api_response.model_dump_json(exclude_none=True)
+                        if api_response
+                        else "None"
+                    ),
+                )
+                return types.EvalCaseMetricResult(
+                    metric_name=metric_name,
+                    error_message="Metric results missing in API response.",
+                )
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.error(
+                "Error processing metric %s for case %s",
+                metric_name,
+                eval_case.eval_case_id,
+                exc_info=True,
+            )
+            return types.EvalCaseMetricResult(
+                metric_name=metric_name, error_message=str(e)
+            )
+
+    @override
+    def aggregate(
+        self, eval_case_metric_results: list[types.EvalCaseMetricResult]
+    ) -> types.AggregatedMetricResult:
+        """Aggregates the metric results for a custom code execution metric."""
+        logger.debug(
+            "Aggregating results for custom code execution metric: %s", self.metric.name
+        )
+        return _default_aggregate_scores(
+            self.metric.name, eval_case_metric_results, calculate_pass_rate=True
+        )
+
+
 _METRIC_HANDLER_MAPPING = [
+    (
+        lambda m: hasattr(m, "remote_custom_function") and m.remote_custom_function,
+        CustomCodeExecutionMetricHandler,
+    ),
     (
         lambda m: m.custom_function and isinstance(m.custom_function, Callable),
         CustomMetricHandler,
@@ -1125,6 +1264,7 @@ def aggregate(
     TranslationMetricHandler,
     LLMMetricHandler,
     CustomMetricHandler,
+    CustomCodeExecutionMetricHandler,
     PredefinedMetricHandler,
 )
 
diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py
index 02356a98be..ab0c3f1f31 100644
--- a/vertexai/_genai/_transformers.py
+++ b/vertexai/_genai/_transformers.py
@@ -60,6 +60,13 @@ def t_metrics(
                 "metric_spec_name": metric_name,
                 "metric_spec_parameters": metric.metric_spec_parameters,
             }
+        # Custom Code Execution Metric
+        elif (
+            hasattr(metric, "remote_custom_function") and metric.remote_custom_function
+        ):
+            metric_payload_item["custom_code_execution_spec"] = {
+                "evaluation_function": metric.remote_custom_function
+            }
         # Pointwise metrics
         elif hasattr(metric, "prompt_template") and metric.prompt_template:
             pointwise_spec = {"metric_prompt_template": metric.prompt_template}
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index 9f88aab784..2a67707e1b 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -81,7 +81,13 @@ def _CreateEvaluationRunParameters_to_vertex(
         setv(to_object, ["dataSource"], getv(from_object, ["data_source"]))
 
     if getv(from_object, ["evaluation_config"]) is not None:
-        setv(to_object, ["evaluationConfig"], getv(from_object, ["evaluation_config"]))
+        setv(
+            to_object,
+            ["evaluationConfig"],
+            _EvaluationRunConfig_to_vertex(
+                getv(from_object, ["evaluation_config"]), to_object
+            ),
+        )
 
     if getv(from_object, ["labels"]) is not None:
         setv(to_object, ["labels"], getv(from_object, ["labels"]))
@@ -112,6 +118,36 @@ def _CreateEvaluationSetParameters_to_vertex(
     return to_object
 
 
+def _CustomCodeExecutionSpec_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["evaluation_function"]) is not None:
+        setv(
+            to_object,
+            ["remote_custom_function"],
+            getv(from_object, ["evaluation_function"]),
+        )
+
+    return to_object
+
+
+def _CustomCodeExecutionSpec_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["remote_custom_function"]) is not None:
+        setv(
+            to_object,
+            ["evaluation_function"],
+            getv(from_object, ["remote_custom_function"]),
+        )
+
+    return to_object
+
+
 def _EvaluateInstancesRequestParameters_to_vertex(
     from_object: Union[dict[str, Any], object],
     parent_object: Optional[dict[str, Any]] = None,
@@ -196,6 +232,90 @@ def _EvaluateInstancesRequestParameters_to_vertex(
     return to_object
 
 
+def _EvaluationRunConfig_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["metrics"]) is not None:
+        setv(
+            to_object,
+            ["metrics"],
+            [
+                _EvaluationRunMetric_from_vertex(item, to_object)
+                for item in getv(from_object, ["metrics"])
+            ],
+        )
+
+    if getv(from_object, ["outputConfig"]) is not None:
+        setv(to_object, ["output_config"], getv(from_object, ["outputConfig"]))
+
+    if getv(from_object, ["autoraterConfig"]) is not None:
+        setv(to_object, ["autorater_config"], getv(from_object, ["autoraterConfig"]))
+
+    return to_object
+
+
+def _EvaluationRunConfig_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["metrics"]) is not None:
+        setv(
+            to_object,
+            ["metrics"],
+            [
+                _EvaluationRunMetric_to_vertex(item, to_object)
+                for item in getv(from_object, ["metrics"])
+            ],
+        )
+
+    if getv(from_object, ["output_config"]) is not None:
+        setv(to_object, ["outputConfig"], getv(from_object, ["output_config"]))
+
+    if getv(from_object, ["autorater_config"]) is not None:
+        setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"]))
+
+    return to_object
+
+
+def _EvaluationRunMetric_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["metric"]) is not None:
+        setv(to_object, ["metric"], getv(from_object, ["metric"]))
+
+    if getv(from_object, ["metricConfig"]) is not None:
+        setv(
+            to_object,
+            ["metric_config"],
+            _UnifiedMetric_from_vertex(getv(from_object, ["metricConfig"]), to_object),
+        )
+
+    return to_object
+
+
+def _EvaluationRunMetric_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["metric"]) is not None:
+        setv(to_object, ["metric"], getv(from_object, ["metric"]))
+
+    if getv(from_object, ["metric_config"]) is not None:
+        setv(
+            to_object,
+            ["metricConfig"],
+            _UnifiedMetric_to_vertex(getv(from_object, ["metric_config"]), to_object),
+        )
+
+    return to_object
+
+
 def _EvaluationRun_from_vertex(
     from_object: Union[dict[str, Any], object],
     parent_object: Optional[dict[str, Any]] = None,
@@ -240,7 +360,13 @@ def _EvaluationRun_from_vertex(
         )
 
     if getv(from_object, ["evaluationConfig"]) is not None:
-        setv(to_object, ["evaluation_config"], getv(from_object, ["evaluationConfig"]))
+        setv(
+            to_object,
+            ["evaluation_config"],
+            _EvaluationRunConfig_from_vertex(
+                getv(from_object, ["evaluationConfig"]), to_object
+            ),
+        )
 
     if getv(from_object, ["inferenceConfigs"]) is not None:
         setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"]))
@@ -410,6 +536,94 @@ def _RubricGenerationSpec_to_vertex(
     return to_object
 
 
+def _UnifiedMetric_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["bleuSpec"]) is not None:
+        setv(to_object, ["bleu_spec"], getv(from_object, ["bleuSpec"]))
+
+    if getv(from_object, ["rougeSpec"]) is not None:
+        setv(to_object, ["rouge_spec"], getv(from_object, ["rougeSpec"]))
+
+    if getv(from_object, ["pointwiseMetricSpec"]) is not None:
+        setv(
+            to_object,
+            ["pointwise_metric_spec"],
+            getv(from_object, ["pointwiseMetricSpec"]),
+        )
+
+    if getv(from_object, ["llmBasedMetricSpec"]) is not None:
+        setv(
+            to_object,
+            ["llm_based_metric_spec"],
+            getv(from_object, ["llmBasedMetricSpec"]),
+        )
+
+    if getv(from_object, ["customCodeExecutionSpec"]) is not None:
+        setv(
+            to_object,
+            ["custom_code_execution_spec"],
+            _CustomCodeExecutionSpec_from_vertex(
+                getv(from_object, ["customCodeExecutionSpec"]), to_object
+            ),
+        )
+
+    if getv(from_object, ["predefinedMetricSpec"]) is not None:
+        setv(
+            to_object,
+            ["predefined_metric_spec"],
+            getv(from_object, ["predefinedMetricSpec"]),
+        )
+
+    return to_object
+
+
+def _UnifiedMetric_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["bleu_spec"]) is not None:
+        setv(to_object, ["bleuSpec"], getv(from_object, ["bleu_spec"]))
+
+    if getv(from_object, ["rouge_spec"]) is not None:
+        setv(to_object, ["rougeSpec"], getv(from_object, ["rouge_spec"]))
+
+    if getv(from_object, ["pointwise_metric_spec"]) is not None:
+        setv(
+            to_object,
+            ["pointwiseMetricSpec"],
+            getv(from_object, ["pointwise_metric_spec"]),
+        )
+
+    if getv(from_object, ["llm_based_metric_spec"]) is not None:
+        setv(
+            to_object,
+            ["llmBasedMetricSpec"],
+            getv(from_object, ["llm_based_metric_spec"]),
+        )
+
+    if getv(from_object, ["custom_code_execution_spec"]) is not None:
+        setv(
+            to_object,
+            ["customCodeExecutionSpec"],
+            _CustomCodeExecutionSpec_to_vertex(
+                getv(from_object, ["custom_code_execution_spec"]), to_object
+            ),
+        )
+
+    if getv(from_object, ["predefined_metric_spec"]) is not None:
+        setv(
+            to_object,
+            ["predefinedMetricSpec"],
+            getv(from_object, ["predefined_metric_spec"]),
+        )
+
+    return to_object
+
+
 class Evals(_api_module.BaseModule):
 
     def _create_evaluation_item(
diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py
index 6ff808a52e..a1f8da0079 100644
--- a/vertexai/_genai/types/__init__.py
+++ b/vertexai/_genai/types/__init__.py
@@ -218,6 +218,9 @@
 from .common import CreatePromptVersionConfig
 from .common import CreatePromptVersionConfigDict
 from .common import CreatePromptVersionConfigOrDict
+from .common import CustomCodeExecutionSpec
+from .common import CustomCodeExecutionSpecDict
+from .common import CustomCodeExecutionSpecOrDict
 from .common import CustomJob
 from .common import CustomJobDict
 from .common import CustomJobOrDict
@@ -1059,6 +1062,9 @@
     "LLMBasedMetricSpec",
     "LLMBasedMetricSpecDict",
     "LLMBasedMetricSpecOrDict",
+    "CustomCodeExecutionSpec",
+    "CustomCodeExecutionSpecDict",
+    "CustomCodeExecutionSpecOrDict",
     "UnifiedMetric",
     "UnifiedMetricDict",
     "UnifiedMetricOrDict",
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py
index b1d0dd88be..e8efba4216 100644
--- a/vertexai/_genai/types/common.py
+++ b/vertexai/_genai/types/common.py
@@ -905,6 +905,37 @@ class LLMBasedMetricSpecDict(TypedDict, total=False):
 LLMBasedMetricSpecOrDict = Union[LLMBasedMetricSpec, LLMBasedMetricSpecDict]
 
 
+class CustomCodeExecutionSpec(_common.BaseModel):
+    """Specificies a metric that is computed by running user-defined Python functions remotely."""
+
+    remote_custom_function: Optional[str] = Field(
+        default=None,
+        description="""A string representing a user-defined function for evaluation.
+  Expected user to define the following function, e.g.:
+    def evaluate(instance: dict[str, Any]) -> float:
+  Please include this function signature in the code snippet.
+  Instance is the evaluation instance, any fields populated in the instance
+  are available to the function as instance[field_name].""",
+    )
+
+
+class CustomCodeExecutionSpecDict(TypedDict, total=False):
+    """Specificies a metric that is computed by running user-defined Python functions remotely."""
+
+    remote_custom_function: Optional[str]
+    """A string representing a user-defined function for evaluation.
+  Expected user to define the following function, e.g.:
+    def evaluate(instance: dict[str, Any]) -> float:
+  Please include this function signature in the code snippet.
+  Instance is the evaluation instance, any fields populated in the instance
+  are available to the function as instance[field_name]."""
+
+
+CustomCodeExecutionSpecOrDict = Union[
+    CustomCodeExecutionSpec, CustomCodeExecutionSpecDict
+]
+
+
 class UnifiedMetric(_common.BaseModel):
     """The unified metric used for evaluation."""
 
@@ -920,6 +951,9 @@ class UnifiedMetric(_common.BaseModel):
     llm_based_metric_spec: Optional[LLMBasedMetricSpec] = Field(
         default=None, description="""The spec for an LLM based metric."""
     )
+    custom_code_execution_spec: Optional[CustomCodeExecutionSpec] = Field(
+        default=None, description="""The spec for a custom code execution metric."""
+    )
     predefined_metric_spec: Optional[PredefinedMetricSpec] = Field(
         default=None, description="""The spec for a pre-defined metric."""
     )
@@ -940,6 +974,9 @@ class UnifiedMetricDict(TypedDict, total=False):
     llm_based_metric_spec: Optional[LLMBasedMetricSpecDict]
     """The spec for an LLM based metric."""
 
+    custom_code_execution_spec: Optional[CustomCodeExecutionSpecDict]
+    """The spec for a custom code execution metric."""
+
     predefined_metric_spec: Optional[PredefinedMetricSpecDict]
     """The spec for a pre-defined metric."""
 
@@ -2616,6 +2653,10 @@ class Metric(_common.BaseModel):
         default=None,
         description="""The custom function that defines the end-to-end logic for metric computation.""",
     )
+    remote_custom_function: Optional[str] = Field(
+        default=None,
+        description="""The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""",
+    )
     prompt_template: Optional[str] = Field(
         default=None, description="""The prompt template for the metric."""
     )
@@ -2823,6 +2864,9 @@ class MetricDict(TypedDict, total=False):
     custom_function: Optional[Callable[..., Any]]
     """The custom function that defines the end-to-end logic for metric computation."""
 
+    remote_custom_function: Optional[str]
+    """The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service."""
+
     prompt_template: Optional[str]
     """The prompt template for the metric."""