From ce35218a96d393713b3d85e7ca14e2b54c05ce59 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Tue, 21 Oct 2025 13:16:50 -0700 Subject: [PATCH] feat: GenAI Client(evals) - Add `metrics` to `create_evaluation_run` method in Vertex AI GenAI SDK evals PiperOrigin-RevId: 822249192 --- .../replays/test_create_evaluation_run.py | 148 +++++--- vertexai/_genai/_evals_common.py | 67 ++++ vertexai/_genai/evals.py | 31 +- vertexai/_genai/types.py | 329 +++++++++++++----- 4 files changed, 440 insertions(+), 135 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index 6ad54cd348..e33b48b69b 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -19,53 +19,96 @@ from google.genai import types as genai_types import pytest - -def test_create_eval_run_data_source_evaluation_set(client): - """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" - client._api_client._http_options.api_version = "v1beta1" - tool = genai_types.Tool( - function_declarations=[ - genai_types.FunctionDeclaration( - name="get_weather", - description="Get weather in a location", - parameters={ - "type": "object", - "properties": {"location": {"type": "string"}}, - }, +GCS_DEST = "gs://lakeyk-test-limited/eval_run_output" +UNIVERSAL_AR_METRIC = types.EvaluationRunMetric( + metric="universal_ar_v1", + metric_config=types.UnifiedMetric( + predefined_metric_spec=types.PredefinedMetricSpec( + metric_spec_name="universal_ar_v1", + ) + ), +) +FINAL_RESPONSE_QUALITY_METRIC = types.EvaluationRunMetric( + metric="final_response_quality_v1", + metric_config=types.UnifiedMetric( + predefined_metric_spec=types.PredefinedMetricSpec( + metric_spec_name="final_response_quality_v1", + ) + ), +) +LLM_METRIC = types.EvaluationRunMetric( + metric="llm_metric", + metric_config=types.UnifiedMetric( + llm_based_metric_spec=types.LLMBasedMetricSpec( + metric_prompt_template=( + "\nEvaluate the fluency of the response. Provide a score from 1-5." ) - ] - ) - evaluation_run = client.evals.create_evaluation_run( - name="test4", - display_name="test4", - dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ), - agent_info=types.AgentInfo( - name="agent-1", - instruction="agent-1 instruction", - tool_declarations=[tool], - ), - dest="gs://lakeyk-limited-bucket/eval_run_output", - ) - assert isinstance(evaluation_run, types.EvaluationRun) - assert evaluation_run.display_name == "test4" - assert evaluation_run.state == types.EvaluationRunState.PENDING - assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) - assert evaluation_run.inference_configs[ - "agent-1" - ] == types.EvaluationRunInferenceConfig( - agent_config=types.EvaluationRunAgentConfig( - developer_instruction=genai_types.Content( - parts=[genai_types.Part(text="agent-1 instruction")] - ), - tools=[tool], ) - ) - assert evaluation_run.error is None + ), +) + + +# TODO(b/431231205): Re-enable once Unified Metrics are in prod. +# def test_create_eval_run_data_source_evaluation_set(client): +# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" +# client._api_client._http_options.base_url = ( +# "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/" +# ) +# client._api_client._http_options.api_version = "v1beta1" +# tool = genai_types.Tool( +# function_declarations=[ +# genai_types.FunctionDeclaration( +# name="get_weather", +# description="Get weather in a location", +# parameters={ +# "type": "object", +# "properties": {"location": {"type": "string"}}, +# }, +# ) +# ] +# ) +# evaluation_run = client.evals.create_evaluation_run( +# name="test4", +# display_name="test4", +# dataset=types.EvaluationRunDataSource( +# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" +# ), +# dest=GCS_DEST, +# metrics=[ +# UNIVERSAL_AR_METRIC, +# types.RubricMetric.FINAL_RESPONSE_QUALITY, +# LLM_METRIC +# ], +# agent_info=types.AgentInfo( +# name="agent-1", +# instruction="agent-1 instruction", +# tool_declarations=[tool], +# ), +# ) +# assert isinstance(evaluation_run, types.EvaluationRun) +# assert evaluation_run.display_name == "test4" +# assert evaluation_run.state == types.EvaluationRunState.PENDING +# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) +# assert evaluation_run.data_source.evaluation_set == ( +# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" +# ) +# assert evaluation_run.evaluation_config == types.EvaluationRunConfig( +# output_config=genai_types.OutputConfig( +# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) +# ), +# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], +# ) +# assert evaluation_run.inference_configs[ +# "agent-1" +# ] == types.EvaluationRunInferenceConfig( +# agent_config=types.EvaluationRunAgentConfig( +# developer_instruction=genai_types.Content( +# parts=[genai_types.Part(text="agent-1 instruction")] +# ), +# tools=[tool], +# ) +# ) +# assert evaluation_run.error is None def test_create_eval_run_data_source_bigquery_request_set(client): @@ -84,7 +127,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): }, ) ), - dest="gs://lakeyk-limited-bucket/eval_run_output", + dest=GCS_DEST, ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test5" @@ -101,6 +144,11 @@ def test_create_eval_run_data_source_bigquery_request_set(client): }, ) ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + ) assert evaluation_run.inference_configs is None assert evaluation_run.error is None @@ -220,7 +268,7 @@ async def test_create_eval_run_async(client): }, ) ), - dest="gs://lakeyk-limited-bucket/eval_run_output", + dest=GCS_DEST, ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test8" @@ -233,6 +281,12 @@ async def test_create_eval_run_async(client): "checkpoint_2": "checkpoint_2", }, ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + ) + assert evaluation_run.error is None assert evaluation_run.inference_configs is None assert evaluation_run.error is None diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 711d833c4f..716ab1be04 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -933,6 +933,73 @@ def _resolve_dataset_inputs( return processed_eval_dataset, num_response_candidates +def _resolve_evaluation_run_metrics( + metrics: list[types.EvaluationRunMetric], api_client: Any +) -> list[types.EvaluationRunMetric]: + """Resolves a list of evaluation run metric instances, loading RubricMetric if necessary.""" + if not metrics: + return [] + resolved_metrics_list = [] + for metric_instance in metrics: + if isinstance(metric_instance, types.EvaluationRunMetric): + resolved_metrics_list.append(metric_instance) + elif isinstance(metric_instance, _evals_utils.LazyLoadedPrebuiltMetric): + try: + resolved_metric = metric_instance.resolve(api_client=api_client) + if resolved_metric.name: + resolved_metrics_list.append( + types.EvaluationRunMetric( + metric=resolved_metric.name, + metric_config=types.UnifiedMetric( + predefined_metric_spec=types.PredefinedMetricSpec( + metric_spec_name=resolved_metric.name, + ) + ), + ) + ) + except Exception as e: + logger.error( + "Failed to resolve RubricMetric %s@%s: %s", + metric_instance.name, + metric_instance.version, + e, + ) + raise + else: + try: + metric_name_str = str(metric_instance) + lazy_metric_instance = getattr( + _evals_utils.RubricMetric, metric_name_str.upper() + ) + if isinstance( + lazy_metric_instance, _evals_utils.LazyLoadedPrebuiltMetric + ): + resolved_metric = lazy_metric_instance.resolve( + api_client=api_client + ) + if resolved_metric.name: + resolved_metrics_list.append( + types.EvaluationRunMetric( + metric=resolved_metric.name, + metric_config=types.UnifiedMetric( + predefined_metric_spec=types.PredefinedMetricSpec( + metric_spec_name=resolved_metric.name, + ) + ), + ) + ) + else: + raise TypeError( + f"RubricMetric.{metric_name_str.upper()} cannot be resolved." + ) + except AttributeError as exc: + raise TypeError( + "Unsupported metric type or invalid RubricMetric name:" + f" {metric_instance}" + ) from exc + return resolved_metrics_list + + def _resolve_metrics( metrics: list[types.Metric], api_client: Any ) -> list[types.Metric]: diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index a4e30577e7..74a681c152 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -230,6 +230,9 @@ def _EvaluationRun_from_vertex( getv(from_object, ["evaluationResults"]), ) + if getv(from_object, ["evaluationConfig"]) is not None: + setv(to_object, ["evaluation_config"], getv(from_object, ["evaluationConfig"])) + if getv(from_object, ["inferenceConfigs"]) is not None: setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"])) @@ -460,7 +463,7 @@ def _create_evaluation_run( name: Optional[str] = None, display_name: Optional[str] = None, data_source: types.EvaluationRunDataSourceOrDict, - evaluation_config: genai_types.EvaluationConfigOrDict, + evaluation_config: types.EvaluationRunConfigOrDict, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] @@ -1306,9 +1309,12 @@ def create_evaluation_run( self, *, name: str, - display_name: Optional[str] = None, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, + display_name: Optional[str] = None, + metrics: Optional[ + list[types.EvaluationRunMetricOrDict] + ] = None, # TODO: Make required unified metrics available in prod. agent_info: Optional[types.AgentInfo] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: @@ -1328,7 +1334,12 @@ def create_evaluation_run( output_config = genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest) ) - evaluation_config = genai_types.EvaluationConfig(output_config=output_config) + resolved_metrics = _evals_common._resolve_evaluation_run_metrics( + metrics, self._api_client + ) + evaluation_config = types.EvaluationRunConfig( + output_config=output_config, metrics=resolved_metrics + ) inference_configs = {} if agent_info: logger.warning( @@ -1554,7 +1565,7 @@ async def _create_evaluation_run( name: Optional[str] = None, display_name: Optional[str] = None, data_source: types.EvaluationRunDataSourceOrDict, - evaluation_config: genai_types.EvaluationConfigOrDict, + evaluation_config: types.EvaluationRunConfigOrDict, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] @@ -2103,9 +2114,12 @@ async def create_evaluation_run( self, *, name: str, - display_name: Optional[str] = None, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, + display_name: Optional[str] = None, + metrics: Optional[ + list[types.EvaluationRunMetricOrDict] + ] = None, # TODO: Make required unified metrics available in prod. agent_info: Optional[types.AgentInfo] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: @@ -2125,7 +2139,12 @@ async def create_evaluation_run( output_config = genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest) ) - evaluation_config = genai_types.EvaluationConfig(output_config=output_config) + resolved_metrics = _evals_common._resolve_evaluation_run_metrics( + metrics, self._api_client + ) + evaluation_config = types.EvaluationRunConfig( + output_config=output_config, metrics=resolved_metrics + ) inference_configs = {} if agent_info: logger.warning( diff --git a/vertexai/_genai/types.py b/vertexai/_genai/types.py index 361c31e2c5..15e232c586 100644 --- a/vertexai/_genai/types.py +++ b/vertexai/_genai/types.py @@ -299,6 +299,17 @@ class SamplingMethod(_common.CaseInSensitiveEnum): """Sampling method is random.""" +class RubricContentType(_common.CaseInSensitiveEnum): + """Specifies the type of rubric content to generate.""" + + PROPERTY = "PROPERTY" + """Generate rubrics based on properties.""" + NL_QUESTION_ANSWER = "NL_QUESTION_ANSWER" + """Generate rubrics in an NL question answer format.""" + PYTHON_CODE_ASSERTION = "PYTHON_CODE_ASSERTION" + """Generate rubrics in a unit test format.""" + + class EvaluationRunState(_common.CaseInSensitiveEnum): """Represents the state of an evaluation run.""" @@ -320,17 +331,6 @@ class EvaluationRunState(_common.CaseInSensitiveEnum): """Evaluation run is performing rubric generation.""" -class RubricContentType(_common.CaseInSensitiveEnum): - """Specifies the type of rubric content to generate.""" - - PROPERTY = "PROPERTY" - """Generate rubrics based on properties.""" - NL_QUESTION_ANSWER = "NL_QUESTION_ANSWER" - """Generate rubrics in an NL question answer format.""" - PYTHON_CODE_ASSERTION = "PYTHON_CODE_ASSERTION" - """Generate rubrics in a unit test format.""" - - class Importance(_common.CaseInSensitiveEnum): """Importance level of the rubric.""" @@ -961,6 +961,234 @@ class EvaluationRunDataSourceDict(TypedDict, total=False): ] +class PredefinedMetricSpec(_common.BaseModel): + """Spec for predefined metric.""" + + metric_spec_name: Optional[str] = Field( + default=None, + description="""The name of a pre-defined metric, such as "instruction_following_v1" or + "text_quality_v1".""", + ) + metric_spec_parameters: Optional[dict] = Field( + default=None, + description="""The parameters needed to run the pre-defined metric.""", + ) + + +class PredefinedMetricSpecDict(TypedDict, total=False): + """Spec for predefined metric.""" + + metric_spec_name: Optional[str] + """The name of a pre-defined metric, such as "instruction_following_v1" or + "text_quality_v1".""" + + metric_spec_parameters: Optional[dict] + """The parameters needed to run the pre-defined metric.""" + + +PredefinedMetricSpecOrDict = Union[PredefinedMetricSpec, PredefinedMetricSpecDict] + + +class RubricGenerationSpec(_common.BaseModel): + """Spec for generating rubrics.""" + + prompt_template: Optional[str] = Field( + default=None, + description="""Template for the prompt used to generate rubrics. + The details should be updated based on the most-recent recipe requirements.""", + ) + generator_model_config: Optional[genai_types.AutoraterConfig] = Field( + default=None, + description="""Configuration for the model used in rubric generation. + Configs including sampling count and base model can be specified here. + Flipping is not supported for rubric generation.""", + ) + rubric_content_type: Optional[RubricContentType] = Field( + default=None, description="""The type of rubric content to be generated.""" + ) + rubric_type_ontology: Optional[list[str]] = Field( + default=None, + description="""An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""", + ) + + +class RubricGenerationSpecDict(TypedDict, total=False): + """Spec for generating rubrics.""" + + prompt_template: Optional[str] + """Template for the prompt used to generate rubrics. + The details should be updated based on the most-recent recipe requirements.""" + + generator_model_config: Optional[genai_types.AutoraterConfigDict] + """Configuration for the model used in rubric generation. + Configs including sampling count and base model can be specified here. + Flipping is not supported for rubric generation.""" + + rubric_content_type: Optional[RubricContentType] + """The type of rubric content to be generated.""" + + rubric_type_ontology: Optional[list[str]] + """An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""" + + +RubricGenerationSpecOrDict = Union[RubricGenerationSpec, RubricGenerationSpecDict] + + +class LLMBasedMetricSpec(_common.BaseModel): + """Specification for an LLM based metric.""" + + metric_prompt_template: Optional[str] = Field( + default=None, description="""Template for the prompt sent to the judge model.""" + ) + system_instruction: Optional[str] = Field( + default=None, description="""System instruction for the judge model.""" + ) + judge_autorater_config: Optional[genai_types.AutoraterConfig] = Field( + default=None, + description="""Optional configuration for the judge LLM (Autorater).""", + ) + rubric_group_key: Optional[str] = Field( + default=None, + description="""Use a pre-defined group of rubrics associated with the input. + Refers to a key in the rubric_groups map of EvaluationInstance.""", + ) + predefined_rubric_generation_spec: Optional[PredefinedMetricSpec] = Field( + default=None, + description="""Dynamically generate rubrics using a predefined spec.""", + ) + rubric_generation_spec: Optional[RubricGenerationSpec] = Field( + default=None, + description="""Dynamically generate rubrics using this specification.""", + ) + + +class LLMBasedMetricSpecDict(TypedDict, total=False): + """Specification for an LLM based metric.""" + + metric_prompt_template: Optional[str] + """Template for the prompt sent to the judge model.""" + + system_instruction: Optional[str] + """System instruction for the judge model.""" + + judge_autorater_config: Optional[genai_types.AutoraterConfigDict] + """Optional configuration for the judge LLM (Autorater).""" + + rubric_group_key: Optional[str] + """Use a pre-defined group of rubrics associated with the input. + Refers to a key in the rubric_groups map of EvaluationInstance.""" + + predefined_rubric_generation_spec: Optional[PredefinedMetricSpecDict] + """Dynamically generate rubrics using a predefined spec.""" + + rubric_generation_spec: Optional[RubricGenerationSpecDict] + """Dynamically generate rubrics using this specification.""" + + +LLMBasedMetricSpecOrDict = Union[LLMBasedMetricSpec, LLMBasedMetricSpecDict] + + +class UnifiedMetric(_common.BaseModel): + """The unified metric used for evaluation.""" + + bleu_spec: Optional[genai_types.BleuSpec] = Field( + default=None, description="""The Bleu metric spec.""" + ) + rouge_spec: Optional[genai_types.RougeSpec] = Field( + default=None, description="""The rouge metric spec.""" + ) + pointwise_metric_spec: Optional[genai_types.PointwiseMetricSpec] = Field( + default=None, description="""The pointwise metric spec.""" + ) + llm_based_metric_spec: Optional[LLMBasedMetricSpec] = Field( + default=None, description="""The spec for an LLM based metric.""" + ) + predefined_metric_spec: Optional[PredefinedMetricSpec] = Field( + default=None, description="""The spec for a pre-defined metric.""" + ) + + +class UnifiedMetricDict(TypedDict, total=False): + """The unified metric used for evaluation.""" + + bleu_spec: Optional[genai_types.BleuSpecDict] + """The Bleu metric spec.""" + + rouge_spec: Optional[genai_types.RougeSpecDict] + """The rouge metric spec.""" + + pointwise_metric_spec: Optional[genai_types.PointwiseMetricSpecDict] + """The pointwise metric spec.""" + + llm_based_metric_spec: Optional[LLMBasedMetricSpecDict] + """The spec for an LLM based metric.""" + + predefined_metric_spec: Optional[PredefinedMetricSpecDict] + """The spec for a pre-defined metric.""" + + +UnifiedMetricOrDict = Union[UnifiedMetric, UnifiedMetricDict] + + +class EvaluationRunMetric(_common.BaseModel): + """The metric used for evaluation run.""" + + metric: Optional[str] = Field( + default=None, description="""The name of the metric.""" + ) + metric_config: Optional[UnifiedMetric] = Field( + default=None, description="""The unified metric used for evaluation run.""" + ) + + +class EvaluationRunMetricDict(TypedDict, total=False): + """The metric used for evaluation run.""" + + metric: Optional[str] + """The name of the metric.""" + + metric_config: Optional[UnifiedMetricDict] + """The unified metric used for evaluation run.""" + + +EvaluationRunMetricOrDict = Union[EvaluationRunMetric, EvaluationRunMetricDict] + + +class EvaluationRunConfig(_common.BaseModel): + """The evaluation configuration used for the evaluation run.""" + + metrics: Optional[list[EvaluationRunMetric]] = Field( + default=None, + description="""The metrics to be calculated in the evaluation run.""", + ) + output_config: Optional[genai_types.OutputConfig] = Field( + default=None, description="""The output config for the evaluation run.""" + ) + autorater_config: Optional[genai_types.AutoraterConfig] = Field( + default=None, description="""The autorater config for the evaluation run.""" + ) + + +class EvaluationRunConfigDict(TypedDict, total=False): + """The evaluation configuration used for the evaluation run.""" + + metrics: Optional[list[EvaluationRunMetricDict]] + """The metrics to be calculated in the evaluation run.""" + + output_config: Optional[genai_types.OutputConfigDict] + """The output config for the evaluation run.""" + + autorater_config: Optional[genai_types.AutoraterConfigDict] + """The autorater config for the evaluation run.""" + + +EvaluationRunConfigOrDict = Union[EvaluationRunConfig, EvaluationRunConfigDict] + + class CreateEvaluationRunConfig(_common.BaseModel): """Config to create an evaluation run.""" @@ -989,7 +1217,7 @@ class _CreateEvaluationRunParameters(_common.BaseModel): data_source: Optional[EvaluationRunDataSource] = Field( default=None, description="""""" ) - evaluation_config: Optional[genai_types.EvaluationConfig] = Field( + evaluation_config: Optional[EvaluationRunConfig] = Field( default=None, description="""""" ) config: Optional[CreateEvaluationRunConfig] = Field( @@ -1012,7 +1240,7 @@ class _CreateEvaluationRunParametersDict(TypedDict, total=False): data_source: Optional[EvaluationRunDataSourceDict] """""" - evaluation_config: Optional[genai_types.EvaluationConfigDict] + evaluation_config: Optional[EvaluationRunConfigDict] """""" config: Optional[CreateEvaluationRunConfigDict] @@ -1684,6 +1912,9 @@ class EvaluationRun(_common.BaseModel): default=None, description="""The parsed EvaluationItem results for the evaluation run. This is only populated when include_evaluation_items is set to True.""", ) + evaluation_config: Optional[EvaluationRunConfig] = Field( + default=None, description="""The evaluation config for the evaluation run.""" + ) inference_configs: Optional[dict[str, "EvaluationRunInferenceConfig"]] = Field( default=None, description="""This field is experimental and may change in future versions. The inference configs for the evaluation run.""", @@ -1766,6 +1997,9 @@ class EvaluationRunDict(TypedDict, total=False): evaluation_item_results: Optional[EvaluationResultDict] """The parsed EvaluationItem results for the evaluation run. This is only populated when include_evaluation_items is set to True.""" + evaluation_config: Optional[EvaluationRunConfigDict] + """The evaluation config for the evaluation run.""" + inference_configs: Optional[dict[str, "EvaluationRunInferenceConfigDict"]] """This field is experimental and may change in future versions. The inference configs for the evaluation run.""" @@ -2643,55 +2877,6 @@ class EvaluateInstancesConfigDict(TypedDict, total=False): ] -class RubricGenerationSpec(_common.BaseModel): - """Spec for generating rubrics.""" - - prompt_template: Optional[str] = Field( - default=None, - description="""Template for the prompt used to generate rubrics. - The details should be updated based on the most-recent recipe requirements.""", - ) - generator_model_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, - description="""Configuration for the model used in rubric generation. - Configs including sampling count and base model can be specified here. - Flipping is not supported for rubric generation.""", - ) - rubric_content_type: Optional[RubricContentType] = Field( - default=None, description="""The type of rubric content to be generated.""" - ) - rubric_type_ontology: Optional[list[str]] = Field( - default=None, - description="""An optional, pre-defined list of allowed types for generated rubrics. - If this field is provided, it implies `include_rubric_type` should be true, - and the generated rubric types should be chosen from this ontology.""", - ) - - -class RubricGenerationSpecDict(TypedDict, total=False): - """Spec for generating rubrics.""" - - prompt_template: Optional[str] - """Template for the prompt used to generate rubrics. - The details should be updated based on the most-recent recipe requirements.""" - - generator_model_config: Optional[genai_types.AutoraterConfigDict] - """Configuration for the model used in rubric generation. - Configs including sampling count and base model can be specified here. - Flipping is not supported for rubric generation.""" - - rubric_content_type: Optional[RubricContentType] - """The type of rubric content to be generated.""" - - rubric_type_ontology: Optional[list[str]] - """An optional, pre-defined list of allowed types for generated rubrics. - If this field is provided, it implies `include_rubric_type` should be true, - and the generated rubric types should be chosen from this ontology.""" - - -RubricGenerationSpecOrDict = Union[RubricGenerationSpec, RubricGenerationSpecDict] - - class RubricBasedMetricSpec(_common.BaseModel): """Specification for a metric that is based on rubrics.""" @@ -3762,26 +3947,6 @@ class EvaluateInstancesResponseDict(TypedDict, total=False): ] -class PredefinedMetricSpec(_common.BaseModel): - """Spec for predefined metric.""" - - metric_spec_name: Optional[str] = Field(default=None, description="""""") - metric_spec_parameters: Optional[dict] = Field(default=None, description="""""") - - -class PredefinedMetricSpecDict(TypedDict, total=False): - """Spec for predefined metric.""" - - metric_spec_name: Optional[str] - """""" - - metric_spec_parameters: Optional[dict] - """""" - - -PredefinedMetricSpecOrDict = Union[PredefinedMetricSpec, PredefinedMetricSpecDict] - - class RubricGenerationConfig(_common.BaseModel): """Config for generating rubrics."""