Skip to content

Commit c69dcf8

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Added an Fast API new endpoint to serve eval metric info
This endpoint could be used by ADK Web to dynamically know: - What are the available eval metrics in an App - A description of those metrics - A value range supported by those metrics We also update the metric registry to make it mandatory to supply these details. The goal is to improve usability and interpretability of the eval metrics. PiperOrigin-RevId: 787277695
1 parent ec7d9b0 commit c69dcf8

16 files changed

+393
-53
lines changed

src/google/adk/cli/adk_web_server.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
from ..evaluation.eval_metrics import EvalMetric
6565
from ..evaluation.eval_metrics import EvalMetricResult
6666
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
67+
from ..evaluation.eval_metrics import MetricInfo
6768
from ..evaluation.eval_result import EvalSetResult
6869
from ..evaluation.eval_set_results_manager import EvalSetResultsManager
6970
from ..evaluation.eval_sets_manager import EvalSetsManager
@@ -697,6 +698,24 @@ def list_eval_results(app_name: str) -> list[str]:
697698
"""Lists all eval results for the given app."""
698699
return self.eval_set_results_manager.list_eval_set_results(app_name)
699700

701+
@app.get(
702+
"/apps/{app_name}/eval_metrics",
703+
response_model_exclude_none=True,
704+
)
705+
def list_eval_metrics(app_name: str) -> list[MetricInfo]:
706+
"""Lists all eval metrics for the given app."""
707+
try:
708+
from ..evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
709+
710+
# Right now we ignore the app_name as eval metrics are not tied to the
711+
# app_name, but they could be moving forward.
712+
return DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics()
713+
except ModuleNotFoundError as e:
714+
logger.exception("%s\n%s", MISSING_EVAL_DEPENDENCIES_MESSAGE, e)
715+
raise HTTPException(
716+
status_code=400, detail=MISSING_EVAL_DEPENDENCIES_MESSAGE
717+
) from e
718+
700719
@app.delete("/apps/{app_name}/users/{user_id}/sessions/{session_id}")
701720
async def delete_session(app_name: str, user_id: str, session_id: str):
702721
await self.session_service.delete_session(

src/google/adk/evaluation/eval_metrics.py

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,22 @@ class JudgeModelOptions(BaseModel):
4949

5050
judge_model: str = Field(
5151
default="gemini-2.5-flash",
52-
description="""The judge model to use for evaluation. It can be a model name.""",
52+
description=(
53+
"The judge model to use for evaluation. It can be a model name."
54+
),
5355
)
5456

5557
judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
56-
default=None, description="""The configuration for the judge model."""
58+
default=None,
59+
description="The configuration for the judge model.",
5760
)
5861

5962
num_samples: Optional[int] = Field(
6063
default=None,
61-
description="""The number of times to sample the model for each invocation evaluation.""",
64+
description=(
65+
"The number of times to sample the model for each invocation"
66+
" evaluation."
67+
),
6268
)
6369

6470

@@ -70,15 +76,20 @@ class EvalMetric(BaseModel):
7076
populate_by_name=True,
7177
)
7278

73-
metric_name: str
74-
"""The name of the metric."""
79+
metric_name: str = Field(
80+
description="The name of the metric.",
81+
)
7582

76-
threshold: float
77-
"""A threshold value. Each metric decides how to interpret this threshold."""
83+
threshold: float = Field(
84+
description=(
85+
"A threshold value. Each metric decides how to interpret this"
86+
" threshold."
87+
),
88+
)
7889

7990
judge_model_options: Optional[JudgeModelOptions] = Field(
8091
default=None,
81-
description="""Options for the judge model.""",
92+
description="Options for the judge model.",
8293
)
8394

8495

@@ -90,8 +101,14 @@ class EvalMetricResult(EvalMetric):
90101
populate_by_name=True,
91102
)
92103

93-
score: Optional[float] = None
94-
eval_status: EvalStatus
104+
score: Optional[float] = Field(
105+
default=None,
106+
description=(
107+
"Score obtained after evaluating the metric. Optional, as evaluation"
108+
" might not have happened."
109+
),
110+
)
111+
eval_status: EvalStatus = Field(description="The status of this evaluation.")
95112

96113

97114
class EvalMetricResultPerInvocation(BaseModel):
@@ -102,11 +119,71 @@ class EvalMetricResultPerInvocation(BaseModel):
102119
populate_by_name=True,
103120
)
104121

105-
actual_invocation: Invocation
106-
"""The actual invocation, usually obtained by inferencing the agent."""
122+
actual_invocation: Invocation = Field(
123+
description=(
124+
"The actual invocation, usually obtained by inferencing the agent."
125+
)
126+
)
127+
128+
expected_invocation: Invocation = Field(
129+
description=(
130+
"The expected invocation, usually the reference or golden invocation."
131+
)
132+
)
107133

108-
expected_invocation: Invocation
109-
"""The expected invocation, usually the reference or golden invocation."""
134+
eval_metric_results: list[EvalMetricResult] = Field(
135+
default=[],
136+
description="Eval resutls for each applicable metric.",
137+
)
138+
139+
140+
class Interval(BaseModel):
141+
"""Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""
142+
143+
min_value: float = Field(description="The smaller end of the interval.")
144+
145+
open_at_min: bool = Field(
146+
default=False,
147+
description=(
148+
"The interval is Open on the min end. The default value is False,"
149+
" which means that we assume that the interval is Closed."
150+
),
151+
)
152+
153+
max_value: float = Field(description="The larger end of the interval.")
154+
155+
open_at_max: bool = Field(
156+
default=False,
157+
description=(
158+
"The interval is Open on the max end. The default value is False,"
159+
" which means that we assume that the interval is Closed."
160+
),
161+
)
110162

111-
eval_metric_results: list[EvalMetricResult] = []
112-
"""Eval resutls for each applicable metric."""
163+
164+
class MetricValueInfo(BaseModel):
165+
"""Information about the type of metric value."""
166+
167+
interval: Optional[Interval] = Field(
168+
default=None,
169+
description="The values represented by the metric are of type interval.",
170+
)
171+
172+
173+
class MetricInfo(BaseModel):
174+
"""Information about the metric that are used for Evals."""
175+
176+
model_config = ConfigDict(
177+
alias_generator=alias_generators.to_camel,
178+
populate_by_name=True,
179+
)
180+
181+
metric_name: str = Field(description="The name of the metric.")
182+
183+
description: str = Field(
184+
default=None, description="A 2 to 3 line description of the metric."
185+
)
186+
187+
metric_value_info: MetricValueInfo = Field(
188+
description="Information on the nature of values supported by the metric."
189+
)

src/google/adk/evaluation/final_response_match_v1.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,39 @@
2222

2323
from .eval_case import Invocation
2424
from .eval_metrics import EvalMetric
25+
from .eval_metrics import Interval
26+
from .eval_metrics import MetricInfo
27+
from .eval_metrics import MetricValueInfo
28+
from .eval_metrics import PrebuiltMetrics
2529
from .evaluator import EvalStatus
2630
from .evaluator import EvaluationResult
2731
from .evaluator import Evaluator
2832
from .evaluator import PerInvocationResult
2933

3034

3135
class RougeEvaluator(Evaluator):
32-
"""Calculates the ROUGE-1 metric to compare responses."""
36+
"""Evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric.
37+
38+
Value range for this metric is [0,1], with values closer to 1 more desirable.
39+
"""
3340

3441
def __init__(self, eval_metric: EvalMetric):
3542
self._eval_metric = eval_metric
3643

44+
@staticmethod
45+
def get_metric_info() -> MetricInfo:
46+
return MetricInfo(
47+
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
48+
description=(
49+
"This metric evaluates if the agent's final response matches a"
50+
" golden/expected final response using Rouge_1 metric. Value range"
51+
" for this metric is [0,1], with values closer to 1 more desirable."
52+
),
53+
metric_value_info=MetricValueInfo(
54+
interval=Interval(min_value=0.0, max_value=1.0)
55+
),
56+
)
57+
3758
@override
3859
def evaluate_invocations(
3960
self,

src/google/adk/evaluation/final_response_match_v2.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
from ..utils.feature_decorator import experimental
2525
from .eval_case import Invocation
2626
from .eval_metrics import EvalMetric
27+
from .eval_metrics import Interval
28+
from .eval_metrics import MetricInfo
29+
from .eval_metrics import MetricValueInfo
30+
from .eval_metrics import PrebuiltMetrics
2731
from .evaluator import EvalStatus
2832
from .evaluator import EvaluationResult
2933
from .evaluator import PerInvocationResult
@@ -146,6 +150,20 @@ def __init__(
146150
if self._eval_metric.judge_model_options.num_samples is None:
147151
self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES
148152

153+
@staticmethod
154+
def get_metric_info() -> MetricInfo:
155+
return MetricInfo(
156+
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
157+
description=(
158+
"This metric evaluates if the agent's final response matches a"
159+
" golden/expected final response using LLM as a judge. Value range"
160+
" for this metric is [0,1], with values closer to 1 more desirable."
161+
),
162+
metric_value_info=MetricValueInfo(
163+
interval=Interval(min_value=0.0, max_value=1.0)
164+
),
165+
)
166+
149167
@override
150168
def format_auto_rater_prompt(
151169
self, actual_invocation: Invocation, expected_invocation: Invocation
@@ -185,8 +203,7 @@ def aggregate_per_invocation_samples(
185203
tie, consider the result to be invalid.
186204
187205
Args:
188-
per_invocation_samples: Samples of per-invocation results to
189-
aggregate.
206+
per_invocation_samples: Samples of per-invocation results to aggregate.
190207
191208
Returns:
192209
If there is a majority of valid results, return the first valid result.

src/google/adk/evaluation/metric_evaluator_registry.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
import logging
1818

1919
from ..errors.not_found_error import NotFoundError
20+
from ..utils.feature_decorator import experimental
2021
from .eval_metrics import EvalMetric
22+
from .eval_metrics import MetricInfo
2123
from .eval_metrics import MetricName
2224
from .eval_metrics import PrebuiltMetrics
2325
from .evaluator import Evaluator
@@ -29,10 +31,11 @@
2931
logger = logging.getLogger("google_adk." + __name__)
3032

3133

34+
@experimental
3235
class MetricEvaluatorRegistry:
3336
"""A registry for metric Evaluators."""
3437

35-
_registry: dict[str, type[Evaluator]] = {}
38+
_registry: dict[str, tuple[type[Evaluator], MetricInfo]] = {}
3639

3740
def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
3841
"""Returns an Evaluator for the given metric.
@@ -48,15 +51,18 @@ def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
4851
if eval_metric.metric_name not in self._registry:
4952
raise NotFoundError(f"{eval_metric.metric_name} not found in registry.")
5053

51-
return self._registry[eval_metric.metric_name](eval_metric=eval_metric)
54+
return self._registry[eval_metric.metric_name][0](eval_metric=eval_metric)
5255

5356
def register_evaluator(
54-
self, metric_name: MetricName, evaluator: type[Evaluator]
57+
self,
58+
metric_info: MetricInfo,
59+
evaluator: type[Evaluator],
5560
):
56-
"""Registers an evaluator given the metric name.
61+
"""Registers an evaluator given the metric info.
5762
5863
If a mapping already exist, then it is updated.
5964
"""
65+
metric_name = metric_info.metric_name
6066
if metric_name in self._registry:
6167
logger.info(
6268
"Updating Evaluator class for %s from %s to %s",
@@ -65,31 +71,45 @@ def register_evaluator(
6571
evaluator,
6672
)
6773

68-
self._registry[str(metric_name)] = evaluator
74+
self._registry[str(metric_name)] = (evaluator, metric_info)
75+
76+
def get_registered_metrics(
77+
self,
78+
) -> list[MetricInfo]:
79+
"""Returns a list of MetricInfo about the metrics registered so far."""
80+
return [
81+
evaluator_and_metric_info[1].model_copy(deep=True)
82+
for _, evaluator_and_metric_info in self._registry.items()
83+
]
6984

7085

7186
def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
7287
"""Returns an instance of MetricEvaluatorRegistry with standard metrics already registered in it."""
7388
metric_evaluator_registry = MetricEvaluatorRegistry()
7489

7590
metric_evaluator_registry.register_evaluator(
76-
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
91+
metric_info=TrajectoryEvaluator.get_metric_info(),
7792
evaluator=TrajectoryEvaluator,
7893
)
94+
7995
metric_evaluator_registry.register_evaluator(
80-
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
96+
metric_info=ResponseEvaluator.get_metric_info(
97+
PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value
98+
),
8199
evaluator=ResponseEvaluator,
82100
)
83101
metric_evaluator_registry.register_evaluator(
84-
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
102+
metric_info=ResponseEvaluator.get_metric_info(
103+
PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
104+
),
85105
evaluator=ResponseEvaluator,
86106
)
87107
metric_evaluator_registry.register_evaluator(
88-
metric_name=PrebuiltMetrics.SAFETY_V1.value,
108+
metric_info=SafetyEvaluatorV1.get_metric_info(),
89109
evaluator=SafetyEvaluatorV1,
90110
)
91111
metric_evaluator_registry.register_evaluator(
92-
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
112+
metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
93113
evaluator=FinalResponseMatchV2Evaluator,
94114
)
95115

0 commit comments

Comments
 (0)