Skip to content

Commit b48935b

Browse files
committed
fix: add Failed state for eval runs
1 parent 7fdfea3 commit b48935b

File tree

5 files changed

+148
-11
lines changed

5 files changed

+148
-11
lines changed

src/uipath/_cli/_evals/_models/_evaluation_set.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ class EvaluationStatus(IntEnum):
261261
PENDING = 0
262262
IN_PROGRESS = 1
263263
COMPLETED = 2
264+
FAILED = 3
264265

265266

266267
def _discriminate_eval_set(

src/uipath/_cli/_evals/_progress_reporter.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ async def update_eval_run(
323323
eval_run_id=sw_progress_item.eval_run_id,
324324
execution_time=sw_progress_item.agent_execution_time,
325325
actual_output=sw_progress_item.agent_output,
326+
success=sw_progress_item.success,
326327
is_coded=is_coded,
327328
)
328329
else:
@@ -332,6 +333,7 @@ async def update_eval_run(
332333
eval_run_id=sw_progress_item.eval_run_id,
333334
execution_time=sw_progress_item.agent_execution_time,
334335
actual_output=sw_progress_item.agent_output,
336+
success=sw_progress_item.success,
335337
is_coded=is_coded,
336338
)
337339

@@ -350,10 +352,11 @@ async def update_eval_set_run(
350352
eval_set_run_id: str,
351353
evaluator_scores: dict[str, float],
352354
is_coded: bool = False,
355+
success: bool = True,
353356
):
354357
"""Update the evaluation set run status to complete."""
355358
spec = self._update_eval_set_run_spec(
356-
eval_set_run_id, evaluator_scores, is_coded
359+
eval_set_run_id, evaluator_scores, is_coded, success
357360
)
358361
await self._client.request_async(
359362
method=spec.method,
@@ -498,9 +501,11 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
498501
eval_set_run_id,
499502
payload.evaluator_scores,
500503
is_coded=is_coded,
504+
success=payload.success,
501505
)
506+
status_str = "completed" if payload.success else "failed"
502507
logger.debug(
503-
f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
508+
f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded}, status={status_str})"
504509
)
505510
else:
506511
logger.warning(
@@ -704,17 +709,19 @@ def _update_eval_run_spec(
704709
eval_run_id: str,
705710
actual_output: dict[str, Any],
706711
execution_time: float,
712+
success: bool,
707713
is_coded: bool = False,
708714
) -> RequestSpec:
709715
# For legacy evaluations, endpoint is without /coded
710716
endpoint_suffix = "coded/" if is_coded else ""
711717

718+
# Determine status based on success
719+
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
720+
712721
payload: dict[str, Any] = {
713722
"evalRunId": eval_run_id,
714723
# For coded evaluations, use integer status; for legacy, use string
715-
"status": EvaluationStatus.COMPLETED.value
716-
if is_coded
717-
else self._status_to_string(EvaluationStatus.COMPLETED),
724+
"status": status.value if is_coded else self._status_to_string(status),
718725
"result": {
719726
"output": dict(actual_output),
720727
"evaluatorScores": evaluator_scores,
@@ -739,18 +746,20 @@ def _update_coded_eval_run_spec(
739746
eval_run_id: str,
740747
actual_output: dict[str, Any],
741748
execution_time: float,
749+
success: bool,
742750
is_coded: bool = False,
743751
) -> RequestSpec:
744752
"""Create update spec for coded evaluators."""
745753
# For coded evaluations, endpoint has /coded
746754
endpoint_suffix = "coded/" if is_coded else ""
747755

756+
# Determine status based on success
757+
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
758+
748759
payload: dict[str, Any] = {
749760
"evalRunId": eval_run_id,
750761
# For coded evaluations, use integer status; for legacy, use string
751-
"status": EvaluationStatus.COMPLETED.value
752-
if is_coded
753-
else self._status_to_string(EvaluationStatus.COMPLETED),
762+
"status": status.value if is_coded else self._status_to_string(status),
754763
"result": {
755764
"output": dict(actual_output),
756765
"scores": evaluator_scores,
@@ -870,6 +879,7 @@ def _update_eval_set_run_spec(
870879
eval_set_run_id: str,
871880
evaluator_scores: dict[str, float],
872881
is_coded: bool = False,
882+
success: bool = True,
873883
) -> RequestSpec:
874884
# Legacy API expects evaluatorId as GUID, coded accepts string
875885
evaluator_scores_list = []
@@ -894,12 +904,13 @@ def _update_eval_set_run_spec(
894904
# For legacy evaluations, endpoint is without /coded
895905
endpoint_suffix = "coded/" if is_coded else ""
896906

907+
# Determine status based on success
908+
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
909+
897910
payload: dict[str, Any] = {
898911
"evalSetRunId": eval_set_run_id,
899912
# For coded evaluations, use integer status; for legacy, use string
900-
"status": EvaluationStatus.COMPLETED.value
901-
if is_coded
902-
else self._status_to_string(EvaluationStatus.COMPLETED),
913+
"status": status.value if is_coded else self._status_to_string(status),
903914
"evaluatorScores": evaluator_scores_list,
904915
}
905916

src/uipath/_cli/_evals/_runtime.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,16 @@ async def execute(self) -> UiPathRuntimeResult:
260260
evaluator_averages: dict[str, float] = defaultdict(float)
261261
evaluator_count: dict[str, int] = defaultdict(int)
262262

263+
# Check if any eval runs failed
264+
any_failed = False
263265
for eval_run_result in results.evaluation_set_results:
266+
# Check if the agent execution had an error
267+
if (
268+
eval_run_result.agent_execution_output
269+
and eval_run_result.agent_execution_output.result.error
270+
):
271+
any_failed = True
272+
264273
for result_dto in eval_run_result.evaluation_run_results:
265274
evaluator_averages[result_dto.evaluator_id] += result_dto.result.score
266275
evaluator_count[result_dto.evaluator_id] += 1
@@ -274,6 +283,7 @@ async def execute(self) -> UiPathRuntimeResult:
274283
EvalSetRunUpdatedEvent(
275284
execution_id=self.execution_id,
276285
evaluator_scores=evaluator_averages,
286+
success=not any_failed,
277287
),
278288
wait_for_completion=False,
279289
)

src/uipath/_events/_events.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def validate_exception_details(self):
6262
class EvalSetRunUpdatedEvent(BaseModel):
6363
execution_id: str
6464
evaluator_scores: dict[str, float]
65+
success: bool = True
6566

6667

6768
ProgressEvent = Union[

tests/cli/eval/test_progress_reporter.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"""
1010

1111
import json
12+
from typing import Any
1213
from unittest.mock import AsyncMock, Mock, patch
1314

1415
import pytest
@@ -282,6 +283,7 @@ def test_update_coded_eval_run_spec(self, progress_reporter):
282283
eval_run_id="test-run-id",
283284
actual_output={"result": "success"},
284285
execution_time=5.5,
286+
success=True,
285287
is_coded=True,
286288
)
287289

@@ -291,6 +293,7 @@ def test_update_coded_eval_run_spec(self, progress_reporter):
291293
assert spec.json["evaluatorRuns"] == evaluator_runs
292294
assert spec.json["result"]["scores"] == evaluator_scores
293295
assert spec.json["completionMetrics"]["duration"] == 5
296+
assert spec.json["status"] == 2 # COMPLETED
294297

295298
def test_update_legacy_eval_run_spec(self, progress_reporter):
296299
"""Test updating eval run spec for legacy evaluators."""
@@ -305,6 +308,7 @@ def test_update_legacy_eval_run_spec(self, progress_reporter):
305308
eval_run_id="test-run-id",
306309
actual_output={"result": "success"},
307310
execution_time=5.5,
311+
success=True,
308312
is_coded=False,
309313
)
310314

@@ -314,6 +318,47 @@ def test_update_legacy_eval_run_spec(self, progress_reporter):
314318
assert spec.json["assertionRuns"] == assertion_runs
315319
assert spec.json["result"]["evaluatorScores"] == evaluator_scores
316320
assert spec.json["completionMetrics"]["duration"] == 5
321+
assert spec.json["status"] == "Completed" # String format for legacy
322+
323+
def test_update_coded_eval_run_spec_with_failure(self, progress_reporter):
324+
"""Test updating eval run spec for coded evaluators with failure."""
325+
evaluator_runs: list[dict[str, Any]] = []
326+
evaluator_scores: list[dict[str, Any]] = []
327+
328+
spec = progress_reporter._update_coded_eval_run_spec(
329+
evaluator_runs=evaluator_runs,
330+
evaluator_scores=evaluator_scores,
331+
eval_run_id="test-run-id",
332+
actual_output={},
333+
execution_time=0.0,
334+
success=False,
335+
is_coded=True,
336+
)
337+
338+
assert spec.method == "PUT"
339+
assert "coded/" in spec.endpoint
340+
assert spec.json["evalRunId"] == "test-run-id"
341+
assert spec.json["status"] == 3 # FAILED
342+
343+
def test_update_legacy_eval_run_spec_with_failure(self, progress_reporter):
344+
"""Test updating eval run spec for legacy evaluators with failure."""
345+
assertion_runs: list[dict[str, Any]] = []
346+
evaluator_scores: list[dict[str, Any]] = []
347+
348+
spec = progress_reporter._update_eval_run_spec(
349+
assertion_runs=assertion_runs,
350+
evaluator_scores=evaluator_scores,
351+
eval_run_id="test-run-id",
352+
actual_output={},
353+
execution_time=0.0,
354+
success=False,
355+
is_coded=False,
356+
)
357+
358+
assert spec.method == "PUT"
359+
assert "coded/" not in spec.endpoint
360+
assert spec.json["evalRunId"] == "test-run-id"
361+
assert spec.json["status"] == "Failed" # String format for legacy
317362

318363

319364
# Tests for custom eval set run ID handling
@@ -421,3 +466,72 @@ def test_eval_set_run_created_event_initialization_without_custom_run_id(self):
421466

422467
# Assert
423468
assert event.eval_set_run_id is None
469+
470+
471+
# Tests for eval set run status updates
472+
class TestEvalSetRunStatusUpdates:
473+
"""Tests for handling eval set run status updates (completed vs failed)."""
474+
475+
def test_update_eval_set_run_spec_with_success_coded(self, progress_reporter):
476+
"""Test updating eval set run spec for coded evaluators with success=True."""
477+
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
478+
479+
spec = progress_reporter._update_eval_set_run_spec(
480+
eval_set_run_id="test-run-id",
481+
evaluator_scores=evaluator_scores,
482+
is_coded=True,
483+
success=True,
484+
)
485+
486+
assert spec.method == "PUT"
487+
assert "coded/" in spec.endpoint
488+
assert spec.json["evalSetRunId"] == "test-run-id"
489+
assert spec.json["status"] == 2 # COMPLETED = 2
490+
491+
def test_update_eval_set_run_spec_with_failure_coded(self, progress_reporter):
492+
"""Test updating eval set run spec for coded evaluators with success=False."""
493+
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
494+
495+
spec = progress_reporter._update_eval_set_run_spec(
496+
eval_set_run_id="test-run-id",
497+
evaluator_scores=evaluator_scores,
498+
is_coded=True,
499+
success=False,
500+
)
501+
502+
assert spec.method == "PUT"
503+
assert "coded/" in spec.endpoint
504+
assert spec.json["evalSetRunId"] == "test-run-id"
505+
assert spec.json["status"] == 3 # FAILED = 3
506+
507+
def test_update_eval_set_run_spec_with_success_legacy(self, progress_reporter):
508+
"""Test updating eval set run spec for legacy evaluators with success=True."""
509+
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
510+
511+
spec = progress_reporter._update_eval_set_run_spec(
512+
eval_set_run_id="test-run-id",
513+
evaluator_scores=evaluator_scores,
514+
is_coded=False,
515+
success=True,
516+
)
517+
518+
assert spec.method == "PUT"
519+
assert "coded/" not in spec.endpoint
520+
assert spec.json["evalSetRunId"] == "test-run-id"
521+
assert spec.json["status"] == "Completed" # String format for legacy
522+
523+
def test_update_eval_set_run_spec_with_failure_legacy(self, progress_reporter):
524+
"""Test updating eval set run spec for legacy evaluators with success=False."""
525+
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
526+
527+
spec = progress_reporter._update_eval_set_run_spec(
528+
eval_set_run_id="test-run-id",
529+
evaluator_scores=evaluator_scores,
530+
is_coded=False,
531+
success=False,
532+
)
533+
534+
assert spec.method == "PUT"
535+
assert "coded/" not in spec.endpoint
536+
assert spec.json["evalSetRunId"] == "test-run-id"
537+
assert spec.json["status"] == "Failed" # String format for legacy

0 commit comments

Comments
 (0)