fix: add Failed state for eval runs

mjnovice · mjnovice · commit b48935b01e7f · 2025-12-02T16:46:24.000-05:00
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -261,6 +261,7 @@ class EvaluationStatus(IntEnum):
     PENDING = 0
     IN_PROGRESS = 1
     COMPLETED = 2
+    FAILED = 3
 
 
 def _discriminate_eval_set(
diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -323,6 +323,7 @@ async def update_eval_run(
                 eval_run_id=sw_progress_item.eval_run_id,
                 execution_time=sw_progress_item.agent_execution_time,
                 actual_output=sw_progress_item.agent_output,
+                success=sw_progress_item.success,
                 is_coded=is_coded,
             )
         else:
@@ -332,6 +333,7 @@ async def update_eval_run(
                 eval_run_id=sw_progress_item.eval_run_id,
                 execution_time=sw_progress_item.agent_execution_time,
                 actual_output=sw_progress_item.agent_output,
+                success=sw_progress_item.success,
                 is_coded=is_coded,
             )
 
@@ -350,10 +352,11 @@ async def update_eval_set_run(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        success: bool = True,
     ):
         """Update the evaluation set run status to complete."""
         spec = self._update_eval_set_run_spec(
-            eval_set_run_id, evaluator_scores, is_coded
+            eval_set_run_id, evaluator_scores, is_coded, success
         )
         await self._client.request_async(
             method=spec.method,
@@ -498,9 +501,11 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
                     eval_set_run_id,
                     payload.evaluator_scores,
                     is_coded=is_coded,
+                    success=payload.success,
                 )
+                status_str = "completed" if payload.success else "failed"
                 logger.debug(
-                    f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
+                    f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded}, status={status_str})"
                 )
             else:
                 logger.warning(
@@ -704,17 +709,19 @@ def _update_eval_run_spec(
         eval_run_id: str,
         actual_output: dict[str, Any],
         execution_time: float,
+        success: bool,
         is_coded: bool = False,
     ) -> RequestSpec:
         # For legacy evaluations, endpoint is without /coded
         endpoint_suffix = "coded/" if is_coded else ""
 
+        # Determine status based on success
+        status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
+
         payload: dict[str, Any] = {
             "evalRunId": eval_run_id,
             # For coded evaluations, use integer status; for legacy, use string
-            "status": EvaluationStatus.COMPLETED.value
-            if is_coded
-            else self._status_to_string(EvaluationStatus.COMPLETED),
+            "status": status.value if is_coded else self._status_to_string(status),
             "result": {
                 "output": dict(actual_output),
                 "evaluatorScores": evaluator_scores,
@@ -739,18 +746,20 @@ def _update_coded_eval_run_spec(
         eval_run_id: str,
         actual_output: dict[str, Any],
         execution_time: float,
+        success: bool,
         is_coded: bool = False,
     ) -> RequestSpec:
         """Create update spec for coded evaluators."""
         # For coded evaluations, endpoint has /coded
         endpoint_suffix = "coded/" if is_coded else ""
 
+        # Determine status based on success
+        status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
+
         payload: dict[str, Any] = {
             "evalRunId": eval_run_id,
             # For coded evaluations, use integer status; for legacy, use string
-            "status": EvaluationStatus.COMPLETED.value
-            if is_coded
-            else self._status_to_string(EvaluationStatus.COMPLETED),
+            "status": status.value if is_coded else self._status_to_string(status),
             "result": {
                 "output": dict(actual_output),
                 "scores": evaluator_scores,
@@ -870,6 +879,7 @@ def _update_eval_set_run_spec(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        success: bool = True,
     ) -> RequestSpec:
         # Legacy API expects evaluatorId as GUID, coded accepts string
         evaluator_scores_list = []
@@ -894,12 +904,13 @@ def _update_eval_set_run_spec(
         # For legacy evaluations, endpoint is without /coded
         endpoint_suffix = "coded/" if is_coded else ""
 
+        # Determine status based on success
+        status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
+
         payload: dict[str, Any] = {
             "evalSetRunId": eval_set_run_id,
             # For coded evaluations, use integer status; for legacy, use string
-            "status": EvaluationStatus.COMPLETED.value
-            if is_coded
-            else self._status_to_string(EvaluationStatus.COMPLETED),
+            "status": status.value if is_coded else self._status_to_string(status),
             "evaluatorScores": evaluator_scores_list,
         }
 
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
@@ -260,7 +260,16 @@ async def execute(self) -> UiPathRuntimeResult:
         evaluator_averages: dict[str, float] = defaultdict(float)
         evaluator_count: dict[str, int] = defaultdict(int)
 
+        # Check if any eval runs failed
+        any_failed = False
         for eval_run_result in results.evaluation_set_results:
+            # Check if the agent execution had an error
+            if (
+                eval_run_result.agent_execution_output
+                and eval_run_result.agent_execution_output.result.error
+            ):
+                any_failed = True
+
             for result_dto in eval_run_result.evaluation_run_results:
                 evaluator_averages[result_dto.evaluator_id] += result_dto.result.score
                 evaluator_count[result_dto.evaluator_id] += 1
@@ -274,6 +283,7 @@ async def execute(self) -> UiPathRuntimeResult:
             EvalSetRunUpdatedEvent(
                 execution_id=self.execution_id,
                 evaluator_scores=evaluator_averages,
+                success=not any_failed,
             ),
             wait_for_completion=False,
         )
diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py
@@ -62,6 +62,7 @@ def validate_exception_details(self):
 class EvalSetRunUpdatedEvent(BaseModel):
     execution_id: str
     evaluator_scores: dict[str, float]
+    success: bool = True
 
 
 ProgressEvent = Union[
diff --git a/tests/cli/eval/test_progress_reporter.py b/tests/cli/eval/test_progress_reporter.py
@@ -9,6 +9,7 @@
 """
 
 import json
+from typing import Any
 from unittest.mock import AsyncMock, Mock, patch
 
 import pytest
@@ -282,6 +283,7 @@ def test_update_coded_eval_run_spec(self, progress_reporter):
             eval_run_id="test-run-id",
             actual_output={"result": "success"},
             execution_time=5.5,
+            success=True,
             is_coded=True,
         )
 
@@ -291,6 +293,7 @@ def test_update_coded_eval_run_spec(self, progress_reporter):
         assert spec.json["evaluatorRuns"] == evaluator_runs
         assert spec.json["result"]["scores"] == evaluator_scores
         assert spec.json["completionMetrics"]["duration"] == 5
+        assert spec.json["status"] == 2  # COMPLETED
 
     def test_update_legacy_eval_run_spec(self, progress_reporter):
         """Test updating eval run spec for legacy evaluators."""
@@ -305,6 +308,7 @@ def test_update_legacy_eval_run_spec(self, progress_reporter):
             eval_run_id="test-run-id",
             actual_output={"result": "success"},
             execution_time=5.5,
+            success=True,
             is_coded=False,
         )
 
@@ -314,6 +318,47 @@ def test_update_legacy_eval_run_spec(self, progress_reporter):
         assert spec.json["assertionRuns"] == assertion_runs
         assert spec.json["result"]["evaluatorScores"] == evaluator_scores
         assert spec.json["completionMetrics"]["duration"] == 5
+        assert spec.json["status"] == "Completed"  # String format for legacy
+
+    def test_update_coded_eval_run_spec_with_failure(self, progress_reporter):
+        """Test updating eval run spec for coded evaluators with failure."""
+        evaluator_runs: list[dict[str, Any]] = []
+        evaluator_scores: list[dict[str, Any]] = []
+
+        spec = progress_reporter._update_coded_eval_run_spec(
+            evaluator_runs=evaluator_runs,
+            evaluator_scores=evaluator_scores,
+            eval_run_id="test-run-id",
+            actual_output={},
+            execution_time=0.0,
+            success=False,
+            is_coded=True,
+        )
+
+        assert spec.method == "PUT"
+        assert "coded/" in spec.endpoint
+        assert spec.json["evalRunId"] == "test-run-id"
+        assert spec.json["status"] == 3  # FAILED
+
+    def test_update_legacy_eval_run_spec_with_failure(self, progress_reporter):
+        """Test updating eval run spec for legacy evaluators with failure."""
+        assertion_runs: list[dict[str, Any]] = []
+        evaluator_scores: list[dict[str, Any]] = []
+
+        spec = progress_reporter._update_eval_run_spec(
+            assertion_runs=assertion_runs,
+            evaluator_scores=evaluator_scores,
+            eval_run_id="test-run-id",
+            actual_output={},
+            execution_time=0.0,
+            success=False,
+            is_coded=False,
+        )
+
+        assert spec.method == "PUT"
+        assert "coded/" not in spec.endpoint
+        assert spec.json["evalRunId"] == "test-run-id"
+        assert spec.json["status"] == "Failed"  # String format for legacy
 
 
 # Tests for custom eval set run ID handling
@@ -421,3 +466,72 @@ def test_eval_set_run_created_event_initialization_without_custom_run_id(self):
 
         # Assert
         assert event.eval_set_run_id is None
+
+
+# Tests for eval set run status updates
+class TestEvalSetRunStatusUpdates:
+    """Tests for handling eval set run status updates (completed vs failed)."""
+
+    def test_update_eval_set_run_spec_with_success_coded(self, progress_reporter):
+        """Test updating eval set run spec for coded evaluators with success=True."""
+        evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
+
+        spec = progress_reporter._update_eval_set_run_spec(
+            eval_set_run_id="test-run-id",
+            evaluator_scores=evaluator_scores,
+            is_coded=True,
+            success=True,
+        )
+
+        assert spec.method == "PUT"
+        assert "coded/" in spec.endpoint
+        assert spec.json["evalSetRunId"] == "test-run-id"
+        assert spec.json["status"] == 2  # COMPLETED = 2
+
+    def test_update_eval_set_run_spec_with_failure_coded(self, progress_reporter):
+        """Test updating eval set run spec for coded evaluators with success=False."""
+        evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
+
+        spec = progress_reporter._update_eval_set_run_spec(
+            eval_set_run_id="test-run-id",
+            evaluator_scores=evaluator_scores,
+            is_coded=True,
+            success=False,
+        )
+
+        assert spec.method == "PUT"
+        assert "coded/" in spec.endpoint
+        assert spec.json["evalSetRunId"] == "test-run-id"
+        assert spec.json["status"] == 3  # FAILED = 3
+
+    def test_update_eval_set_run_spec_with_success_legacy(self, progress_reporter):
+        """Test updating eval set run spec for legacy evaluators with success=True."""
+        evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
+
+        spec = progress_reporter._update_eval_set_run_spec(
+            eval_set_run_id="test-run-id",
+            evaluator_scores=evaluator_scores,
+            is_coded=False,
+            success=True,
+        )
+
+        assert spec.method == "PUT"
+        assert "coded/" not in spec.endpoint
+        assert spec.json["evalSetRunId"] == "test-run-id"
+        assert spec.json["status"] == "Completed"  # String format for legacy
+
+    def test_update_eval_set_run_spec_with_failure_legacy(self, progress_reporter):
+        """Test updating eval set run spec for legacy evaluators with success=False."""
+        evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}
+
+        spec = progress_reporter._update_eval_set_run_spec(
+            eval_set_run_id="test-run-id",
+            evaluator_scores=evaluator_scores,
+            is_coded=False,
+            success=False,
+        )
+
+        assert spec.method == "PUT"
+        assert "coded/" not in spec.endpoint
+        assert spec.json["evalSetRunId"] == "test-run-id"
+        assert spec.json["status"] == "Failed"  # String format for legacy