Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ class EvaluationStatus(IntEnum):
PENDING = 0
IN_PROGRESS = 1
COMPLETED = 2
FAILED = 3


def _discriminate_eval_set(
Expand Down
33 changes: 22 additions & 11 deletions src/uipath/_cli/_evals/_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ async def update_eval_run(
eval_run_id=sw_progress_item.eval_run_id,
execution_time=sw_progress_item.agent_execution_time,
actual_output=sw_progress_item.agent_output,
success=sw_progress_item.success,
is_coded=is_coded,
)
else:
Expand All @@ -332,6 +333,7 @@ async def update_eval_run(
eval_run_id=sw_progress_item.eval_run_id,
execution_time=sw_progress_item.agent_execution_time,
actual_output=sw_progress_item.agent_output,
success=sw_progress_item.success,
is_coded=is_coded,
)

Expand All @@ -350,10 +352,11 @@ async def update_eval_set_run(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
is_coded: bool = False,
success: bool = True,
):
"""Update the evaluation set run status to complete."""
spec = self._update_eval_set_run_spec(
eval_set_run_id, evaluator_scores, is_coded
eval_set_run_id, evaluator_scores, is_coded, success
)
await self._client.request_async(
method=spec.method,
Expand Down Expand Up @@ -498,9 +501,11 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
eval_set_run_id,
payload.evaluator_scores,
is_coded=is_coded,
success=payload.success,
)
status_str = "completed" if payload.success else "failed"
logger.debug(
f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded}, status={status_str})"
)
else:
logger.warning(
Expand Down Expand Up @@ -704,17 +709,19 @@ def _update_eval_run_spec(
eval_run_id: str,
actual_output: dict[str, Any],
execution_time: float,
success: bool,
is_coded: bool = False,
) -> RequestSpec:
# For legacy evaluations, endpoint is without /coded
endpoint_suffix = "coded/" if is_coded else ""

# Determine status based on success
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED

payload: dict[str, Any] = {
"evalRunId": eval_run_id,
# For coded evaluations, use integer status; for legacy, use string
"status": EvaluationStatus.COMPLETED.value
if is_coded
else self._status_to_string(EvaluationStatus.COMPLETED),
"status": status.value if is_coded else self._status_to_string(status),
"result": {
"output": dict(actual_output),
"evaluatorScores": evaluator_scores,
Expand All @@ -739,18 +746,20 @@ def _update_coded_eval_run_spec(
eval_run_id: str,
actual_output: dict[str, Any],
execution_time: float,
success: bool,
is_coded: bool = False,
) -> RequestSpec:
"""Create update spec for coded evaluators."""
# For coded evaluations, endpoint has /coded
endpoint_suffix = "coded/" if is_coded else ""

# Determine status based on success
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED

payload: dict[str, Any] = {
"evalRunId": eval_run_id,
# For coded evaluations, use integer status; for legacy, use string
"status": EvaluationStatus.COMPLETED.value
if is_coded
else self._status_to_string(EvaluationStatus.COMPLETED),
"status": status.value if is_coded else self._status_to_string(status),
"result": {
"output": dict(actual_output),
"scores": evaluator_scores,
Expand Down Expand Up @@ -870,6 +879,7 @@ def _update_eval_set_run_spec(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
is_coded: bool = False,
success: bool = True,
) -> RequestSpec:
# Legacy API expects evaluatorId as GUID, coded accepts string
evaluator_scores_list = []
Expand All @@ -894,12 +904,13 @@ def _update_eval_set_run_spec(
# For legacy evaluations, endpoint is without /coded
endpoint_suffix = "coded/" if is_coded else ""

# Determine status based on success
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED

payload: dict[str, Any] = {
"evalSetRunId": eval_set_run_id,
# For coded evaluations, use integer status; for legacy, use string
"status": EvaluationStatus.COMPLETED.value
if is_coded
else self._status_to_string(EvaluationStatus.COMPLETED),
"status": status.value if is_coded else self._status_to_string(status),
"evaluatorScores": evaluator_scores_list,
}

Expand Down
10 changes: 10 additions & 0 deletions src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,16 @@ async def execute(self) -> UiPathRuntimeResult:
evaluator_averages: dict[str, float] = defaultdict(float)
evaluator_count: dict[str, int] = defaultdict(int)

# Check if any eval runs failed
any_failed = False
for eval_run_result in results.evaluation_set_results:
# Check if the agent execution had an error
if (
eval_run_result.agent_execution_output
and eval_run_result.agent_execution_output.result.error
):
any_failed = True

for result_dto in eval_run_result.evaluation_run_results:
evaluator_averages[result_dto.evaluator_id] += result_dto.result.score
evaluator_count[result_dto.evaluator_id] += 1
Expand All @@ -274,6 +283,7 @@ async def execute(self) -> UiPathRuntimeResult:
EvalSetRunUpdatedEvent(
execution_id=self.execution_id,
evaluator_scores=evaluator_averages,
success=not any_failed,
),
wait_for_completion=False,
)
Expand Down
1 change: 1 addition & 0 deletions src/uipath/_events/_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def validate_exception_details(self):
class EvalSetRunUpdatedEvent(BaseModel):
execution_id: str
evaluator_scores: dict[str, float]
success: bool = True


ProgressEvent = Union[
Expand Down
4 changes: 4 additions & 0 deletions src/uipath/tracing/_otel_exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ def __init__(

def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
"""Export spans to UiPath LLM Ops."""
if len(spans) == 0:
logger.warning("No spans to export")
return SpanExportResult.SUCCESS

logger.debug(
f"Exporting {len(spans)} spans to {self.base_url}/llmopstenant_/api/Traces/spans"
)
Expand Down
114 changes: 114 additions & 0 deletions tests/cli/eval/test_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""

import json
from typing import Any
from unittest.mock import AsyncMock, Mock, patch

import pytest
Expand Down Expand Up @@ -282,6 +283,7 @@ def test_update_coded_eval_run_spec(self, progress_reporter):
eval_run_id="test-run-id",
actual_output={"result": "success"},
execution_time=5.5,
success=True,
is_coded=True,
)

Expand All @@ -291,6 +293,7 @@ def test_update_coded_eval_run_spec(self, progress_reporter):
assert spec.json["evaluatorRuns"] == evaluator_runs
assert spec.json["result"]["scores"] == evaluator_scores
assert spec.json["completionMetrics"]["duration"] == 5
assert spec.json["status"] == 2 # COMPLETED

def test_update_legacy_eval_run_spec(self, progress_reporter):
"""Test updating eval run spec for legacy evaluators."""
Expand All @@ -305,6 +308,7 @@ def test_update_legacy_eval_run_spec(self, progress_reporter):
eval_run_id="test-run-id",
actual_output={"result": "success"},
execution_time=5.5,
success=True,
is_coded=False,
)

Expand All @@ -314,6 +318,47 @@ def test_update_legacy_eval_run_spec(self, progress_reporter):
assert spec.json["assertionRuns"] == assertion_runs
assert spec.json["result"]["evaluatorScores"] == evaluator_scores
assert spec.json["completionMetrics"]["duration"] == 5
assert spec.json["status"] == "Completed" # String format for legacy

def test_update_coded_eval_run_spec_with_failure(self, progress_reporter):
"""Test updating eval run spec for coded evaluators with failure."""
evaluator_runs: list[dict[str, Any]] = []
evaluator_scores: list[dict[str, Any]] = []

spec = progress_reporter._update_coded_eval_run_spec(
evaluator_runs=evaluator_runs,
evaluator_scores=evaluator_scores,
eval_run_id="test-run-id",
actual_output={},
execution_time=0.0,
success=False,
is_coded=True,
)

assert spec.method == "PUT"
assert "coded/" in spec.endpoint
assert spec.json["evalRunId"] == "test-run-id"
assert spec.json["status"] == 3 # FAILED

def test_update_legacy_eval_run_spec_with_failure(self, progress_reporter):
"""Test updating eval run spec for legacy evaluators with failure."""
assertion_runs: list[dict[str, Any]] = []
evaluator_scores: list[dict[str, Any]] = []

spec = progress_reporter._update_eval_run_spec(
assertion_runs=assertion_runs,
evaluator_scores=evaluator_scores,
eval_run_id="test-run-id",
actual_output={},
execution_time=0.0,
success=False,
is_coded=False,
)

assert spec.method == "PUT"
assert "coded/" not in spec.endpoint
assert spec.json["evalRunId"] == "test-run-id"
assert spec.json["status"] == "Failed" # String format for legacy


# Tests for custom eval set run ID handling
Expand Down Expand Up @@ -421,3 +466,72 @@ def test_eval_set_run_created_event_initialization_without_custom_run_id(self):

# Assert
assert event.eval_set_run_id is None


# Tests for eval set run status updates
class TestEvalSetRunStatusUpdates:
"""Tests for handling eval set run status updates (completed vs failed)."""

def test_update_eval_set_run_spec_with_success_coded(self, progress_reporter):
"""Test updating eval set run spec for coded evaluators with success=True."""
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}

spec = progress_reporter._update_eval_set_run_spec(
eval_set_run_id="test-run-id",
evaluator_scores=evaluator_scores,
is_coded=True,
success=True,
)

assert spec.method == "PUT"
assert "coded/" in spec.endpoint
assert spec.json["evalSetRunId"] == "test-run-id"
assert spec.json["status"] == 2 # COMPLETED = 2

def test_update_eval_set_run_spec_with_failure_coded(self, progress_reporter):
"""Test updating eval set run spec for coded evaluators with success=False."""
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}

spec = progress_reporter._update_eval_set_run_spec(
eval_set_run_id="test-run-id",
evaluator_scores=evaluator_scores,
is_coded=True,
success=False,
)

assert spec.method == "PUT"
assert "coded/" in spec.endpoint
assert spec.json["evalSetRunId"] == "test-run-id"
assert spec.json["status"] == 3 # FAILED = 3

def test_update_eval_set_run_spec_with_success_legacy(self, progress_reporter):
"""Test updating eval set run spec for legacy evaluators with success=True."""
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}

spec = progress_reporter._update_eval_set_run_spec(
eval_set_run_id="test-run-id",
evaluator_scores=evaluator_scores,
is_coded=False,
success=True,
)

assert spec.method == "PUT"
assert "coded/" not in spec.endpoint
assert spec.json["evalSetRunId"] == "test-run-id"
assert spec.json["status"] == "Completed" # String format for legacy

def test_update_eval_set_run_spec_with_failure_legacy(self, progress_reporter):
"""Test updating eval set run spec for legacy evaluators with success=False."""
evaluator_scores = {"eval-1": 0.9, "eval-2": 0.85}

spec = progress_reporter._update_eval_set_run_spec(
eval_set_run_id="test-run-id",
evaluator_scores=evaluator_scores,
is_coded=False,
success=False,
)

assert spec.method == "PUT"
assert "coded/" not in spec.endpoint
assert spec.json["evalSetRunId"] == "test-run-id"
assert spec.json["status"] == "Failed" # String format for legacy