Skip to content

Commit 7f019c7

Browse files
committed
Merge branch 'master' of github.com:robusta-dev/holmesgpt into buildcheck
2 parents b1b5017 + 5002166 commit 7f019c7

File tree

5 files changed

+21
-12
lines changed

5 files changed

+21
-12
lines changed

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ markers = [
101101
"synthetic: Tests using synthetic data",
102102
"network: Tests requiring network connectivity",
103103
"runbooks: Tests involving runbook functionality",
104-
"kafka: Tests involving Kafka functionality",
105104
"misleading-history: Tests with misleading historical data",
106105
"chain-of-causation: Tests involving chain-of-causation analysis",
107106
"slackbot: Tests involving Slack bot functionality",
@@ -119,8 +118,9 @@ markers = [
119118
"toolset-limitation: Tests that cannot be solved no matter how smart the model, unless we improve the underlying toolsets themselves",
120119
"ask-for-clarification: Tests where Holmes should ask the user for clarification",
121120
"database: Tests involving database interactions",
122-
"datadog: DataDog toolset",
123-
"traces: Tests where the ai is expected to find the solution using the traces"
121+
"traces: Tests where the ai is expected to find the solution using the traces",
122+
"answer-given-wrong-place: LLM gave the answer in intermediate steps, not final answer, so its harder for the user to see the answer",
123+
"datadog: DataDog toolset"
124124
]
125125

126126
addopts = [

tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ expected_output:
66
- Should NOT just say "find grafana in your cluster" or give generic instructions
77
tags:
88
- kubernetes
9-
- easy
9+
- medium
10+
- answer-given-wrong-place # LLM gave the answer in intermediate steps, not final answer, so harder for user to see
1011
before_test: |
1112
kubectl create namespace app-03
1213
cat <<EOF | kubectl apply -f -

tests/llm/utils/commands.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@ def exit_info(self) -> str:
5454
)
5555

5656

57-
def _invoke_command(command: str, cwd: str) -> str:
57+
def _invoke_command(command: str, cwd: str, timeout: Optional[int] = None) -> str:
5858
try:
59-
logging.debug(f"Running `{command}` in {cwd}")
59+
actual_timeout = timeout if timeout is not None else EVAL_SETUP_TIMEOUT
60+
logging.debug(f"Running `{command}` in {cwd} with timeout {actual_timeout}s")
6061
result = subprocess.run(
6162
command,
6263
shell=True,
@@ -65,7 +66,7 @@ def _invoke_command(command: str, cwd: str) -> str:
6566
check=True,
6667
stdin=subprocess.DEVNULL,
6768
cwd=cwd,
68-
timeout=EVAL_SETUP_TIMEOUT,
69+
timeout=actual_timeout,
6970
)
7071

7172
output = f"{result.stdout}\n{result.stderr}"
@@ -109,7 +110,13 @@ def run_commands(
109110

110111
try:
111112
# Execute the entire commands string as a single bash script
112-
_invoke_command(command=script, cwd=test_case.folder)
113+
# Use per-test timeout if specified, otherwise use default
114+
timeout = (
115+
test_case.setup_timeout
116+
if hasattr(test_case, "setup_timeout") and test_case.setup_timeout
117+
else None
118+
)
119+
_invoke_command(command=script, cwd=test_case.folder, timeout=timeout)
113120

114121
elapsed_time = time.time() - start_time
115122
return CommandResult(
@@ -133,7 +140,7 @@ def run_commands(
133140
)
134141
except subprocess.TimeoutExpired as e:
135142
elapsed_time = time.time() - start_time
136-
error_details = f"TIMEOUT after {e.timeout}s\n\nYou can increase timeout with environment variable EVAL_SETUP_TIMEOUT=<seconds>\n\nScript that timed out:\n$ {_truncate_script(script)}"
143+
error_details = f"TIMEOUT after {e.timeout}s\n\nYou can increase timeout with environment variable EVAL_SETUP_TIMEOUT=<seconds> or by setting 'setup_timeout' in test_case.yaml\n\nScript that timed out:\n$ {_truncate_script(script)}"
137144

138145
return CommandResult(
139146
command=f"{operation.capitalize()} timeout: {e.cmd}",

tests/llm/utils/reporting/terminal_reporter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -910,9 +910,9 @@ def _print_model_comparison_table(sorted_results: List[dict], console: Console)
910910
# Calculate and print total evaluation cost
911911
total_cost = sum(model_costs.values()) if model_costs else 0
912912
if total_cost > 0:
913-
# Count unique test cases across all models
914-
unique_tests = len(set(r["test_case_name"] for r in sorted_results))
915-
avg_cost_per_test = total_cost / unique_tests if unique_tests else 0
913+
# Count total number of test runs (not unique test cases)
914+
total_test_runs = len(sorted_results)
915+
avg_cost_per_test = total_cost / total_test_runs if total_test_runs else 0
916916
console.print(
917917
f"[cyan]Total evaluation cost: ${total_cost:.4f}, Average per test: ${avg_cost_per_test:.6f}[/cyan]"
918918
)

tests/llm/utils/test_case_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ class HolmesTestCase(BaseModel):
7676
evaluation: LLMEvaluations = LLMEvaluations()
7777
before_test: Optional[str] = None
7878
after_test: Optional[str] = None
79+
setup_timeout: Optional[int] = None # Override default setup timeout in seconds
7980
conversation_history: Optional[list[dict]] = None
8081
test_env_vars: Optional[Dict[str, str]] = (
8182
None # Environment variables to set during test execution

0 commit comments

Comments
 (0)