Merge branch 'master' of github.com:robusta-dev/holmesgpt into buildcheck

mainred · mainred · commit 7f019c76c76b · 2025-08-26T10:01:29.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -101,7 +101,6 @@ markers = [
     "synthetic: Tests using synthetic data",
     "network: Tests requiring network connectivity",
     "runbooks: Tests involving runbook functionality",
-    "kafka: Tests involving Kafka functionality",
     "misleading-history: Tests with misleading historical data",
     "chain-of-causation: Tests involving chain-of-causation analysis",
     "slackbot: Tests involving Slack bot functionality",
@@ -119,8 +118,9 @@ markers = [
     "toolset-limitation: Tests that cannot be solved no matter how smart the model, unless we improve the underlying toolsets themselves",
     "ask-for-clarification: Tests where Holmes should ask the user for clarification",
     "database: Tests involving database interactions",
-    "datadog: DataDog toolset",
-    "traces: Tests where the ai is expected to find the solution using the traces"
+    "traces: Tests where the ai is expected to find the solution using the traces",
+    "answer-given-wrong-place: LLM gave the answer in intermediate steps, not final answer, so its harder for the user to see the answer",
+    "datadog: DataDog toolset"
 ]
 
 addopts = [
diff --git a/tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml
@@ -6,7 +6,8 @@ expected_output:
   - Should NOT just say "find grafana in your cluster" or give generic instructions
 tags:
   - kubernetes
-  - easy
+  - medium
+  - answer-given-wrong-place # LLM gave the answer in intermediate steps, not final answer, so harder for user to see
 before_test: |
   kubectl create namespace app-03
   cat <<EOF | kubectl apply -f -
diff --git a/tests/llm/utils/commands.py b/tests/llm/utils/commands.py
@@ -54,9 +54,10 @@ def exit_info(self) -> str:
         )
 
 
-def _invoke_command(command: str, cwd: str) -> str:
+def _invoke_command(command: str, cwd: str, timeout: Optional[int] = None) -> str:
     try:
-        logging.debug(f"Running `{command}` in {cwd}")
+        actual_timeout = timeout if timeout is not None else EVAL_SETUP_TIMEOUT
+        logging.debug(f"Running `{command}` in {cwd} with timeout {actual_timeout}s")
         result = subprocess.run(
             command,
             shell=True,
@@ -65,7 +66,7 @@ def _invoke_command(command: str, cwd: str) -> str:
             check=True,
             stdin=subprocess.DEVNULL,
             cwd=cwd,
-            timeout=EVAL_SETUP_TIMEOUT,
+            timeout=actual_timeout,
         )
 
         output = f"{result.stdout}\n{result.stderr}"
@@ -109,7 +110,13 @@ def run_commands(
 
     try:
         # Execute the entire commands string as a single bash script
-        _invoke_command(command=script, cwd=test_case.folder)
+        # Use per-test timeout if specified, otherwise use default
+        timeout = (
+            test_case.setup_timeout
+            if hasattr(test_case, "setup_timeout") and test_case.setup_timeout
+            else None
+        )
+        _invoke_command(command=script, cwd=test_case.folder, timeout=timeout)
 
         elapsed_time = time.time() - start_time
         return CommandResult(
@@ -133,7 +140,7 @@ def run_commands(
         )
     except subprocess.TimeoutExpired as e:
         elapsed_time = time.time() - start_time
-        error_details = f"TIMEOUT after {e.timeout}s\n\nYou can increase timeout with environment variable EVAL_SETUP_TIMEOUT=<seconds>\n\nScript that timed out:\n$ {_truncate_script(script)}"
+        error_details = f"TIMEOUT after {e.timeout}s\n\nYou can increase timeout with environment variable EVAL_SETUP_TIMEOUT=<seconds> or by setting 'setup_timeout' in test_case.yaml\n\nScript that timed out:\n$ {_truncate_script(script)}"
 
         return CommandResult(
             command=f"{operation.capitalize()} timeout: {e.cmd}",
diff --git a/tests/llm/utils/reporting/terminal_reporter.py b/tests/llm/utils/reporting/terminal_reporter.py
@@ -910,9 +910,9 @@ def _print_model_comparison_table(sorted_results: List[dict], console: Console)
     # Calculate and print total evaluation cost
     total_cost = sum(model_costs.values()) if model_costs else 0
     if total_cost > 0:
-        # Count unique test cases across all models
-        unique_tests = len(set(r["test_case_name"] for r in sorted_results))
-        avg_cost_per_test = total_cost / unique_tests if unique_tests else 0
+        # Count total number of test runs (not unique test cases)
+        total_test_runs = len(sorted_results)
+        avg_cost_per_test = total_cost / total_test_runs if total_test_runs else 0
         console.print(
             f"[cyan]Total evaluation cost: ${total_cost:.4f}, Average per test: ${avg_cost_per_test:.6f}[/cyan]"
         )
diff --git a/tests/llm/utils/test_case_utils.py b/tests/llm/utils/test_case_utils.py
@@ -76,6 +76,7 @@ class HolmesTestCase(BaseModel):
     evaluation: LLMEvaluations = LLMEvaluations()
     before_test: Optional[str] = None
     after_test: Optional[str] = None
+    setup_timeout: Optional[int] = None  # Override default setup timeout in seconds
     conversation_history: Optional[list[dict]] = None
     test_env_vars: Optional[Dict[str, str]] = (
         None  # Environment variables to set during test execution