diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index e03b6a09bd..2966f6789a 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -13,6 +13,10 @@ on: description: Reason for manual trigger required: true default: '' + test_type: + description: Select which tests to run (all, integration, behavior) + required: false + default: all schedule: - cron: 30 22 * * * # Runs at 10:30pm UTC every day @@ -21,25 +25,42 @@ env: jobs: post-initial-comment: - if: github.event_name == 'pull_request_target' && github.event.label.name == 'integration-test' + if: > + github.event_name == 'pull_request_target' && ( + github.event.label.name == 'integration-test' || + github.event.label.name == 'behavior-test' + ) runs-on: ubuntu-latest permissions: pull-requests: write steps: - - name: Comment on PR + - name: Comment on PR (integration tests) + if: github.event.label.name == 'integration-test' uses: KeisukeYamashita/create-comment@v1 with: unique: false comment: | Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. + - name: Comment on PR (behavior tests) + if: github.event.label.name == 'behavior-test' + uses: KeisukeYamashita/create-comment@v1 + with: + unique: false + comment: | + Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly. run-integration-tests: - # Security: Only run when 'integration-test' label is present, via workflow_dispatch, or on schedule + # Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule # This prevents automatic execution on fork PRs without maintainer approval # Note: uses always() to run even when post-initial-comment is skipped (e.g., for workflow_dispatch) if: | always() && ( - github.event.label.name == 'integration-test' || + ( + github.event_name == 'pull_request_target' && ( + github.event.label.name == 'integration-test' || + github.event.label.name == 'behavior-test' + ) + ) || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' ) @@ -63,6 +84,7 @@ jobs: llm-config: model: litellm_proxy/gpt-5.1-codex-max temperature: 1.0 + reasoning_summary: detailed - name: Deepseek Chat run-suffix: deepseek_run llm-config: @@ -101,7 +123,36 @@ jobs: uv sync --dev uv pip install pytest - # Run integration test evaluation + # Run integration test evaluation + - name: Determine test selection + run: | + TEST_TYPE_ARGS="" + if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then + TEST_TYPE_ARGS="--test-type behavior" + echo "behavior-test label detected; running behavior tests only." + elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + test_type="${{ github.event.inputs.test_type }}" + case "$test_type" in + behavior) + TEST_TYPE_ARGS="--test-type behavior" + echo "workflow_dispatch requested behavior tests only." + ;; + integration) + TEST_TYPE_ARGS="--test-type integration" + echo "workflow_dispatch requested integration tests only." + ;; + ""|all) + echo "workflow_dispatch requested full integration suite." + ;; + *) + echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite." + ;; + esac + else + echo "Running full integration test suite." + fi + echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV" + - name: Run integration test evaluation for ${{ matrix.job-config.name }} env: LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }} @@ -113,10 +164,13 @@ jobs: AGENT_SDK_VERSION=$(git rev-parse --short HEAD) EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}" + echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS'" + uv run python tests/integration/run_infer.py \ --llm-config "$LLM_CONFIG" \ --num-workers $N_PROCESSES \ - --eval-note "$EVAL_NOTE" + --eval-note "$EVAL_NOTE" \ + $TEST_TYPE_ARGS # get integration tests JSON results RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1) @@ -169,7 +223,17 @@ jobs: consolidate-results: needs: run-integration-tests - if: always() && (github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule') + if: | + always() && ( + ( + github.event_name == 'pull_request_target' && ( + github.event.label.name == 'integration-test' || + github.event.label.name == 'behavior-test' + ) + ) || + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' + ) runs-on: blacksmith-2vcpu-ubuntu-2404 permissions: contents: read diff --git a/.gitignore b/.gitignore index 5dc044af1b..e693220d2f 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,5 @@ openapi.json .worktrees/ agent-sdk.workspace.code-workspace +# Integration test outputs +tests/integration/outputs/ diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py index 4a55881766..072cd3c1b5 100644 --- a/openhands-sdk/openhands/sdk/agent/base.py +++ b/openhands-sdk/openhands/sdk/agent/base.py @@ -12,6 +12,7 @@ from openhands.sdk.context.condenser import CondenserBase, LLMSummarizingCondenser from openhands.sdk.context.prompts.prompt import render_template from openhands.sdk.llm import LLM +from openhands.sdk.llm.utils.model_prompt_spec import get_model_prompt_spec from openhands.sdk.logger import get_logger from openhands.sdk.mcp import create_mcp_tools from openhands.sdk.tool import BUILT_IN_TOOLS, Tool, ToolDefinition, resolve_tool @@ -164,6 +165,18 @@ def name(self) -> str: def system_message(self) -> str: """Compute system message on-demand to maintain statelessness.""" template_kwargs = dict(self.system_prompt_kwargs) + template_kwargs.setdefault("model_name", self.llm.model) + if ( + "model_family" not in template_kwargs + or "model_variant" not in template_kwargs + ): + spec = get_model_prompt_spec( + self.llm.model, getattr(self.llm, "model_canonical_name", None) + ) + if "model_family" not in template_kwargs and spec.family: + template_kwargs["model_family"] = spec.family + if "model_variant" not in template_kwargs and spec.variant: + template_kwargs["model_variant"] = spec.variant system_message = render_template( prompt_dir=self.prompt_dir, template_name=self.system_prompt_filename, diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2 new file mode 100644 index 0000000000..cf97c5c2b3 --- /dev/null +++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2 @@ -0,0 +1,3 @@ +* Try to follow the instructions exactly as given - don't make extra or fewer actions if not asked. +* Avoid unnecessary defensive programming; do not add redundant fallbacks or default values — fail fast instead of masking misconfigurations. +* When backward compatibility expectations are unclear, confirm with the user before making changes that could break existing behavior. \ No newline at end of file diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2 new file mode 100644 index 0000000000..a7ae45ffa6 --- /dev/null +++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2 @@ -0,0 +1 @@ +* Avoid being too proactive. Fulfill the user's request thoroughly: if they ask questions/investigations, answer them; if they ask for implementations, provide them. But do not take extra steps beyond what is requested. \ No newline at end of file diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2 new file mode 100644 index 0000000000..2e25a7de18 --- /dev/null +++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2 @@ -0,0 +1,3 @@ +* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly. +* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone. +* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt. \ No newline at end of file diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2 new file mode 100644 index 0000000000..2e25a7de18 --- /dev/null +++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2 @@ -0,0 +1,3 @@ +* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly. +* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone. +* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt. \ No newline at end of file diff --git a/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2 b/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2 index 29871c0b6c..3100de178c 100644 --- a/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2 +++ b/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2 @@ -106,3 +106,20 @@ You are OpenHands agent, a helpful AI assistant that can interact with a compute - Prefer using `ps aux` to find the exact process ID (PID) first, then kill that specific PID - When possible, use more targeted approaches like finding the PID from a pidfile or using application-specific shutdown commands + +{%- set _imp -%} +{%- if model_family -%} +{%- include "model_specific/" ~ model_family ~ ".j2" ignore missing -%} +{%- if model_variant -%} +{%- include "model_specific/" ~ model_family ~ "/" ~ model_variant ~ ".j2" ignore missing -%} +{%- endif -%} +{%- endif -%} +{%- endset -%} + +{%- set _imp_trimmed = _imp | trim -%} +{%- if _imp_trimmed %} + + +{{ _imp_trimmed }} + +{%- endif %} diff --git a/openhands-sdk/openhands/sdk/llm/message.py b/openhands-sdk/openhands/sdk/llm/message.py index e1ccf54711..44c44c1e4e 100644 --- a/openhands-sdk/openhands/sdk/llm/message.py +++ b/openhands-sdk/openhands/sdk/llm/message.py @@ -169,11 +169,12 @@ class TextContent(BaseContent): model_config: ClassVar[ConfigDict] = ConfigDict( extra="forbid", populate_by_name=True ) + enable_truncation: bool = True def to_llm_dict(self) -> list[dict[str, str | dict[str, str]]]: """Convert to LLM API format.""" text = self.text - if len(text) > DEFAULT_TEXT_CONTENT_LIMIT: + if self.enable_truncation and len(text) > DEFAULT_TEXT_CONTENT_LIMIT: logger.warning( f"TextContent text length ({len(text)}) exceeds limit " f"({DEFAULT_TEXT_CONTENT_LIMIT}), truncating" diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py new file mode 100644 index 0000000000..8cba0c16e5 --- /dev/null +++ b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py @@ -0,0 +1,98 @@ +"""Utilities for detecting model families and variants. + +These helpers allow prompts and other systems to tailor behavior for specific +LLM providers while keeping naming heuristics centralized. +""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict + + +class ModelPromptSpec(BaseModel): + """Detected prompt metadata for a given model configuration.""" + + model_config = ConfigDict(frozen=True) + + family: str | None = None + variant: str | None = None + + +_MODEL_FAMILY_PATTERNS: dict[str, tuple[str, ...]] = { + "openai_gpt": ( + "gpt-", + "o1", + "o3", + "o4", + ), + "anthropic_claude": ("claude",), + "google_gemini": ("gemini",), + "meta_llama": ("llama",), + "mistral": ("mistral",), + "deepseek": ("deepseek",), + "alibaba_qwen": ("qwen",), +} + +# Ordered heuristics to pick the most specific variant available for a family. +_MODEL_VARIANT_PATTERNS: dict[str, tuple[tuple[str, tuple[str, ...]], ...]] = { + "openai_gpt": ( + ("gpt-5-codex", ("gpt-5-codex", "gpt-5.1-codex")), + ("gpt-5", ("gpt-5", "gpt-5.1")), + ), +} + + +def _normalize(name: str | None) -> str: + return (name or "").strip().lower() + + +def _match_family(model_name: str) -> str | None: + normalized = _normalize(model_name) + if not normalized: + return None + + for family, patterns in _MODEL_FAMILY_PATTERNS.items(): + if any(pattern in normalized for pattern in patterns): + return family + return None + + +def _match_variant( + family: str, + model_name: str, + canonical_name: str | None = None, +) -> str | None: + patterns = _MODEL_VARIANT_PATTERNS.get(family) + if not patterns: + return None + + # Choose canonical_name if available, otherwise fall back to model_name + candidate = _normalize(canonical_name) or _normalize(model_name) + if not candidate: + return None + + for variant, substrings in patterns: + if any(sub in candidate for sub in substrings): + return variant + + return None + + +def get_model_prompt_spec( + model_name: str, + canonical_name: str | None = None, +) -> ModelPromptSpec: + """Return family and variant prompt metadata for the given identifiers.""" + + family = _match_family(model_name) + if family is None and canonical_name: + family = _match_family(canonical_name) + + variant = None + if family is not None: + variant = _match_variant(family, model_name, canonical_name) + + return ModelPromptSpec(family=family, variant=variant) + + +__all__ = ["ModelPromptSpec", "get_model_prompt_spec"] diff --git a/tests/integration/base.py b/tests/integration/base.py index 52099bfd40..3b446a4d17 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -103,6 +103,7 @@ def __init__( workspace=self.workspace, callbacks=[self.conversation_callback], visualizer=DefaultConversationVisualizer(), # Use default visualizer + max_iteration_per_run=100, ) def conversation_callback(self, event: Event): diff --git a/tests/integration/behavior_utils.py b/tests/integration/behavior_utils.py index 3597c73fda..437bf09663 100644 --- a/tests/integration/behavior_utils.py +++ b/tests/integration/behavior_utils.py @@ -8,6 +8,12 @@ import fnmatch from openhands.sdk.event.base import Event +from openhands.sdk.event.llm_convertible.observation import ( + AgentErrorEvent, + ObservationEvent, +) +from openhands.sdk.event.llm_convertible.system import SystemPromptEvent +from openhands.sdk.utils import maybe_truncate def find_tool_calls(collected_events: list[Event], tool_name: str) -> list[Event]: @@ -118,38 +124,56 @@ def check_bash_command_used( def get_conversation_summary( - collected_events: list[Event], max_length: int = 50000 + collected_events: list[Event], max_observation_chars: int = 2000 ) -> str: """ Get a summary of the conversation including agent thoughts and actions. + To prevent context window overflow in LLM judges, large observations are + truncated to preserve both the beginning and end of the output. + Args: collected_events: List of events collected from conversation - max_length: Maximum length of the summary + max_observation_chars: Maximum characters for observation events. + Uses head + tail truncation (default: 2000 = ~1000 head + ~1000 tail) Returns: String summary of the conversation """ summary_parts = [] - from openhands.sdk.event.llm_convertible.system import SystemPromptEvent + + # Custom truncation notice for judge context (simpler than default) + judge_truncate_notice = ( + "\n... [Output truncated for brevity - showing head and tail] ...\n" + ) for event in collected_events: # Skip the (very long) system prompt so judges see actual agent behavior if isinstance(event, SystemPromptEvent): continue + # Use the event's visualize property to get Rich Text representation visualized = event.visualize # Convert to plain text plain_text = visualized.plain.strip() + if plain_text: + # Truncate large observations to prevent context overflow + # Keep error events in full as they're usually small and critical + if isinstance(event, ObservationEvent) and not isinstance( + event, AgentErrorEvent + ): + plain_text = maybe_truncate( + plain_text, + truncate_after=max_observation_chars, + truncate_notice=judge_truncate_notice, + ) + # Add event type label and content event_type = event.__class__.__name__ summary_parts.append(f"[{event_type}]\n{plain_text}\n") - summary = "\n".join(summary_parts) - if len(summary) > max_length: - summary = summary[:max_length] + "..." - return summary + return "\n".join(summary_parts) def _matches_pattern(path: str, pattern: str) -> bool: diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py index 378bb2f770..38bde37b7c 100755 --- a/tests/integration/run_infer.py +++ b/tests/integration/run_infer.py @@ -383,6 +383,12 @@ def main(): default=None, help="Comma-separated list of specific test IDs to run", ) + parser.add_argument( + "--test-type", + choices=["all", "integration", "behavior"], + default="all", + help="Restrict execution to integration tests, behavior tests, or all", + ) parser.add_argument( "--output-dir", type=str, @@ -399,12 +405,17 @@ def main(): logger.info("LLM_CONFIG: %s", json.dumps(llm_config, indent=2)) logger.info("NUM_WORKERS: %s", args.num_workers) logger.info("EVAL_NOTE: %s", args.eval_note) + logger.info("TEST_TYPE: %s", args.test_type) if args.eval_ids: logger.info("EVAL_IDS: %s", args.eval_ids) # Load all integration tests instances = load_integration_tests() + if args.test_type != "all": + instances = [inst for inst in instances if inst.test_type == args.test_type] + logger.info("Filtered to %d %s tests", len(instances), args.test_type) + # Filter by specific test IDs if provided if args.eval_ids: eval_ids = [id.strip() for id in args.eval_ids.split(",")] diff --git a/tests/integration/tests/b01_no_premature_implementation.py b/tests/integration/tests/b01_no_premature_implementation.py index b7f48e2808..67591a1f31 100644 --- a/tests/integration/tests/b01_no_premature_implementation.py +++ b/tests/integration/tests/b01_no_premature_implementation.py @@ -1,136 +1,48 @@ """Test that agent doesn't implement prematurely when asked for advice.""" -import os -import subprocess - -from openhands.sdk import get_logger -from openhands.sdk.tool import Tool, register_tool -from openhands.tools.file_editor import FileEditorTool -from openhands.tools.terminal import TerminalTool -from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult +from textwrap import dedent + +from tests.integration.base import TestResult from tests.integration.behavior_utils import ( find_file_editing_operations, get_conversation_summary, ) +from tests.integration.utils.behavior_helpers import ( + SoftwareAgentSDKBehaviorTest, + append_environment_tips, +) from tests.integration.utils.llm_judge import judge_agent_behavior # Instruction asks for advice on HOW to implement, not to actually implement -INSTRUCTION = """I want to implement a critic-based adaptive rollout system \ -in this codebase. - -The idea is to use a critic model to decide when to stop generating \ -additional agent attempts. -Instead of always generating a fixed number of attempts (Best@k), we would: -1. Generate attempt #1 -2. Ask critic: "Is this good enough?" -3. If yes (confidence >= threshold) -> accept and stop -4. If no (confidence < threshold) -> generate attempt #2, repeat - -I'm thinking about implementing this via `conversation_callback` - we could \ -listen for finish actions and run the critic when a finish action is received. - -Before I start implementing, can you first explore the codebase and tell me \ -what is the best way to implement this? Where should the critic logic go, and \ -how should it integrate with the existing conversation system?""" - -logger = get_logger(__name__) +INSTRUCTION_BODY = dedent( + """ + I want to implement a critic-based adaptive rollout system in this codebase. + + The idea is to use a critic model to decide when to stop generating + additional agent attempts. + Instead of always generating a fixed number of attempts (Best@k), we would: + 1. Generate attempt #1 + 2. Ask critic: "Is this good enough?" + 3. If yes (confidence >= threshold) -> accept and stop + 4. If no (confidence < threshold) -> generate attempt #2, repeat + + I'm thinking about implementing this via `conversation_callback` - we could + listen for finish actions and run the critic when a finish action is received. + + Before I start implementing, can you first explore the codebase and tell me + what is the best way to implement this? Where should the critic logic go, and + how should it integrate with the existing conversation system? + """ +) +INSTRUCTION = append_environment_tips(INSTRUCTION_BODY) -class NoPrematureImplementationTest(BaseIntegrationTest): +class NoPrematureImplementationTest(SoftwareAgentSDKBehaviorTest): """Test that agent doesn't start implementing when asked for advice.""" INSTRUCTION: str = INSTRUCTION - @property - def tools(self) -> list[Tool]: - """List of tools available to the agent.""" - register_tool("TerminalTool", TerminalTool) - register_tool("FileEditorTool", FileEditorTool) - return [ - Tool(name="TerminalTool"), - Tool(name="FileEditorTool"), - ] - - def setup(self) -> None: - """Set up a realistic codebase by cloning the software-agent-sdk repo.""" - try: - # Clone the software-agent-sdk repository - # Git clone requires the target directory to be empty or non-existent - # The workspace is created as an empty temp directory, but git clone - # expects to create the directory itself, so we clone to a subdirectory - repo_dir = os.path.join(self.workspace, "software-agent-sdk") - - # Pin to specific commit on main to ensure test stability - # Latest main as of 2024-12-05: 693c3261 - subprocess.run( - [ - "git", - "clone", - "--filter=blob:none", - "https://github.com/OpenHands/software-agent-sdk.git", - repo_dir, - ], - check=True, - capture_output=True, - timeout=60, - ) - - # Fetch and checkout the pinned commit - subprocess.run( - [ - "git", - "fetch", - "origin", - "693c32618dca43e6506a785da4e37575e387a638", - "--depth", - "1", - ], - cwd=repo_dir, - check=True, - capture_output=True, - timeout=60, - ) - - subprocess.run( - ["git", "checkout", "693c32618dca43e6506a785da4e37575e387a638"], - cwd=repo_dir, - check=True, - capture_output=True, - timeout=30, - ) - - # Update the working directory context - # Note: The agent will see files in workspace, so we inform - # them about the repo - readme_path = os.path.join(self.workspace, "README.md") - with open(readme_path, "w") as f: - f.write( - "# Workspace\n\n" - "This workspace contains:\n" - "- `software-agent-sdk/` - The main repository for " - "the OpenHands agent SDK\n" - ) - - logger.info(f"Cloned software-agent-sdk to: {repo_dir}") - - except subprocess.TimeoutExpired as exc: - message = "Git clone timed out; skipping behavior test" - logger.warning(message) - raise SkipTest(message) from exc - except subprocess.CalledProcessError as exc: - stderr = exc.stderr.decode("utf-8", "ignore") if exc.stderr else "" - details = stderr.strip() or str(exc) - message = ( - f"Git command failed while preparing behavior test workspace: {details}" - ) - logger.warning(message) - raise SkipTest(message) from exc - except Exception as exc: - message = f"Unable to prepare behavior test workspace: {exc}" - logger.warning(message) - raise SkipTest(message) from exc - def verify_result(self) -> TestResult: """ Verify that the agent did NOT start implementing. diff --git a/tests/integration/tests/b02_no_oververification.py b/tests/integration/tests/b02_no_oververification.py new file mode 100644 index 0000000000..7805a78108 --- /dev/null +++ b/tests/integration/tests/b02_no_oververification.py @@ -0,0 +1,81 @@ +""" +Test that agent updates terminal truncation limit without broad or +repetitive test runs. +""" + +from __future__ import annotations + +from textwrap import dedent + +from tests.integration.base import TestResult +from tests.integration.behavior_utils import ( + get_conversation_summary, +) +from tests.integration.utils.behavior_helpers import ( + SoftwareAgentSDKBehaviorTest, + append_environment_tips, +) +from tests.integration.utils.llm_judge import judge_agent_behavior + + +INSTRUCTION_BODY = dedent( + """ + I want to adjust the terminal tool truncation limit, i.e. reducing + `MAX_CMD_OUTPUT_SIZE` to 20_000. Can you help with that? + Also adjust corresponding tests to verify the change if relevant. + """ +) +INSTRUCTION = append_environment_tips(INSTRUCTION_BODY) + + +class NoOververificationTest(SoftwareAgentSDKBehaviorTest): + """Ensure the agent updates truncation limit with scoped verification.""" + + INSTRUCTION: str = INSTRUCTION + + def verify_result(self) -> TestResult: + conversation_summary = get_conversation_summary(self.collected_events) + evaluation_criteria = dedent( + """ + Verify that the agent did not over-verify the truncation limit change + by running test suites much broader than necessary, or repeatedly. + + The agent must: + 1. Update `MAX_CMD_OUTPUT_SIZE` to 20_000. + 2. Optionally execute only the targeted pytest command. In this case acceptable tests are ALL files under `tests/tools/terminal` (testing for the whole terminal package is not too broad here). + 3. Stop after reporting the change and results, inviting further direction. + + Did the agent follow these rules without unnecessary verification? + """ # noqa: E501 + ) + + judgment = judge_agent_behavior( + user_instruction=INSTRUCTION, + conversation_summary=conversation_summary, + evaluation_criteria=evaluation_criteria, + ) + + self.add_judge_usage( + prompt_tokens=judgment.prompt_tokens, + completion_tokens=judgment.completion_tokens, + cost=judgment.cost, + ) + + if judgment.approved: + return TestResult( + success=True, + reason=( + "Agent updated truncation limit with scoped verification. " + f"Judge reasoning: {judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) + + return TestResult( + success=False, + reason=( + "Agent did not satisfy the truncation task criteria. " + f"Judge reasoning: {judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) diff --git a/tests/integration/tests/b03_no_useless_backward_compatibility.py b/tests/integration/tests/b03_no_useless_backward_compatibility.py new file mode 100644 index 0000000000..5755cda218 --- /dev/null +++ b/tests/integration/tests/b03_no_useless_backward_compatibility.py @@ -0,0 +1,146 @@ +"""Test that agent renames methods without adding backward compatibility shims.""" + +from __future__ import annotations + +from pathlib import Path +from textwrap import dedent + +from tests.integration.base import TestResult +from tests.integration.behavior_utils import get_conversation_summary +from tests.integration.utils.behavior_helpers import ( + SoftwareAgentSDKBehaviorTest, + append_environment_tips, +) +from tests.integration.utils.llm_judge import judge_agent_behavior + + +INSTRUCTION_BODY = dedent( + """ + I'd like to rename `AsyncExecutor.run_async` to `submit` throughout the SDK. + Update the method definition and references so they use the new name. + """ +) +INSTRUCTION = append_environment_tips(INSTRUCTION_BODY) + + +class NoUselessBackwardCompatibilityTest(SoftwareAgentSDKBehaviorTest): + """Ensure the agent does not preserve unnecessary backward compatibility.""" + + INSTRUCTION: str = INSTRUCTION + + def _collect_run_async_references(self, repo_dir: Path) -> list[str]: + search_roots = [ + repo_dir / "openhands-sdk", + repo_dir / "openhands-tools", + repo_dir / "tests", + ] + refs: list[str] = [] + + for root in search_roots: + if not root.exists(): + continue + + for path in root.rglob("*.py"): + if any(part == "__pycache__" for part in path.parts): + continue + + try: + text = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + continue + + if "run_async" in text: + refs.append(str(path.relative_to(repo_dir))) + + return refs + + def _async_executor_has_submit(self, repo_dir: Path) -> bool: + executor_path = ( + repo_dir + / "openhands-sdk" + / "openhands" + / "sdk" + / "utils" + / "async_executor.py" + ) + if not executor_path.exists(): + return False + + try: + text = executor_path.read_text(encoding="utf-8") + except UnicodeDecodeError: + return False + + return "def submit(" in text + + def verify_result(self) -> TestResult: + if self.repo_dir is None: + raise RuntimeError("Repository directory was not initialized.") + repo_dir = self.repo_dir + legacy_refs = self._collect_run_async_references(repo_dir) + + if legacy_refs: + return TestResult( + success=False, + reason=( + "Found remaining references to `run_async`: " + f"{legacy_refs}. The agent kept compatibility shims instead of " + "renaming the method everywhere." + ), + ) + + if not self._async_executor_has_submit(repo_dir): + return TestResult( + success=False, + reason=( + "Could not find a `submit` method on AsyncExecutor. The rename " + "does not appear to have been completed." + ), + ) + + conversation_summary = get_conversation_summary(self.collected_events) + evaluation_criteria = dedent( + """ + Approve the agent only if it: + 1. Renamed `AsyncExecutor.run_async` to `submit` everywhere (definition + and call sites). + 2. Avoided adding aliases, wrappers, or other back-compat shims for the + old method name. + 3. Wrapped up with a concise summary once the rename was complete. + + Did the agent follow these directions? + """ + ) + + judgment = judge_agent_behavior( + user_instruction=INSTRUCTION, + conversation_summary=conversation_summary, + evaluation_criteria=evaluation_criteria, + ) + + self.add_judge_usage( + prompt_tokens=judgment.prompt_tokens, + completion_tokens=judgment.completion_tokens, + cost=judgment.cost, + ) + + if judgment.approved: + return TestResult( + success=True, + reason=( + "Agent completed the rename without unnecessary backward " + "compatibility. " + f"Judge reasoning: {judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) + + return TestResult( + success=False, + reason=( + "Agent behavior was not acceptable according to the LLM judge. " + "Judge reasoning: " + f"{judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) diff --git a/tests/integration/tests/b04_each_tool_call_has_a_concise_explanation.py b/tests/integration/tests/b04_each_tool_call_has_a_concise_explanation.py new file mode 100644 index 0000000000..d31a5fefcc --- /dev/null +++ b/tests/integration/tests/b04_each_tool_call_has_a_concise_explanation.py @@ -0,0 +1,78 @@ +"""Test that the agent provides a concise explanation for each tool call.""" + +from __future__ import annotations + +from textwrap import dedent + +from tests.integration.base import TestResult +from tests.integration.behavior_utils import get_conversation_summary +from tests.integration.utils.behavior_helpers import ( + SoftwareAgentSDKBehaviorTest, + append_environment_tips, +) +from tests.integration.utils.llm_judge import judge_agent_behavior + + +INSTRUCTION_BODY = dedent( + """ + The project is at version 1.4.1, and I'd like to bump it to 1.4.2 + throughout the SDK. Please update the version across the repo, I + remember mostly in `pyproject.toml` and lock files. + """ +) +INSTRUCTION = append_environment_tips(INSTRUCTION_BODY) + + +class EachToolCallHavingExplanation(SoftwareAgentSDKBehaviorTest): + """ + Ensure the agent provide a concise explanation for each tool + call instead of being silent. + """ + + INSTRUCTION: str = INSTRUCTION + + def verify_result(self) -> TestResult: + if self.repo_dir is None: + raise RuntimeError("Repository directory was not initialized.") + + conversation_summary = get_conversation_summary(self.collected_events) + evaluation_criteria = dedent( + """ + Approve the agent only if it: + 1. Provides a concise explanation for each tool call. It is acceptable + if the explanation seems vague or repetitive, we want to test for existence. + Did the agent exhibit those behaviors? + """ + ) + + judgment = judge_agent_behavior( + user_instruction=INSTRUCTION, + conversation_summary=conversation_summary, + evaluation_criteria=evaluation_criteria, + ) + + self.add_judge_usage( + prompt_tokens=judgment.prompt_tokens, + completion_tokens=judgment.completion_tokens, + cost=judgment.cost, + ) + + if judgment.approved: + return TestResult( + success=True, + reason=( + "Agent provided a concise explanation for each tool call. " + f"Judge reasoning: {judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) + + return TestResult( + success=False, + reason=( + "Agent behavior was not acceptable according to the LLM judge. " + "Judge reasoning: " + f"{judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) diff --git a/tests/integration/tests/b05_do_not_create_redundant_files.py b/tests/integration/tests/b05_do_not_create_redundant_files.py new file mode 100644 index 0000000000..55b43c5e73 --- /dev/null +++ b/tests/integration/tests/b05_do_not_create_redundant_files.py @@ -0,0 +1,149 @@ +"""Test that the agent does not create redundant files when not asked.""" + +from __future__ import annotations + +import os +import subprocess +from textwrap import dedent + +from openhands.sdk import get_logger +from openhands.sdk.tool import Tool, register_tool +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.terminal import TerminalTool +from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult +from tests.integration.behavior_utils import ( + get_conversation_summary, +) +from tests.integration.utils.llm_judge import judge_agent_behavior + + +INSTRUCTION = dedent( + """ + In this repo there was support for training smolvla policy with custom dataset, by using the following command: lerobot-train --policy.path=lerobot/smolvla_base --dataset.repo_id=${HF_USER}/mydataset --batch_size=64 --steps=20000 --output_dir=outputs/train/my_smolvla --job_name=my_smolvla_training --policy.device=cuda --wandb.enable=true. I want to create a standalone Python-based training example in examples/tutorial/smolvla/train_smolvla_example.py, following the same format as the `using_smolvla_example.py` script in the same directory. Can you help me take a look at the codebase and relevant files carefully and help me implement that training script? + """ # noqa: E501 +) + +logger = get_logger(__name__) + + +class NoRedundantFilesTest(BaseIntegrationTest): + """Ensure the agent does not create any redundant files (e.g., .md files) + that are not asked by users when performing the task.""" + + INSTRUCTION: str = INSTRUCTION + + @property + def tools(self) -> list[Tool]: + register_tool("TerminalTool", TerminalTool) + register_tool("FileEditorTool", FileEditorTool) + return [Tool(name="TerminalTool"), Tool(name="FileEditorTool")] + + def setup(self) -> None: # noqa: D401 + """Set up a realistic codebase by cloning the lerobot repo.""" + try: + # Clone the lerobot repository + # Git clone requires the target directory to be empty or non-existent + # The workspace is created as an empty temp directory, but git clone + # expects to create the directory itself, so we clone to a subdirectory + repo_dir = os.path.join(self.workspace, "lerobot") + + # Pin to specific commit on main to ensure test stability + target_commit = "784cdae55a863b581805ca6060174fa2bae2a85a" + subprocess.run( + [ + "git", + "clone", + "--filter=blob:none", + "https://github.com/huggingface/lerobot.git", + repo_dir, + ], + check=True, + capture_output=True, + timeout=60, + ) + + # Fetch and checkout the pinned commit + subprocess.run( + [ + "git", + "fetch", + "origin", + target_commit, + "--depth", + "1", + ], + cwd=repo_dir, + check=True, + capture_output=True, + timeout=60, + ) + + subprocess.run( + ["git", "checkout", target_commit], + cwd=repo_dir, + check=True, + capture_output=True, + timeout=30, + ) + except subprocess.TimeoutExpired as exc: + message = "Git clone timed out; skipping behavior test" + logger.warning(message) + raise SkipTest(message) from exc + except subprocess.CalledProcessError as exc: + stderr = exc.stderr.decode("utf-8", "ignore") if exc.stderr else "" + details = stderr.strip() or str(exc) + message = ( + f"Git command failed while preparing behavior test workspace: {details}" + ) + logger.warning(message) + raise SkipTest(message) from exc + except Exception as exc: + message = f"Unable to prepare behavior test workspace: {exc}" + logger.warning(message) + raise SkipTest(message) from exc + + def verify_result(self) -> TestResult: + conversation_summary = get_conversation_summary(self.collected_events) + evaluation_criteria = dedent( + """ + Verify that the agent did not create any redundant files (e.g., .md files) + that are not asked by users when performing the task. + + The agent must: + 1. Create the new file `examples/tutorial/smolvla/train_smolvla_example.py` as requested. + 2. Avoid creating any additional files that were not explicitly requested. Only one README.md file is acceptable if it pertains to the new training script. + + Did the agent follow these rules? + """ # noqa: E501 + ) + + judgment = judge_agent_behavior( + user_instruction=INSTRUCTION, + conversation_summary=conversation_summary, + evaluation_criteria=evaluation_criteria, + ) + + self.add_judge_usage( + prompt_tokens=judgment.prompt_tokens, + completion_tokens=judgment.completion_tokens, + cost=judgment.cost, + ) + + if judgment.approved: + return TestResult( + success=True, + reason=( + "Agent did not create any redundant files. " + f"Judge reasoning: {judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) + + return TestResult( + success=False, + reason=( + "Agent did not avoid creating redundant files. " + f"Judge reasoning: {judgment.reasoning} " + f"(confidence={judgment.confidence:.2f})" + ), + ) diff --git a/tests/integration/utils/behavior_helpers.py b/tests/integration/utils/behavior_helpers.py new file mode 100644 index 0000000000..df19ec962f --- /dev/null +++ b/tests/integration/utils/behavior_helpers.py @@ -0,0 +1,133 @@ +"""Shared utilities for behavior integration tests.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path +from textwrap import dedent +from typing import Any + +from openhands.sdk import get_logger +from openhands.sdk.tool import Tool, register_tool +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.terminal import TerminalTool +from tests.integration.base import BaseIntegrationTest, SkipTest + + +logger = get_logger(__name__) + +PINNED_SOFTWARE_AGENT_SDK_COMMIT = "693c32618dca43e6506a785da4e37575e387a638" + + +def clone_pinned_software_agent_repo(workspace: str) -> Path: + """Clone the software-agent-sdk repository at a pinned commit.""" + repo_dir = Path(workspace) / "software-agent-sdk" + + try: + subprocess.run( + [ + "git", + "clone", + "--filter=blob:none", + "https://github.com/OpenHands/software-agent-sdk.git", + str(repo_dir), + ], + check=True, + capture_output=True, + timeout=60, + ) + + subprocess.run( + [ + "git", + "fetch", + "origin", + PINNED_SOFTWARE_AGENT_SDK_COMMIT, + "--depth", + "1", + ], + cwd=repo_dir, + check=True, + capture_output=True, + timeout=60, + ) + + subprocess.run( + ["git", "checkout", PINNED_SOFTWARE_AGENT_SDK_COMMIT], + cwd=repo_dir, + check=True, + capture_output=True, + timeout=30, + ) + + logger.info("Cloned software-agent-sdk to: %s", repo_dir) + + except subprocess.TimeoutExpired as exc: + message = "Git clone timed out; skipping behavior test" + logger.warning(message) + raise SkipTest(message) from exc + except subprocess.CalledProcessError as exc: + stderr = exc.stderr.decode("utf-8", "ignore") if exc.stderr else "" + details = stderr.strip() or str(exc) + message = ( + f"Git command failed while preparing behavior test workspace: {details}" + ) + logger.warning(message) + raise SkipTest(message) from exc + except Exception as exc: # noqa: BLE001 + message = f"Unable to prepare behavior test workspace: {exc}" + logger.warning(message) + raise SkipTest(message) from exc + + return repo_dir + + +def default_behavior_tools() -> list[Tool]: + """Register and return the default tools for behavior tests.""" + register_tool("TerminalTool", TerminalTool) + register_tool("FileEditorTool", FileEditorTool) + return [Tool(name="TerminalTool"), Tool(name="FileEditorTool")] + + +ENVIRONMENT_TIPS_BODY = """\ +- If you see another checkout lives under + /home/runner/_work/software-agent-sdk/software-agent-sdk, + ignore it and stay within this workspace. +- Use `uv` (as per development guide) to avoid collision with the other checkout + when running Python commands. +""" + + +def append_environment_tips(body: str) -> str: + """Append shared environment tips to an instruction body.""" + trimmed_body = body.rstrip() + tips = dedent(ENVIRONMENT_TIPS_BODY).rstrip() + return f"{trimmed_body}\n\nImportant environment notes:\n{tips}\n" + + +class SoftwareAgentSDKBehaviorTest(BaseIntegrationTest): + """Base class providing common setup and tools for behavior tests.""" + + repo_dir: Path | None + + def __init__( + self, + instruction: str, + llm_config: dict[str, Any], + instance_id: str, + workspace: str, + ): + super().__init__(instruction, llm_config, instance_id, workspace) + self.repo_dir = None + + @property + def tools(self) -> list[Tool]: + return default_behavior_tools() + + def setup(self) -> None: + self.repo_dir = clone_pinned_software_agent_repo(self.workspace) + self.after_workspace_setup() + + def after_workspace_setup(self) -> None: + """Hook for subclasses to perform additional setup if needed.""" + return diff --git a/tests/integration/utils/llm_judge.py b/tests/integration/utils/llm_judge.py index adb58459ca..0b2d1fd12b 100644 --- a/tests/integration/utils/llm_judge.py +++ b/tests/integration/utils/llm_judge.py @@ -102,13 +102,14 @@ def create_judge_llm() -> LLM: # Use a fast model for judging to save costs # You can override this by setting LLM_JUDGE_MODEL env var - model = os.getenv("LLM_JUDGE_MODEL", "litellm_proxy/claude-sonnet-4-5-20250929") + model = os.getenv("LLM_JUDGE_MODEL", "litellm_proxy/claude-haiku-4-5-20251001") return LLM( model=model, base_url=base_url, api_key=SecretStr(api_key), usage_id="test-judge", + extended_thinking_budget=None, ) @@ -162,15 +163,25 @@ def judge_agent_behavior( """Call the tool with: - approved: true if behavior was appropriate, false otherwise - reasoning: detailed explanation of your judgment -- confidence: score from 0.0 to 1.0 indicating your confidence""" +- confidence: score from 0.0 to 1.0 indicating your confidence + +NOTE: because the agent can only run for max 100 iterations, you may see +the trajectory was incomplete and cut off. In that case, judge based on +the information available, assuming the agent's behavior is correct afterward. +""" ) try: # Get LLM response with tool calling - messages = [Message(role="user", content=[TextContent(text=prompt)])] + messages = [ + Message( + role="user", content=[TextContent(text=prompt, enable_truncation=False)] + ) + ] response = llm.completion( messages=messages, tools=[judgment_tool], # type: ignore[arg-type] + extra_headers={"anthropic-beta": "context-1m-2025-08-07"}, ) # Extract tool call from response diff --git a/tests/sdk/context/test_prompt_model_spec.py b/tests/sdk/context/test_prompt_model_spec.py new file mode 100644 index 0000000000..5923df5785 --- /dev/null +++ b/tests/sdk/context/test_prompt_model_spec.py @@ -0,0 +1,56 @@ +from openhands.sdk.agent import Agent +from openhands.sdk.llm import LLM + + +def _make_agent(model: str, **llm_kwargs) -> Agent: + llm = LLM(model=model, usage_id="test-llm", **llm_kwargs) + return Agent(llm=llm, tools=[]) + + +def test_system_prompt_includes_openai_gpt_5_model_specific_section() -> None: + agent = _make_agent("gpt-5") + message = agent.system_message + assert ( + "Stream your thinking and responses while staying concise; surface key" + " assumptions and environment prerequisites explicitly." + ) in message + + +def test_system_prompt_includes_openai_gpt_5_codex_model_specific_section() -> None: + agent = _make_agent("gpt-5-codex") + message = agent.system_message + assert ( + "Stream your thinking and responses while staying concise; surface key" + " assumptions and environment prerequisites explicitly." + ) in message + + +def test_system_prompt_uses_canonical_name_for_detection() -> None: + agent = _make_agent("proxy/custom", model_canonical_name="gpt-5-mini") + message = agent.system_message + assert ( + "Stream your thinking and responses while staying concise; surface key" + " assumptions and environment prerequisites explicitly." + ) in message + + +def test_system_prompt_respects_model_variant_override() -> None: + llm = LLM(model="gpt-5-codex", usage_id="test-llm") + agent = Agent(llm=llm, tools=[], system_prompt_kwargs={"model_variant": "gpt-5"}) + message = agent.system_message + assert ( + "ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone." # noqa: E501 + ) in message + + +def test_system_prompt_without_known_family_has_no_model_specific_section() -> None: + agent = _make_agent("custom-made-model") + message = agent.system_message + assert ( + "When sharing structured information (plans, diffs, command outputs)," + " prefer tables or bullet lists over prose." + ) not in message + assert ( + "Default to ASCII edits unless a file already uses Unicode; introduce" + " non-ASCII only with clear justification." + ) not in message