OpenHands · xingyaoww · Dec 16, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -13,6 +13,10 @@ on:
                 description: Reason for manual trigger
                 required: true
                 default: ''
+            test_type:
+                description: Select which tests to run (all, integration, behavior)
+                required: false
+                default: all
     schedule:
         - cron: 30 22 * * * # Runs at 10:30pm UTC every day
 
@@ -21,25 +25,42 @@ env:
 
 jobs:
     post-initial-comment:
-        if: github.event_name == 'pull_request_target' && github.event.label.name == 'integration-test'
+        if: >
+            github.event_name == 'pull_request_target' && (
+                github.event.label.name == 'integration-test' ||
+                github.event.label.name == 'behavior-test'
+            )
         runs-on: ubuntu-latest
         permissions:
             pull-requests: write
         steps:
-            - name: Comment on PR
+            - name: Comment on PR (integration tests)
+              if: github.event.label.name == 'integration-test'
               uses: KeisukeYamashita/create-comment@v1
               with:
                   unique: false
                   comment: |
                       Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
+            - name: Comment on PR (behavior tests)
+              if: github.event.label.name == 'behavior-test'
+              uses: KeisukeYamashita/create-comment@v1
+              with:
+                  unique: false
+                  comment: |
+                      Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly.
 
     run-integration-tests:
-        # Security: Only run when 'integration-test' label is present, via workflow_dispatch, or on schedule
+        # Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule
         # This prevents automatic execution on fork PRs without maintainer approval
         # Note: uses always() to run even when post-initial-comment is skipped (e.g., for workflow_dispatch)
         if: |
             always() && (
-                github.event.label.name == 'integration-test' ||
+                (
+                    github.event_name == 'pull_request_target' && (
+                        github.event.label.name == 'integration-test' ||
+                        github.event.label.name == 'behavior-test'
+                    )
+                ) ||
                 github.event_name == 'workflow_dispatch' ||
                 github.event_name == 'schedule'
             )
@@ -63,6 +84,7 @@ jobs:
                       llm-config:
                           model: litellm_proxy/gpt-5.1-codex-max
                           temperature: 1.0
+                          reasoning_summary: detailed
                     - name: Deepseek Chat
                       run-suffix: deepseek_run
                       llm-config:
@@ -101,7 +123,36 @@ jobs:
                   uv sync --dev
                   uv pip install pytest
 
-      # Run integration test evaluation
+            # Run integration test evaluation
+            - name: Determine test selection
+              run: |
+                  TEST_TYPE_ARGS=""
+                  if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then
+                    TEST_TYPE_ARGS="--test-type behavior"
+                    echo "behavior-test label detected; running behavior tests only."
+                  elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+                    test_type="${{ github.event.inputs.test_type }}"
+                    case "$test_type" in
+                      behavior)
+                        TEST_TYPE_ARGS="--test-type behavior"
+                        echo "workflow_dispatch requested behavior tests only."
+                        ;;
+                      integration)
+                        TEST_TYPE_ARGS="--test-type integration"
+                        echo "workflow_dispatch requested integration tests only."
+                        ;;
+                      ""|all)
+                        echo "workflow_dispatch requested full integration suite."
+                        ;;
+                      *)
+                        echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite."
+                        ;;
+                    esac
+                  else
+                    echo "Running full integration test suite."
+                  fi
+                  echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"
+
             - name: Run integration test evaluation for ${{ matrix.job-config.name }}
               env:
                   LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
@@ -113,10 +164,13 @@ jobs:
                   AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
                   EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}"
 
+                  echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS'"
+
                   uv run python tests/integration/run_infer.py \
                     --llm-config "$LLM_CONFIG" \
                     --num-workers $N_PROCESSES \
-                    --eval-note "$EVAL_NOTE"
+                    --eval-note "$EVAL_NOTE" \
+                    $TEST_TYPE_ARGS
 
                   # get integration tests JSON results
                   RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1)
@@ -169,7 +223,17 @@ jobs:
 
     consolidate-results:
         needs: run-integration-tests
-        if: always() && (github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule')
+        if: |
+            always() && (
+                (
+                    github.event_name == 'pull_request_target' && (
+                        github.event.label.name == 'integration-test' ||
+                        github.event.label.name == 'behavior-test'
+                    )
+                ) ||
+                github.event_name == 'workflow_dispatch' ||
+                github.event_name == 'schedule'
+            )
         runs-on: blacksmith-2vcpu-ubuntu-2404
         permissions:
             contents: read

diff --git a/.gitignore b/.gitignore
@@ -209,3 +209,5 @@ openapi.json
 .worktrees/
 agent-sdk.workspace.code-workspace
 
+# Integration test outputs
+tests/integration/outputs/
diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py
@@ -12,6 +12,7 @@
 from openhands.sdk.context.condenser import CondenserBase, LLMSummarizingCondenser
 from openhands.sdk.context.prompts.prompt import render_template
 from openhands.sdk.llm import LLM
+from openhands.sdk.llm.utils.model_prompt_spec import get_model_prompt_spec
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp import create_mcp_tools
 from openhands.sdk.tool import BUILT_IN_TOOLS, Tool, ToolDefinition, resolve_tool
@@ -164,6 +165,18 @@ def name(self) -> str:
     def system_message(self) -> str:
         """Compute system message on-demand to maintain statelessness."""
         template_kwargs = dict(self.system_prompt_kwargs)
+        template_kwargs.setdefault("model_name", self.llm.model)
+        if (
+            "model_family" not in template_kwargs
+            or "model_variant" not in template_kwargs
+        ):
+            spec = get_model_prompt_spec(
+                self.llm.model, getattr(self.llm, "model_canonical_name", None)
+            )
+            if "model_family" not in template_kwargs and spec.family:
+                template_kwargs["model_family"] = spec.family
+            if "model_variant" not in template_kwargs and spec.variant:
+                template_kwargs["model_variant"] = spec.variant
         system_message = render_template(
             prompt_dir=self.prompt_dir,
             template_name=self.system_prompt_filename,

diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2
@@ -0,0 +1,3 @@
+* Try to follow the instructions exactly as given - don't make extra or fewer actions if not asked.
+* Avoid unnecessary defensive programming; do not add redundant fallbacks or default values — fail fast instead of masking misconfigurations.
+* When backward compatibility expectations are unclear, confirm with the user before making changes that could break existing behavior.
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2
@@ -0,0 +1 @@
+* Avoid being too proactive. Fulfill the user's request thoroughly: if they ask questions/investigations, answer them; if they ask for implementations, provide them. But do not take extra steps beyond what is requested.
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2
@@ -0,0 +1,3 @@
+* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
+* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
+* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2
@@ -0,0 +1,3 @@
+* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
+* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
+* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2 b/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2
@@ -106,3 +106,20 @@ You are OpenHands agent, a helpful AI assistant that can interact with a compute
   - Prefer using `ps aux` to find the exact process ID (PID) first, then kill that specific PID
   - When possible, use more targeted approaches like finding the PID from a pidfile or using application-specific shutdown commands
 </PROCESS_MANAGEMENT>
+
+{%- set _imp -%}
+{%- if model_family -%}
+{%- include "model_specific/" ~ model_family ~ ".j2" ignore missing -%}
+{%- if model_variant -%}
+{%- include "model_specific/" ~ model_family ~ "/" ~ model_variant ~ ".j2" ignore missing -%}
+{%- endif -%}
+{%- endif -%}
+{%- endset -%}
+
+{%- set _imp_trimmed = _imp | trim -%}
+{%- if _imp_trimmed %}
+
+<IMPORTANT>
+{{ _imp_trimmed }}
+</IMPORTANT>
+{%- endif %}
diff --git a/openhands-sdk/openhands/sdk/llm/message.py b/openhands-sdk/openhands/sdk/llm/message.py
@@ -169,11 +169,12 @@ class TextContent(BaseContent):
     model_config: ClassVar[ConfigDict] = ConfigDict(
         extra="forbid", populate_by_name=True
     )
+    enable_truncation: bool = True
 
     def to_llm_dict(self) -> list[dict[str, str | dict[str, str]]]:
         """Convert to LLM API format."""
         text = self.text
-        if len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
+        if self.enable_truncation and len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
             logger.warning(
                 f"TextContent text length ({len(text)}) exceeds limit "
                 f"({DEFAULT_TEXT_CONTENT_LIMIT}), truncating"

diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
@@ -0,0 +1,98 @@
+"""Utilities for detecting model families and variants.
+
+These helpers allow prompts and other systems to tailor behavior for specific
+LLM providers while keeping naming heuristics centralized.
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, ConfigDict
+
+
+class ModelPromptSpec(BaseModel):
+    """Detected prompt metadata for a given model configuration."""
+
+    model_config = ConfigDict(frozen=True)
+
+    family: str | None = None
+    variant: str | None = None
+
+
+_MODEL_FAMILY_PATTERNS: dict[str, tuple[str, ...]] = {
+    "openai_gpt": (
+        "gpt-",
+        "o1",
+        "o3",
+        "o4",
+    ),
+    "anthropic_claude": ("claude",),
+    "google_gemini": ("gemini",),
+    "meta_llama": ("llama",),
+    "mistral": ("mistral",),
+    "deepseek": ("deepseek",),
+    "alibaba_qwen": ("qwen",),
+}
+
+# Ordered heuristics to pick the most specific variant available for a family.
+_MODEL_VARIANT_PATTERNS: dict[str, tuple[tuple[str, tuple[str, ...]], ...]] = {
+    "openai_gpt": (
+        ("gpt-5-codex", ("gpt-5-codex", "gpt-5.1-codex")),
+        ("gpt-5", ("gpt-5", "gpt-5.1")),
+    ),
+}
+
+
+def _normalize(name: str | None) -> str:
+    return (name or "").strip().lower()
+
+
+def _match_family(model_name: str) -> str | None:
+    normalized = _normalize(model_name)
+    if not normalized:
+        return None
+
+    for family, patterns in _MODEL_FAMILY_PATTERNS.items():
+        if any(pattern in normalized for pattern in patterns):
+            return family
+    return None
+
+
+def _match_variant(
+    family: str,
+    model_name: str,
+    canonical_name: str | None = None,
+) -> str | None:
+    patterns = _MODEL_VARIANT_PATTERNS.get(family)
+    if not patterns:
+        return None
+
+    # Choose canonical_name if available, otherwise fall back to model_name
+    candidate = _normalize(canonical_name) or _normalize(model_name)
+    if not candidate:
+        return None
+
+    for variant, substrings in patterns:
+        if any(sub in candidate for sub in substrings):
+            return variant
+
+    return None
+
+
+def get_model_prompt_spec(
+    model_name: str,
+    canonical_name: str | None = None,
+) -> ModelPromptSpec:
+    """Return family and variant prompt metadata for the given identifiers."""
+
+    family = _match_family(model_name)
+    if family is None and canonical_name:
+        family = _match_family(canonical_name)
+
+    variant = None
+    if family is not None:
+        variant = _match_variant(family, model_name, canonical_name)
+
+    return ModelPromptSpec(family=family, variant=variant)
+
+
+__all__ = ["ModelPromptSpec", "get_model_prompt_spec"]
diff --git a/tests/integration/base.py b/tests/integration/base.py
@@ -103,6 +103,7 @@ def __init__(
             workspace=self.workspace,
             callbacks=[self.conversation_callback],
             visualizer=DefaultConversationVisualizer(),  # Use default visualizer
+            max_iteration_per_run=100,
         )
 
     def conversation_callback(self, event: Event):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		* Avoid being too proactive. Fulfill the user's request thoroughly: if they ask questions/investigations, answer them; if they ask for implementations, provide them. But do not take extra steps beyond what is requested.