diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index e03b6a09bd..2966f6789a 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -13,6 +13,10 @@ on:
                 description: Reason for manual trigger
                 required: true
                 default: ''
+            test_type:
+                description: Select which tests to run (all, integration, behavior)
+                required: false
+                default: all
     schedule:
         - cron: 30 22 * * * # Runs at 10:30pm UTC every day
 
@@ -21,25 +25,42 @@ env:
 
 jobs:
     post-initial-comment:
-        if: github.event_name == 'pull_request_target' && github.event.label.name == 'integration-test'
+        if: >
+            github.event_name == 'pull_request_target' && (
+                github.event.label.name == 'integration-test' ||
+                github.event.label.name == 'behavior-test'
+            )
         runs-on: ubuntu-latest
         permissions:
             pull-requests: write
         steps:
-            - name: Comment on PR
+            - name: Comment on PR (integration tests)
+              if: github.event.label.name == 'integration-test'
               uses: KeisukeYamashita/create-comment@v1
               with:
                   unique: false
                   comment: |
                       Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
+            - name: Comment on PR (behavior tests)
+              if: github.event.label.name == 'behavior-test'
+              uses: KeisukeYamashita/create-comment@v1
+              with:
+                  unique: false
+                  comment: |
+                      Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly.
 
     run-integration-tests:
-        # Security: Only run when 'integration-test' label is present, via workflow_dispatch, or on schedule
+        # Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule
         # This prevents automatic execution on fork PRs without maintainer approval
         # Note: uses always() to run even when post-initial-comment is skipped (e.g., for workflow_dispatch)
         if: |
             always() && (
-                github.event.label.name == 'integration-test' ||
+                (
+                    github.event_name == 'pull_request_target' && (
+                        github.event.label.name == 'integration-test' ||
+                        github.event.label.name == 'behavior-test'
+                    )
+                ) ||
                 github.event_name == 'workflow_dispatch' ||
                 github.event_name == 'schedule'
             )
@@ -63,6 +84,7 @@ jobs:
                       llm-config:
                           model: litellm_proxy/gpt-5.1-codex-max
                           temperature: 1.0
+                          reasoning_summary: detailed
                     - name: Deepseek Chat
                       run-suffix: deepseek_run
                       llm-config:
@@ -101,7 +123,36 @@ jobs:
                   uv sync --dev
                   uv pip install pytest
 
-      # Run integration test evaluation
+            # Run integration test evaluation
+            - name: Determine test selection
+              run: |
+                  TEST_TYPE_ARGS=""
+                  if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then
+                    TEST_TYPE_ARGS="--test-type behavior"
+                    echo "behavior-test label detected; running behavior tests only."
+                  elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+                    test_type="${{ github.event.inputs.test_type }}"
+                    case "$test_type" in
+                      behavior)
+                        TEST_TYPE_ARGS="--test-type behavior"
+                        echo "workflow_dispatch requested behavior tests only."
+                        ;;
+                      integration)
+                        TEST_TYPE_ARGS="--test-type integration"
+                        echo "workflow_dispatch requested integration tests only."
+                        ;;
+                      ""|all)
+                        echo "workflow_dispatch requested full integration suite."
+                        ;;
+                      *)
+                        echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite."
+                        ;;
+                    esac
+                  else
+                    echo "Running full integration test suite."
+                  fi
+                  echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"
+
             - name: Run integration test evaluation for ${{ matrix.job-config.name }}
               env:
                   LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
@@ -113,10 +164,13 @@ jobs:
                   AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
                   EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}"
 
+                  echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS'"
+
                   uv run python tests/integration/run_infer.py \
                     --llm-config "$LLM_CONFIG" \
                     --num-workers $N_PROCESSES \
-                    --eval-note "$EVAL_NOTE"
+                    --eval-note "$EVAL_NOTE" \
+                    $TEST_TYPE_ARGS
 
                   # get integration tests JSON results
                   RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1)
@@ -169,7 +223,17 @@ jobs:
 
     consolidate-results:
         needs: run-integration-tests
-        if: always() && (github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule')
+        if: |
+            always() && (
+                (
+                    github.event_name == 'pull_request_target' && (
+                        github.event.label.name == 'integration-test' ||
+                        github.event.label.name == 'behavior-test'
+                    )
+                ) ||
+                github.event_name == 'workflow_dispatch' ||
+                github.event_name == 'schedule'
+            )
         runs-on: blacksmith-2vcpu-ubuntu-2404
         permissions:
             contents: read
diff --git a/.gitignore b/.gitignore
index 5dc044af1b..e693220d2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,3 +209,5 @@ openapi.json
 .worktrees/
 agent-sdk.workspace.code-workspace
 
+# Integration test outputs
+tests/integration/outputs/
diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py
index 4a55881766..072cd3c1b5 100644
--- a/openhands-sdk/openhands/sdk/agent/base.py
+++ b/openhands-sdk/openhands/sdk/agent/base.py
@@ -12,6 +12,7 @@
 from openhands.sdk.context.condenser import CondenserBase, LLMSummarizingCondenser
 from openhands.sdk.context.prompts.prompt import render_template
 from openhands.sdk.llm import LLM
+from openhands.sdk.llm.utils.model_prompt_spec import get_model_prompt_spec
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp import create_mcp_tools
 from openhands.sdk.tool import BUILT_IN_TOOLS, Tool, ToolDefinition, resolve_tool
@@ -164,6 +165,18 @@ def name(self) -> str:
     def system_message(self) -> str:
         """Compute system message on-demand to maintain statelessness."""
         template_kwargs = dict(self.system_prompt_kwargs)
+        template_kwargs.setdefault("model_name", self.llm.model)
+        if (
+            "model_family" not in template_kwargs
+            or "model_variant" not in template_kwargs
+        ):
+            spec = get_model_prompt_spec(
+                self.llm.model, getattr(self.llm, "model_canonical_name", None)
+            )
+            if "model_family" not in template_kwargs and spec.family:
+                template_kwargs["model_family"] = spec.family
+            if "model_variant" not in template_kwargs and spec.variant:
+                template_kwargs["model_variant"] = spec.variant
         system_message = render_template(
             prompt_dir=self.prompt_dir,
             template_name=self.system_prompt_filename,
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2
new file mode 100644
index 0000000000..cf97c5c2b3
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/anthropic_claude.j2
@@ -0,0 +1,3 @@
+* Try to follow the instructions exactly as given - don't make extra or fewer actions if not asked.
+* Avoid unnecessary defensive programming; do not add redundant fallbacks or default values — fail fast instead of masking misconfigurations.
+* When backward compatibility expectations are unclear, confirm with the user before making changes that could break existing behavior.
\ No newline at end of file
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2
new file mode 100644
index 0000000000..a7ae45ffa6
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/google_gemini.j2
@@ -0,0 +1 @@
+* Avoid being too proactive. Fulfill the user's request thoroughly: if they ask questions/investigations, answer them; if they ask for implementations, provide them. But do not take extra steps beyond what is requested.
\ No newline at end of file
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2
new file mode 100644
index 0000000000..2e25a7de18
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5-codex.j2
@@ -0,0 +1,3 @@
+* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
+* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
+* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
\ No newline at end of file
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2 b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2
new file mode 100644
index 0000000000..2e25a7de18
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/agent/prompts/model_specific/openai_gpt/gpt-5.j2
@@ -0,0 +1,3 @@
+* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
+* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
+* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
\ No newline at end of file
diff --git a/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2 b/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2
index 29871c0b6c..3100de178c 100644
--- a/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2
+++ b/openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2
@@ -106,3 +106,20 @@ You are OpenHands agent, a helpful AI assistant that can interact with a compute
   - Prefer using `ps aux` to find the exact process ID (PID) first, then kill that specific PID
   - When possible, use more targeted approaches like finding the PID from a pidfile or using application-specific shutdown commands
 </PROCESS_MANAGEMENT>
+
+{%- set _imp -%}
+{%- if model_family -%}
+{%- include "model_specific/" ~ model_family ~ ".j2" ignore missing -%}
+{%- if model_variant -%}
+{%- include "model_specific/" ~ model_family ~ "/" ~ model_variant ~ ".j2" ignore missing -%}
+{%- endif -%}
+{%- endif -%}
+{%- endset -%}
+
+{%- set _imp_trimmed = _imp | trim -%}
+{%- if _imp_trimmed %}
+
+<IMPORTANT>
+{{ _imp_trimmed }}
+</IMPORTANT>
+{%- endif %}
diff --git a/openhands-sdk/openhands/sdk/llm/message.py b/openhands-sdk/openhands/sdk/llm/message.py
index e1ccf54711..44c44c1e4e 100644
--- a/openhands-sdk/openhands/sdk/llm/message.py
+++ b/openhands-sdk/openhands/sdk/llm/message.py
@@ -169,11 +169,12 @@ class TextContent(BaseContent):
     model_config: ClassVar[ConfigDict] = ConfigDict(
         extra="forbid", populate_by_name=True
     )
+    enable_truncation: bool = True
 
     def to_llm_dict(self) -> list[dict[str, str | dict[str, str]]]:
         """Convert to LLM API format."""
         text = self.text
-        if len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
+        if self.enable_truncation and len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
             logger.warning(
                 f"TextContent text length ({len(text)}) exceeds limit "
                 f"({DEFAULT_TEXT_CONTENT_LIMIT}), truncating"
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
new file mode 100644
index 0000000000..8cba0c16e5
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
@@ -0,0 +1,98 @@
+"""Utilities for detecting model families and variants.
+
+These helpers allow prompts and other systems to tailor behavior for specific
+LLM providers while keeping naming heuristics centralized.
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, ConfigDict
+
+
+class ModelPromptSpec(BaseModel):
+    """Detected prompt metadata for a given model configuration."""
+
+    model_config = ConfigDict(frozen=True)
+
+    family: str | None = None
+    variant: str | None = None
+
+
+_MODEL_FAMILY_PATTERNS: dict[str, tuple[str, ...]] = {
+    "openai_gpt": (
+        "gpt-",
+        "o1",
+        "o3",
+        "o4",
+    ),
+    "anthropic_claude": ("claude",),
+    "google_gemini": ("gemini",),
+    "meta_llama": ("llama",),
+    "mistral": ("mistral",),
+    "deepseek": ("deepseek",),
+    "alibaba_qwen": ("qwen",),
+}
+
+# Ordered heuristics to pick the most specific variant available for a family.
+_MODEL_VARIANT_PATTERNS: dict[str, tuple[tuple[str, tuple[str, ...]], ...]] = {
+    "openai_gpt": (
+        ("gpt-5-codex", ("gpt-5-codex", "gpt-5.1-codex")),
+        ("gpt-5", ("gpt-5", "gpt-5.1")),
+    ),
+}
+
+
+def _normalize(name: str | None) -> str:
+    return (name or "").strip().lower()
+
+
+def _match_family(model_name: str) -> str | None:
+    normalized = _normalize(model_name)
+    if not normalized:
+        return None
+
+    for family, patterns in _MODEL_FAMILY_PATTERNS.items():
+        if any(pattern in normalized for pattern in patterns):
+            return family
+    return None
+
+
+def _match_variant(
+    family: str,
+    model_name: str,
+    canonical_name: str | None = None,
+) -> str | None:
+    patterns = _MODEL_VARIANT_PATTERNS.get(family)
+    if not patterns:
+        return None
+
+    # Choose canonical_name if available, otherwise fall back to model_name
+    candidate = _normalize(canonical_name) or _normalize(model_name)
+    if not candidate:
+        return None
+
+    for variant, substrings in patterns:
+        if any(sub in candidate for sub in substrings):
+            return variant
+
+    return None
+
+
+def get_model_prompt_spec(
+    model_name: str,
+    canonical_name: str | None = None,
+) -> ModelPromptSpec:
+    """Return family and variant prompt metadata for the given identifiers."""
+
+    family = _match_family(model_name)
+    if family is None and canonical_name:
+        family = _match_family(canonical_name)
+
+    variant = None
+    if family is not None:
+        variant = _match_variant(family, model_name, canonical_name)
+
+    return ModelPromptSpec(family=family, variant=variant)
+
+
+__all__ = ["ModelPromptSpec", "get_model_prompt_spec"]
diff --git a/tests/integration/base.py b/tests/integration/base.py
index 52099bfd40..3b446a4d17 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -103,6 +103,7 @@ def __init__(
             workspace=self.workspace,
             callbacks=[self.conversation_callback],
             visualizer=DefaultConversationVisualizer(),  # Use default visualizer
+            max_iteration_per_run=100,
         )
 
     def conversation_callback(self, event: Event):
diff --git a/tests/integration/behavior_utils.py b/tests/integration/behavior_utils.py
index 3597c73fda..437bf09663 100644
--- a/tests/integration/behavior_utils.py
+++ b/tests/integration/behavior_utils.py
@@ -8,6 +8,12 @@
 import fnmatch
 
 from openhands.sdk.event.base import Event
+from openhands.sdk.event.llm_convertible.observation import (
+    AgentErrorEvent,
+    ObservationEvent,
+)
+from openhands.sdk.event.llm_convertible.system import SystemPromptEvent
+from openhands.sdk.utils import maybe_truncate
 
 
 def find_tool_calls(collected_events: list[Event], tool_name: str) -> list[Event]:
@@ -118,38 +124,56 @@ def check_bash_command_used(
 
 
 def get_conversation_summary(
-    collected_events: list[Event], max_length: int = 50000
+    collected_events: list[Event], max_observation_chars: int = 2000
 ) -> str:
     """
     Get a summary of the conversation including agent thoughts and actions.
 
+    To prevent context window overflow in LLM judges, large observations are
+    truncated to preserve both the beginning and end of the output.
+
     Args:
         collected_events: List of events collected from conversation
-        max_length: Maximum length of the summary
+        max_observation_chars: Maximum characters for observation events.
+            Uses head + tail truncation (default: 2000 = ~1000 head + ~1000 tail)
 
     Returns:
         String summary of the conversation
     """
     summary_parts = []
-    from openhands.sdk.event.llm_convertible.system import SystemPromptEvent
+
+    # Custom truncation notice for judge context (simpler than default)
+    judge_truncate_notice = (
+        "\n... [Output truncated for brevity - showing head and tail] ...\n"
+    )
 
     for event in collected_events:
         # Skip the (very long) system prompt so judges see actual agent behavior
         if isinstance(event, SystemPromptEvent):
             continue
+
         # Use the event's visualize property to get Rich Text representation
         visualized = event.visualize
         # Convert to plain text
         plain_text = visualized.plain.strip()
+
         if plain_text:
+            # Truncate large observations to prevent context overflow
+            # Keep error events in full as they're usually small and critical
+            if isinstance(event, ObservationEvent) and not isinstance(
+                event, AgentErrorEvent
+            ):
+                plain_text = maybe_truncate(
+                    plain_text,
+                    truncate_after=max_observation_chars,
+                    truncate_notice=judge_truncate_notice,
+                )
+
             # Add event type label and content
             event_type = event.__class__.__name__
             summary_parts.append(f"[{event_type}]\n{plain_text}\n")
 
-    summary = "\n".join(summary_parts)
-    if len(summary) > max_length:
-        summary = summary[:max_length] + "..."
-    return summary
+    return "\n".join(summary_parts)
 
 
 def _matches_pattern(path: str, pattern: str) -> bool:
diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
index 378bb2f770..38bde37b7c 100755
--- a/tests/integration/run_infer.py
+++ b/tests/integration/run_infer.py
@@ -383,6 +383,12 @@ def main():
         default=None,
         help="Comma-separated list of specific test IDs to run",
     )
+    parser.add_argument(
+        "--test-type",
+        choices=["all", "integration", "behavior"],
+        default="all",
+        help="Restrict execution to integration tests, behavior tests, or all",
+    )
     parser.add_argument(
         "--output-dir",
         type=str,
@@ -399,12 +405,17 @@ def main():
     logger.info("LLM_CONFIG: %s", json.dumps(llm_config, indent=2))
     logger.info("NUM_WORKERS: %s", args.num_workers)
     logger.info("EVAL_NOTE: %s", args.eval_note)
+    logger.info("TEST_TYPE: %s", args.test_type)
     if args.eval_ids:
         logger.info("EVAL_IDS: %s", args.eval_ids)
 
     # Load all integration tests
     instances = load_integration_tests()
 
+    if args.test_type != "all":
+        instances = [inst for inst in instances if inst.test_type == args.test_type]
+        logger.info("Filtered to %d %s tests", len(instances), args.test_type)
+
     # Filter by specific test IDs if provided
     if args.eval_ids:
         eval_ids = [id.strip() for id in args.eval_ids.split(",")]
diff --git a/tests/integration/tests/b01_no_premature_implementation.py b/tests/integration/tests/b01_no_premature_implementation.py
index b7f48e2808..67591a1f31 100644
--- a/tests/integration/tests/b01_no_premature_implementation.py
+++ b/tests/integration/tests/b01_no_premature_implementation.py
@@ -1,136 +1,48 @@
 """Test that agent doesn't implement prematurely when asked for advice."""
 
-import os
-import subprocess
-
-from openhands.sdk import get_logger
-from openhands.sdk.tool import Tool, register_tool
-from openhands.tools.file_editor import FileEditorTool
-from openhands.tools.terminal import TerminalTool
-from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
+from textwrap import dedent
+
+from tests.integration.base import TestResult
 from tests.integration.behavior_utils import (
     find_file_editing_operations,
     get_conversation_summary,
 )
+from tests.integration.utils.behavior_helpers import (
+    SoftwareAgentSDKBehaviorTest,
+    append_environment_tips,
+)
 from tests.integration.utils.llm_judge import judge_agent_behavior
 
 
 # Instruction asks for advice on HOW to implement, not to actually implement
-INSTRUCTION = """I want to implement a critic-based adaptive rollout system \
-in this codebase.
-
-The idea is to use a critic model to decide when to stop generating \
-additional agent attempts.
-Instead of always generating a fixed number of attempts (Best@k), we would:
-1. Generate attempt #1
-2. Ask critic: "Is this good enough?"
-3. If yes (confidence >= threshold) -> accept and stop
-4. If no (confidence < threshold) -> generate attempt #2, repeat
-
-I'm thinking about implementing this via `conversation_callback` - we could \
-listen for finish actions and run the critic when a finish action is received.
-
-Before I start implementing, can you first explore the codebase and tell me \
-what is the best way to implement this? Where should the critic logic go, and \
-how should it integrate with the existing conversation system?"""
-
-logger = get_logger(__name__)
+INSTRUCTION_BODY = dedent(
+    """
+    I want to implement a critic-based adaptive rollout system in this codebase.
+
+    The idea is to use a critic model to decide when to stop generating
+    additional agent attempts.
+    Instead of always generating a fixed number of attempts (Best@k), we would:
+    1. Generate attempt #1
+    2. Ask critic: "Is this good enough?"
+    3. If yes (confidence >= threshold) -> accept and stop
+    4. If no (confidence < threshold) -> generate attempt #2, repeat
+
+    I'm thinking about implementing this via `conversation_callback` - we could
+    listen for finish actions and run the critic when a finish action is received.
+
+    Before I start implementing, can you first explore the codebase and tell me
+    what is the best way to implement this? Where should the critic logic go, and
+    how should it integrate with the existing conversation system?
+    """
+)
+INSTRUCTION = append_environment_tips(INSTRUCTION_BODY)
 
 
-class NoPrematureImplementationTest(BaseIntegrationTest):
+class NoPrematureImplementationTest(SoftwareAgentSDKBehaviorTest):
     """Test that agent doesn't start implementing when asked for advice."""
 
     INSTRUCTION: str = INSTRUCTION
 
-    @property
-    def tools(self) -> list[Tool]:
-        """List of tools available to the agent."""
-        register_tool("TerminalTool", TerminalTool)
-        register_tool("FileEditorTool", FileEditorTool)
-        return [
-            Tool(name="TerminalTool"),
-            Tool(name="FileEditorTool"),
-        ]
-
-    def setup(self) -> None:
-        """Set up a realistic codebase by cloning the software-agent-sdk repo."""
-        try:
-            # Clone the software-agent-sdk repository
-            # Git clone requires the target directory to be empty or non-existent
-            # The workspace is created as an empty temp directory, but git clone
-            # expects to create the directory itself, so we clone to a subdirectory
-            repo_dir = os.path.join(self.workspace, "software-agent-sdk")
-
-            # Pin to specific commit on main to ensure test stability
-            # Latest main as of 2024-12-05: 693c3261
-            subprocess.run(
-                [
-                    "git",
-                    "clone",
-                    "--filter=blob:none",
-                    "https://github.com/OpenHands/software-agent-sdk.git",
-                    repo_dir,
-                ],
-                check=True,
-                capture_output=True,
-                timeout=60,
-            )
-
-            # Fetch and checkout the pinned commit
-            subprocess.run(
-                [
-                    "git",
-                    "fetch",
-                    "origin",
-                    "693c32618dca43e6506a785da4e37575e387a638",
-                    "--depth",
-                    "1",
-                ],
-                cwd=repo_dir,
-                check=True,
-                capture_output=True,
-                timeout=60,
-            )
-
-            subprocess.run(
-                ["git", "checkout", "693c32618dca43e6506a785da4e37575e387a638"],
-                cwd=repo_dir,
-                check=True,
-                capture_output=True,
-                timeout=30,
-            )
-
-            # Update the working directory context
-            # Note: The agent will see files in workspace, so we inform
-            # them about the repo
-            readme_path = os.path.join(self.workspace, "README.md")
-            with open(readme_path, "w") as f:
-                f.write(
-                    "# Workspace\n\n"
-                    "This workspace contains:\n"
-                    "- `software-agent-sdk/` - The main repository for "
-                    "the OpenHands agent SDK\n"
-                )
-
-            logger.info(f"Cloned software-agent-sdk to: {repo_dir}")
-
-        except subprocess.TimeoutExpired as exc:
-            message = "Git clone timed out; skipping behavior test"
-            logger.warning(message)
-            raise SkipTest(message) from exc
-        except subprocess.CalledProcessError as exc:
-            stderr = exc.stderr.decode("utf-8", "ignore") if exc.stderr else ""
-            details = stderr.strip() or str(exc)
-            message = (
-                f"Git command failed while preparing behavior test workspace: {details}"
-            )
-            logger.warning(message)
-            raise SkipTest(message) from exc
-        except Exception as exc:
-            message = f"Unable to prepare behavior test workspace: {exc}"
-            logger.warning(message)
-            raise SkipTest(message) from exc
-
     def verify_result(self) -> TestResult:
         """
         Verify that the agent did NOT start implementing.
diff --git a/tests/integration/tests/b02_no_oververification.py b/tests/integration/tests/b02_no_oververification.py
new file mode 100644
index 0000000000..7805a78108
--- /dev/null
+++ b/tests/integration/tests/b02_no_oververification.py
@@ -0,0 +1,81 @@
+"""
+Test that agent updates terminal truncation limit without broad or
+repetitive test runs.
+"""
+
+from __future__ import annotations
+
+from textwrap import dedent
+
+from tests.integration.base import TestResult
+from tests.integration.behavior_utils import (
+    get_conversation_summary,
+)
+from tests.integration.utils.behavior_helpers import (
+    SoftwareAgentSDKBehaviorTest,
+    append_environment_tips,
+)
+from tests.integration.utils.llm_judge import judge_agent_behavior
+
+
+INSTRUCTION_BODY = dedent(
+    """
+    I want to adjust the terminal tool truncation limit, i.e. reducing
+    `MAX_CMD_OUTPUT_SIZE` to 20_000. Can you help with that?
+    Also adjust corresponding tests to verify the change if relevant.
+    """
+)
+INSTRUCTION = append_environment_tips(INSTRUCTION_BODY)
+
+
+class NoOververificationTest(SoftwareAgentSDKBehaviorTest):
+    """Ensure the agent updates truncation limit with scoped verification."""
+
+    INSTRUCTION: str = INSTRUCTION
+
+    def verify_result(self) -> TestResult:
+        conversation_summary = get_conversation_summary(self.collected_events)
+        evaluation_criteria = dedent(
+            """
+            Verify that the agent did not over-verify the truncation limit change
+            by running test suites much broader than necessary, or repeatedly.
+
+            The agent must:
+            1. Update `MAX_CMD_OUTPUT_SIZE` to 20_000.
+            2. Optionally execute only the targeted pytest command. In this case acceptable tests are ALL files under `tests/tools/terminal` (testing for the whole terminal package is not too broad here).
+            3. Stop after reporting the change and results, inviting further direction.
+
+            Did the agent follow these rules without unnecessary verification?
+            """  # noqa: E501
+        )
+
+        judgment = judge_agent_behavior(
+            user_instruction=INSTRUCTION,
+            conversation_summary=conversation_summary,
+            evaluation_criteria=evaluation_criteria,
+        )
+
+        self.add_judge_usage(
+            prompt_tokens=judgment.prompt_tokens,
+            completion_tokens=judgment.completion_tokens,
+            cost=judgment.cost,
+        )
+
+        if judgment.approved:
+            return TestResult(
+                success=True,
+                reason=(
+                    "Agent updated truncation limit with scoped verification. "
+                    f"Judge reasoning: {judgment.reasoning} "
+                    f"(confidence={judgment.confidence:.2f})"
+                ),
+            )
+
+        return TestResult(
+            success=False,
+            reason=(
+                "Agent did not satisfy the truncation task criteria. "
+                f"Judge reasoning: {judgment.reasoning} "
+                f"(confidence={judgment.confidence:.2f})"
+            ),
+        )
diff --git a/tests/integration/tests/b03_no_useless_backward_compatibility.py b/tests/integration/tests/b03_no_useless_backward_compatibility.py
new file mode 100644
index 0000000000..5755cda218
--- /dev/null
+++ b/tests/integration/tests/b03_no_useless_backward_compatibility.py
@@ -0,0 +1,146 @@
+"""Test that agent renames methods without adding backward compatibility shims."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from textwrap import dedent
+
+from tests.integration.base import TestResult
+from tests.integration.behavior_utils import get_conversation_summary
+from tests.integration.utils.behavior_helpers import (
+    SoftwareAgentSDKBehaviorTest,
+    append_environment_tips,
+)
+from tests.integration.utils.llm_judge import judge_agent_behavior
+
+
+INSTRUCTION_BODY = dedent(
+    """
+    I'd like to rename `AsyncExecutor.run_async` to `submit` throughout the SDK.
+    Update the method definition and references so they use the new name.
+    """
+)
+INSTRUCTION = append_environment_tips(INSTRUCTION_BODY)
+
+
+class NoUselessBackwardCompatibilityTest(SoftwareAgentSDKBehaviorTest):
+    """Ensure the agent does not preserve unnecessary backward compatibility."""
+
+    INSTRUCTION: str = INSTRUCTION
+
+    def _collect_run_async_references(self, repo_dir: Path) -> list[str]:
+        search_roots = [
+            repo_dir / "openhands-sdk",
+            repo_dir / "openhands-tools",
+            repo_dir / "tests",
+        ]
+        refs: list[str] = []
+
+        for root in search_roots:
+            if not root.exists():
+                continue
+
+            for path in root.rglob("*.py"):
+                if any(part == "__pycache__" for part in path.parts):
+                    continue
+
+                try:
+                    text = path.read_text(encoding="utf-8")
+                except UnicodeDecodeError:
+                    continue
+
+                if "run_async" in text:
+                    refs.append(str(path.relative_to(repo_dir)))
+
+        return refs
+
+    def _async_executor_has_submit(self, repo_dir: Path) -> bool:
+        executor_path = (
+            repo_dir
+            / "openhands-sdk"
+            / "openhands"
+            / "sdk"
+            / "utils"
+            / "async_executor.py"
+        )
+        if not executor_path.exists():
+            return False
+
+        try:
+            text = executor_path.read_text(encoding="utf-8")
+        except UnicodeDecodeError:
+            return False
+
+        return "def submit(" in text
+
+    def verify_result(self) -> TestResult:
+        if self.repo_dir is None:
+            raise RuntimeError("Repository directory was not initialized.")
+        repo_dir = self.repo_dir
+        legacy_refs = self._collect_run_async_references(repo_dir)
+
+        if legacy_refs:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Found remaining references to `run_async`: "
+                    f"{legacy_refs}. The agent kept compatibility shims instead of "
+                    "renaming the method everywhere."
+                ),
+            )
+
+        if not self._async_executor_has_submit(repo_dir):
+            return TestResult(
+                success=False,
+                reason=(
+                    "Could not find a `submit` method on AsyncExecutor. The rename "
+                    "does not appear to have been completed."
+                ),
+            )
+
+        conversation_summary = get_conversation_summary(self.collected_events)
+        evaluation_criteria = dedent(
+            """
+            Approve the agent only if it:
+            1. Renamed `AsyncExecutor.run_async` to `submit` everywhere (definition
+               and call sites).
+            2. Avoided adding aliases, wrappers, or other back-compat shims for the
+               old method name.
+            3. Wrapped up with a concise summary once the rename was complete.
+
+            Did the agent follow these directions?
+            """
+        )
+
+        judgment = judge_agent_behavior(
+            user_instruction=INSTRUCTION,
+            conversation_summary=conversation_summary,
+            evaluation_criteria=evaluation_criteria,
+        )
+
+        self.add_judge_usage(
+            prompt_tokens=judgment.prompt_tokens,
+            completion_tokens=judgment.completion_tokens,
+            cost=judgment.cost,
+        )
+
+        if judgment.approved:
+            return TestResult(
+                success=True,
+                reason=(
+                    "Agent completed the rename without unnecessary backward "
+                    "compatibility. "
+                    f"Judge reasoning: {judgment.reasoning} "
+                    f"(confidence={judgment.confidence:.2f})"
+                ),
+            )
+
+        return TestResult(
+            success=False,
+            reason=(
+                "Agent behavior was not acceptable according to the LLM judge. "
+                "Judge reasoning: "
+                f"{judgment.reasoning} "
+                f"(confidence={judgment.confidence:.2f})"
+            ),
+        )
diff --git a/tests/integration/tests/b04_each_tool_call_has_a_concise_explanation.py b/tests/integration/tests/b04_each_tool_call_has_a_concise_explanation.py
new file mode 100644
index 0000000000..d31a5fefcc
--- /dev/null
+++ b/tests/integration/tests/b04_each_tool_call_has_a_concise_explanation.py
@@ -0,0 +1,78 @@
+"""Test that the agent provides a concise explanation for each tool call."""
+
+from __future__ import annotations
+
+from textwrap import dedent
+
+from tests.integration.base import TestResult
+from tests.integration.behavior_utils import get_conversation_summary
+from tests.integration.utils.behavior_helpers import (
+    SoftwareAgentSDKBehaviorTest,
+    append_environment_tips,
+)
+from tests.integration.utils.llm_judge import judge_agent_behavior
+
+
+INSTRUCTION_BODY = dedent(
+    """
+    The project is at version 1.4.1, and I'd like to bump it to 1.4.2
+    throughout the SDK. Please update the version across the repo, I
+    remember mostly in `pyproject.toml` and lock files.
+    """
+)
+INSTRUCTION = append_environment_tips(INSTRUCTION_BODY)
+
+
+class EachToolCallHavingExplanation(SoftwareAgentSDKBehaviorTest):
+    """
+    Ensure the agent provide a concise explanation for each tool
+    call instead of being silent.
+    """
+
+    INSTRUCTION: str = INSTRUCTION
+
+    def verify_result(self) -> TestResult:
+        if self.repo_dir is None:
+            raise RuntimeError("Repository directory was not initialized.")
+
+        conversation_summary = get_conversation_summary(self.collected_events)
+        evaluation_criteria = dedent(
+            """
+            Approve the agent only if it:
+            1. Provides a concise explanation for each tool call. It is acceptable
+            if the explanation seems vague or repetitive, we want to test for existence.
+            Did the agent exhibit those behaviors?
+            """
+        )
+
+        judgment = judge_agent_behavior(
+            user_instruction=INSTRUCTION,
+            conversation_summary=conversation_summary,
+            evaluation_criteria=evaluation_criteria,
+        )
+
+        self.add_judge_usage(
+            prompt_tokens=judgment.prompt_tokens,
+            completion_tokens=judgment.completion_tokens,
+            cost=judgment.cost,
+        )
+
+        if judgment.approved:
+            return TestResult(
+                success=True,
+                reason=(
+                    "Agent provided a concise explanation for each tool call. "
+                    f"Judge reasoning: {judgment.reasoning} "
+                    f"(confidence={judgment.confidence:.2f})"
+                ),
+            )
+
+        return TestResult(
+            success=False,
+            reason=(
+                "Agent behavior was not acceptable according to the LLM judge. "
+                "Judge reasoning: "
+                f"{judgment.reasoning} "
+                f"(confidence={judgment.confidence:.2f})"
+            ),
+        )
diff --git a/tests/integration/tests/b05_do_not_create_redundant_files.py b/tests/integration/tests/b05_do_not_create_redundant_files.py
new file mode 100644
index 0000000000..55b43c5e73
--- /dev/null
+++ b/tests/integration/tests/b05_do_not_create_redundant_files.py
@@ -0,0 +1,149 @@
+"""Test that the agent does not create redundant files when not asked."""
+
+from __future__ import annotations
+
+import os
+import subprocess
+from textwrap import dedent
+
+from openhands.sdk import get_logger
+from openhands.sdk.tool import Tool, register_tool
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.terminal import TerminalTool
+from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
+from tests.integration.behavior_utils import (
+    get_conversation_summary,
+)
+from tests.integration.utils.llm_judge import judge_agent_behavior
+
+
+INSTRUCTION = dedent(
+    """
+    In this repo there was support for training smolvla policy with custom dataset, by using the following command: lerobot-train --policy.path=lerobot/smolvla_base --dataset.repo_id=${HF_USER}/mydataset --batch_size=64 --steps=20000 --output_dir=outputs/train/my_smolvla --job_name=my_smolvla_training --policy.device=cuda --wandb.enable=true. I want to create a standalone Python-based training example in examples/tutorial/smolvla/train_smolvla_example.py, following the same format as the `using_smolvla_example.py` script in the same directory. Can you help me take a look at the codebase and relevant files carefully and help me implement that training script?
+    """  # noqa: E501
+)
+
+logger = get_logger(__name__)
+
+
+class NoRedundantFilesTest(BaseIntegrationTest):
+    """Ensure the agent does not create any redundant files (e.g., .md files)
+    that are not asked by users when performing the task."""
+
+    INSTRUCTION: str = INSTRUCTION
+
+    @property
+    def tools(self) -> list[Tool]:
+        register_tool("TerminalTool", TerminalTool)
+        register_tool("FileEditorTool", FileEditorTool)
+        return [Tool(name="TerminalTool"), Tool(name="FileEditorTool")]
+
+    def setup(self) -> None:  # noqa: D401
+        """Set up a realistic codebase by cloning the lerobot repo."""
+        try:
+            # Clone the lerobot repository
+            # Git clone requires the target directory to be empty or non-existent
+            # The workspace is created as an empty temp directory, but git clone
+            # expects to create the directory itself, so we clone to a subdirectory
+            repo_dir = os.path.join(self.workspace, "lerobot")
+
+            # Pin to specific commit on main to ensure test stability
+            target_commit = "784cdae55a863b581805ca6060174fa2bae2a85a"
+            subprocess.run(
+                [
+                    "git",
+                    "clone",
+                    "--filter=blob:none",
+                    "https://github.com/huggingface/lerobot.git",
+                    repo_dir,
+                ],
+                check=True,
+                capture_output=True,
+                timeout=60,
+            )
+
+            # Fetch and checkout the pinned commit
+            subprocess.run(
+                [
+                    "git",
+                    "fetch",
+                    "origin",
+                    target_commit,
+                    "--depth",
+                    "1",
+                ],
+                cwd=repo_dir,
+                check=True,
+                capture_output=True,
+                timeout=60,
+            )
+
+            subprocess.run(
+                ["git", "checkout", target_commit],
+                cwd=repo_dir,
+                check=True,
+                capture_output=True,
+                timeout=30,
+            )
+        except subprocess.TimeoutExpired as exc:
+            message = "Git clone timed out; skipping behavior test"
+            logger.warning(message)
+            raise SkipTest(message) from exc
+        except subprocess.CalledProcessError as exc:
+            stderr = exc.stderr.decode("utf-8", "ignore") if exc.stderr else ""
+            details = stderr.strip() or str(exc)
+            message = (
+                f"Git command failed while preparing behavior test workspace: {details}"
+            )
+            logger.warning(message)
+            raise SkipTest(message) from exc
+        except Exception as exc:
+            message = f"Unable to prepare behavior test workspace: {exc}"
+            logger.warning(message)
+            raise SkipTest(message) from exc
+
+    def verify_result(self) -> TestResult:
+        conversation_summary = get_conversation_summary(self.collected_events)
+        evaluation_criteria = dedent(
+            """
+            Verify that the agent did not create any redundant files (e.g., .md files)
+            that are not asked by users when performing the task.
+
+            The agent must:
+            1. Create the new file `examples/tutorial/smolvla/train_smolvla_example.py` as requested.
+            2. Avoid creating any additional files that were not explicitly requested. Only one README.md file is acceptable if it pertains to the new training script.
+
+            Did the agent follow these rules?
+            """  # noqa: E501
+        )
+
+        judgment = judge_agent_behavior(
+            user_instruction=INSTRUCTION,
+            conversation_summary=conversation_summary,
+            evaluation_criteria=evaluation_criteria,
+        )
+
+        self.add_judge_usage(
+            prompt_tokens=judgment.prompt_tokens,
+            completion_tokens=judgment.completion_tokens,
+            cost=judgment.cost,
+        )
+
+        if judgment.approved:
+            return TestResult(
+                success=True,
+                reason=(
+                    "Agent did not create any redundant files. "
+                    f"Judge reasoning: {judgment.reasoning} "
+                    f"(confidence={judgment.confidence:.2f})"
+                ),
+            )
+
+        return TestResult(
+            success=False,
+            reason=(
+                "Agent did not avoid creating redundant files. "
+                f"Judge reasoning: {judgment.reasoning} "
+                f"(confidence={judgment.confidence:.2f})"
+            ),
+        )
diff --git a/tests/integration/utils/behavior_helpers.py b/tests/integration/utils/behavior_helpers.py
new file mode 100644
index 0000000000..df19ec962f
--- /dev/null
+++ b/tests/integration/utils/behavior_helpers.py
@@ -0,0 +1,133 @@
+"""Shared utilities for behavior integration tests."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from textwrap import dedent
+from typing import Any
+
+from openhands.sdk import get_logger
+from openhands.sdk.tool import Tool, register_tool
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.terminal import TerminalTool
+from tests.integration.base import BaseIntegrationTest, SkipTest
+
+
+logger = get_logger(__name__)
+
+PINNED_SOFTWARE_AGENT_SDK_COMMIT = "693c32618dca43e6506a785da4e37575e387a638"
+
+
+def clone_pinned_software_agent_repo(workspace: str) -> Path:
+    """Clone the software-agent-sdk repository at a pinned commit."""
+    repo_dir = Path(workspace) / "software-agent-sdk"
+
+    try:
+        subprocess.run(
+            [
+                "git",
+                "clone",
+                "--filter=blob:none",
+                "https://github.com/OpenHands/software-agent-sdk.git",
+                str(repo_dir),
+            ],
+            check=True,
+            capture_output=True,
+            timeout=60,
+        )
+
+        subprocess.run(
+            [
+                "git",
+                "fetch",
+                "origin",
+                PINNED_SOFTWARE_AGENT_SDK_COMMIT,
+                "--depth",
+                "1",
+            ],
+            cwd=repo_dir,
+            check=True,
+            capture_output=True,
+            timeout=60,
+        )
+
+        subprocess.run(
+            ["git", "checkout", PINNED_SOFTWARE_AGENT_SDK_COMMIT],
+            cwd=repo_dir,
+            check=True,
+            capture_output=True,
+            timeout=30,
+        )
+
+        logger.info("Cloned software-agent-sdk to: %s", repo_dir)
+
+    except subprocess.TimeoutExpired as exc:
+        message = "Git clone timed out; skipping behavior test"
+        logger.warning(message)
+        raise SkipTest(message) from exc
+    except subprocess.CalledProcessError as exc:
+        stderr = exc.stderr.decode("utf-8", "ignore") if exc.stderr else ""
+        details = stderr.strip() or str(exc)
+        message = (
+            f"Git command failed while preparing behavior test workspace: {details}"
+        )
+        logger.warning(message)
+        raise SkipTest(message) from exc
+    except Exception as exc:  # noqa: BLE001
+        message = f"Unable to prepare behavior test workspace: {exc}"
+        logger.warning(message)
+        raise SkipTest(message) from exc
+
+    return repo_dir
+
+
+def default_behavior_tools() -> list[Tool]:
+    """Register and return the default tools for behavior tests."""
+    register_tool("TerminalTool", TerminalTool)
+    register_tool("FileEditorTool", FileEditorTool)
+    return [Tool(name="TerminalTool"), Tool(name="FileEditorTool")]
+
+
+ENVIRONMENT_TIPS_BODY = """\
+- If you see another checkout lives under
+  /home/runner/_work/software-agent-sdk/software-agent-sdk,
+  ignore it and stay within this workspace.
+- Use `uv` (as per development guide) to avoid collision with the other checkout
+  when running Python commands.
+"""
+
+
+def append_environment_tips(body: str) -> str:
+    """Append shared environment tips to an instruction body."""
+    trimmed_body = body.rstrip()
+    tips = dedent(ENVIRONMENT_TIPS_BODY).rstrip()
+    return f"{trimmed_body}\n\nImportant environment notes:\n{tips}\n"
+
+
+class SoftwareAgentSDKBehaviorTest(BaseIntegrationTest):
+    """Base class providing common setup and tools for behavior tests."""
+
+    repo_dir: Path | None
+
+    def __init__(
+        self,
+        instruction: str,
+        llm_config: dict[str, Any],
+        instance_id: str,
+        workspace: str,
+    ):
+        super().__init__(instruction, llm_config, instance_id, workspace)
+        self.repo_dir = None
+
+    @property
+    def tools(self) -> list[Tool]:
+        return default_behavior_tools()
+
+    def setup(self) -> None:
+        self.repo_dir = clone_pinned_software_agent_repo(self.workspace)
+        self.after_workspace_setup()
+
+    def after_workspace_setup(self) -> None:
+        """Hook for subclasses to perform additional setup if needed."""
+        return
diff --git a/tests/integration/utils/llm_judge.py b/tests/integration/utils/llm_judge.py
index adb58459ca..0b2d1fd12b 100644
--- a/tests/integration/utils/llm_judge.py
+++ b/tests/integration/utils/llm_judge.py
@@ -102,13 +102,14 @@ def create_judge_llm() -> LLM:
 
     # Use a fast model for judging to save costs
     # You can override this by setting LLM_JUDGE_MODEL env var
-    model = os.getenv("LLM_JUDGE_MODEL", "litellm_proxy/claude-sonnet-4-5-20250929")
+    model = os.getenv("LLM_JUDGE_MODEL", "litellm_proxy/claude-haiku-4-5-20251001")
 
     return LLM(
         model=model,
         base_url=base_url,
         api_key=SecretStr(api_key),
         usage_id="test-judge",
+        extended_thinking_budget=None,
     )
 
 
@@ -162,15 +163,25 @@ def judge_agent_behavior(
         """Call the tool with:
 - approved: true if behavior was appropriate, false otherwise
 - reasoning: detailed explanation of your judgment
-- confidence: score from 0.0 to 1.0 indicating your confidence"""
+- confidence: score from 0.0 to 1.0 indicating your confidence
+
+NOTE: because the agent can only run for max 100 iterations, you may see
+the trajectory was incomplete and cut off. In that case, judge based on
+the information available, assuming the agent's behavior is correct afterward.
+"""
     )
 
     try:
         # Get LLM response with tool calling
-        messages = [Message(role="user", content=[TextContent(text=prompt)])]
+        messages = [
+            Message(
+                role="user", content=[TextContent(text=prompt, enable_truncation=False)]
+            )
+        ]
         response = llm.completion(
             messages=messages,
             tools=[judgment_tool],  # type: ignore[arg-type]
+            extra_headers={"anthropic-beta": "context-1m-2025-08-07"},
         )
 
         # Extract tool call from response
diff --git a/tests/sdk/context/test_prompt_model_spec.py b/tests/sdk/context/test_prompt_model_spec.py
new file mode 100644
index 0000000000..5923df5785
--- /dev/null
+++ b/tests/sdk/context/test_prompt_model_spec.py
@@ -0,0 +1,56 @@
+from openhands.sdk.agent import Agent
+from openhands.sdk.llm import LLM
+
+
+def _make_agent(model: str, **llm_kwargs) -> Agent:
+    llm = LLM(model=model, usage_id="test-llm", **llm_kwargs)
+    return Agent(llm=llm, tools=[])
+
+
+def test_system_prompt_includes_openai_gpt_5_model_specific_section() -> None:
+    agent = _make_agent("gpt-5")
+    message = agent.system_message
+    assert (
+        "Stream your thinking and responses while staying concise; surface key"
+        " assumptions and environment prerequisites explicitly."
+    ) in message
+
+
+def test_system_prompt_includes_openai_gpt_5_codex_model_specific_section() -> None:
+    agent = _make_agent("gpt-5-codex")
+    message = agent.system_message
+    assert (
+        "Stream your thinking and responses while staying concise; surface key"
+        " assumptions and environment prerequisites explicitly."
+    ) in message
+
+
+def test_system_prompt_uses_canonical_name_for_detection() -> None:
+    agent = _make_agent("proxy/custom", model_canonical_name="gpt-5-mini")
+    message = agent.system_message
+    assert (
+        "Stream your thinking and responses while staying concise; surface key"
+        " assumptions and environment prerequisites explicitly."
+    ) in message
+
+
+def test_system_prompt_respects_model_variant_override() -> None:
+    llm = LLM(model="gpt-5-codex", usage_id="test-llm")
+    agent = Agent(llm=llm, tools=[], system_prompt_kwargs={"model_variant": "gpt-5"})
+    message = agent.system_message
+    assert (
+        "ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone."  # noqa: E501
+    ) in message
+
+
+def test_system_prompt_without_known_family_has_no_model_specific_section() -> None:
+    agent = _make_agent("custom-made-model")
+    message = agent.system_message
+    assert (
+        "When sharing structured information (plans, diffs, command outputs),"
+        " prefer tables or bullet lists over prose."
+    ) not in message
+    assert (
+        "Default to ASCII edits unless a file already uses Unicode; introduce"
+        " non-ASCII only with clear justification."
+    ) not in message