Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
cc9264d
implement model-family specific system prompts
ryanhoangt Dec 8, 2025
9a50f66
update claude specific prompts
ryanhoangt Dec 8, 2025
d1c7b72
update claude prompt
ryanhoangt Dec 8, 2025
d0601b7
add gemini custom prompt
ryanhoangt Dec 8, 2025
3b4a5f7
add ability to define model variant-specific prompt
ryanhoangt Dec 8, 2025
e5d9335
add gpt-5 custom prompts
ryanhoangt Dec 8, 2025
e29f7b6
remove detection indicator in j2 templates
ryanhoangt Dec 8, 2025
c1e3ae0
Merge branch 'main' into ht/custom-prompts-per-models
ryanhoangt Dec 10, 2025
c074acc
use pydantic
ryanhoangt Dec 10, 2025
6998fd6
add behavior test for oververification
ryanhoangt Dec 10, 2025
4c6e59e
remove all custom prompt files
ryanhoangt Dec 10, 2025
ebe6301
add behavior-test label
ryanhoangt Dec 10, 2025
4eb49b4
allow running only behavior tests via workflow dispatch
ryanhoangt Dec 10, 2025
9d66b7c
add behavior test for useless backward compat
ryanhoangt Dec 10, 2025
18dbd0c
Merge branch 'main' into ht/custom-prompts-per-models
enyst Dec 10, 2025
c6d4fde
add enviroment notes
ryanhoangt Dec 11, 2025
6e3b4e9
refactor tests
ryanhoangt Dec 11, 2025
555ebc4
revert default system prompt & prioritize canonical name
ryanhoangt Dec 11, 2025
018b27a
Merge branch 'main' into ht/custom-prompts-per-models
enyst Dec 11, 2025
e7a865c
remove convo summary truncation for llm judge
ryanhoangt Dec 15, 2025
78ca66c
remove follow up requirement in b03
ryanhoangt Dec 15, 2025
44e5b5a
do not truncate when serializing content in judge llm
ryanhoangt Dec 15, 2025
392b237
add custom prompts for claude and gemini
ryanhoangt Dec 15, 2025
15dba97
add test b04 for gpt-5 and gemini
ryanhoangt Dec 15, 2025
1ba590b
add test b05
ryanhoangt Dec 15, 2025
3d5a622
truncate obs to max 5000 chars
ryanhoangt Dec 16, 2025
f07d811
reduce max iter of intg tests to 100
ryanhoangt Dec 16, 2025
cb3af19
use 1m context window for judge
ryanhoangt Dec 16, 2025
c521e9d
adjust criteria for b02
ryanhoangt Dec 16, 2025
71a6c8f
use haiku for judge
ryanhoangt Dec 16, 2025
dff8805
add custom prompts for gpt-5 and gpt-5-codex
ryanhoangt Dec 16, 2025
6b482e3
fix comments
ryanhoangt Dec 16, 2025
e8f7424
move <MODEL_SPECIFIC> tag to base prompt
ryanhoangt Dec 16, 2025
4c1e8d5
enable reasoning_summary in intg tests
ryanhoangt Dec 16, 2025
73752c5
Merge branch 'main' into ht/custom-prompts-per-models
ryanhoangt Dec 16, 2025
4c5aaa4
add note for cutoff due to max iter for llm judge
ryanhoangt Dec 16, 2025
1494055
Apply suggestion from @xingyaoww
xingyaoww Dec 16, 2025
708d6bc
Apply suggestion from @xingyaoww
xingyaoww Dec 16, 2025
402eae0
trim newlines from important
xingyaoww Dec 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 71 additions & 7 deletions .github/workflows/integration-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ on:
description: Reason for manual trigger
required: true
default: ''
test_type:
description: Select which tests to run (all, integration, behavior)
required: false
default: all
schedule:
- cron: 30 22 * * * # Runs at 10:30pm UTC every day

Expand All @@ -21,25 +25,42 @@ env:

jobs:
post-initial-comment:
if: github.event_name == 'pull_request_target' && github.event.label.name == 'integration-test'
if: >
github.event_name == 'pull_request_target' && (
github.event.label.name == 'integration-test' ||
github.event.label.name == 'behavior-test'
)
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- name: Comment on PR
- name: Comment on PR (integration tests)
if: github.event.label.name == 'integration-test'
uses: KeisukeYamashita/create-comment@v1
with:
unique: false
comment: |
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
- name: Comment on PR (behavior tests)
if: github.event.label.name == 'behavior-test'
uses: KeisukeYamashita/create-comment@v1
with:
unique: false
comment: |
Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly.
run-integration-tests:
# Security: Only run when 'integration-test' label is present, via workflow_dispatch, or on schedule
# Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule
# This prevents automatic execution on fork PRs without maintainer approval
# Note: uses always() to run even when post-initial-comment is skipped (e.g., for workflow_dispatch)
if: |
always() && (
github.event.label.name == 'integration-test' ||
(
github.event_name == 'pull_request_target' && (
github.event.label.name == 'integration-test' ||
github.event.label.name == 'behavior-test'
)
) ||
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule'
)
Expand All @@ -63,6 +84,7 @@ jobs:
llm-config:
model: litellm_proxy/gpt-5.1-codex-max
temperature: 1.0
reasoning_summary: detailed
- name: Deepseek Chat
run-suffix: deepseek_run
llm-config:
Expand Down Expand Up @@ -101,7 +123,36 @@ jobs:
uv sync --dev
uv pip install pytest
# Run integration test evaluation
# Run integration test evaluation
- name: Determine test selection
run: |
TEST_TYPE_ARGS=""
if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then
TEST_TYPE_ARGS="--test-type behavior"
echo "behavior-test label detected; running behavior tests only."
elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
test_type="${{ github.event.inputs.test_type }}"
case "$test_type" in
behavior)
TEST_TYPE_ARGS="--test-type behavior"
echo "workflow_dispatch requested behavior tests only."
;;
integration)
TEST_TYPE_ARGS="--test-type integration"
echo "workflow_dispatch requested integration tests only."
;;
""|all)
echo "workflow_dispatch requested full integration suite."
;;
*)
echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite."
;;
esac
else
echo "Running full integration test suite."
fi
echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"
- name: Run integration test evaluation for ${{ matrix.job-config.name }}
env:
LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
Expand All @@ -113,10 +164,13 @@ jobs:
AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}"
echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS'"
uv run python tests/integration/run_infer.py \
--llm-config "$LLM_CONFIG" \
--num-workers $N_PROCESSES \
--eval-note "$EVAL_NOTE"
--eval-note "$EVAL_NOTE" \
$TEST_TYPE_ARGS
# get integration tests JSON results
RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1)
Expand Down Expand Up @@ -169,7 +223,17 @@ jobs:

consolidate-results:
needs: run-integration-tests
if: always() && (github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule')
if: |
always() && (
(
github.event_name == 'pull_request_target' && (
github.event.label.name == 'integration-test' ||
github.event.label.name == 'behavior-test'
)
) ||
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule'
)
runs-on: blacksmith-2vcpu-ubuntu-2404
permissions:
contents: read
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,5 @@ openapi.json
.worktrees/
agent-sdk.workspace.code-workspace

# Integration test outputs
tests/integration/outputs/
13 changes: 13 additions & 0 deletions openhands-sdk/openhands/sdk/agent/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from openhands.sdk.context.condenser import CondenserBase, LLMSummarizingCondenser
from openhands.sdk.context.prompts.prompt import render_template
from openhands.sdk.llm import LLM
from openhands.sdk.llm.utils.model_prompt_spec import get_model_prompt_spec
from openhands.sdk.logger import get_logger
from openhands.sdk.mcp import create_mcp_tools
from openhands.sdk.tool import BUILT_IN_TOOLS, Tool, ToolDefinition, resolve_tool
Expand Down Expand Up @@ -164,6 +165,18 @@ def name(self) -> str:
def system_message(self) -> str:
"""Compute system message on-demand to maintain statelessness."""
template_kwargs = dict(self.system_prompt_kwargs)
template_kwargs.setdefault("model_name", self.llm.model)
if (
"model_family" not in template_kwargs
or "model_variant" not in template_kwargs
):
spec = get_model_prompt_spec(
self.llm.model, getattr(self.llm, "model_canonical_name", None)
)
if "model_family" not in template_kwargs and spec.family:
template_kwargs["model_family"] = spec.family
if "model_variant" not in template_kwargs and spec.variant:
template_kwargs["model_variant"] = spec.variant
system_message = render_template(
prompt_dir=self.prompt_dir,
template_name=self.system_prompt_filename,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Try to follow the instructions exactly as given - don't make extra or fewer actions if not asked.
* Avoid unnecessary defensive programming; do not add redundant fallbacks or default values — fail fast instead of masking misconfigurations.
* When backward compatibility expectations are unclear, confirm with the user before making changes that could break existing behavior.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Avoid being too proactive. Fulfill the user's request thoroughly: if they ask questions/investigations, answer them; if they ask for implementations, provide them. But do not take extra steps beyond what is requested.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
17 changes: 17 additions & 0 deletions openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,20 @@ You are OpenHands agent, a helpful AI assistant that can interact with a compute
- Prefer using `ps aux` to find the exact process ID (PID) first, then kill that specific PID
- When possible, use more targeted approaches like finding the PID from a pidfile or using application-specific shutdown commands
</PROCESS_MANAGEMENT>

{%- set _imp -%}
{%- if model_family -%}
{%- include "model_specific/" ~ model_family ~ ".j2" ignore missing -%}
{%- if model_variant -%}
{%- include "model_specific/" ~ model_family ~ "/" ~ model_variant ~ ".j2" ignore missing -%}
{%- endif -%}
{%- endif -%}
{%- endset -%}

{%- set _imp_trimmed = _imp | trim -%}
{%- if _imp_trimmed %}

<IMPORTANT>
{{ _imp_trimmed }}
</IMPORTANT>
{%- endif %}
3 changes: 2 additions & 1 deletion openhands-sdk/openhands/sdk/llm/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,12 @@ class TextContent(BaseContent):
model_config: ClassVar[ConfigDict] = ConfigDict(
extra="forbid", populate_by_name=True
)
enable_truncation: bool = True

def to_llm_dict(self) -> list[dict[str, str | dict[str, str]]]:
"""Convert to LLM API format."""
text = self.text
if len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
if self.enable_truncation and len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
logger.warning(
f"TextContent text length ({len(text)}) exceeds limit "
f"({DEFAULT_TEXT_CONTENT_LIMIT}), truncating"
Expand Down
98 changes: 98 additions & 0 deletions openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Utilities for detecting model families and variants.
These helpers allow prompts and other systems to tailor behavior for specific
LLM providers while keeping naming heuristics centralized.
"""

from __future__ import annotations

from pydantic import BaseModel, ConfigDict


class ModelPromptSpec(BaseModel):
"""Detected prompt metadata for a given model configuration."""

model_config = ConfigDict(frozen=True)

family: str | None = None
variant: str | None = None


_MODEL_FAMILY_PATTERNS: dict[str, tuple[str, ...]] = {
"openai_gpt": (
"gpt-",
"o1",
"o3",
"o4",
),
"anthropic_claude": ("claude",),
"google_gemini": ("gemini",),
"meta_llama": ("llama",),
"mistral": ("mistral",),
"deepseek": ("deepseek",),
"alibaba_qwen": ("qwen",),
}

# Ordered heuristics to pick the most specific variant available for a family.
_MODEL_VARIANT_PATTERNS: dict[str, tuple[tuple[str, tuple[str, ...]], ...]] = {
"openai_gpt": (
("gpt-5-codex", ("gpt-5-codex", "gpt-5.1-codex")),
("gpt-5", ("gpt-5", "gpt-5.1")),
),
}


def _normalize(name: str | None) -> str:
return (name or "").strip().lower()


def _match_family(model_name: str) -> str | None:
normalized = _normalize(model_name)
if not normalized:
return None

for family, patterns in _MODEL_FAMILY_PATTERNS.items():
if any(pattern in normalized for pattern in patterns):
return family
return None


def _match_variant(
family: str,
model_name: str,
canonical_name: str | None = None,
) -> str | None:
patterns = _MODEL_VARIANT_PATTERNS.get(family)
if not patterns:
return None

# Choose canonical_name if available, otherwise fall back to model_name
candidate = _normalize(canonical_name) or _normalize(model_name)
if not candidate:
return None

for variant, substrings in patterns:
if any(sub in candidate for sub in substrings):
return variant

return None


def get_model_prompt_spec(
model_name: str,
canonical_name: str | None = None,
) -> ModelPromptSpec:
"""Return family and variant prompt metadata for the given identifiers."""

family = _match_family(model_name)
if family is None and canonical_name:
family = _match_family(canonical_name)

variant = None
if family is not None:
variant = _match_variant(family, model_name, canonical_name)

return ModelPromptSpec(family=family, variant=variant)


__all__ = ["ModelPromptSpec", "get_model_prompt_spec"]
1 change: 1 addition & 0 deletions tests/integration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def __init__(
workspace=self.workspace,
callbacks=[self.conversation_callback],
visualizer=DefaultConversationVisualizer(), # Use default visualizer
max_iteration_per_run=100,
)

def conversation_callback(self, event: Event):
Expand Down
Loading
Loading