Skip to content

Commit c3c59df

Browse files
ryanhoangtenystxingyaoww
authored
Support model-family and model-variant system prompts (#1348)
Co-authored-by: Engel Nyst <engel.nyst@gmail.com> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
1 parent a8c5dd3 commit c3c59df

21 files changed

+942
-135
lines changed

.github/workflows/integration-runner.yml

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ on:
1313
description: Reason for manual trigger
1414
required: true
1515
default: ''
16+
test_type:
17+
description: Select which tests to run (all, integration, behavior)
18+
required: false
19+
default: all
1620
schedule:
1721
- cron: 30 22 * * * # Runs at 10:30pm UTC every day
1822

@@ -21,25 +25,42 @@ env:
2125

2226
jobs:
2327
post-initial-comment:
24-
if: github.event_name == 'pull_request_target' && github.event.label.name == 'integration-test'
28+
if: >
29+
github.event_name == 'pull_request_target' && (
30+
github.event.label.name == 'integration-test' ||
31+
github.event.label.name == 'behavior-test'
32+
)
2533
runs-on: ubuntu-latest
2634
permissions:
2735
pull-requests: write
2836
steps:
29-
- name: Comment on PR
37+
- name: Comment on PR (integration tests)
38+
if: github.event.label.name == 'integration-test'
3039
uses: KeisukeYamashita/create-comment@v1
3140
with:
3241
unique: false
3342
comment: |
3443
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
44+
- name: Comment on PR (behavior tests)
45+
if: github.event.label.name == 'behavior-test'
46+
uses: KeisukeYamashita/create-comment@v1
47+
with:
48+
unique: false
49+
comment: |
50+
Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly.
3551
3652
run-integration-tests:
37-
# Security: Only run when 'integration-test' label is present, via workflow_dispatch, or on schedule
53+
# Security: Only run when integration-related labels are present, via workflow_dispatch, or on schedule
3854
# This prevents automatic execution on fork PRs without maintainer approval
3955
# Note: uses always() to run even when post-initial-comment is skipped (e.g., for workflow_dispatch)
4056
if: |
4157
always() && (
42-
github.event.label.name == 'integration-test' ||
58+
(
59+
github.event_name == 'pull_request_target' && (
60+
github.event.label.name == 'integration-test' ||
61+
github.event.label.name == 'behavior-test'
62+
)
63+
) ||
4364
github.event_name == 'workflow_dispatch' ||
4465
github.event_name == 'schedule'
4566
)
@@ -63,6 +84,7 @@ jobs:
6384
llm-config:
6485
model: litellm_proxy/gpt-5.1-codex-max
6586
temperature: 1.0
87+
reasoning_summary: detailed
6688
- name: Deepseek Chat
6789
run-suffix: deepseek_run
6890
llm-config:
@@ -101,7 +123,36 @@ jobs:
101123
uv sync --dev
102124
uv pip install pytest
103125
104-
# Run integration test evaluation
126+
# Run integration test evaluation
127+
- name: Determine test selection
128+
run: |
129+
TEST_TYPE_ARGS=""
130+
if [ "${{ github.event_name }}" = "pull_request_target" ] && [ "${{ github.event.label.name }}" = "behavior-test" ]; then
131+
TEST_TYPE_ARGS="--test-type behavior"
132+
echo "behavior-test label detected; running behavior tests only."
133+
elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
134+
test_type="${{ github.event.inputs.test_type }}"
135+
case "$test_type" in
136+
behavior)
137+
TEST_TYPE_ARGS="--test-type behavior"
138+
echo "workflow_dispatch requested behavior tests only."
139+
;;
140+
integration)
141+
TEST_TYPE_ARGS="--test-type integration"
142+
echo "workflow_dispatch requested integration tests only."
143+
;;
144+
""|all)
145+
echo "workflow_dispatch requested full integration suite."
146+
;;
147+
*)
148+
echo "workflow_dispatch provided unknown test_type '$test_type'; defaulting to full suite."
149+
;;
150+
esac
151+
else
152+
echo "Running full integration test suite."
153+
fi
154+
echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"
155+
105156
- name: Run integration test evaluation for ${{ matrix.job-config.name }}
106157
env:
107158
LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
@@ -113,10 +164,13 @@ jobs:
113164
AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
114165
EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}"
115166
167+
echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS'"
168+
116169
uv run python tests/integration/run_infer.py \
117170
--llm-config "$LLM_CONFIG" \
118171
--num-workers $N_PROCESSES \
119-
--eval-note "$EVAL_NOTE"
172+
--eval-note "$EVAL_NOTE" \
173+
$TEST_TYPE_ARGS
120174
121175
# get integration tests JSON results
122176
RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1)
@@ -169,7 +223,17 @@ jobs:
169223

170224
consolidate-results:
171225
needs: run-integration-tests
172-
if: always() && (github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule')
226+
if: |
227+
always() && (
228+
(
229+
github.event_name == 'pull_request_target' && (
230+
github.event.label.name == 'integration-test' ||
231+
github.event.label.name == 'behavior-test'
232+
)
233+
) ||
234+
github.event_name == 'workflow_dispatch' ||
235+
github.event_name == 'schedule'
236+
)
173237
runs-on: blacksmith-2vcpu-ubuntu-2404
174238
permissions:
175239
contents: read

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,3 +209,5 @@ openapi.json
209209
.worktrees/
210210
agent-sdk.workspace.code-workspace
211211

212+
# Integration test outputs
213+
tests/integration/outputs/

openhands-sdk/openhands/sdk/agent/base.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from openhands.sdk.context.condenser import CondenserBase, LLMSummarizingCondenser
1313
from openhands.sdk.context.prompts.prompt import render_template
1414
from openhands.sdk.llm import LLM
15+
from openhands.sdk.llm.utils.model_prompt_spec import get_model_prompt_spec
1516
from openhands.sdk.logger import get_logger
1617
from openhands.sdk.mcp import create_mcp_tools
1718
from openhands.sdk.tool import BUILT_IN_TOOLS, Tool, ToolDefinition, resolve_tool
@@ -164,6 +165,18 @@ def name(self) -> str:
164165
def system_message(self) -> str:
165166
"""Compute system message on-demand to maintain statelessness."""
166167
template_kwargs = dict(self.system_prompt_kwargs)
168+
template_kwargs.setdefault("model_name", self.llm.model)
169+
if (
170+
"model_family" not in template_kwargs
171+
or "model_variant" not in template_kwargs
172+
):
173+
spec = get_model_prompt_spec(
174+
self.llm.model, getattr(self.llm, "model_canonical_name", None)
175+
)
176+
if "model_family" not in template_kwargs and spec.family:
177+
template_kwargs["model_family"] = spec.family
178+
if "model_variant" not in template_kwargs and spec.variant:
179+
template_kwargs["model_variant"] = spec.variant
167180
system_message = render_template(
168181
prompt_dir=self.prompt_dir,
169182
template_name=self.system_prompt_filename,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
* Try to follow the instructions exactly as given - don't make extra or fewer actions if not asked.
2+
* Avoid unnecessary defensive programming; do not add redundant fallbacks or default values — fail fast instead of masking misconfigurations.
3+
* When backward compatibility expectations are unclear, confirm with the user before making changes that could break existing behavior.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* Avoid being too proactive. Fulfill the user's request thoroughly: if they ask questions/investigations, answer them; if they ask for implementations, provide them. But do not take extra steps beyond what is requested.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
2+
* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
3+
* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
* Stream your thinking and responses while staying concise; surface key assumptions and environment prerequisites explicitly.
2+
* ALWAYS send a brief preamble to the user explaining what you're about to do before each tool call, using 8 - 12 words, with a friendly and curious tone.
3+
* You have access to external resources and should actively use available tools to try accessing them first, rather than claiming you can’t access something without making an attempt.

openhands-sdk/openhands/sdk/agent/prompts/system_prompt.j2

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,20 @@ You are OpenHands agent, a helpful AI assistant that can interact with a compute
113113
- Prefer using `ps aux` to find the exact process ID (PID) first, then kill that specific PID
114114
- When possible, use more targeted approaches like finding the PID from a pidfile or using application-specific shutdown commands
115115
</PROCESS_MANAGEMENT>
116+
117+
{%- set _imp -%}
118+
{%- if model_family -%}
119+
{%- include "model_specific/" ~ model_family ~ ".j2" ignore missing -%}
120+
{%- if model_variant -%}
121+
{%- include "model_specific/" ~ model_family ~ "/" ~ model_variant ~ ".j2" ignore missing -%}
122+
{%- endif -%}
123+
{%- endif -%}
124+
{%- endset -%}
125+
126+
{%- set _imp_trimmed = _imp | trim -%}
127+
{%- if _imp_trimmed %}
128+
129+
<IMPORTANT>
130+
{{ _imp_trimmed }}
131+
</IMPORTANT>
132+
{%- endif %}

openhands-sdk/openhands/sdk/llm/message.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,12 @@ class TextContent(BaseContent):
169169
model_config: ClassVar[ConfigDict] = ConfigDict(
170170
extra="forbid", populate_by_name=True
171171
)
172+
enable_truncation: bool = True
172173

173174
def to_llm_dict(self) -> list[dict[str, str | dict[str, str]]]:
174175
"""Convert to LLM API format."""
175176
text = self.text
176-
if len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
177+
if self.enable_truncation and len(text) > DEFAULT_TEXT_CONTENT_LIMIT:
177178
logger.warning(
178179
f"TextContent text length ({len(text)}) exceeds limit "
179180
f"({DEFAULT_TEXT_CONTENT_LIMIT}), truncating"
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""Utilities for detecting model families and variants.
2+
3+
These helpers allow prompts and other systems to tailor behavior for specific
4+
LLM providers while keeping naming heuristics centralized.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from pydantic import BaseModel, ConfigDict
10+
11+
12+
class ModelPromptSpec(BaseModel):
13+
"""Detected prompt metadata for a given model configuration."""
14+
15+
model_config = ConfigDict(frozen=True)
16+
17+
family: str | None = None
18+
variant: str | None = None
19+
20+
21+
_MODEL_FAMILY_PATTERNS: dict[str, tuple[str, ...]] = {
22+
"openai_gpt": (
23+
"gpt-",
24+
"o1",
25+
"o3",
26+
"o4",
27+
),
28+
"anthropic_claude": ("claude",),
29+
"google_gemini": ("gemini",),
30+
"meta_llama": ("llama",),
31+
"mistral": ("mistral",),
32+
"deepseek": ("deepseek",),
33+
"alibaba_qwen": ("qwen",),
34+
}
35+
36+
# Ordered heuristics to pick the most specific variant available for a family.
37+
_MODEL_VARIANT_PATTERNS: dict[str, tuple[tuple[str, tuple[str, ...]], ...]] = {
38+
"openai_gpt": (
39+
("gpt-5-codex", ("gpt-5-codex", "gpt-5.1-codex")),
40+
("gpt-5", ("gpt-5", "gpt-5.1")),
41+
),
42+
}
43+
44+
45+
def _normalize(name: str | None) -> str:
46+
return (name or "").strip().lower()
47+
48+
49+
def _match_family(model_name: str) -> str | None:
50+
normalized = _normalize(model_name)
51+
if not normalized:
52+
return None
53+
54+
for family, patterns in _MODEL_FAMILY_PATTERNS.items():
55+
if any(pattern in normalized for pattern in patterns):
56+
return family
57+
return None
58+
59+
60+
def _match_variant(
61+
family: str,
62+
model_name: str,
63+
canonical_name: str | None = None,
64+
) -> str | None:
65+
patterns = _MODEL_VARIANT_PATTERNS.get(family)
66+
if not patterns:
67+
return None
68+
69+
# Choose canonical_name if available, otherwise fall back to model_name
70+
candidate = _normalize(canonical_name) or _normalize(model_name)
71+
if not candidate:
72+
return None
73+
74+
for variant, substrings in patterns:
75+
if any(sub in candidate for sub in substrings):
76+
return variant
77+
78+
return None
79+
80+
81+
def get_model_prompt_spec(
82+
model_name: str,
83+
canonical_name: str | None = None,
84+
) -> ModelPromptSpec:
85+
"""Return family and variant prompt metadata for the given identifiers."""
86+
87+
family = _match_family(model_name)
88+
if family is None and canonical_name:
89+
family = _match_family(canonical_name)
90+
91+
variant = None
92+
if family is not None:
93+
variant = _match_variant(family, model_name, canonical_name)
94+
95+
return ModelPromptSpec(family=family, variant=variant)
96+
97+
98+
__all__ = ["ModelPromptSpec", "get_model_prompt_spec"]

0 commit comments

Comments
 (0)