Skip to content

Commit dde4fcb

Browse files
Fix: tool call streaming when both reasoning and tool parsers are enabled
- Add early detection of tool call tokens in streaming mode - Skip reasoning phase when tool calls are present - Fixes issue #28297 where hermes tool parser fails in streaming mode when used together with qwen3 reasoning parser
1 parent 67a2da8 commit dde4fcb

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

vllm/entrypoints/openai/serving_chat.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,24 @@ async def chat_completion_stream_generator(
946946
assert added_content_delta_arr is not None
947947
assert reasoning_end_arr is not None
948948
output_token_ids = as_list(output.token_ids)
949+
950+
# Check if tool call tokens are present in the output
951+
# If so, skip reasoning and go directly to tool parsing
952+
# This handles cases where models output tool calls without
953+
# reasoning content (e.g., Qwen3-VL with hermes tool parser)
954+
if (
955+
not reasoning_end_arr[i]
956+
and tool_parser
957+
and hasattr(tool_parser, "tool_call_start_token")
958+
and tool_parser.tool_call_start_token in current_text
959+
):
960+
reasoning_end_arr[i] = True
961+
# Prepare for tool parsing by resetting state
962+
if not added_content_delta_arr[i]:
963+
added_content_delta_arr[i] = True
964+
previous_text = ""
965+
previous_token_ids = []
966+
949967
if not reasoning_end_arr[i]:
950968
delta_message = (
951969
reasoning_parser.extract_reasoning_content_streaming(

0 commit comments

Comments
 (0)