From 2df2ff401057864cf1c3d8dc8efa9f79c77f7bf6 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Fri, 7 Nov 2025 18:01:07 -0500 Subject: [PATCH 1/2] Fix: tool call streaming when both reasoning and tool parsers are enabled - Add early detection of tool call tokens in streaming mode - Skip reasoning phase when tool calls are present - Fixes issue #28297 where hermes tool parser fails in streaming mode when used together with qwen3 reasoning parser Signed-off-by: baonudesifeizhai --- vllm/entrypoints/openai/serving_chat.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 888aa4eb6fa8..9b7cd9e69743 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -946,7 +946,15 @@ async def chat_completion_stream_generator( assert added_content_delta_arr is not None assert reasoning_end_arr is not None output_token_ids = as_list(output.token_ids) - if not reasoning_end_arr[i]: + + # Process reasoning only if tool call tokens are not present + # This handles cases where models output tool calls without + # reasoning content (e.g., Qwen3-VL with hermes tool parser) + if not reasoning_end_arr[i] and ( + not tool_parser + or not hasattr(tool_parser, "tool_call_start_token") + or tool_parser.tool_call_start_token not in current_text + ): delta_message = ( reasoning_parser.extract_reasoning_content_streaming( previous_text, @@ -992,8 +1000,18 @@ async def chat_completion_stream_generator( else: current_text = "" - # handle tool calls only after reasoning is done, + # handle tool calls after reasoning is done or when tool call + # tokens are detected else: + # If entering here due to tool call detection, mark + # reasoning as ended and prepare state for tool parsing + if not reasoning_end_arr[i]: + reasoning_end_arr[i] = True + if not added_content_delta_arr[i]: + added_content_delta_arr[i] = True + previous_text = "" + previous_token_ids = [] + delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids From fe3814f8bf2661638cf73edea65410e8ce8f2ddd Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Sat, 8 Nov 2025 17:43:10 -0500 Subject: [PATCH 2/2] change for reviewer advice --- vllm/entrypoints/openai/serving_chat.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9b7cd9e69743..3f8abb445cd3 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1003,15 +1003,7 @@ async def chat_completion_stream_generator( # handle tool calls after reasoning is done or when tool call # tokens are detected else: - # If entering here due to tool call detection, mark - # reasoning as ended and prepare state for tool parsing - if not reasoning_end_arr[i]: - reasoning_end_arr[i] = True - if not added_content_delta_arr[i]: - added_content_delta_arr[i] = True - previous_text = "" - previous_token_ids = [] - + reasoning_end_arr[i] = True delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids