From 2df2ff401057864cf1c3d8dc8efa9f79c77f7bf6 Mon Sep 17 00:00:00 2001
From: baonudesifeizhai <baonudesifeizhai@gmail.com>
Date: Fri, 7 Nov 2025 18:01:07 -0500
Subject: [PATCH 1/2] Fix: tool call streaming when both reasoning and tool
 parsers are enabled

- Add early detection of tool call tokens in streaming mode
- Skip reasoning phase when tool calls are present
- Fixes issue #28297 where hermes tool parser fails in streaming mode
  when used together with qwen3 reasoning parser

Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 888aa4eb6fa8..9b7cd9e69743 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -946,7 +946,15 @@ async def chat_completion_stream_generator(
                         assert added_content_delta_arr is not None
                         assert reasoning_end_arr is not None
                         output_token_ids = as_list(output.token_ids)
-                        if not reasoning_end_arr[i]:
+
+                        # Process reasoning only if tool call tokens are not present
+                        # This handles cases where models output tool calls without
+                        # reasoning content (e.g., Qwen3-VL with hermes tool parser)
+                        if not reasoning_end_arr[i] and (
+                            not tool_parser
+                            or not hasattr(tool_parser, "tool_call_start_token")
+                            or tool_parser.tool_call_start_token not in current_text
+                        ):
                             delta_message = (
                                 reasoning_parser.extract_reasoning_content_streaming(
                                     previous_text,
@@ -992,8 +1000,18 @@ async def chat_completion_stream_generator(
                                 else:
                                     current_text = ""
 
-                        # handle tool calls only after reasoning is done,
+                        # handle tool calls after reasoning is done or when tool call
+                        # tokens are detected
                         else:
+                            # If entering here due to tool call detection, mark
+                            # reasoning as ended and prepare state for tool parsing
+                            if not reasoning_end_arr[i]:
+                                reasoning_end_arr[i] = True
+                                if not added_content_delta_arr[i]:
+                                    added_content_delta_arr[i] = True
+                                    previous_text = ""
+                                    previous_token_ids = []
+
                             delta_token_ids = output_token_ids
                             # First time to tool call,
                             # add the remaining text and token ids

From fe3814f8bf2661638cf73edea65410e8ce8f2ddd Mon Sep 17 00:00:00 2001
From: baonudesifeizhai <baonudesifeizhai@gmail.com>
Date: Sat, 8 Nov 2025 17:43:10 -0500
Subject: [PATCH 2/2] change for reviewer advice

---
 vllm/entrypoints/openai/serving_chat.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9b7cd9e69743..3f8abb445cd3 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1003,15 +1003,7 @@ async def chat_completion_stream_generator(
                         # handle tool calls after reasoning is done or when tool call
                         # tokens are detected
                         else:
-                            # If entering here due to tool call detection, mark
-                            # reasoning as ended and prepare state for tool parsing
-                            if not reasoning_end_arr[i]:
-                                reasoning_end_arr[i] = True
-                                if not added_content_delta_arr[i]:
-                                    added_content_delta_arr[i] = True
-                                    previous_text = ""
-                                    previous_token_ids = []
-
+                            reasoning_end_arr[i] = True
                             delta_token_ids = output_token_ids
                             # First time to tool call,
                             # add the remaining text and token ids