Fix: tool call streaming when both reasoning and tool parsers are enabled

baonudesifeizhai · baonudesifeizhai · commit dde4fcb64ad8 · 2025-11-07T18:01:07.000-05:00
- Add early detection of tool call tokens in streaming mode - Skip reasoning phase when tool calls are present - Fixes issue #28297 where hermes tool parser fails in streaming mode when used together with qwen3 reasoning parser
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -946,6 +946,24 @@ async def chat_completion_stream_generator(
                         assert added_content_delta_arr is not None
                         assert reasoning_end_arr is not None
                         output_token_ids = as_list(output.token_ids)
+
+                        # Check if tool call tokens are present in the output
+                        # If so, skip reasoning and go directly to tool parsing
+                        # This handles cases where models output tool calls without
+                        # reasoning content (e.g., Qwen3-VL with hermes tool parser)
+                        if (
+                            not reasoning_end_arr[i]
+                            and tool_parser
+                            and hasattr(tool_parser, "tool_call_start_token")
+                            and tool_parser.tool_call_start_token in current_text
+                        ):
+                            reasoning_end_arr[i] = True
+                            # Prepare for tool parsing by resetting state
+                            if not added_content_delta_arr[i]:
+                                added_content_delta_arr[i] = True
+                                previous_text = ""
+                                previous_token_ids = []
+
                         if not reasoning_end_arr[i]:
                             delta_message = (
                                 reasoning_parser.extract_reasoning_content_streaming(