vllm-project · baonudesifeizhai · Nov 7, 2025 · Nov 8, 2025
@@ -946,7 +946,15 @@ async def chat_completion_stream_generator(
                         assert added_content_delta_arr is not None
                         assert reasoning_end_arr is not None
                         output_token_ids = as_list(output.token_ids)
-                        if not reasoning_end_arr[i]:
+
+                        # Process reasoning only if tool call tokens are not present
+                        # This handles cases where models output tool calls without
+                        # reasoning content (e.g., Qwen3-VL with hermes tool parser)
+                        if not reasoning_end_arr[i] and (
+                            not tool_parser
+                            or not hasattr(tool_parser, "tool_call_start_token")
+                            or tool_parser.tool_call_start_token not in current_text
+                        ):
                             delta_message = (
                                 reasoning_parser.extract_reasoning_content_streaming(
                                     previous_text,
@@ -992,8 +1000,10 @@ async def chat_completion_stream_generator(
                                 else:
                                     current_text = ""
 
-                        # handle tool calls only after reasoning is done,
+                        # handle tool calls after reasoning is done or when tool call
+                        # tokens are detected
                         else:
+                            reasoning_end_arr[i] = True
                             delta_token_ids = output_token_ids
                             # First time to tool call,
                             # add the remaining text and token ids