diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 888aa4eb6fa8..3f8abb445cd3 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -946,7 +946,15 @@ async def chat_completion_stream_generator( assert added_content_delta_arr is not None assert reasoning_end_arr is not None output_token_ids = as_list(output.token_ids) - if not reasoning_end_arr[i]: + + # Process reasoning only if tool call tokens are not present + # This handles cases where models output tool calls without + # reasoning content (e.g., Qwen3-VL with hermes tool parser) + if not reasoning_end_arr[i] and ( + not tool_parser + or not hasattr(tool_parser, "tool_call_start_token") + or tool_parser.tool_call_start_token not in current_text + ): delta_message = ( reasoning_parser.extract_reasoning_content_streaming( previous_text, @@ -992,8 +1000,10 @@ async def chat_completion_stream_generator( else: current_text = "" - # handle tool calls only after reasoning is done, + # handle tool calls after reasoning is done or when tool call + # tokens are detected else: + reasoning_end_arr[i] = True delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids