Skip to content

[Bug] InternVL3.5 4B is significantly slower than Qwen3-4B when served with vllm #1187

@mertunsall

Description

@mertunsall

Checklist

  • 1. I have searched related issues but cannot get the expected help.
  • 2. The bug has not been fixed in the latest version.
  • 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.

Describe the bug

I simply serve and query Qwen/Qwen3-4B and OpenGVLab/InternVL3_5-4B with vllm and query them the same way for different number of input and output tokens.

Results with Qwen3-4B

Image

Results with InternVL-4B

Image

Reproduction

Serving Qwen with

vllm serve Qwen/Qwen3-4B --uvicorn-log-level=info --host 0.0.0.0 --port 8000 --max-model-len 32768 --trust-remote-code --no-enforce-eager --tensor-parallel-size 1

Serving InternVL with

vllm serve OpenGVLab/InternVL3_5-4B --uvicorn-log-level=info --host 0.0.0.0 --port 8000 --max-model-len 32768 --trust-remote-code --limit-mm-per-prompt.image 1 --no-enforce-eager --tensor-parallel-size 1

Query script:

#!/usr/bin/env python3
"""
Script to query VLLM OpenAI-compatible server and measure response time.
"""

import time
import requests
import json
from typing import Dict, Any
import random

def query_vllm_server(
    prompt: str = "Explain quantum computing in simple terms.",
    model: str = "Qwen/Qwen3-4B",
    base_url: str = "http://localhost:8000",
    max_tokens: int = 100,
    temperature: float = 0.7
) -> Dict[str, Any]:
    """
    Query the VLLM server and return response with timing info.
    
    Args:
        prompt: The input prompt
        model: Model name
        base_url: Server base URL
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        
    Returns:
        Dictionary with response text, timing info, and metadata
    """
    
    # Prepare the request payload
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stream": False
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    url = f"{base_url}/v1/chat/completions"
    
    print(f"🚀 Querying VLLM server at {url}")
    #print(f"📝 Prompt: {prompt}")
    #print(f"🤖 Model: {model}")
    print("-" * 60)
    
    # Time the request
    start_time = time.time()
    
    try:
        response = requests.post(url, json=payload, headers=headers, timeout=60)
        end_time = time.time()
        
        response.raise_for_status()
        
        # Parse response
        response_data = response.json()
        
        # Extract the generated text
        generated_text = response_data["choices"][0]["message"]["content"]
        
        # Calculate timing metrics
        total_time = end_time - start_time
        
        # Get token usage if available
        usage = response_data.get("usage", {})
        prompt_tokens = usage.get("prompt_tokens", 0)
        completion_tokens = usage.get("completion_tokens", 0)
        total_tokens = usage.get("total_tokens", 0)
        
        # Calculate tokens per second
        tokens_per_second = completion_tokens / total_time if total_time > 0 else 0
        
        result = {
            "success": True,
            "generated_text": generated_text,
            "timing": {
                "total_time_seconds": round(total_time, 3),
                "tokens_per_second": round(tokens_per_second, 2)
            },
            "token_usage": {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": total_tokens
            },
            "model": model
        }
        
        return result
        
    except requests.exceptions.RequestException as e:
        end_time = time.time()
        total_time = end_time - start_time
        
        return {
            "success": False,
            "error": str(e),
            "timing": {
                "total_time_seconds": round(total_time, 3)
            }
        }

def main():
    """Main function to run benchmark with varying input/output sizes."""
    
    print("=" * 60)
    print("🔥 VLLM Server Benchmark")
    print("=" * 60)
    
    # Results table
    results = []
    
    # Base prompt (repeat to get different character counts)
    base_prompt = "Write a detailed analysis of artificial intelligence and its impact on society, economy, technology, and human relationships. Discuss the benefits, challenges, and future implications."
    
    # Test parameters
    char_counts = list(range(4000, 12000, 4000))  # 4k to 60k in 4k increments
    max_tokens_list = [1] + list(range(10, 40, 10))  # 1, then 10-100 in 10 increments
    
    test_num = 0
    total_tests = len(char_counts) * len(max_tokens_list)
    
    for idx1, char_count in enumerate(char_counts):
        for idx2, max_tokens in enumerate(max_tokens_list):
            test_num += 1
            
            # Create prompt with target character count and unique prefix for cache busting
            unique_prefix = f"[TEST_{test_num}_{idx1}_{idx2}_{random.randint(1, 1000000)}] "
            repeat_count = max(1, (char_count - len(unique_prefix)) // len(base_prompt))
            test_prompt = unique_prefix + base_prompt * repeat_count
            
            print(f"[{test_num}/{total_tests}] Chars: {len(test_prompt)}, Max tokens: {max_tokens}")
            
            result = query_vllm_server(
                prompt=test_prompt,
                max_tokens=max_tokens,
                temperature=0.7
            )
            
            if result["success"]:
                results.append({
                    'test_id': test_num,
                    'prompt_chars': len(test_prompt),
                    'input_tokens': result['token_usage']['prompt_tokens'],
                    'output_tokens': result['token_usage']['completion_tokens'],
                    'max_tokens': max_tokens,
                    'latency_seconds': result['timing']['total_time_seconds'],
                    'tokens_per_second': result['timing']['tokens_per_second']
                })
                print(f"  ✅ {result['token_usage']['prompt_tokens']} → {result['token_usage']['completion_tokens']} tokens, {result['timing']['total_time_seconds']:.3f}s")
            else:
                results.append({
                    'test_id': test_num,
                    'prompt_chars': len(test_prompt),
                    'input_tokens': 0,
                    'output_tokens': 0,
                    'max_tokens': max_tokens,
                    'latency_seconds': result['timing']['total_time_seconds'],
                    'tokens_per_second': 0,
                    'error': result.get('error', 'Unknown')
                })
                print(f"  ❌ Error: {result.get('error', 'Unknown')}")
            
            time.sleep(0.1)  # Brief pause
    
    # Print results table
    print("\n" + "=" * 80)
    print("📊 BENCHMARK RESULTS")
    print("=" * 80)
    print(f"{'Test':<4} {'Chars':<6} {'InTok':<6} {'OutTok':<7} {'MaxTok':<7} {'Latency':<8} {'Tok/s':<8}")
    print("-" * 80)
    
    for r in results:
        if 'error' not in r:
            print(f"{r['test_id']:<4} {r['prompt_chars']:<6} {r['input_tokens']:<6} {r['output_tokens']:<7} {r['max_tokens']:<7} {r['latency_seconds']:<8.3f} {r['tokens_per_second']:<8.1f}")
        else:
            print(f"{r['test_id']:<4} {r['prompt_chars']:<6} {'ERR':<6} {'ERR':<7} {r['max_tokens']:<7} {r['latency_seconds']:<8.3f} {'ERR':<8}")
    
    # Save to CSV
    import csv
    filename = f"vllm_benchmark_{int(time.time())}.csv"
    with open(filename, 'w', newline='') as f:
        if results:
            writer = csv.DictWriter(f, fieldnames=results[0].keys())
            writer.writeheader()
            writer.writerows(results)
    
    print(f"\n💾 Results saved to: {filename}")
    successful = [r for r in results if 'error' not in r]
    print(f"✅ Successful: {len(successful)}/{len(results)} tests")

if __name__ == "__main__":
    main()

Environment

vllm=0.10.2

Error traceback

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions