-
Notifications
You must be signed in to change notification settings - Fork 714
Open
Description
Checklist
- 1. I have searched related issues but cannot get the expected help.
- 2. The bug has not been fixed in the latest version.
- 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
Describe the bug
I simply serve and query Qwen/Qwen3-4B
and OpenGVLab/InternVL3_5-4B
with vllm and query them the same way for different number of input and output tokens.
Results with Qwen3-4B

Results with InternVL-4B

Reproduction
Serving Qwen with
vllm serve Qwen/Qwen3-4B --uvicorn-log-level=info --host 0.0.0.0 --port 8000 --max-model-len 32768 --trust-remote-code --no-enforce-eager --tensor-parallel-size 1
Serving InternVL with
vllm serve OpenGVLab/InternVL3_5-4B --uvicorn-log-level=info --host 0.0.0.0 --port 8000 --max-model-len 32768 --trust-remote-code --limit-mm-per-prompt.image 1 --no-enforce-eager --tensor-parallel-size 1
Query script:
#!/usr/bin/env python3
"""
Script to query VLLM OpenAI-compatible server and measure response time.
"""
import time
import requests
import json
from typing import Dict, Any
import random
def query_vllm_server(
prompt: str = "Explain quantum computing in simple terms.",
model: str = "Qwen/Qwen3-4B",
base_url: str = "http://localhost:8000",
max_tokens: int = 100,
temperature: float = 0.7
) -> Dict[str, Any]:
"""
Query the VLLM server and return response with timing info.
Args:
prompt: The input prompt
model: Model name
base_url: Server base URL
max_tokens: Maximum tokens to generate
temperature: Sampling temperature
Returns:
Dictionary with response text, timing info, and metadata
"""
# Prepare the request payload
payload = {
"model": model,
"messages": [
{"role": "user", "content": prompt}
],
"max_tokens": max_tokens,
"temperature": temperature,
"stream": False
}
headers = {
"Content-Type": "application/json"
}
url = f"{base_url}/v1/chat/completions"
print(f"🚀 Querying VLLM server at {url}")
#print(f"📝 Prompt: {prompt}")
#print(f"🤖 Model: {model}")
print("-" * 60)
# Time the request
start_time = time.time()
try:
response = requests.post(url, json=payload, headers=headers, timeout=60)
end_time = time.time()
response.raise_for_status()
# Parse response
response_data = response.json()
# Extract the generated text
generated_text = response_data["choices"][0]["message"]["content"]
# Calculate timing metrics
total_time = end_time - start_time
# Get token usage if available
usage = response_data.get("usage", {})
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
total_tokens = usage.get("total_tokens", 0)
# Calculate tokens per second
tokens_per_second = completion_tokens / total_time if total_time > 0 else 0
result = {
"success": True,
"generated_text": generated_text,
"timing": {
"total_time_seconds": round(total_time, 3),
"tokens_per_second": round(tokens_per_second, 2)
},
"token_usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens
},
"model": model
}
return result
except requests.exceptions.RequestException as e:
end_time = time.time()
total_time = end_time - start_time
return {
"success": False,
"error": str(e),
"timing": {
"total_time_seconds": round(total_time, 3)
}
}
def main():
"""Main function to run benchmark with varying input/output sizes."""
print("=" * 60)
print("🔥 VLLM Server Benchmark")
print("=" * 60)
# Results table
results = []
# Base prompt (repeat to get different character counts)
base_prompt = "Write a detailed analysis of artificial intelligence and its impact on society, economy, technology, and human relationships. Discuss the benefits, challenges, and future implications."
# Test parameters
char_counts = list(range(4000, 12000, 4000)) # 4k to 60k in 4k increments
max_tokens_list = [1] + list(range(10, 40, 10)) # 1, then 10-100 in 10 increments
test_num = 0
total_tests = len(char_counts) * len(max_tokens_list)
for idx1, char_count in enumerate(char_counts):
for idx2, max_tokens in enumerate(max_tokens_list):
test_num += 1
# Create prompt with target character count and unique prefix for cache busting
unique_prefix = f"[TEST_{test_num}_{idx1}_{idx2}_{random.randint(1, 1000000)}] "
repeat_count = max(1, (char_count - len(unique_prefix)) // len(base_prompt))
test_prompt = unique_prefix + base_prompt * repeat_count
print(f"[{test_num}/{total_tests}] Chars: {len(test_prompt)}, Max tokens: {max_tokens}")
result = query_vllm_server(
prompt=test_prompt,
max_tokens=max_tokens,
temperature=0.7
)
if result["success"]:
results.append({
'test_id': test_num,
'prompt_chars': len(test_prompt),
'input_tokens': result['token_usage']['prompt_tokens'],
'output_tokens': result['token_usage']['completion_tokens'],
'max_tokens': max_tokens,
'latency_seconds': result['timing']['total_time_seconds'],
'tokens_per_second': result['timing']['tokens_per_second']
})
print(f" ✅ {result['token_usage']['prompt_tokens']} → {result['token_usage']['completion_tokens']} tokens, {result['timing']['total_time_seconds']:.3f}s")
else:
results.append({
'test_id': test_num,
'prompt_chars': len(test_prompt),
'input_tokens': 0,
'output_tokens': 0,
'max_tokens': max_tokens,
'latency_seconds': result['timing']['total_time_seconds'],
'tokens_per_second': 0,
'error': result.get('error', 'Unknown')
})
print(f" ❌ Error: {result.get('error', 'Unknown')}")
time.sleep(0.1) # Brief pause
# Print results table
print("\n" + "=" * 80)
print("📊 BENCHMARK RESULTS")
print("=" * 80)
print(f"{'Test':<4} {'Chars':<6} {'InTok':<6} {'OutTok':<7} {'MaxTok':<7} {'Latency':<8} {'Tok/s':<8}")
print("-" * 80)
for r in results:
if 'error' not in r:
print(f"{r['test_id']:<4} {r['prompt_chars']:<6} {r['input_tokens']:<6} {r['output_tokens']:<7} {r['max_tokens']:<7} {r['latency_seconds']:<8.3f} {r['tokens_per_second']:<8.1f}")
else:
print(f"{r['test_id']:<4} {r['prompt_chars']:<6} {'ERR':<6} {'ERR':<7} {r['max_tokens']:<7} {r['latency_seconds']:<8.3f} {'ERR':<8}")
# Save to CSV
import csv
filename = f"vllm_benchmark_{int(time.time())}.csv"
with open(filename, 'w', newline='') as f:
if results:
writer = csv.DictWriter(f, fieldnames=results[0].keys())
writer.writeheader()
writer.writerows(results)
print(f"\n💾 Results saved to: {filename}")
successful = [r for r in results if 'error' not in r]
print(f"✅ Successful: {len(successful)}/{len(results)} tests")
if __name__ == "__main__":
main()
Environment
vllm=0.10.2
Error traceback
Metadata
Metadata
Assignees
Labels
No labels