From b492cb661828b9dde77c0d57ccba76d7be516cc7 Mon Sep 17 00:00:00 2001
From: Michael Zuo <mzuo@google.com>
Date: Tue, 16 Sep 2025 21:19:02 -0700
Subject: [PATCH] Fix: Added vLLM CLI command to run benchmark in README.md

---
 inference/trillium/vLLM/Qwen2.5-32B/README.md | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/inference/trillium/vLLM/Qwen2.5-32B/README.md b/inference/trillium/vLLM/Qwen2.5-32B/README.md
index ccbd188..272fbb6 100644
--- a/inference/trillium/vLLM/Qwen2.5-32B/README.md
+++ b/inference/trillium/vLLM/Qwen2.5-32B/README.md
@@ -142,6 +142,27 @@ python benchmarks/benchmark_serving.py \
     # --random-prefix-len=$PREFIX_LEN
 ```
 
+In newer vLLM docker images, the bechmark_serving.py has been moved to the vLLM CLI.  Run the benchmark by using:
+
+```bash
+export MAX_INPUT_LEN=1800
+export MAX_OUTPUT_LEN=128
+export HF_TOKEN=<your HF token>
+
+cd /workspace/vllm
+
+vllm bench serve \
+    --backend vllm \
+    --model "Qwen/Qwen2.5-32B"  \
+    --dataset-name random \
+    --num-prompts 1000 \
+    --random-input-len=$MAX_INPUT_LEN \
+    --random-output-len=$MAX_OUTPUT_LEN \
+    --seed 100
+    # --random-range-ratio=$RATIO \
+    # --random-prefix-len=$PREFIX_LEN
+```
+
 The snippet below is what you’d expect to see - the numbers vary based on the vllm version, the model size and the TPU instance type/size.
 
 ```bash