2525 A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2626 required : true
2727 type : string
28- default : h100,rocm,spr,gnr,b200
28+ default : h100,rocm,spr,gnr,b200,gaudi3
2929 pull_request :
3030 paths :
3131 - .github/workflows/vllm-benchmark.yml
@@ -104,6 +104,9 @@ jobs:
104104 elif command -v rocm-smi; then
105105 DEVICE_NAME=rocm
106106 rocm-smi
107+ elif command -v hl-smi; then
108+ DEVICE_NAME=hpu
109+ hl-smi
107110 else
108111 DEVICE_NAME=cpu
109112 lscpu
@@ -120,6 +123,8 @@ jobs:
120123 DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
121124 elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
122125 DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
126+ elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
127+ DEVICE_TYPE=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
123128 elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
124129 DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
125130 fi
@@ -133,6 +138,9 @@ jobs:
133138 if [[ "${DEVICE_NAME}" == "rocm" ]]; then
134139 pip install -r .github/scripts/requirements.txt \
135140 --extra-index-url https://download.pytorch.org/whl/rocm6.3
141+ elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
142+ grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt
143+ pip install -r /tmp/requirements_no_torch.txt
136144 else
137145 pip install -r .github/scripts/requirements.txt \
138146 --extra-index-url https://download.pytorch.org/whl/cu128
@@ -155,6 +163,8 @@ jobs:
155163 DOCKER_IMAGE_SUFFIX=""
156164 if [[ "${DEVICE_NAME}" == "rocm" ]]; then
157165 DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
166+ elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
167+ DOCKER_IMAGE_SUFFIX=-hpu
158168 elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
159169 DOCKER_IMAGE_SUFFIX=-cpu
160170 fi
@@ -188,34 +198,83 @@ jobs:
188198
189199 if [[ -z "${HEAD_SHA}" ]]; then
190200 pushd vllm
191- # Looking back the latest 100 commits is enough
192- for i in {0..99}
193- do
194- # Check if the image is there, if it doesn't then check an older one
195- # because the commit is too recent
196- HEAD_SHA=$(git rev-parse --verify HEAD~${i})
197- DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
198-
199- # No Docker image available yet because the commit is too recent
200- if ! docker manifest inspect "${DOCKER_IMAGE}"; then
201- continue
201+
202+ # Special handling for HPU: use vllm-gaudi compatible commit
203+ #
204+ # Problem: VLLM_STABLE_COMMIT might change between when CI builds the image
205+ # and when this benchmark runs (every 12 hours), causing image tag mismatches.
206+ #
207+ # Solution: Query git history of VLLM_STABLE_COMMIT file to find the most recent
208+ # compatible vLLM commit that has an actual Docker image built by CI.
209+ if [[ "${DEVICE_NAME}" == "hpu" ]]; then
210+ echo "HPU device detected - finding compatible vLLM commit from vllm-gaudi history"
211+
212+ # Clone only the last-good-commit-for-vllm-gaudi branch (lightweight, single file)
213+ git clone --depth 50 --single-branch --branch vllm/last-good-commit-for-vllm-gaudi \
214+ https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi
215+ pushd /tmp/vllm-gaudi
216+
217+ # Get the last 30 commits - each commit represents a VLLM_STABLE_COMMIT update
218+ # This gives us a history of compatible vLLM versions
219+ CANDIDATE_COMMITS=$(git log -30 --pretty=format:"%H")
220+ popd
221+
222+ # Try each candidate commit (newest to oldest) until we find an existing image
223+ FOUND_IMAGE=0
224+ for VLLM_GAUDI_COMMIT in ${CANDIDATE_COMMITS}; do
225+ # Get the vLLM commit from this version of the branch
226+ CANDIDATE_VLLM_COMMIT=$(curl -s "https://raw.githubusercontent.com/vllm-project/vllm-gaudi/${VLLM_GAUDI_COMMIT}/VLLM_STABLE_COMMIT" | tr -d '\n')
227+
228+ if [[ -z "${CANDIDATE_VLLM_COMMIT}" ]]; then
229+ continue
230+ fi
231+
232+ DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${CANDIDATE_VLLM_COMMIT}${DOCKER_IMAGE_SUFFIX}"
233+ echo "Checking if image exists: ${DOCKER_IMAGE}"
234+
235+ if docker manifest inspect "${DOCKER_IMAGE}" > /dev/null 2>&1; then
236+ echo "Found existing HPU image for vLLM commit: ${CANDIDATE_VLLM_COMMIT}"
237+ HEAD_SHA="${CANDIDATE_VLLM_COMMIT}"
238+ FOUND_IMAGE=1
239+ break
240+ fi
241+ done
242+
243+ if [[ ${FOUND_IMAGE} == 0 ]]; then
244+ echo "ERROR: No HPU Docker image found in the last 20 versions of VLLM_STABLE_COMMIT"
245+ echo "This likely means ci-infra hasn't successfully built any HPU images yet"
246+ exit 1
202247 fi
203-
204- NOT_EXIST=0
205- S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
206- aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
207-
208- if [[ ${NOT_EXIST} == "1" ]]; then
209- echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
210- break
211- fi
212- done
248+ else
249+ # For non-HPU devices: Looking back the latest 100 commits
250+ for i in {0..99}
251+ do
252+ # Check if the image is there, if it doesn't then check an older one
253+ # because the commit is too recent
254+ HEAD_SHA=$(git rev-parse --verify HEAD~${i})
255+ DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
256+
257+ # No Docker image available yet because the commit is too recent
258+ if ! docker manifest inspect "${DOCKER_IMAGE}"; then
259+ continue
260+ fi
261+
262+ NOT_EXIST=0
263+ S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
264+ aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
265+
266+ if [[ ${NOT_EXIST} == "1" ]]; then
267+ echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
268+ break
269+ fi
270+ done
271+ fi
213272 popd
214273 fi
215274
216275 echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
217276
218- # Print the benchmark commit for rereference
277+ # Print the benchmark commit for reference
219278 echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
220279
221280 - name : Setup CUDA GPU_FLAG for docker run
0 commit comments