@@ -198,34 +198,83 @@ jobs:
198198
199199 if [[ -z "${HEAD_SHA}" ]]; then
200200 pushd vllm
201- # Looking back the latest 100 commits is enough
202- for i in {0..99}
203- do
204- # Check if the image is there, if it doesn't then check an older one
205- # because the commit is too recent
206- HEAD_SHA=$(git rev-parse --verify HEAD~${i})
207- DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
208-
209- # No Docker image available yet because the commit is too recent
210- if ! docker manifest inspect "${DOCKER_IMAGE}"; then
211- continue
201+
202+ # Special handling for HPU: use vllm-gaudi compatible commit
203+ #
204+ # Problem: VLLM_STABLE_COMMIT might change between when CI builds the image
205+ # and when this benchmark runs (every 12 hours), causing image tag mismatches.
206+ #
207+ # Solution: Query git history of VLLM_STABLE_COMMIT file to find the most recent
208+ # compatible vLLM commit that has an actual Docker image built by CI.
209+ if [[ "${DEVICE_NAME}" == "hpu" ]]; then
210+ echo "HPU device detected - finding compatible vLLM commit from vllm-gaudi history"
211+
212+ # Clone only the last-good-commit-for-vllm-gaudi branch (lightweight, single file)
213+ git clone --depth 50 --single-branch --branch vllm/last-good-commit-for-vllm-gaudi \
214+ https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi
215+ pushd /tmp/vllm-gaudi
216+
217+ # Get the last 30 commits - each commit represents a VLLM_STABLE_COMMIT update
218+ # This gives us a history of compatible vLLM versions
219+ CANDIDATE_COMMITS=$(git log -30 --pretty=format:"%H")
220+ popd
221+
222+ # Try each candidate commit (newest to oldest) until we find an existing image
223+ FOUND_IMAGE=0
224+ for VLLM_GAUDI_COMMIT in ${CANDIDATE_COMMITS}; do
225+ # Get the vLLM commit from this version of the branch
226+ CANDIDATE_VLLM_COMMIT=$(curl -s "https://raw.githubusercontent.com/vllm-project/vllm-gaudi/${VLLM_GAUDI_COMMIT}/VLLM_STABLE_COMMIT" | tr -d '\n')
227+
228+ if [[ -z "${CANDIDATE_VLLM_COMMIT}" ]]; then
229+ continue
230+ fi
231+
232+ DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${CANDIDATE_VLLM_COMMIT}${DOCKER_IMAGE_SUFFIX}"
233+ echo "Checking if image exists: ${DOCKER_IMAGE}"
234+
235+ if docker manifest inspect "${DOCKER_IMAGE}" > /dev/null 2>&1; then
236+ echo "Found existing HPU image for vLLM commit: ${CANDIDATE_VLLM_COMMIT}"
237+ HEAD_SHA="${CANDIDATE_VLLM_COMMIT}"
238+ FOUND_IMAGE=1
239+ break
240+ fi
241+ done
242+
243+ if [[ ${FOUND_IMAGE} == 0 ]]; then
244+ echo "ERROR: No HPU Docker image found in the last 20 versions of VLLM_STABLE_COMMIT"
245+ echo "This likely means ci-infra hasn't successfully built any HPU images yet"
246+ exit 1
212247 fi
213-
214- NOT_EXIST=0
215- S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
216- aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
217-
218- if [[ ${NOT_EXIST} == "1" ]]; then
219- echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
220- break
221- fi
222- done
248+ else
249+ # For non-HPU devices: Looking back the latest 100 commits
250+ for i in {0..99}
251+ do
252+ # Check if the image is there, if it doesn't then check an older one
253+ # because the commit is too recent
254+ HEAD_SHA=$(git rev-parse --verify HEAD~${i})
255+ DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
256+
257+ # No Docker image available yet because the commit is too recent
258+ if ! docker manifest inspect "${DOCKER_IMAGE}"; then
259+ continue
260+ fi
261+
262+ NOT_EXIST=0
263+ S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
264+ aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
265+
266+ if [[ ${NOT_EXIST} == "1" ]]; then
267+ echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
268+ break
269+ fi
270+ done
271+ fi
223272 popd
224273 fi
225274
226275 echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
227276
228- # Print the benchmark commit for rereference
277+ # Print the benchmark commit for reference
229278 echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
230279
231280 - name : Setup CUDA GPU_FLAG for docker run
0 commit comments