Skip to content

Commit 55c19ce

Browse files
[CI/Build][Intel] Enable benchmarks on Intel Gaudi 3 runner (#94)
* Enable Intel Gaudi 3 benchmarks, runner placeholder * Add Intel Gaudi3 HPU benchmark support with version compatibility
1 parent e141125 commit 55c19ce

File tree

5 files changed

+296
-23
lines changed

5 files changed

+296
-23
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,26 @@
2020
"linux.24xl.spr-metal",
2121
"linux.24xl.gnr",
2222
"linux.dgx.b200",
23+
"linux.hpu.gaudi3.8",
2324
],
2425
# NB: There is no 2xH100 runner at the momement, so let's use the next one
2526
# in the list here which is 4xH100
2627
2: [
2728
"linux.aws.h100.4",
2829
"linux.rocm.gpu.gfx942.2",
2930
"linux.24xl.gnr",
31+
"linux.hpu.gaudi3.8",
3032
],
3133
4: [
3234
"linux.aws.h100.4",
3335
"linux.rocm.gpu.gfx942.4",
36+
"linux.hpu.gaudi3.8",
3437
],
3538
8: [
3639
"linux.aws.h100.8",
3740
"linux.rocm.gpu.gfx942.8",
3841
"linux.dgx.b200.8",
42+
"linux.hpu.gaudi3.8",
3943
],
4044
}
4145

@@ -53,6 +57,7 @@
5357
"linux.rocm.gpu.gfx942.8": "rocm",
5458
"linux.24xl.spr-metal": "cpu",
5559
"linux.24xl.gnr": "cpu",
60+
"linux.hpu.gaudi3.8": "hpu",
5661
}
5762

5863
# All the different names vLLM uses to refer to their benchmark configs
@@ -82,60 +87,71 @@
8287
],
8388
"Qwen/Qwen3-8B": [
8489
"linux.dgx.b200",
90+
"linux.hpu.gaudi3.8",
8591
],
8692
"google/gemma-3-4b-it": [
8793
"linux.dgx.b200",
8894
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
95+
"linux.hpu.gaudi3.8",
8996
],
9097
# Run some bigger models on B200 to share the load
9198
"Qwen/Qwen3-30B-A3B": [
9299
"linux.aws.a100",
93100
"linux.aws.h100",
94101
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
95102
"linux.24xl.gnr",
103+
"linux.hpu.gaudi3.8",
96104
],
97105
"google/gemma-3-27b-it": [
98106
"linux.aws.a100",
99107
"linux.aws.h100",
100108
"linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm
101109
"linux.24xl.gnr",
110+
"linux.hpu.gaudi3.8",
102111
],
103112
"meta-llama/Llama-4-Scout-17B-16E-Instruct": [
104113
"linux.aws.a100",
105114
"linux.aws.h100",
106115
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
107116
"linux.24xl.gnr",
117+
"linux.hpu.gaudi3.8",
108118
],
109119
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
110120
"linux.aws.a100",
111121
"linux.aws.h100",
112122
"linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm
113123
"linux.24xl.gnr",
124+
"linux.hpu.gaudi3.8",
114125
],
115126
# Run gpt-oss on both H100 and B200
116127
"openai/gpt-oss-20b": [
117128
"linux.aws.a100",
118129
"linux.24xl.gnr",
130+
"linux.hpu.gaudi3.8",
119131
],
120132
"openai/gpt-oss-120b": [
121133
"linux.aws.a100",
122134
"linux.24xl.gnr",
135+
"linux.hpu.gaudi3.8",
123136
],
124137
# Deepseek can only run on B200
125138
"deepseek-ai/DeepSeek-V3.1": [
126139
"linux.aws.a100",
127140
"linux.aws.h100",
128141
"linux.24xl.gnr",
142+
"linux.hpu.gaudi3.8",
129143
],
130144
"deepseek-ai/DeepSeek-V3.2-Exp": [
131145
"linux.aws.a100",
132146
"linux.aws.h100",
133147
"linux.24xl.gnr",
148+
"linux.hpu.gaudi3.8",
134149
],
135150
"deepseek-ai/DeepSeek-R1": [
136151
"linux.aws.a100",
137152
"linux.24xl.gnr",
138153
"linux.aws.h100",
154+
"linux.hpu.gaudi3.8",
139155
],
140156
}
141157
# Lower case all the model names for consistency

.github/workflows/vllm-benchmark.yml

Lines changed: 82 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2626
required: true
2727
type: string
28-
default: h100,rocm,spr,gnr,b200
28+
default: h100,rocm,spr,gnr,b200,gaudi3
2929
pull_request:
3030
paths:
3131
- .github/workflows/vllm-benchmark.yml
@@ -104,6 +104,9 @@ jobs:
104104
elif command -v rocm-smi; then
105105
DEVICE_NAME=rocm
106106
rocm-smi
107+
elif command -v hl-smi; then
108+
DEVICE_NAME=hpu
109+
hl-smi
107110
else
108111
DEVICE_NAME=cpu
109112
lscpu
@@ -120,6 +123,8 @@ jobs:
120123
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
121124
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
122125
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
126+
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
127+
DEVICE_TYPE=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
123128
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
124129
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
125130
fi
@@ -133,6 +138,9 @@ jobs:
133138
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
134139
pip install -r .github/scripts/requirements.txt \
135140
--extra-index-url https://download.pytorch.org/whl/rocm6.3
141+
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
142+
grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt
143+
pip install -r /tmp/requirements_no_torch.txt
136144
else
137145
pip install -r .github/scripts/requirements.txt \
138146
--extra-index-url https://download.pytorch.org/whl/cu128
@@ -155,6 +163,8 @@ jobs:
155163
DOCKER_IMAGE_SUFFIX=""
156164
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
157165
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
166+
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
167+
DOCKER_IMAGE_SUFFIX=-hpu
158168
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
159169
DOCKER_IMAGE_SUFFIX=-cpu
160170
fi
@@ -188,34 +198,83 @@ jobs:
188198
189199
if [[ -z "${HEAD_SHA}" ]]; then
190200
pushd vllm
191-
# Looking back the latest 100 commits is enough
192-
for i in {0..99}
193-
do
194-
# Check if the image is there, if it doesn't then check an older one
195-
# because the commit is too recent
196-
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
197-
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
198-
199-
# No Docker image available yet because the commit is too recent
200-
if ! docker manifest inspect "${DOCKER_IMAGE}"; then
201-
continue
201+
202+
# Special handling for HPU: use vllm-gaudi compatible commit
203+
#
204+
# Problem: VLLM_STABLE_COMMIT might change between when CI builds the image
205+
# and when this benchmark runs (every 12 hours), causing image tag mismatches.
206+
#
207+
# Solution: Query git history of VLLM_STABLE_COMMIT file to find the most recent
208+
# compatible vLLM commit that has an actual Docker image built by CI.
209+
if [[ "${DEVICE_NAME}" == "hpu" ]]; then
210+
echo "HPU device detected - finding compatible vLLM commit from vllm-gaudi history"
211+
212+
# Clone only the last-good-commit-for-vllm-gaudi branch (lightweight, single file)
213+
git clone --depth 50 --single-branch --branch vllm/last-good-commit-for-vllm-gaudi \
214+
https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi
215+
pushd /tmp/vllm-gaudi
216+
217+
# Get the last 30 commits - each commit represents a VLLM_STABLE_COMMIT update
218+
# This gives us a history of compatible vLLM versions
219+
CANDIDATE_COMMITS=$(git log -30 --pretty=format:"%H")
220+
popd
221+
222+
# Try each candidate commit (newest to oldest) until we find an existing image
223+
FOUND_IMAGE=0
224+
for VLLM_GAUDI_COMMIT in ${CANDIDATE_COMMITS}; do
225+
# Get the vLLM commit from this version of the branch
226+
CANDIDATE_VLLM_COMMIT=$(curl -s "https://raw.githubusercontent.com/vllm-project/vllm-gaudi/${VLLM_GAUDI_COMMIT}/VLLM_STABLE_COMMIT" | tr -d '\n')
227+
228+
if [[ -z "${CANDIDATE_VLLM_COMMIT}" ]]; then
229+
continue
230+
fi
231+
232+
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${CANDIDATE_VLLM_COMMIT}${DOCKER_IMAGE_SUFFIX}"
233+
echo "Checking if image exists: ${DOCKER_IMAGE}"
234+
235+
if docker manifest inspect "${DOCKER_IMAGE}" > /dev/null 2>&1; then
236+
echo "Found existing HPU image for vLLM commit: ${CANDIDATE_VLLM_COMMIT}"
237+
HEAD_SHA="${CANDIDATE_VLLM_COMMIT}"
238+
FOUND_IMAGE=1
239+
break
240+
fi
241+
done
242+
243+
if [[ ${FOUND_IMAGE} == 0 ]]; then
244+
echo "ERROR: No HPU Docker image found in the last 20 versions of VLLM_STABLE_COMMIT"
245+
echo "This likely means ci-infra hasn't successfully built any HPU images yet"
246+
exit 1
202247
fi
203-
204-
NOT_EXIST=0
205-
S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
206-
aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
207-
208-
if [[ ${NOT_EXIST} == "1" ]]; then
209-
echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
210-
break
211-
fi
212-
done
248+
else
249+
# For non-HPU devices: Looking back the latest 100 commits
250+
for i in {0..99}
251+
do
252+
# Check if the image is there, if it doesn't then check an older one
253+
# because the commit is too recent
254+
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
255+
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
256+
257+
# No Docker image available yet because the commit is too recent
258+
if ! docker manifest inspect "${DOCKER_IMAGE}"; then
259+
continue
260+
fi
261+
262+
NOT_EXIST=0
263+
S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
264+
aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
265+
266+
if [[ ${NOT_EXIST} == "1" ]]; then
267+
echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
268+
break
269+
fi
270+
done
271+
fi
213272
popd
214273
fi
215274
216275
echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
217276
218-
# Print the benchmark commit for rereference
277+
# Print the benchmark commit for reference
219278
echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
220279
221280
- name: Setup CUDA GPU_FLAG for docker run
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"PT_HPU_LAZY_MODE": 1,
6+
"VLLM_CONTIGUOUS_PA": 1,
7+
"VLLM_DEFRAG": 1
8+
},
9+
"parameters": {
10+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
11+
"tensor_parallel_size": 1,
12+
"load_format": "dummy",
13+
"num-iters-warmup": 5,
14+
"num-iters": 15,
15+
"max-model-len": 256,
16+
"async-scheduling": ""
17+
}
18+
},
19+
{
20+
"test_name": "latency_llama70B_tp4",
21+
"environment_variables": {
22+
"PT_HPU_LAZY_MODE": 1,
23+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
24+
"VLLM_CONTIGUOUS_PA": 1,
25+
"VLLM_DEFRAG": 1
26+
},
27+
"parameters": {
28+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
29+
"tensor_parallel_size": 4,
30+
"load_format": "dummy",
31+
"num-iters-warmup": 5,
32+
"num-iters": 15,
33+
"max-model-len": 256,
34+
"async-scheduling": ""
35+
}
36+
},
37+
{
38+
"test_name": "latency_mixtral8x7B_tp2",
39+
"environment_variables": {
40+
"PT_HPU_LAZY_MODE": 1,
41+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
42+
"VLLM_CONTIGUOUS_PA": 1,
43+
"VLLM_DEFRAG": 1
44+
},
45+
"parameters": {
46+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
47+
"tensor_parallel_size": 2,
48+
"load_format": "dummy",
49+
"num-iters-warmup": 5,
50+
"num-iters": 15,
51+
"max-model-len": 256,
52+
"async-scheduling": ""
53+
}
54+
}
55+
]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"PT_HPU_LAZY_MODE": 1,
7+
"VLLM_CONTIGUOUS_PA": 1,
8+
"VLLM_DEFRAG": 1
9+
},
10+
"server_parameters": {
11+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
12+
"tensor_parallel_size": 1,
13+
"swap_space": 16,
14+
"disable_log_stats": "",
15+
"load_format": "dummy",
16+
"max-model-len": 2048,
17+
"max-num-seqs": 256,
18+
"async-scheduling": ""
19+
},
20+
"client_parameters": {
21+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
22+
"backend": "vllm",
23+
"dataset_name": "sharegpt",
24+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
25+
"num_prompts": 200
26+
}
27+
},
28+
{
29+
"test_name": "serving_llama70B_tp4_sharegpt",
30+
"qps_list": [1, 4, 16, "inf"],
31+
"server_environment_variables": {
32+
"PT_HPU_LAZY_MODE": 1,
33+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
34+
"VLLM_CONTIGUOUS_PA": 1,
35+
"VLLM_DEFRAG": 1
36+
},
37+
"server_parameters": {
38+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
39+
"tensor_parallel_size": 4,
40+
"swap_space": 16,
41+
"disable_log_stats": "",
42+
"load_format": "dummy",
43+
"max-model-len": 2048,
44+
"max-num-seqs": 256,
45+
"async-scheduling": ""
46+
},
47+
"client_parameters": {
48+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
49+
"backend": "vllm",
50+
"dataset_name": "sharegpt",
51+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
52+
"num_prompts": 200
53+
}
54+
},
55+
{
56+
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
57+
"qps_list": [1, 4, 16, "inf"],
58+
"server_environment_variables": {
59+
"PT_HPU_LAZY_MODE": 1,
60+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
61+
"VLLM_CONTIGUOUS_PA": 1,
62+
"VLLM_DEFRAG": 1
63+
},
64+
"server_parameters": {
65+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
66+
"tensor_parallel_size": 2,
67+
"swap_space": 16,
68+
"disable_log_stats": "",
69+
"load_format": "dummy",
70+
"max-model-len": 2048,
71+
"max-num-seqs": 256,
72+
"async-scheduling": ""
73+
},
74+
"client_parameters": {
75+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
76+
"backend": "vllm",
77+
"dataset_name": "sharegpt",
78+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
79+
"num_prompts": 200
80+
}
81+
}
82+
]

0 commit comments

Comments
 (0)