Skip to content

Commit bd1e80c

Browse files
Enable Intel Gaudi 3 benchmarks, runner placeholder
1 parent 4bde5d3 commit bd1e80c

File tree

5 files changed

+225
-1
lines changed

5 files changed

+225
-1
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,25 @@
1919
"linux.rocm.gpu.gfx942.1",
2020
"linux.24xl.spr-metal",
2121
"linux.dgx.b200",
22+
"linux.hpu.gaudi3.8",
2223
],
2324
# NB: There is no 2xH100 runner at the momement, so let's use the next one
2425
# in the list here which is 4xH100
2526
2: [
2627
"linux.aws.h100.4",
2728
"linux.rocm.gpu.gfx942.2",
29+
"linux.hpu.gaudi3.8",
2830
],
2931
4: [
3032
"linux.aws.h100.4",
3133
"linux.rocm.gpu.gfx942.4",
34+
"linux.hpu.gaudi3.8",
3235
],
3336
8: [
3437
"linux.aws.h100.8",
3538
"linux.rocm.gpu.gfx942.8",
3639
"linux.dgx.b200.8",
40+
"linux.hpu.gaudi3.8",
3741
],
3842
}
3943

@@ -50,6 +54,7 @@
5054
"linux.rocm.gpu.gfx942.4": "rocm",
5155
"linux.rocm.gpu.gfx942.8": "rocm",
5256
"linux.24xl.spr-metal": "cpu",
57+
"linux.hpu.gaudi3.8": "hpu",
5358
}
5459

5560
# All the different names vLLM uses to refer to their benchmark configs
@@ -78,51 +83,62 @@
7883
],
7984
"Qwen/Qwen3-8B": [
8085
"linux.dgx.b200",
86+
"linux.hpu.gaudi3.8",
8187
],
8288
"google/gemma-3-4b-it": [
8389
"linux.dgx.b200",
8490
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
91+
"linux.hpu.gaudi3.8",
8592
],
8693
# Run some bigger models on B200 to share the load
8794
"Qwen/Qwen3-30B-A3B": [
8895
"linux.aws.a100",
8996
"linux.aws.h100",
9097
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
98+
"linux.hpu.gaudi3.8",
9199
],
92100
"google/gemma-3-27b-it": [
93101
"linux.aws.a100",
94102
"linux.aws.h100",
95103
"linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm
104+
"linux.hpu.gaudi3.8",
96105
],
97106
"meta-llama/Llama-4-Scout-17B-16E-Instruct": [
98107
"linux.aws.a100",
99108
"linux.aws.h100",
100109
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
110+
"linux.hpu.gaudi3.8",
101111
],
102112
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
103113
"linux.aws.a100",
104114
"linux.aws.h100",
105115
"linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm
116+
"linux.hpu.gaudi3.8",
106117
],
107118
# Run gpt-oss on both H100 and B200
108119
"openai/gpt-oss-20b": [
109120
"linux.aws.a100",
121+
"linux.hpu.gaudi3.8",
110122
],
111123
"openai/gpt-oss-120b": [
112124
"linux.aws.a100",
125+
"linux.hpu.gaudi3.8",
113126
],
114127
# Deepseek can only run on B200
115128
"deepseek-ai/DeepSeek-V3.1": [
116129
"linux.aws.a100",
117130
"linux.aws.h100",
131+
"linux.hpu.gaudi3.8",
118132
],
119133
"deepseek-ai/DeepSeek-V3.2-Exp": [
120134
"linux.aws.a100",
121135
"linux.aws.h100",
136+
"linux.hpu.gaudi3.8",
122137
],
123138
"deepseek-ai/DeepSeek-R1": [
124139
"linux.aws.a100",
125140
"linux.aws.h100",
141+
"linux.hpu.gaudi3.8",
126142
],
127143
}
128144
# Lower case all the model names for consistency

.github/workflows/vllm-benchmark.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2626
required: true
2727
type: string
28-
default: h100,rocm,spr,b200
28+
default: h100,rocm,spr,b200,gaudi3
2929
pull_request:
3030
paths:
3131
- .github/workflows/vllm-benchmark.yml
@@ -104,6 +104,9 @@ jobs:
104104
elif command -v rocm-smi; then
105105
DEVICE_NAME=rocm
106106
rocm-smi
107+
elif command -v hl-smi; then
108+
DEVICE_NAME=hpu
109+
hl-smi
107110
else
108111
DEVICE_NAME=cpu
109112
lscpu
@@ -120,6 +123,8 @@ jobs:
120123
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
121124
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
122125
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
126+
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
127+
DEVICE_TYPE=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
123128
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
124129
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
125130
fi
@@ -133,6 +138,9 @@ jobs:
133138
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
134139
pip install -r .github/scripts/requirements.txt \
135140
--extra-index-url https://download.pytorch.org/whl/rocm6.3
141+
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
142+
grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt
143+
pip install -r /tmp/requirements_no_torch.txt \
136144
else
137145
pip install -r .github/scripts/requirements.txt \
138146
--extra-index-url https://download.pytorch.org/whl/cu128
@@ -155,6 +163,8 @@ jobs:
155163
DOCKER_IMAGE_SUFFIX=""
156164
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
157165
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
166+
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
167+
DOCKER_IMAGE_SUFFIX=-hpu
158168
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
159169
DOCKER_IMAGE_SUFFIX=-cpu
160170
fi
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"PT_HPU_LAZY_MODE": 1,
6+
"VLLM_CONTIGUOUS_PA": 1,
7+
"VLLM_DEFRAG": 1
8+
},
9+
"parameters": {
10+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
11+
"tensor_parallel_size": 1,
12+
"load_format": "dummy",
13+
"num-iters-warmup": 5,
14+
"num-iters": 15,
15+
"max-model-len": 256,
16+
"async-scheduling": ""
17+
}
18+
},
19+
{
20+
"test_name": "latency_llama70B_tp4",
21+
"environment_variables": {
22+
"PT_HPU_LAZY_MODE": 1,
23+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
24+
"VLLM_CONTIGUOUS_PA": 1,
25+
"VLLM_DEFRAG": 1
26+
},
27+
"parameters": {
28+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
29+
"tensor_parallel_size": 4,
30+
"load_format": "dummy",
31+
"num-iters-warmup": 5,
32+
"num-iters": 15,
33+
"max-model-len": 256,
34+
"async-scheduling": ""
35+
}
36+
},
37+
{
38+
"test_name": "latency_mixtral8x7B_tp2",
39+
"environment_variables": {
40+
"PT_HPU_LAZY_MODE": 1,
41+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
42+
"VLLM_CONTIGUOUS_PA": 1,
43+
"VLLM_DEFRAG": 1
44+
},
45+
"parameters": {
46+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
47+
"tensor_parallel_size": 2,
48+
"load_format": "dummy",
49+
"num-iters-warmup": 5,
50+
"num-iters": 15,
51+
"max-model-len": 256,
52+
"async-scheduling": ""
53+
}
54+
}
55+
]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"PT_HPU_LAZY_MODE": 1,
7+
"VLLM_CONTIGUOUS_PA": 1,
8+
"VLLM_DEFRAG": 1
9+
},
10+
"server_parameters": {
11+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
12+
"tensor_parallel_size": 1,
13+
"swap_space": 16,
14+
"disable_log_stats": "",
15+
"load_format": "dummy",
16+
"max-model-len": 2048,
17+
"max-num-seqs": 256,
18+
"async-scheduling": ""
19+
},
20+
"client_parameters": {
21+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
22+
"backend": "vllm",
23+
"dataset_name": "sharegpt",
24+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
25+
"num_prompts": 200
26+
}
27+
},
28+
{
29+
"test_name": "serving_llama70B_tp4_sharegpt",
30+
"qps_list": [1, 4, 16, "inf"],
31+
"server_environment_variables": {
32+
"PT_HPU_LAZY_MODE": 1,
33+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
34+
"VLLM_CONTIGUOUS_PA": 1,
35+
"VLLM_DEFRAG": 1
36+
},
37+
"server_parameters": {
38+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
39+
"tensor_parallel_size": 4,
40+
"swap_space": 16,
41+
"disable_log_stats": "",
42+
"load_format": "dummy",
43+
"max-model-len": 2048,
44+
"max-num-seqs": 256,
45+
"async-scheduling": ""
46+
},
47+
"client_parameters": {
48+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
49+
"backend": "vllm",
50+
"dataset_name": "sharegpt",
51+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
52+
"num_prompts": 200
53+
}
54+
},
55+
{
56+
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
57+
"qps_list": [1, 4, 16, "inf"],
58+
"server_environment_variables": {
59+
"PT_HPU_LAZY_MODE": 1,
60+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
61+
"VLLM_CONTIGUOUS_PA": 1,
62+
"VLLM_DEFRAG": 1
63+
},
64+
"server_parameters": {
65+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
66+
"tensor_parallel_size": 2,
67+
"swap_space": 16,
68+
"disable_log_stats": "",
69+
"load_format": "dummy",
70+
"max-model-len": 2048,
71+
"max-num-seqs": 256,
72+
"async-scheduling": ""
73+
},
74+
"client_parameters": {
75+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
76+
"backend": "vllm",
77+
"dataset_name": "sharegpt",
78+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
79+
"num_prompts": 200
80+
}
81+
}
82+
]
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"PT_HPU_LAZY_MODE": 1,
6+
"VLLM_CONTIGUOUS_PA": 1,
7+
"VLLM_DEFRAG": 1
8+
},
9+
"parameters": {
10+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
11+
"tensor_parallel_size": 1,
12+
"load_format": "dummy",
13+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
14+
"num_prompts": 1000,
15+
"backend": "vllm",
16+
"max-model-len": 2048,
17+
"max-num-seqs": 512,
18+
"async-scheduling": ""
19+
}
20+
},
21+
{
22+
"test_name": "throughput_llama70B_tp4",
23+
"environment_variables": {
24+
"PT_HPU_LAZY_MODE": 1,
25+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
26+
"VLLM_CONTIGUOUS_PA": 1,
27+
"VLLM_DEFRAG": 1
28+
},
29+
"parameters": {
30+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
31+
"tensor_parallel_size": 4,
32+
"load_format": "dummy",
33+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
34+
"num_prompts": 1000,
35+
"backend": "vllm",
36+
"max-model-len": 2048,
37+
"max-num-seqs": 512,
38+
"async-scheduling": ""
39+
}
40+
},
41+
{
42+
"test_name": "throughput_mixtral8x7B_tp2",
43+
"environment_variables": {
44+
"PT_HPU_LAZY_MODE": 1,
45+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
46+
"VLLM_CONTIGUOUS_PA": 1,
47+
"VLLM_DEFRAG": 1
48+
},
49+
"parameters": {
50+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
51+
"tensor_parallel_size": 2,
52+
"load_format": "dummy",
53+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
54+
"num_prompts": 1000,
55+
"backend": "vllm",
56+
"max-model-len": 2048,
57+
"max-num-seqs": 512,
58+
"async-scheduling": ""
59+
}
60+
}
61+
]

0 commit comments

Comments
 (0)