Enable Intel Gaudi 3 benchmarks, runner placeholder

jakub-sochacki · jakub-sochacki · commit bd1e80c6d600 · 2025-10-14T12:34:34.000+03:00
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -19,21 +19,25 @@
         "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
         "linux.dgx.b200",
+        "linux.hpu.gaudi3.8",
     ],
     # NB: There is no 2xH100 runner at the momement, so let's use the next one
     # in the list here which is 4xH100
     2: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.gfx942.2",
+        "linux.hpu.gaudi3.8",
     ],
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.gfx942.4",
+        "linux.hpu.gaudi3.8",
     ],
     8: [
         "linux.aws.h100.8",
         "linux.rocm.gpu.gfx942.8",
         "linux.dgx.b200.8",
+        "linux.hpu.gaudi3.8",
     ],
 }
 
@@ -50,6 +54,7 @@
     "linux.rocm.gpu.gfx942.4": "rocm",
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
+    "linux.hpu.gaudi3.8": "hpu",
 }
 
 # All the different names vLLM uses to refer to their benchmark configs
@@ -78,51 +83,62 @@
     ],
     "Qwen/Qwen3-8B": [
         "linux.dgx.b200",
+        "linux.hpu.gaudi3.8",
     ],
     "google/gemma-3-4b-it": [
         "linux.dgx.b200",
         "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
+        "linux.hpu.gaudi3.8",
     ],
     # Run some bigger models on B200 to share the load
     "Qwen/Qwen3-30B-A3B": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
+        "linux.hpu.gaudi3.8",
     ],
     "google/gemma-3-27b-it": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO (huydhn): Fail on ROCm
+        "linux.hpu.gaudi3.8",
     ],
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
+        "linux.hpu.gaudi3.8",
     ],
     "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO (huydhn): Hang on ROCm
+        "linux.hpu.gaudi3.8",
     ],
     # Run gpt-oss on both H100 and B200
     "openai/gpt-oss-20b": [
         "linux.aws.a100",
+        "linux.hpu.gaudi3.8",
     ],
     "openai/gpt-oss-120b": [
         "linux.aws.a100",
+        "linux.hpu.gaudi3.8",
     ],
     # Deepseek can only run on B200
     "deepseek-ai/DeepSeek-V3.1": [
         "linux.aws.a100",
         "linux.aws.h100",
+        "linux.hpu.gaudi3.8",
     ],
     "deepseek-ai/DeepSeek-V3.2-Exp": [
         "linux.aws.a100",
         "linux.aws.h100",
+        "linux.hpu.gaudi3.8",
     ],
     "deepseek-ai/DeepSeek-R1": [
         "linux.aws.a100",
         "linux.aws.h100",
+        "linux.hpu.gaudi3.8",
     ],
 }
 # Lower case all the model names for consistency
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -25,7 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        default: h100,rocm,spr,b200
+        default: h100,rocm,spr,b200,gaudi3
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -104,6 +104,9 @@ jobs:
           elif command -v rocm-smi; then
             DEVICE_NAME=rocm
             rocm-smi
+          elif command -v hl-smi; then
+            DEVICE_NAME=hpu
+            hl-smi
           else
             DEVICE_NAME=cpu
             lscpu
@@ -120,6 +123,8 @@ jobs:
             DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
           elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
+            DEVICE_TYPE=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
           fi
@@ -133,6 +138,9 @@ jobs:
           if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
+            grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt
+            pip install -r /tmp/requirements_no_torch.txt \
           else
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/cu128
@@ -155,6 +163,8 @@ jobs:
           DOCKER_IMAGE_SUFFIX=""
           if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-hpu
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DOCKER_IMAGE_SUFFIX=-cpu
           fi
diff --git a/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json
@@ -0,0 +1,55 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json
@@ -0,0 +1,82 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json
@@ -0,0 +1,61 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    }
+]