[CI/Build][Intel] Enable benchmarks on Intel Gaudi 3 runner (#94)

jakub-sochacki · web-flow · commit 55c19cee9b07 · 2025-10-30T18:32:12.000-07:00
* Enable Intel Gaudi 3 benchmarks, runner placeholder

* Add Intel Gaudi3 HPU benchmark support with version compatibility
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -20,22 +20,26 @@
         "linux.24xl.spr-metal",
         "linux.24xl.gnr",
         "linux.dgx.b200",
+        "linux.hpu.gaudi3.8",
     ],
     # NB: There is no 2xH100 runner at the momement, so let's use the next one
     # in the list here which is 4xH100
     2: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.gfx942.2",
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.gfx942.4",
+        "linux.hpu.gaudi3.8",
     ],
     8: [
         "linux.aws.h100.8",
         "linux.rocm.gpu.gfx942.8",
         "linux.dgx.b200.8",
+        "linux.hpu.gaudi3.8",
     ],
 }
 
@@ -53,6 +57,7 @@
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
     "linux.24xl.gnr": "cpu",
+    "linux.hpu.gaudi3.8": "hpu",
 }
 
 # All the different names vLLM uses to refer to their benchmark configs
@@ -82,60 +87,71 @@
     ],
     "Qwen/Qwen3-8B": [
         "linux.dgx.b200",
+        "linux.hpu.gaudi3.8",
     ],
     "google/gemma-3-4b-it": [
         "linux.dgx.b200",
         "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
+        "linux.hpu.gaudi3.8",
     ],
     # Run some bigger models on B200 to share the load
     "Qwen/Qwen3-30B-A3B": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     "google/gemma-3-27b-it": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO (huydhn): Fail on ROCm
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942",  # TODO (huydhn): Hang on ROCm
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     # Run gpt-oss on both H100 and B200
     "openai/gpt-oss-20b": [
         "linux.aws.a100",
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     "openai/gpt-oss-120b": [
         "linux.aws.a100",
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     # Deepseek can only run on B200
     "deepseek-ai/DeepSeek-V3.1": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     "deepseek-ai/DeepSeek-V3.2-Exp": [
         "linux.aws.a100",
         "linux.aws.h100",
         "linux.24xl.gnr",
+        "linux.hpu.gaudi3.8",
     ],
     "deepseek-ai/DeepSeek-R1": [
         "linux.aws.a100",
         "linux.24xl.gnr",
         "linux.aws.h100",
+        "linux.hpu.gaudi3.8",
     ],
 }
 # Lower case all the model names for consistency
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -25,7 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        default: h100,rocm,spr,gnr,b200
+        default: h100,rocm,spr,gnr,b200,gaudi3
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -104,6 +104,9 @@ jobs:
           elif command -v rocm-smi; then
             DEVICE_NAME=rocm
             rocm-smi
+          elif command -v hl-smi; then
+            DEVICE_NAME=hpu
+            hl-smi
           else
             DEVICE_NAME=cpu
             lscpu
@@ -120,6 +123,8 @@ jobs:
             DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
           elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
+            DEVICE_TYPE=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
           fi
@@ -133,6 +138,9 @@ jobs:
           if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
+            grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt
+            pip install -r /tmp/requirements_no_torch.txt
           else
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/cu128
@@ -155,6 +163,8 @@ jobs:
           DOCKER_IMAGE_SUFFIX=""
           if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-hpu
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DOCKER_IMAGE_SUFFIX=-cpu
           fi
@@ -188,34 +198,83 @@ jobs:
 
           if [[ -z "${HEAD_SHA}" ]]; then
             pushd vllm
-            # Looking back the latest 100 commits is enough
-            for i in {0..99}
-            do
-              # Check if the image is there, if it doesn't then check an older one
-              # because the commit is too recent
-              HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
-
-              # No Docker image available yet because the commit is too recent
-              if ! docker manifest inspect "${DOCKER_IMAGE}"; then
-                continue
+            
+            # Special handling for HPU: use vllm-gaudi compatible commit
+            # 
+            # Problem: VLLM_STABLE_COMMIT might change between when CI builds the image
+            # and when this benchmark runs (every 12 hours), causing image tag mismatches.
+            # 
+            # Solution: Query git history of VLLM_STABLE_COMMIT file to find the most recent
+            # compatible vLLM commit that has an actual Docker image built by CI.
+            if [[ "${DEVICE_NAME}" == "hpu" ]]; then
+              echo "HPU device detected - finding compatible vLLM commit from vllm-gaudi history"
+              
+              # Clone only the last-good-commit-for-vllm-gaudi branch (lightweight, single file)
+              git clone --depth 50 --single-branch --branch vllm/last-good-commit-for-vllm-gaudi \
+                https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi
+              pushd /tmp/vllm-gaudi
+              
+              # Get the last 30 commits - each commit represents a VLLM_STABLE_COMMIT update
+              # This gives us a history of compatible vLLM versions
+              CANDIDATE_COMMITS=$(git log -30 --pretty=format:"%H")
+              popd
+              
+              # Try each candidate commit (newest to oldest) until we find an existing image
+              FOUND_IMAGE=0
+              for VLLM_GAUDI_COMMIT in ${CANDIDATE_COMMITS}; do
+                # Get the vLLM commit from this version of the branch
+                CANDIDATE_VLLM_COMMIT=$(curl -s "https://raw.githubusercontent.com/vllm-project/vllm-gaudi/${VLLM_GAUDI_COMMIT}/VLLM_STABLE_COMMIT" | tr -d '\n')
+                
+                if [[ -z "${CANDIDATE_VLLM_COMMIT}" ]]; then
+                  continue
+                fi
+                
+                DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${CANDIDATE_VLLM_COMMIT}${DOCKER_IMAGE_SUFFIX}"
+                echo "Checking if image exists: ${DOCKER_IMAGE}"
+                
+                if docker manifest inspect "${DOCKER_IMAGE}" > /dev/null 2>&1; then
+                  echo "Found existing HPU image for vLLM commit: ${CANDIDATE_VLLM_COMMIT}"
+                  HEAD_SHA="${CANDIDATE_VLLM_COMMIT}"
+                  FOUND_IMAGE=1
+                  break
+                fi
+              done
+              
+              if [[ ${FOUND_IMAGE} == 0 ]]; then
+                echo "ERROR: No HPU Docker image found in the last 20 versions of VLLM_STABLE_COMMIT"
+                echo "This likely means ci-infra hasn't successfully built any HPU images yet"
+                exit 1
               fi
-
-              NOT_EXIST=0
-              S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
-              aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
-
-              if [[ ${NOT_EXIST} == "1" ]]; then
-                echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
-                break
-              fi
-            done
+            else
+              # For non-HPU devices: Looking back the latest 100 commits
+              for i in {0..99}
+              do
+                # Check if the image is there, if it doesn't then check an older one
+                # because the commit is too recent
+                HEAD_SHA=$(git rev-parse --verify HEAD~${i})
+                DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
+
+                # No Docker image available yet because the commit is too recent
+                if ! docker manifest inspect "${DOCKER_IMAGE}"; then
+                  continue
+                fi
+
+                NOT_EXIST=0
+                S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
+                aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
+
+                if [[ ${NOT_EXIST} == "1" ]]; then
+                  echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
+                  break
+                fi
+              done
+            fi
             popd
           fi
 
           echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
 
-          # Print the benchmark commit for rereference
+          # Print the benchmark commit for reference
           echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
 
       - name: Setup CUDA GPU_FLAG for docker run
diff --git a/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json
@@ -0,0 +1,55 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json
@@ -0,0 +1,82 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json