Skip to content

Run an one-off benchmark for gpt-oss #15

Run an one-off benchmark for gpt-oss

Run an one-off benchmark for gpt-oss #15

name: gpt-oss benchmark
on:
pull_request:
paths:
- .github/workflows/gpt-oss-benchmark.yml
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
jobs:
benchmarks:
name: Run gpt-oss benchmarks
strategy:
matrix:
include:
# gpt-oss-120b
- runner: linux.aws.h100.4
model: openai/gpt-oss-120b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.dgx.b200.8
model: openai/gpt-oss-120b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.rocm.gpu.gfx942.4
model: openai/gpt-oss-120b
docker-image: rocm/vllm-dev:open-mi300-08052025
# gpt-oss-20b
- runner: linux.aws.h100
model: openai/gpt-oss-20b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.dgx.b200
model: openai/gpt-oss-20b
docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b'
- runner: linux.rocm.gpu.gfx942.2
model: openai/gpt-oss-20b
docker-image: rocm/vllm-dev:open-mi300-08052025
fail-fast: false
runs-on: ${{ matrix.runner }}
environment: pytorch-x-vllm
permissions:
id-token: write
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Checkout vLLM repository
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: vllm-benchmarks/vllm
- name: Checkout gpt-oss repository
uses: actions/checkout@v4
with:
repository: openai/gpt-oss
path: vllm-benchmarks/gpt-oss
- uses: actions/setup-python@v5
# Amazon Linux fails on this step
continue-on-error: true
with:
python-version: '3.12'
cache: 'pip'
- name: Check if the device is supported
shell: bash
run: |
set -eux
if command -v nvidia-smi; then
DEVICE_NAME=cuda
nvidia-smi
elif command -v rocm-smi; then
DEVICE_NAME=rocm
rocm-smi
else
DEVICE_NAME=cpu
lscpu
fi
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
- name: Set GPU name and type
working-directory: vllm-benchmarks
shell: bash
run: |
set -eux
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
fi
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
- name: Install dependencies
shell: bash
run: |
set -eux
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
pip install -r .github/scripts/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/rocm6.3
else
pip install -r .github/scripts/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/cu128
fi
- name: Setup CUDA GPU_FLAG for docker run
if: env.DEVICE_NAME == 'cuda'
run: |
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
- name: Setup ROCm
if: env.DEVICE_NAME == 'rocm'
uses: pytorch/pytorch/./.github/actions/setup-rocm@main
- name: Setup benchmark tests
env:
MODEL: ${{ matrix.model }}
run: |
set -eux
pushd vllm-benchmarks/vllm
rm .buildkite/nightly-benchmarks/tests/*.json
popd
# Set the list of benchmarks we want to cover in this runner
python3 .github/scripts/setup_vllm_benchmark.py \
--from-benchmark-configs-dir vllm-benchmarks/benchmarks \
--to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
--models "${MODEL}" \
--device "${DEVICE_NAME}"
pushd vllm-benchmarks/vllm
ls -lah .buildkite/nightly-benchmarks/tests
find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \;
popd
- name: Run vLLM gpt-oss benchmark
env:
# To login to public.ecr.aws
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DOCKER_IMAGE: ${{ matrix.docker-image }}
# vLLM-related environment variables
ENGINE_VERSION: v1
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
MODEL: ${{ matrix.model }}
run: |
set -eux
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
# Just to unblock this change on B200
aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}"
aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
fi
# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
export VLLM_USE_TRTLLM_ATTENTION=1
export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
else
export VLLM_USE_TRTLLM_ATTENTION=0
export VLLM_USE_TRTLLM_DECODE_ATTENTION=0
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=0
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=0
fi
if [[ "${DEVICE_NAME}" == *rocm* ]]; then
export VLLM_ROCM_USE_AITER=1
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
else
export VLLM_ROCM_USE_AITER=0
export VLLM_USE_AITER_UNIFIED_ATTENTION=0
export VLLM_ROCM_USE_AITER_MHA=0
fi
container_name=$(docker run \
${GPU_FLAG:-} \
-e MODEL \
-e DEVICE_NAME \
-e DEVICE_TYPE \
-e HF_TOKEN \
-e ENGINE_VERSION \
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-e VLLM_USE_TRTLLM_ATTENTION \
-e VLLM_USE_TRTLLM_DECODE_ATTENTION \
-e VLLM_USE_TRTLLM_CONTEXT_ATTENTION \
-e VLLM_USE_FLASHINFER_MXFP4_BF16_MOE \
-e VLLM_ROCM_USE_AITER \
-e VLLM_USE_AITER_UNIFIED_ATTENTION \
-e VLLM_ROCM_USE_AITER_MHA \
--ipc=host \
--tty \
--detach \
--security-opt seccomp=unconfined \
--shm-size=4g \
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-w /tmp/workspace \
"${DOCKER_IMAGE}"
)
# Run perf tests
docker exec -t "${container_name}" bash -c "
pushd vllm-benchmarks/vllm
cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true
if [[ $DEVICE_NAME != 'rocm' ]]; then
pip install -U openai transformers
pip install --pre vllm==0.10.1+gptoss \
--extra-index-url https://wheels.vllm.ai/gpt-oss/ \
--extra-index-url https://download.pytorch.org/whl/nightly/cu128
fi
pip freeze
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
popd
"
# Run accuracy check
docker exec -t "${container_name}" bash -c "
local tp=0
if [[ $MODEL == 'openai/gpt-oss-120b' ]]; then
tp=4
elif [[ $MODEL == 'openai/gpt-oss-20b' ]]; then
tp=1
fi
# Prepare the accuracy test
vllm serve $MODEL --tensor_parallel_size $tp &
server_pid=$!
wait_for_server() {
timeout 1200 bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}
if wait_for_server; then
echo 'vLLM server is up and running'
else
echo 'vLLM failed to start within the timeout period'
fi
pushd vllm-benchmarks/gpt-oss
mkdir -p /tmp/gpqa_openai
# Low
OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval gpqa \
--reasoning-effort low \
--n-threads $(expr $(nproc) / 2)
# Mid
OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval gpqa \
--reasoning-effort medium \
--n-threads $(expr $(nproc) / 2)
# High
OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \
--model $MODEL \
--eval gpqa \
--reasoning-effort high \
--n-threads $(expr $(nproc) / 2)
mv /tmp/gpqa_openai .
popd
kill -9 $server_pid
"
- name: Authenticate with AWS
# AWS CUDA runners already have access to the bucket via its runner IAM role
if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
# The max duration enforced by the server side
role-duration-seconds: 18000
aws-region: us-east-1
- name: Upload the benchmark results
env:
BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results
MODEL: ${{ matrix.model }}
run: |
set -eux
sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
ls -lah "${BENCHMARK_RESULTS}"
SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g")
SANITIZED_MODELS="${MODELS//\//_}"
python3 .github/scripts/upload_benchmark_results.py \
--repo vllm-benchmarks/vllm \
--benchmark-name "vLLM benchmark" \
--benchmark-results "${BENCHMARK_RESULTS}" \
--device-name "${DEVICE_NAME}" \
--device-type "${SANITIZED_DEVICE_TYPE}" \
--model "${MODEL//\//_}"
echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV
# Keep a copy of the benchmark results on GitHub for reference
- uses: actions/upload-artifact@v4
with:
name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }}
path: vllm-benchmarks/vllm/benchmarks/results
# Keep a copy of the accuracy results on GitHub for reference
- uses: actions/upload-artifact@v4
with:
name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }}
path: |
vllm-benchmarks/gpt-oss/gpqa_openai