Run an one-off benchmark for gpt-oss #10
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: gpt-oss benchmark | |
| on: | |
| pull_request: | |
| paths: | |
| - .github/workflows/gpt-oss-benchmark.yml | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| jobs: | |
| benchmarks: | |
| name: Run gpt-oss benchmarks | |
| strategy: | |
| matrix: | |
| include: | |
| # gpt-oss-120b | |
| - runner: linux.aws.h100.4 | |
| model: openai/gpt-oss-120b | |
| docker-image: 'vllm/vllm-openai:gptoss' | |
| - runner: linux.dgx.b200.8 | |
| model: openai/gpt-oss-120b | |
| docker-image: 'vllm/vllm-openai:gptoss' | |
| - runner: linux.rocm.gpu.gfx942.4 | |
| model: openai/gpt-oss-120b | |
| docker-image: rocm/vllm-dev:open-mi300-08052025 | |
| # gpt-oss-20b | |
| - runner: linux.aws.h100.4 | |
| model: openai/gpt-oss-20b | |
| docker-image: 'vllm/vllm-openai:gptoss' | |
| - runner: linux.dgx.b200 | |
| model: openai/gpt-oss-20b | |
| docker-image: 'vllm/vllm-openai:gptoss' | |
| - runner: linux.rocm.gpu.gfx942.2 | |
| model: openai/gpt-oss-20b | |
| docker-image: rocm/vllm-dev:open-mi300-08052025 | |
| fail-fast: false | |
| runs-on: ${{ matrix.runner }} | |
| environment: pytorch-x-vllm | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Checkout vLLM repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: vllm-project/vllm | |
| path: vllm-benchmarks/vllm | |
| - name: Checkout gpt-oss repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: openai/gpt-oss | |
| path: vllm-benchmarks/gpt-oss | |
| - uses: actions/setup-python@v5 | |
| # Amazon Linux fails on this step | |
| continue-on-error: true | |
| with: | |
| python-version: '3.12' | |
| cache: 'pip' | |
| - name: Check if the device is supported | |
| shell: bash | |
| run: | | |
| set -eux | |
| if command -v nvidia-smi; then | |
| DEVICE_NAME=cuda | |
| nvidia-smi | |
| elif command -v rocm-smi; then | |
| DEVICE_NAME=rocm | |
| rocm-smi | |
| else | |
| DEVICE_NAME=cpu | |
| lscpu | |
| fi | |
| echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV | |
| - name: Set GPU name and type | |
| working-directory: vllm-benchmarks | |
| shell: bash | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_NAME}" == "cuda" ]]; then | |
| DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') | |
| elif [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) | |
| elif [[ "${DEVICE_NAME}" == "cpu" ]]; then | |
| DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") | |
| fi | |
| echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV | |
| - name: Install dependencies | |
| shell: bash | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| pip install -r .github/scripts/requirements.txt \ | |
| --extra-index-url https://download.pytorch.org/whl/rocm6.3 | |
| else | |
| pip install -r .github/scripts/requirements.txt \ | |
| --extra-index-url https://download.pytorch.org/whl/cu128 | |
| fi | |
| - name: Setup CUDA GPU_FLAG for docker run | |
| if: env.DEVICE_NAME == 'cuda' | |
| run: | | |
| echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" | |
| - name: Setup ROCm | |
| if: env.DEVICE_NAME == 'rocm' | |
| uses: pytorch/pytorch/./.github/actions/setup-rocm@main | |
| - name: Setup benchmark tests | |
| env: | |
| MODEL: ${{ matrix.model }} | |
| run: | | |
| set -eux | |
| pushd vllm-benchmarks/vllm | |
| rm .buildkite/nightly-benchmarks/tests/*.json | |
| popd | |
| # Set the list of benchmarks we want to cover in this runner | |
| python3 .github/scripts/setup_vllm_benchmark.py \ | |
| --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ | |
| --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ | |
| --models "${MODEL}" \ | |
| --device "${DEVICE_NAME}" | |
| pushd vllm-benchmarks/vllm | |
| ls -lah .buildkite/nightly-benchmarks/tests | |
| find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; | |
| popd | |
| - name: Run vLLM gpt-oss benchmark | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| DOCKER_IMAGE: ${{ matrix.docker-image }} | |
| # vLLM-related environment variables | |
| ENGINE_VERSION: v1 | |
| SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 | |
| run: | | |
| set -eux | |
| # https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html | |
| if [[ "${DEVICE_TYPE}" == *B200* ]]; then | |
| export VLLM_USE_TRTLLM_ATTENTION=1 | |
| export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 | |
| export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 | |
| export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 | |
| else | |
| export VLLM_USE_TRTLLM_ATTENTION=0 | |
| export VLLM_USE_TRTLLM_DECODE_ATTENTION=0 | |
| export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=0 | |
| export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=0 | |
| fi | |
| if [[ "${DEVICE_NAME}" == *rocm* ]]; then | |
| export VLLM_ROCM_USE_AITER=1 | |
| export VLLM_USE_AITER_UNIFIED_ATTENTION=1 | |
| export VLLM_ROCM_USE_AITER_MHA=0 | |
| else | |
| export VLLM_ROCM_USE_AITER=0 | |
| export VLLM_USE_AITER_UNIFIED_ATTENTION=0 | |
| export VLLM_ROCM_USE_AITER_MHA=0 | |
| fi | |
| container_name=$(docker run \ | |
| ${GPU_FLAG:-} \ | |
| -e DEVICE_NAME \ | |
| -e DEVICE_TYPE \ | |
| -e HF_TOKEN \ | |
| -e ENGINE_VERSION \ | |
| -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ | |
| -e VLLM_USE_TRTLLM_ATTENTION \ | |
| -e VLLM_USE_TRTLLM_DECODE_ATTENTION \ | |
| -e VLLM_USE_TRTLLM_CONTEXT_ATTENTION \ | |
| -e VLLM_USE_FLASHINFER_MXFP4_BF16_MOE \ | |
| -e VLLM_ROCM_USE_AITER \ | |
| -e VLLM_USE_AITER_UNIFIED_ATTENTION \ | |
| -e VLLM_ROCM_USE_AITER_MHA \ | |
| --ipc=host \ | |
| --tty \ | |
| --detach \ | |
| --security-opt seccomp=unconfined \ | |
| --shm-size=4g \ | |
| -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ | |
| -w /tmp/workspace \ | |
| "${DOCKER_IMAGE}" | |
| ) | |
| docker exec -t "${container_name}" bash -c " | |
| cd vllm-benchmarks/vllm | |
| cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true | |
| sleep 7200 | |
| bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh || true | |
| " | |
| - name: Authenticate with AWS | |
| # AWS CUDA runners already have access to the bucket via its runner IAM role | |
| if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') | |
| uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
| # The max duration enforced by the server side | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Upload the benchmark results | |
| env: | |
| BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results | |
| MODEL: ${{ matrix.model }} | |
| run: | | |
| set -eux | |
| sudo chown -R ${UID} "${BENCHMARK_RESULTS}" | |
| ls -lah "${BENCHMARK_RESULTS}" | |
| SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") | |
| python3 .github/scripts/upload_benchmark_results.py \ | |
| --repo vllm-benchmarks/vllm \ | |
| --benchmark-name "vLLM benchmark" \ | |
| --benchmark-results "${BENCHMARK_RESULTS}" \ | |
| --device-name "${DEVICE_NAME}" \ | |
| --device-type "${SANITIZED_DEVICE_TYPE}" \ | |
| --model "${MODEL//\//_}" | |
| echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV | |
| echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV | |
| # Keep a copy of the benchmark results on GitHub for reference | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} | |
| path: vllm-benchmarks/vllm/benchmarks/results |