Run an one-off benchmark for gpt-oss #18
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: gpt-oss benchmark | |
| on: | |
| pull_request: | |
| paths: | |
| - .github/workflows/gpt-oss-benchmark.yml | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| jobs: | |
| benchmarks: | |
| name: Run gpt-oss benchmarks | |
| strategy: | |
| matrix: | |
| include: | |
| # gpt-oss-120b | |
| - runner: linux.aws.h100.4 | |
| model: openai/gpt-oss-120b | |
| docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' | |
| - runner: linux.dgx.b200.8 | |
| model: openai/gpt-oss-120b | |
| docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' | |
| - runner: linux.rocm.gpu.gfx942.4 | |
| model: openai/gpt-oss-120b | |
| docker-image: rocm/vllm-dev:open-mi300-08052025 | |
| # gpt-oss-20b | |
| - runner: linux.aws.h100 | |
| model: openai/gpt-oss-20b | |
| docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' | |
| - runner: linux.dgx.b200 | |
| model: openai/gpt-oss-20b | |
| docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' | |
| - runner: linux.rocm.gpu.gfx942.2 | |
| model: openai/gpt-oss-20b | |
| docker-image: rocm/vllm-dev:open-mi300-08052025 | |
| fail-fast: false | |
| runs-on: ${{ matrix.runner }} | |
| environment: pytorch-x-vllm | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Checkout vLLM repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: vllm-project/vllm | |
| path: vllm-benchmarks/vllm | |
| - name: Checkout gpt-oss repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: openai/gpt-oss | |
| path: vllm-benchmarks/gpt-oss | |
| - uses: actions/setup-python@v5 | |
| # Amazon Linux fails on this step | |
| continue-on-error: true | |
| with: | |
| python-version: '3.12' | |
| cache: 'pip' | |
| - name: Check if the device is supported | |
| shell: bash | |
| run: | | |
| set -eux | |
| if command -v nvidia-smi; then | |
| DEVICE_NAME=cuda | |
| nvidia-smi | |
| elif command -v rocm-smi; then | |
| DEVICE_NAME=rocm | |
| rocm-smi | |
| else | |
| DEVICE_NAME=cpu | |
| lscpu | |
| fi | |
| echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV | |
| - name: Set GPU name and type | |
| working-directory: vllm-benchmarks | |
| shell: bash | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_NAME}" == "cuda" ]]; then | |
| DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') | |
| elif [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) | |
| elif [[ "${DEVICE_NAME}" == "cpu" ]]; then | |
| DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") | |
| fi | |
| echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV | |
| - name: Install dependencies | |
| shell: bash | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| pip install -r .github/scripts/requirements.txt \ | |
| --extra-index-url https://download.pytorch.org/whl/rocm6.3 | |
| else | |
| pip install -r .github/scripts/requirements.txt \ | |
| --extra-index-url https://download.pytorch.org/whl/cu128 | |
| fi | |
| - name: Setup CUDA GPU_FLAG for docker run | |
| if: env.DEVICE_NAME == 'cuda' | |
| run: | | |
| echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" | |
| - name: Setup ROCm | |
| if: env.DEVICE_NAME == 'rocm' | |
| uses: pytorch/pytorch/./.github/actions/setup-rocm@main | |
| - name: Setup benchmark tests | |
| env: | |
| MODEL: ${{ matrix.model }} | |
| run: | | |
| set -eux | |
| pushd vllm-benchmarks/vllm | |
| rm .buildkite/nightly-benchmarks/tests/*.json | |
| popd | |
| # Set the list of benchmarks we want to cover in this runner | |
| python3 .github/scripts/setup_vllm_benchmark.py \ | |
| --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ | |
| --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ | |
| --models "${MODEL}" \ | |
| --device "${DEVICE_NAME}" | |
| pushd vllm-benchmarks/vllm | |
| ls -lah .buildkite/nightly-benchmarks/tests | |
| find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; | |
| popd | |
| - name: Run vLLM gpt-oss benchmark | |
| env: | |
| # To login to public.ecr.aws | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| DOCKER_IMAGE: ${{ matrix.docker-image }} | |
| # vLLM-related environment variables | |
| ENGINE_VERSION: v1 | |
| SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 | |
| MODEL: ${{ matrix.model }} | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_TYPE}" == *B200* ]]; then | |
| # Just to unblock this change on B200 | |
| aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}" | |
| aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}" | |
| aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws | |
| fi | |
| # https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html | |
| if [[ "${DEVICE_TYPE}" == *B200* ]]; then | |
| export VLLM_USE_TRTLLM_ATTENTION=1 | |
| export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 | |
| export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 | |
| export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 | |
| else | |
| export VLLM_USE_TRTLLM_ATTENTION=0 | |
| export VLLM_USE_TRTLLM_DECODE_ATTENTION=0 | |
| export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=0 | |
| export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=0 | |
| fi | |
| if [[ "${DEVICE_NAME}" == *rocm* ]]; then | |
| export VLLM_ROCM_USE_AITER=1 | |
| export VLLM_USE_AITER_UNIFIED_ATTENTION=1 | |
| export VLLM_ROCM_USE_AITER_MHA=0 | |
| else | |
| export VLLM_ROCM_USE_AITER=0 | |
| export VLLM_USE_AITER_UNIFIED_ATTENTION=0 | |
| export VLLM_ROCM_USE_AITER_MHA=0 | |
| fi | |
| container_name=$(docker run \ | |
| ${GPU_FLAG:-} \ | |
| -e MODEL \ | |
| -e DEVICE_NAME \ | |
| -e DEVICE_TYPE \ | |
| -e HF_TOKEN \ | |
| -e ENGINE_VERSION \ | |
| -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ | |
| -e VLLM_USE_TRTLLM_ATTENTION \ | |
| -e VLLM_USE_TRTLLM_DECODE_ATTENTION \ | |
| -e VLLM_USE_TRTLLM_CONTEXT_ATTENTION \ | |
| -e VLLM_USE_FLASHINFER_MXFP4_BF16_MOE \ | |
| -e VLLM_ROCM_USE_AITER \ | |
| -e VLLM_USE_AITER_UNIFIED_ATTENTION \ | |
| -e VLLM_ROCM_USE_AITER_MHA \ | |
| --ipc=host \ | |
| --tty \ | |
| --detach \ | |
| --security-opt seccomp=unconfined \ | |
| --shm-size=4g \ | |
| -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ | |
| -w /tmp/workspace \ | |
| "${DOCKER_IMAGE}" | |
| ) | |
| # Run accuracy check | |
| docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh | |
| # Run perf tests | |
| docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh | |
| - name: Authenticate with AWS | |
| # AWS CUDA runners already have access to the bucket via its runner IAM role | |
| if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') | |
| uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
| # The max duration enforced by the server side | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Upload the benchmark results | |
| env: | |
| BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results | |
| MODEL: ${{ matrix.model }} | |
| run: | | |
| set -eux | |
| sudo chown -R ${UID} "${BENCHMARK_RESULTS}" | |
| ls -lah "${BENCHMARK_RESULTS}" | |
| SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") | |
| SANITIZED_MODELS="${MODELS//\//_}" | |
| python3 .github/scripts/upload_benchmark_results.py \ | |
| --repo vllm-benchmarks/vllm \ | |
| --benchmark-name "vLLM benchmark" \ | |
| --benchmark-results "${BENCHMARK_RESULTS}" \ | |
| --device-name "${DEVICE_NAME}" \ | |
| --device-type "${SANITIZED_DEVICE_TYPE}" \ | |
| --model "${MODEL//\//_}" | |
| echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV | |
| echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV | |
| # Keep a copy of the benchmark results on GitHub for reference | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} | |
| path: vllm-benchmarks/vllm/benchmarks/results | |
| # Keep a copy of the accuracy results on GitHub for reference | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} | |
| path: | | |
| vllm-benchmarks/gpt-oss/gpqa_openai |