From bee13ba00d71155b455cac6730d59c82f0837f9c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 00:37:29 -0700 Subject: [PATCH 01/36] Add an one-off workflow to benchmark gpt-oss Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 202 ++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 .github/workflows/gpt-oss-benchmark.yml diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml new file mode 100644 index 00000000..a1a172c8 --- /dev/null +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -0,0 +1,202 @@ +name: gpt-oss benchmark + +on: + pull_request: + paths: + - .github/workflows/gpt-oss-benchmark.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + benchmarks: + name: Run gpt-oss benchmarks + needs: set-parameters + strategy: + matrix: + # gpt-oss-120b + # - runner: linux.aws.h100.4 + # model: openai/gpt-oss-120b + # docker-image: 'vllm/vllm-openai:gptoss' + # - runner: linux.dgx.b200.8 + # model: openai/gpt-oss-120b + # docker-image: 'vllm/vllm-openai:gptoss' + # - runner: linux.rocm.gpu.gfx942.4 + # model: openai/gpt-oss-120b + # docker-image: rocm/vllm-dev:open-mi300-08052025 + # gpt-oss-20b + - runner: linux.aws.h100 + model: openai/gpt-oss-20b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.dgx.b200 + model: openai/gpt-oss-20b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.rocm.gpu.gfx942.2 + model: openai/gpt-oss-20b + docker-image: rocm/vllm-dev:open-mi300-08052025 + fail-fast: false + runs-on: ${{ matrix.runner }} + environment: pytorch-x-vllm + permissions: + id-token: write + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Checkout vLLM repository + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-benchmarks/vllm + + - uses: actions/setup-python@v5 + # Amazon Linux fails on this step + continue-on-error: true + with: + python-version: '3.12' + cache: 'pip' + + - name: Check if the device is supported + shell: bash + run: | + set -eux + + if command -v nvidia-smi; then + DEVICE_NAME=cuda + nvidia-smi + elif command -v rocm-smi; then + DEVICE_NAME=rocm + rocm-smi + else + DEVICE_NAME=cpu + lscpu + fi + echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV + + - name: Set GPU name and type + working-directory: vllm-benchmarks + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "cuda" ]]; then + DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") + fi + echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV + + - name: Install dependencies + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/rocm6.3 + else + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/cu128 + fi + + - name: Setup CUDA GPU_FLAG for docker run + if: env.DEVICE_NAME == 'cuda' + run: | + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup ROCm + if: env.DEVICE_NAME == 'rocm' + uses: pytorch/pytorch/./.github/actions/setup-rocm@main + + - name: Setup benchmark tests + env: + MODEL: ${{ matrix.model }} + run: | + set -eux + + pushd vllm-benchmarks/vllm + git checkout "${HEAD_SHA}" + rm .buildkite/nightly-benchmarks/tests/*.json + popd + + # Set the list of benchmarks we want to cover in this runner + python3 .github/scripts/setup_vllm_benchmark.py \ + --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ + --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ + --models "${MODEL}" \ + --device "${DEVICE_NAME}" + + pushd vllm-benchmarks/vllm + ls -lah .buildkite/nightly-benchmarks/tests + find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; + popd + + - name: Run vLLM gpt-oss benchmark + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + DOCKER_IMAGE: ${{ matrix.docker-image }} + # vLLM-related environment variables + ENGINE_VERSION: v1 + SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + run: | + set -eux + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e DEVICE_NAME \ + -e DEVICE_TYPE \ + -e HF_TOKEN \ + -e ENGINE_VERSION \ + -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + --ipc=host \ + --tty \ + --detach \ + --security-opt seccomp=unconfined \ + --shm-size=4g \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + + - name: Authenticate with AWS + # AWS CUDA runners already have access to the bucket via its runner IAM role + if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results + # The max duration enforced by the server side + role-duration-seconds: 18000 + aws-region: us-east-1 + + - name: Upload the benchmark results + env: + BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results + MODEL: ${{ matrix.model }} + run: | + set -eux + + sudo chown -R ${UID} "${BENCHMARK_RESULTS}" + ls -lah "${BENCHMARK_RESULTS}" + + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") + python3 .github/scripts/upload_benchmark_results.py \ + --repo vllm-benchmarks/vllm \ + --benchmark-name "vLLM benchmark" \ + --benchmark-results "${BENCHMARK_RESULTS}" \ + --device-name "${DEVICE_NAME}" \ + --device-type "${SANITIZED_DEVICE_TYPE}" \ + --model "${MODEL//\//_}" + + echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV + echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV + + # Keep a copy of the benchmark results on GitHub for reference + - uses: actions/upload-artifact@v4 + with: + name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} + path: vllm-benchmarks/vllm/benchmarks/results From 56e4451e4789376ee50eb07c6fb29c98c49de20a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 00:40:30 -0700 Subject: [PATCH 02/36] Fix workflow syntax Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index a1a172c8..8948f5fe 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -26,15 +26,16 @@ jobs: # model: openai/gpt-oss-120b # docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b - - runner: linux.aws.h100 - model: openai/gpt-oss-20b - docker-image: 'vllm/vllm-openai:gptoss' - - runner: linux.dgx.b200 - model: openai/gpt-oss-20b - docker-image: 'vllm/vllm-openai:gptoss' - - runner: linux.rocm.gpu.gfx942.2 - model: openai/gpt-oss-20b - docker-image: rocm/vllm-dev:open-mi300-08052025 + include: + - runner: linux.aws.h100 + model: openai/gpt-oss-20b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.dgx.b200 + model: openai/gpt-oss-20b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.rocm.gpu.gfx942.2 + model: openai/gpt-oss-20b + docker-image: rocm/vllm-dev:open-mi300-08052025 fail-fast: false runs-on: ${{ matrix.runner }} environment: pytorch-x-vllm @@ -51,6 +52,12 @@ jobs: repository: vllm-project/vllm path: vllm-benchmarks/vllm + - name: Checkout gpt-oss repository + uses: actions/checkout@v4 + with: + repository: openai/gpt-oss + path: vllm-benchmarks/gpt-oss + - uses: actions/setup-python@v5 # Amazon Linux fails on this step continue-on-error: true From 19e9e9b9c17b191e037392df1a3735cf8a41a4b3 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 01:18:46 -0700 Subject: [PATCH 03/36] Add the models Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 24 ++++++- .../benchmarks/cuda/latency-tests.json | 33 ++++++++++ .../benchmarks/cuda/serving-tests.json | 60 ++++++++++++++++++ .../benchmarks/cuda/throughput-tests.json | 36 +++++++++++ .../benchmarks/rocm/latency-tests.json | 33 ++++++++++ .../benchmarks/rocm/serving-tests.json | 63 +++++++++++++++++++ .../benchmarks/rocm/throughput-tests.json | 36 +++++++++++ 7 files changed, 284 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 8948f5fe..df4a1295 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -12,7 +12,6 @@ concurrency: jobs: benchmarks: name: Run gpt-oss benchmarks - needs: set-parameters strategy: matrix: # gpt-oss-120b @@ -152,6 +151,29 @@ jobs: run: | set -eux + # https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html + if [[ "${DEVICE_TYPE}" == *B200* ]]; then + export VLLM_USE_TRTLLM_ATTENTION=1 + export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 + else + export VLLM_USE_TRTLLM_ATTENTION= + export VLLM_USE_TRTLLM_DECODE_ATTENTION= + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION= + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE= + fi + + if [[ "${DEVICE_NAME}" == *rocm* ]]; then + export VLLM_ROCM_USE_AITER=1 + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 + export VLLM_ROCM_USE_AITER_MHA=0 + else + export VLLM_ROCM_USE_AITER= + export VLLM_USE_AITER_UNIFIED_ATTENTION= + export VLLM_ROCM_USE_AITER_MHA= + fi + container_name=$(docker run \ ${GPU_FLAG:-} \ -e DEVICE_NAME \ diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 9e9f15f8..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -50,5 +50,38 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 66b7c4de..d9395e80 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -411,5 +411,65 @@ "random_input_len": 30720, "random_output_len": 100 } + }, + { + "test_name": "serving_gpt_oss_20b_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 647ac2f3..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -55,5 +55,41 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json index 9e9f15f8..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/rocm/latency-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -50,5 +50,38 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index 7b32d384..a6dfd80f 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -410,5 +410,68 @@ "random_input_len": 1024, "random_output_len": 2048 } + }, + { + "test_name": "serving_gpt_oss_20b_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json index 647ac2f3..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -55,5 +55,41 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ] From 96e08d63dec461af7990dc7556b996e573a9cc8a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 01:27:01 -0700 Subject: [PATCH 04/36] More tweaks Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 8 +++++++- .github/workflows/vllm-benchmark.yml | 4 ---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index df4a1295..f8ac2a27 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -125,7 +125,6 @@ jobs: set -eux pushd vllm-benchmarks/vllm - git checkout "${HEAD_SHA}" rm .buildkite/nightly-benchmarks/tests/*.json popd @@ -181,6 +180,13 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e VLLM_USE_TRTLLM_ATTENTION \ + -e VLLM_USE_TRTLLM_DECODE_ATTENTION \ + -e VLLM_USE_TRTLLM_CONTEXT_ATTENTION \ + -e VLLM_USE_FLASHINFER_MXFP4_BF16_MOE \ + -e VLLM_ROCM_USE_AITER \ + -e VLLM_USE_AITER_UNIFIED_ATTENTION \ + -e VLLM_ROCM_USE_AITER_MHA \ --ipc=host \ --tty \ --detach \ diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 0b2965b8..42763170 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -26,10 +26,6 @@ on: required: true type: string default: h100,mi300,spr - pull_request: - paths: - - .github/workflows/vllm-benchmark.yml - - vllm-benchmarks/** concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} From f76c72f819c887c365a750a3b55d3ce10103504c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 02:01:50 -0700 Subject: [PATCH 05/36] Would it work? Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 21 ++++++++++++------- .../benchmarks/cuda/latency-tests.json | 3 ++- .../benchmarks/cuda/serving-tests.json | 3 ++- .../benchmarks/cuda/throughput-tests.json | 3 ++- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index f8ac2a27..91b5cd40 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -157,10 +157,10 @@ jobs: export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 else - export VLLM_USE_TRTLLM_ATTENTION= - export VLLM_USE_TRTLLM_DECODE_ATTENTION= - export VLLM_USE_TRTLLM_CONTEXT_ATTENTION= - export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE= + export VLLM_USE_TRTLLM_ATTENTION=0 + export VLLM_USE_TRTLLM_DECODE_ATTENTION=0 + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=0 + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=0 fi if [[ "${DEVICE_NAME}" == *rocm* ]]; then @@ -168,9 +168,9 @@ jobs: export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 else - export VLLM_ROCM_USE_AITER= - export VLLM_USE_AITER_UNIFIED_ATTENTION= - export VLLM_ROCM_USE_AITER_MHA= + export VLLM_ROCM_USE_AITER=0 + export VLLM_USE_AITER_UNIFIED_ATTENTION=0 + export VLLM_ROCM_USE_AITER_MHA=0 fi container_name=$(docker run \ @@ -196,7 +196,12 @@ jobs: -w /tmp/workspace \ "${DOCKER_IMAGE}" ) - docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + docker exec -t "${container_name}" bash -c " + set -x + cd vllm-benchmarks/vllm + cp vllm/benchmarks/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true + bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + " - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 30b5b83f..40fb96ab 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -59,7 +59,8 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192 + "max_model_len": 8192, + "gpu_memory_utilization": 0.95 } }, { diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index d9395e80..fda293f6 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -422,7 +422,8 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192 + "max_model_len": 8192, + "gpu_memory_utilization": 0.95 }, "client_parameters": { "model": "", diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index d244eb81..0888f85b 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -65,7 +65,8 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192 + "max_model_len": 8192, + "gpu_memory_utilization": 0.95 } }, { From ca94597d1be82ffd494f749bed168d5239f141ff Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 02:23:42 -0700 Subject: [PATCH 06/36] More tweaks Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 22 +++++++++++-------- .../benchmarks/cuda/latency-tests.json | 7 +++--- .../benchmarks/cuda/serving-tests.json | 7 +++--- .../benchmarks/cuda/throughput-tests.json | 7 +++--- .../benchmarks/rocm/latency-tests.json | 6 ++--- .../benchmarks/rocm/serving-tests.json | 6 ++--- .../benchmarks/rocm/throughput-tests.json | 6 ++--- 7 files changed, 31 insertions(+), 30 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 91b5cd40..72547b64 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -15,15 +15,15 @@ jobs: strategy: matrix: # gpt-oss-120b - # - runner: linux.aws.h100.4 - # model: openai/gpt-oss-120b - # docker-image: 'vllm/vllm-openai:gptoss' - # - runner: linux.dgx.b200.8 - # model: openai/gpt-oss-120b - # docker-image: 'vllm/vllm-openai:gptoss' - # - runner: linux.rocm.gpu.gfx942.4 - # model: openai/gpt-oss-120b - # docker-image: rocm/vllm-dev:open-mi300-08052025 + - runner: linux.aws.h100.4 + model: openai/gpt-oss-120b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.dgx.b200.8 + model: openai/gpt-oss-120b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.rocm.gpu.gfx942.4 + model: openai/gpt-oss-120b + docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b include: - runner: linux.aws.h100 @@ -199,6 +199,10 @@ jobs: docker exec -t "${container_name}" bash -c " set -x cd vllm-benchmarks/vllm + ls -la . + ls -la vllm + ls -la vllm/benchmarks + cp vllm/benchmarks/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh " diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 40fb96ab..809aac31 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -59,8 +59,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192, - "gpu_memory_utilization": 0.95 + "max_model_len": 4096 } }, { @@ -71,7 +70,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192 + "max_model_len": 4096 } }, { @@ -82,7 +81,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192 + "max_model_len": 4096 } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index fda293f6..1ddf1080 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -422,8 +422,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192, - "gpu_memory_utilization": 0.95 + "max_model_len": 4096 }, "client_parameters": { "model": "", @@ -443,7 +442,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192 + "max_model_len": 4096 }, "client_parameters": { "model": "", @@ -463,7 +462,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192 + "max_model_len": 4096 }, "client_parameters": { "model": "", diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 0888f85b..4b358bb2 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -65,8 +65,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192, - "gpu_memory_utilization": 0.95 + "max_model_len": 4096 } }, { @@ -78,7 +77,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192 + "max_model_len": 4096 } }, { @@ -90,7 +89,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192 + "max_model_len": 4096 } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json index 30b5b83f..809aac31 100644 --- a/vllm-benchmarks/benchmarks/rocm/latency-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -59,7 +59,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192 + "max_model_len": 4096 } }, { @@ -70,7 +70,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192 + "max_model_len": 4096 } }, { @@ -81,7 +81,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 8192 + "max_model_len": 4096 } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index a6dfd80f..0e7c115c 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -421,7 +421,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192, + "max_model_len": 4096, "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { @@ -442,7 +442,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192, + "max_model_len": 4096, "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { @@ -463,7 +463,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192, + "max_model_len": 4096, "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json index d244eb81..4b358bb2 100644 --- a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -65,7 +65,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192 + "max_model_len": 4096 } }, { @@ -77,7 +77,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192 + "max_model_len": 4096 } }, { @@ -89,7 +89,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 8192 + "max_model_len": 4096 } } ] From 456df1ae22581343a5b7b953fcd5f8a1233bf5cf Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 02:32:06 -0700 Subject: [PATCH 07/36] Fix workflow syntax Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 72547b64..b183964e 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -14,18 +14,18 @@ jobs: name: Run gpt-oss benchmarks strategy: matrix: - # gpt-oss-120b - - runner: linux.aws.h100.4 - model: openai/gpt-oss-120b - docker-image: 'vllm/vllm-openai:gptoss' - - runner: linux.dgx.b200.8 - model: openai/gpt-oss-120b - docker-image: 'vllm/vllm-openai:gptoss' - - runner: linux.rocm.gpu.gfx942.4 - model: openai/gpt-oss-120b - docker-image: rocm/vllm-dev:open-mi300-08052025 - # gpt-oss-20b include: + # gpt-oss-120b + - runner: linux.aws.h100.4 + model: openai/gpt-oss-120b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.dgx.b200.8 + model: openai/gpt-oss-120b + docker-image: 'vllm/vllm-openai:gptoss' + - runner: linux.rocm.gpu.gfx942.4 + model: openai/gpt-oss-120b + docker-image: rocm/vllm-dev:open-mi300-08052025 + # gpt-oss-20b - runner: linux.aws.h100 model: openai/gpt-oss-20b docker-image: 'vllm/vllm-openai:gptoss' From 052b329fd3449de4f65129442344aafeee0fc6d4 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 03:00:32 -0700 Subject: [PATCH 08/36] Debug Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 8 ++------ vllm-benchmarks/benchmarks/cuda/latency-tests.json | 3 ++- vllm-benchmarks/benchmarks/cuda/serving-tests.json | 3 ++- vllm-benchmarks/benchmarks/cuda/throughput-tests.json | 3 ++- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index b183964e..ef26e706 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -197,14 +197,10 @@ jobs: "${DOCKER_IMAGE}" ) docker exec -t "${container_name}" bash -c " - set -x cd vllm-benchmarks/vllm - ls -la . - ls -la vllm - ls -la vllm/benchmarks - - cp vllm/benchmarks/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true + cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + sleep 7200 " - name: Authenticate with AWS diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 809aac31..65da1ef5 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -59,7 +59,8 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 4096, + "gpu_memory_utilization": 0.95 } }, { diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 1ddf1080..c1c6f511 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -422,7 +422,8 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096 + "max_model_len": 4096, + "gpu_memory_utilization": 0.95 }, "client_parameters": { "model": "", diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 4b358bb2..5f3d01ba 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -65,7 +65,8 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 4096, + "gpu_memory_utilization": 0.95 } }, { From 180a976e59668c0c3a66d1e7f704a9bf7803b374 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 03:28:26 -0700 Subject: [PATCH 09/36] KISS Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 4 ++-- vllm-benchmarks/benchmarks/cuda/latency-tests.json | 3 +-- vllm-benchmarks/benchmarks/cuda/serving-tests.json | 3 +-- vllm-benchmarks/benchmarks/cuda/throughput-tests.json | 3 +-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index ef26e706..f0a9f3bd 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -26,7 +26,7 @@ jobs: model: openai/gpt-oss-120b docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b - - runner: linux.aws.h100 + - runner: linux.aws.h100.4 model: openai/gpt-oss-20b docker-image: 'vllm/vllm-openai:gptoss' - runner: linux.dgx.b200 @@ -199,7 +199,7 @@ jobs: docker exec -t "${container_name}" bash -c " cd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh || true sleep 7200 " diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 65da1ef5..809aac31 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -59,8 +59,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096, - "gpu_memory_utilization": 0.95 + "max_model_len": 4096 } }, { diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index c1c6f511..1ddf1080 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -422,8 +422,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096, - "gpu_memory_utilization": 0.95 + "max_model_len": 4096 }, "client_parameters": { "model": "", diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 5f3d01ba..4b358bb2 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -65,8 +65,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096, - "gpu_memory_utilization": 0.95 + "max_model_len": 4096 } }, { From dd3464945a3b58a924dd28b10479377eae47af3e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 15:06:40 -0700 Subject: [PATCH 10/36] Ready to debug Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index f0a9f3bd..3a67dab2 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -199,8 +199,8 @@ jobs: docker exec -t "${container_name}" bash -c " cd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh || true sleep 7200 + bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh || true " - name: Authenticate with AWS From fbfb2bb44dfe1e97b47740f05640d7b148c33810 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 18:03:51 -0700 Subject: [PATCH 11/36] Another try Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 3a67dab2..83b17578 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -18,20 +18,20 @@ jobs: # gpt-oss-120b - runner: linux.aws.h100.4 model: openai/gpt-oss-120b - docker-image: 'vllm/vllm-openai:gptoss' + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - runner: linux.dgx.b200.8 model: openai/gpt-oss-120b - docker-image: 'vllm/vllm-openai:gptoss' + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - runner: linux.rocm.gpu.gfx942.4 model: openai/gpt-oss-120b docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b - runner: linux.aws.h100.4 model: openai/gpt-oss-20b - docker-image: 'vllm/vllm-openai:gptoss' + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - runner: linux.dgx.b200 model: openai/gpt-oss-20b - docker-image: 'vllm/vllm-openai:gptoss' + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - runner: linux.rocm.gpu.gfx942.2 model: openai/gpt-oss-20b docker-image: rocm/vllm-dev:open-mi300-08052025 @@ -199,8 +199,14 @@ jobs: docker exec -t "${container_name}" bash -c " cd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true + + pip install --pre vllm==0.10.1+gptoss \ + --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \ + --index-strategy unsafe-best-match sleep 7200 - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh || true + + bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh " - name: Authenticate with AWS From 3189a5d7a79185522e48aef0911ae6d66f3308a8 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 19:48:47 -0700 Subject: [PATCH 12/36] Login to ECR Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 83b17578..4c187da2 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -142,6 +142,9 @@ jobs: - name: Run vLLM gpt-oss benchmark env: + # To login to public.ecr.aws + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} HF_TOKEN: ${{ secrets.HF_TOKEN }} DOCKER_IMAGE: ${{ matrix.docker-image }} # vLLM-related environment variables @@ -150,6 +153,12 @@ jobs: run: | set -eux + if [[ "${DEVICE_TYPE}" == *B200* ]]; then + aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}" + aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}" + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws + fi + # https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html if [[ "${DEVICE_TYPE}" == *B200* ]]; then export VLLM_USE_TRTLLM_ATTENTION=1 @@ -199,13 +208,11 @@ jobs: docker exec -t "${container_name}" bash -c " cd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true - - pip install --pre vllm==0.10.1+gptoss \ - --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ - --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \ - --index-strategy unsafe-best-match - sleep 7200 - + if [[ $DEVICE_NAME != 'rocm' ]]; then + pip install --pre vllm==0.10.1+gptoss \ + --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu128 + fi bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh " From 5176b114122c6376ebd5011cfe16db644d83d4b4 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 20:42:12 -0700 Subject: [PATCH 13/36] Debug Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 4c187da2..c623b00d 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -213,6 +213,7 @@ jobs: --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ --extra-index-url https://download.pytorch.org/whl/nightly/cu128 fi + sleep 7200 bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh " @@ -237,6 +238,8 @@ jobs: ls -lah "${BENCHMARK_RESULTS}" SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") + SANITIZED_MODELS="${MODELS//\//_}" + python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ From 3ef58c4ac8e10f95a09addecb2d41744ab52efa1 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 6 Aug 2025 23:07:41 -0700 Subject: [PATCH 14/36] Add accuracy check Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 76 ++++++++++++++++++- .../benchmarks/cuda/latency-tests.json | 6 +- .../benchmarks/cuda/serving-tests.json | 6 +- .../benchmarks/cuda/throughput-tests.json | 6 +- .../benchmarks/rocm/latency-tests.json | 6 +- .../benchmarks/rocm/serving-tests.json | 9 +-- .../benchmarks/rocm/throughput-tests.json | 6 +- 7 files changed, 91 insertions(+), 24 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index c623b00d..e776d457 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -26,7 +26,7 @@ jobs: model: openai/gpt-oss-120b docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b - - runner: linux.aws.h100.4 + - runner: linux.aws.h100 model: openai/gpt-oss-20b docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - runner: linux.dgx.b200 @@ -150,10 +150,12 @@ jobs: # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + MODEL: ${{ matrix.model }} run: | set -eux if [[ "${DEVICE_TYPE}" == *B200* ]]; then + # Just to unblock this change on B200 aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}" aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}" aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws @@ -184,6 +186,7 @@ jobs: container_name=$(docker run \ ${GPU_FLAG:-} \ + -e MODEL \ -e DEVICE_NAME \ -e DEVICE_TYPE \ -e HF_TOKEN \ @@ -205,16 +208,76 @@ jobs: -w /tmp/workspace \ "${DOCKER_IMAGE}" ) + + # Run perf tests docker exec -t "${container_name}" bash -c " - cd vllm-benchmarks/vllm + pushd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true if [[ $DEVICE_NAME != 'rocm' ]]; then + pip install -U openai transformers pip install --pre vllm==0.10.1+gptoss \ --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ --extra-index-url https://download.pytorch.org/whl/nightly/cu128 fi - sleep 7200 + pip freeze bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + popd + " + + # Run accuracy check + docker exec -t "${container_name}" bash -c " + local tp + if [[ $MODEL == 'openai/gpt-oss-120b' ]]; then + tp=4 + elfi [[ $MODEL == 'openai/gpt-oss-20b' ]]; then + tp=1 + fi + + # Prepare the accuracy test + vllm serve $MODEL --tensor_parallel_size $tp & + server_pid=$! + + wait_for_server() { + timeout 1200 bash -c ' + until curl -X POST localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 + } + + if wait_for_server; then + echo 'vLLM server is up and running' + else + echo 'vLLM failed to start within the timeout period' + fi + + pushd vllm-benchmarks/gpt-oss + mkdir -p /tmp/gpqa_openai + + # Low + OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort low \ + --n-threads $(expr $(nproc) / 2) + + # Mid + OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort medium \ + --n-threads $(expr $(nproc) / 2) + + # High + OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort high \ + --n-threads $(expr $(nproc) / 2) + + mv /tmp/gpqa_openai . + popd + + kill -9 $server_pid " - name: Authenticate with AWS @@ -256,3 +319,10 @@ jobs: with: name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} path: vllm-benchmarks/vllm/benchmarks/results + + # Keep a copy of the accuracy results on GitHub for reference + - uses: actions/upload-artifact@v4 + with: + name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} + path: | + vllm-benchmarks/gpt-oss/gpqa_openai diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 809aac31..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -59,7 +59,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -70,7 +70,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -81,7 +81,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 8192 } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 1ddf1080..d9395e80 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -422,7 +422,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096 + "max_model_len": 8192 }, "client_parameters": { "model": "", @@ -442,7 +442,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096 + "max_model_len": 8192 }, "client_parameters": { "model": "", @@ -462,7 +462,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096 + "max_model_len": 8192 }, "client_parameters": { "model": "", diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 4b358bb2..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -65,7 +65,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -77,7 +77,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -89,7 +89,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 8192 } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json index 809aac31..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/rocm/latency-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -59,7 +59,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -70,7 +70,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -81,7 +81,7 @@ "load_format": "dummy", "num_iters_warmup": 5, "num_iters": 15, - "max_model_len": 4096 + "max_model_len": 8192 } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index 0e7c115c..b9e814e0 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -421,8 +421,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096, - "compilation-config": "{'full_cuda_graph': true}" + "max_model_len": 8192 }, "client_parameters": { "model": "", @@ -442,8 +441,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096, - "compilation-config": "{'full_cuda_graph': true}" + "max_model_len": 8192 }, "client_parameters": { "model": "", @@ -463,8 +461,7 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 4096, - "compilation-config": "{'full_cuda_graph': true}" + "max_model_len": 8192 }, "client_parameters": { "model": "", diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json index 4b358bb2..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -65,7 +65,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -77,7 +77,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 8192 } }, { @@ -89,7 +89,7 @@ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm", - "max_model_len": 4096 + "max_model_len": 8192 } } ] From 30e013828f47b3b8539037ccf92adc3ca5f6a588 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 00:01:46 -0700 Subject: [PATCH 15/36] Another tweak Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index e776d457..2e27a4ae 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -226,10 +226,10 @@ jobs: # Run accuracy check docker exec -t "${container_name}" bash -c " - local tp + local tp=0 if [[ $MODEL == 'openai/gpt-oss-120b' ]]; then tp=4 - elfi [[ $MODEL == 'openai/gpt-oss-20b' ]]; then + elif [[ $MODEL == 'openai/gpt-oss-20b' ]]; then tp=1 fi From ba28e1e1acce282b5623e6b6c940169aeec1a073 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 00:18:01 -0700 Subject: [PATCH 16/36] Really? Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 2e27a4ae..f8d1bf04 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -226,7 +226,7 @@ jobs: # Run accuracy check docker exec -t "${container_name}" bash -c " - local tp=0 + tp=0 if [[ $MODEL == 'openai/gpt-oss-120b' ]]; then tp=4 elif [[ $MODEL == 'openai/gpt-oss-20b' ]]; then From 044f1c3f909e2c86cafe44286e5ab4bc5644982c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 00:36:49 -0700 Subject: [PATCH 17/36] Debug Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 35 ++++++++++++++----------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index f8d1bf04..20380b07 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -209,23 +209,12 @@ jobs: "${DOCKER_IMAGE}" ) - # Run perf tests - docker exec -t "${container_name}" bash -c " - pushd vllm-benchmarks/vllm - cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true - if [[ $DEVICE_NAME != 'rocm' ]]; then - pip install -U openai transformers - pip install --pre vllm==0.10.1+gptoss \ - --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ - --extra-index-url https://download.pytorch.org/whl/nightly/cu128 - fi - pip freeze - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - popd - " - # Run accuracy check docker exec -t "${container_name}" bash -c " + set -eux + + echo $MODEL + tp=0 if [[ $MODEL == 'openai/gpt-oss-120b' ]]; then tp=4 @@ -233,6 +222,7 @@ jobs: tp=1 fi + echo $tp # Prepare the accuracy test vllm serve $MODEL --tensor_parallel_size $tp & server_pid=$! @@ -280,6 +270,21 @@ jobs: kill -9 $server_pid " + # Run perf tests + docker exec -t "${container_name}" bash -c " + pushd vllm-benchmarks/vllm + cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true + if [[ $DEVICE_NAME != 'rocm' ]]; then + pip install -U openai transformers + pip install --pre vllm==0.10.1+gptoss \ + --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu128 + fi + pip freeze + bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + popd + " + - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') From a17fe1a538ce1fc5342116c3af8240b4685afeeb Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 01:00:08 -0700 Subject: [PATCH 18/36] Move the logic to a script Signed-off-by: Huy Do --- .../scripts/gpt-oss/run_accuracy_checks.sh | 57 ++++++++++++++ .github/scripts/gpt-oss/run_benchmarks.sh | 17 +++++ .github/workflows/gpt-oss-benchmark.yml | 74 +------------------ 3 files changed, 76 insertions(+), 72 deletions(-) create mode 100755 .github/scripts/gpt-oss/run_accuracy_checks.sh create mode 100755 .github/scripts/gpt-oss/run_benchmarks.sh diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh new file mode 100755 index 00000000..d1fb36a5 --- /dev/null +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +set -eux + +tp=0 +if [[ "${MODEL}" == "openai/gpt-oss-120b" ]]; then + tp=4 +elif [[ "${MODEL}" == "openai/gpt-oss-20b" ]]; then + tp=1 +fi + +echo $tp +# Prepare the accuracy test +vllm serve $MODEL --tensor_parallel_size $tp & +server_pid=$! + +wait_for_server() { + timeout 1200 bash -c ' + until curl -X POST localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +if wait_for_server; then + echo "vLLM server is up and running" +else + echo "vLLM failed to start within the timeout period" +fi + +pushd vllm-benchmarks/gpt-oss +mkdir -p /tmp/gpqa_openai + +# Low +OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort low \ + --n-threads $(expr $(nproc) / 2) + +# Mid +OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort medium \ + --n-threads $(expr $(nproc) / 2) + +# High +OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort high \ + --n-threads $(expr $(nproc) / 2) + +mv /tmp/gpqa_openai . +popd + +kill -9 $server_pid diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh new file mode 100755 index 00000000..5cf25ad5 --- /dev/null +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eux + +pushd vllm-benchmarks/vllm +cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true + +if [[ $DEVICE_NAME != 'rocm' ]]; then + pip install -U openai transformers + pip install --pre vllm==0.10.1+gptoss \ + --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu128 +fi + +pip freeze +bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +popd diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 20380b07..213e746a 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -210,80 +210,10 @@ jobs: ) # Run accuracy check - docker exec -t "${container_name}" bash -c " - set -eux - - echo $MODEL - - tp=0 - if [[ $MODEL == 'openai/gpt-oss-120b' ]]; then - tp=4 - elif [[ $MODEL == 'openai/gpt-oss-20b' ]]; then - tp=1 - fi - - echo $tp - # Prepare the accuracy test - vllm serve $MODEL --tensor_parallel_size $tp & - server_pid=$! - - wait_for_server() { - timeout 1200 bash -c ' - until curl -X POST localhost:8000/v1/completions; do - sleep 1 - done' && return 0 || return 1 - } - - if wait_for_server; then - echo 'vLLM server is up and running' - else - echo 'vLLM failed to start within the timeout period' - fi - - pushd vllm-benchmarks/gpt-oss - mkdir -p /tmp/gpqa_openai - - # Low - OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - --model $MODEL \ - --eval gpqa \ - --reasoning-effort low \ - --n-threads $(expr $(nproc) / 2) - - # Mid - OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - --model $MODEL \ - --eval gpqa \ - --reasoning-effort medium \ - --n-threads $(expr $(nproc) / 2) - - # High - OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - --model $MODEL \ - --eval gpqa \ - --reasoning-effort high \ - --n-threads $(expr $(nproc) / 2) - - mv /tmp/gpqa_openai . - popd - - kill -9 $server_pid - " + docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh # Run perf tests - docker exec -t "${container_name}" bash -c " - pushd vllm-benchmarks/vllm - cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true - if [[ $DEVICE_NAME != 'rocm' ]]; then - pip install -U openai transformers - pip install --pre vllm==0.10.1+gptoss \ - --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ - --extra-index-url https://download.pytorch.org/whl/nightly/cu128 - fi - pip freeze - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - popd - " + docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From a7a1664692664ce08936a290c59e5fa096d5aff2 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 01:11:34 -0700 Subject: [PATCH 19/36] Better now Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 213e746a..f0ae3e47 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -209,12 +209,12 @@ jobs: "${DOCKER_IMAGE}" ) - # Run accuracy check - docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh - # Run perf tests docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh + # Run accuracy check + docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh + - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') From 1ddea7677df9f1d5d1fb8af77aa4b3c1141e3963 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 01:50:45 -0700 Subject: [PATCH 20/36] You're an odd one, ain't you? Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_accuracy_checks.sh | 13 ++++++++++--- .github/workflows/gpt-oss-benchmark.yml | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index d1fb36a5..802dfd44 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -30,22 +30,29 @@ fi pushd vllm-benchmarks/gpt-oss mkdir -p /tmp/gpqa_openai +# Not sure why this is needed on ROCm image +if [[ "${DEVICE_NAME}" == "rocm" ]]; then + ls -la gpt_oss + ls -la gpt_oss/evals + export PYTHONPATH=$(pwd):$PYTHONPATH +fi + # Low -OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ +OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ --model $MODEL \ --eval gpqa \ --reasoning-effort low \ --n-threads $(expr $(nproc) / 2) # Mid -OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ +OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ --model $MODEL \ --eval gpqa \ --reasoning-effort medium \ --n-threads $(expr $(nproc) / 2) # High -OPENAI_API_KEY='' python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ +OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ --model $MODEL \ --eval gpqa \ --reasoning-effort high \ diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index f0ae3e47..ce4d92c4 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -22,7 +22,7 @@ jobs: - runner: linux.dgx.b200.8 model: openai/gpt-oss-120b docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - - runner: linux.rocm.gpu.gfx942.4 + - runner: linux.rocm.gpu.gfx942.8 model: openai/gpt-oss-120b docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b From d46a0a67a0fb4434bb8c6aad600e6758b30e7987 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 03:09:52 -0700 Subject: [PATCH 21/36] Another attempt Signed-off-by: Huy Do --- .../scripts/gpt-oss/run_accuracy_checks.sh | 12 ++++++++ .github/scripts/gpt-oss/run_benchmarks.sh | 18 ++++++++++- .github/workflows/gpt-oss-benchmark.yml | 30 ------------------- 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 802dfd44..4de4cf01 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -2,6 +2,18 @@ set -eux +# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html +if [[ "${DEVICE_TYPE}" == *B200* ]]; then + export VLLM_USE_TRTLLM_ATTENTION=1 + export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 +elif [[ "${DEVICE_NAME}" == *rocm* ]]; then + export VLLM_ROCM_USE_AITER=1 + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 + export VLLM_ROCM_USE_AITER_MHA=0 +fi + tp=0 if [[ "${MODEL}" == "openai/gpt-oss-120b" ]]; then tp=4 diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh index 5cf25ad5..8b74ae34 100755 --- a/.github/scripts/gpt-oss/run_benchmarks.sh +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -2,14 +2,30 @@ set -eux +# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html +if [[ "${DEVICE_TYPE}" == *B200* ]]; then + export VLLM_USE_TRTLLM_ATTENTION=1 + export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 +elif [[ "${DEVICE_NAME}" == *rocm* ]]; then + export VLLM_ROCM_USE_AITER=1 + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 + export VLLM_ROCM_USE_AITER_MHA=0 +fi + pushd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true -if [[ $DEVICE_NAME != 'rocm' ]]; then +if [[ "${DEVICE_NAME}" != "rocm" ]]; then pip install -U openai transformers pip install --pre vllm==0.10.1+gptoss \ --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ --extra-index-url https://download.pytorch.org/whl/nightly/cu128 + + export TORCH_CUDA_ARCH_LIST='8.9 9.0' + pip install --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31" \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu128 fi pip freeze diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index ce4d92c4..bfe38209 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -161,29 +161,6 @@ jobs: aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws fi - # https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html - if [[ "${DEVICE_TYPE}" == *B200* ]]; then - export VLLM_USE_TRTLLM_ATTENTION=1 - export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 - export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 - export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 - else - export VLLM_USE_TRTLLM_ATTENTION=0 - export VLLM_USE_TRTLLM_DECODE_ATTENTION=0 - export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=0 - export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=0 - fi - - if [[ "${DEVICE_NAME}" == *rocm* ]]; then - export VLLM_ROCM_USE_AITER=1 - export VLLM_USE_AITER_UNIFIED_ATTENTION=1 - export VLLM_ROCM_USE_AITER_MHA=0 - else - export VLLM_ROCM_USE_AITER=0 - export VLLM_USE_AITER_UNIFIED_ATTENTION=0 - export VLLM_ROCM_USE_AITER_MHA=0 - fi - container_name=$(docker run \ ${GPU_FLAG:-} \ -e MODEL \ @@ -192,13 +169,6 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - -e VLLM_USE_TRTLLM_ATTENTION \ - -e VLLM_USE_TRTLLM_DECODE_ATTENTION \ - -e VLLM_USE_TRTLLM_CONTEXT_ATTENTION \ - -e VLLM_USE_FLASHINFER_MXFP4_BF16_MOE \ - -e VLLM_ROCM_USE_AITER \ - -e VLLM_USE_AITER_UNIFIED_ATTENTION \ - -e VLLM_ROCM_USE_AITER_MHA \ --ipc=host \ --tty \ --detach \ From 898b35ec745dee63b29c62ff59bc7cb90bdaa47e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 03:23:32 -0700 Subject: [PATCH 22/36] Neeed newer setuptools Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_benchmarks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh index 8b74ae34..edea1005 100755 --- a/.github/scripts/gpt-oss/run_benchmarks.sh +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -18,7 +18,7 @@ pushd vllm-benchmarks/vllm cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true if [[ "${DEVICE_NAME}" != "rocm" ]]; then - pip install -U openai transformers + pip install -U openai transformers setuptools pip install --pre vllm==0.10.1+gptoss \ --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ --extra-index-url https://download.pytorch.org/whl/nightly/cu128 From 78f14938f83acabbc6ba4c5706d9cdb0227d150e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 03:48:22 -0700 Subject: [PATCH 23/36] Another try Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_accuracy_checks.sh | 6 +++--- .github/scripts/gpt-oss/run_benchmarks.sh | 6 ++---- .github/workflows/gpt-oss-benchmark.yml | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 4de4cf01..296a722f 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -12,6 +12,8 @@ elif [[ "${DEVICE_NAME}" == *rocm* ]]; then export VLLM_ROCM_USE_AITER=1 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 +else + export VLLM_FLASH_ATTN_VERSION=2 fi tp=0 @@ -44,9 +46,7 @@ mkdir -p /tmp/gpqa_openai # Not sure why this is needed on ROCm image if [[ "${DEVICE_NAME}" == "rocm" ]]; then - ls -la gpt_oss - ls -la gpt_oss/evals - export PYTHONPATH=$(pwd):$PYTHONPATH + export PYTHONPATH=$(pwd) fi # Low diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh index edea1005..2c74e6d0 100755 --- a/.github/scripts/gpt-oss/run_benchmarks.sh +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -12,6 +12,8 @@ elif [[ "${DEVICE_NAME}" == *rocm* ]]; then export VLLM_ROCM_USE_AITER=1 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 +else + export VLLM_FLASH_ATTN_VERSION=2 fi pushd vllm-benchmarks/vllm @@ -22,10 +24,6 @@ if [[ "${DEVICE_NAME}" != "rocm" ]]; then pip install --pre vllm==0.10.1+gptoss \ --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ --extra-index-url https://download.pytorch.org/whl/nightly/cu128 - - export TORCH_CUDA_ARCH_LIST='8.9 9.0' - pip install --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31" \ - --extra-index-url https://download.pytorch.org/whl/nightly/cu128 fi pip freeze diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index bfe38209..cd1722cf 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -16,7 +16,7 @@ jobs: matrix: include: # gpt-oss-120b - - runner: linux.aws.h100.4 + - runner: linux.aws.h100.8 model: openai/gpt-oss-120b docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - runner: linux.dgx.b200.8 From 577c1ab78e7509cfea10e720ed3048216ab74089 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 12:51:04 -0700 Subject: [PATCH 24/36] Increase the timeout to 12h Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_accuracy_checks.sh | 2 -- .github/scripts/gpt-oss/run_benchmarks.sh | 2 -- .github/workflows/gpt-oss-benchmark.yml | 5 +++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 296a722f..214193b0 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -12,8 +12,6 @@ elif [[ "${DEVICE_NAME}" == *rocm* ]]; then export VLLM_ROCM_USE_AITER=1 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 -else - export VLLM_FLASH_ATTN_VERSION=2 fi tp=0 diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh index 2c74e6d0..14c2bf11 100755 --- a/.github/scripts/gpt-oss/run_benchmarks.sh +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -12,8 +12,6 @@ elif [[ "${DEVICE_NAME}" == *rocm* ]]; then export VLLM_ROCM_USE_AITER=1 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 -else - export VLLM_FLASH_ATTN_VERSION=2 fi pushd vllm-benchmarks/vllm diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index cd1722cf..b6cfdc8e 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -41,6 +41,7 @@ jobs: permissions: id-token: write contents: read + timeout-minutes: 720 steps: - name: Checkout repository uses: actions/checkout@v4 @@ -138,7 +139,7 @@ jobs: pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; - popd + - name: Run vLLM gpt-oss benchmark env: @@ -206,7 +207,7 @@ jobs: ls -lah "${BENCHMARK_RESULTS}" SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") - SANITIZED_MODELS="${MODELS//\//_}" + SANITIZED_MODEL="${MODEL//\//_}" python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ From 73e837301815dc24fb74e2ba7ad2aa608d8ee870 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 15:15:23 -0700 Subject: [PATCH 25/36] Another round Signed-off-by: Huy Do --- .../scripts/gpt-oss/run_accuracy_checks.sh | 52 +++++++++++-------- .github/workflows/gpt-oss-benchmark.yml | 8 +++ 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 214193b0..7fe8e47b 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -44,30 +44,38 @@ mkdir -p /tmp/gpqa_openai # Not sure why this is needed on ROCm image if [[ "${DEVICE_NAME}" == "rocm" ]]; then - export PYTHONPATH=$(pwd) + pushd gpt_oss + # Low + OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort low \ + --n-threads $(expr $(nproc) / 2) + popd +else + sleep 7200 + # Low + #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + # --model $MODEL \ + # --eval gpqa \ + # --reasoning-effort low \ + # --n-threads $(expr $(nproc) / 2) + # + # Mid + #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + # --model $MODEL \ + # --eval gpqa \ + # --reasoning-effort medium \ + # --n-threads $(expr $(nproc) / 2) + # + ## High + #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + # --model $MODEL \ + # --eval gpqa \ + # --reasoning-effort high \ + # --n-threads $(expr $(nproc) / 2) fi -# Low -OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - --model $MODEL \ - --eval gpqa \ - --reasoning-effort low \ - --n-threads $(expr $(nproc) / 2) - -# Mid -OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - --model $MODEL \ - --eval gpqa \ - --reasoning-effort medium \ - --n-threads $(expr $(nproc) / 2) - -# High -OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - --model $MODEL \ - --eval gpqa \ - --reasoning-effort high \ - --n-threads $(expr $(nproc) / 2) - mv /tmp/gpqa_openai . popd diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index b6cfdc8e..3500a13d 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -162,6 +162,12 @@ jobs: aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws fi + # Leaving 1GB for the runner and other things + TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) + # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap + # comes from https://github.com/pytorch/test-infra/pull/6058 + TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + container_name=$(docker run \ ${GPU_FLAG:-} \ -e MODEL \ @@ -170,6 +176,8 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ + --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ --ipc=host \ --tty \ --detach \ From 9cab5277f5ed7c4b4fc618b2c46b9ba4a6d27ca7 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 16:34:28 -0700 Subject: [PATCH 26/36] It seems to work now Signed-off-by: Huy Do --- .../scripts/gpt-oss/run_accuracy_checks.sh | 15 +++++++------- .../benchmarks/rocm/latency-tests.json | 11 ---------- .../benchmarks/rocm/serving-tests.json | 20 ------------------- .../benchmarks/rocm/throughput-tests.json | 12 ----------- 4 files changed, 7 insertions(+), 51 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 7fe8e47b..67f0e03a 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -42,8 +42,8 @@ fi pushd vllm-benchmarks/gpt-oss mkdir -p /tmp/gpqa_openai -# Not sure why this is needed on ROCm image if [[ "${DEVICE_NAME}" == "rocm" ]]; then + # Not sure why this is needed on ROCm image pushd gpt_oss # Low OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ @@ -53,14 +53,13 @@ if [[ "${DEVICE_NAME}" == "rocm" ]]; then --n-threads $(expr $(nproc) / 2) popd else - sleep 7200 # Low - #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - # --model $MODEL \ - # --eval gpqa \ - # --reasoning-effort low \ - # --n-threads $(expr $(nproc) / 2) - # + OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort low \ + --n-threads $(expr $(nproc) / 2) + # Mid #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ # --model $MODEL \ diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json index 30b5b83f..f1bc3498 100644 --- a/vllm-benchmarks/benchmarks/rocm/latency-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -72,16 +72,5 @@ "num_iters": 15, "max_model_len": 8192 } - }, - { - "test_name": "latency_gpt_oss_120b_tp4", - "parameters": { - "model": "openai/gpt-oss-120b", - "tensor_parallel_size": 4, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15, - "max_model_len": 8192 - } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index b9e814e0..30924f30 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -450,25 +450,5 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } - }, - { - "test_name": "serving_gpt_oss_120b_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "openai/gpt-oss-120b", - "tensor_parallel_size": 4, - "swap_space": 16, - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy", - "max_model_len": 8192 - }, - "client_parameters": { - "model": "", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json index d244eb81..72272748 100644 --- a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -79,17 +79,5 @@ "backend": "vllm", "max_model_len": 8192 } - }, - { - "test_name": "throughput_gpt_oss_120b_tp4", - "parameters": { - "model": "openai/gpt-oss-120b", - "tensor_parallel_size": 4, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm", - "max_model_len": 8192 - } } ] From 585b42bbc9f28f958bf99b04b155feb310ff592d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 18:01:21 -0700 Subject: [PATCH 27/36] Let's run everything now Signed-off-by: Huy Do --- .../scripts/gpt-oss/run_accuracy_checks.sh | 40 +++++++++++++------ .../benchmarks/cuda/serving-tests.json | 2 +- .../benchmarks/rocm/latency-tests.json | 11 +++++ .../benchmarks/rocm/serving-tests.json | 24 ++++++++++- .../benchmarks/rocm/throughput-tests.json | 12 ++++++ 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 67f0e03a..8d081529 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -43,7 +43,7 @@ pushd vllm-benchmarks/gpt-oss mkdir -p /tmp/gpqa_openai if [[ "${DEVICE_NAME}" == "rocm" ]]; then - # Not sure why this is needed on ROCm image + # Not sure why this is needed on ROCm pushd gpt_oss # Low OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ @@ -51,6 +51,20 @@ if [[ "${DEVICE_NAME}" == "rocm" ]]; then --eval gpqa \ --reasoning-effort low \ --n-threads $(expr $(nproc) / 2) + + # Mid + OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort medium \ + --n-threads $(expr $(nproc) / 2) + + # High + OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort high \ + --n-threads $(expr $(nproc) / 2) popd else # Low @@ -61,18 +75,18 @@ else --n-threads $(expr $(nproc) / 2) # Mid - #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - # --model $MODEL \ - # --eval gpqa \ - # --reasoning-effort medium \ - # --n-threads $(expr $(nproc) / 2) - # - ## High - #OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ - # --model $MODEL \ - # --eval gpqa \ - # --reasoning-effort high \ - # --n-threads $(expr $(nproc) / 2) + OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort medium \ + --n-threads $(expr $(nproc) / 2) + + # High + OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval gpqa \ + --reasoning-effort high \ + --n-threads $(expr $(nproc) / 2) fi mv /tmp/gpqa_openai . diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index d9395e80..01f59aaf 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -437,7 +437,7 @@ "qps_list": [1, 4, 16, "inf"], "server_parameters": { "model": "openai/gpt-oss-120b", - "tensor_parallel_size": 1, + "tensor_parallel_size": 2, "swap_space": 16, "disable_log_stats": "", "disable_log_requests": "", diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json index f1bc3498..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/rocm/latency-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -72,5 +72,16 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index 30924f30..ed53e686 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -436,8 +436,28 @@ "qps_list": [1, 4, 16, "inf"], "server_parameters": { "model": "openai/gpt-oss-120b", - "tensor_parallel_size": 1, - "swap_space": 16, + "tensor_parallel_size": 2, + "swap_space": 128, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "swap_space": 128, "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json index 72272748..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -79,5 +79,17 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ] From 9960023529931de97bdb238819b0674caefad7de Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 7 Aug 2025 23:27:36 -0700 Subject: [PATCH 28/36] Use bigger runners Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 6 +++--- vllm-benchmarks/benchmarks/rocm/serving-tests.json | 13 ++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 3500a13d..ad6e0594 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -26,13 +26,13 @@ jobs: model: openai/gpt-oss-120b docker-image: rocm/vllm-dev:open-mi300-08052025 # gpt-oss-20b - - runner: linux.aws.h100 + - runner: linux.aws.h100.4 model: openai/gpt-oss-20b docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - - runner: linux.dgx.b200 + - runner: linux.dgx.b200.8 model: openai/gpt-oss-20b docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' - - runner: linux.rocm.gpu.gfx942.2 + - runner: linux.rocm.gpu.gfx942.8 model: openai/gpt-oss-20b docker-image: rocm/vllm-dev:open-mi300-08052025 fail-fast: false diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index ed53e686..95a6a7d3 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -421,7 +421,8 @@ "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192 + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { "model": "", @@ -437,11 +438,12 @@ "server_parameters": { "model": "openai/gpt-oss-120b", "tensor_parallel_size": 2, - "swap_space": 128, + "swap_space": 16, "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192 + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { "model": "", @@ -457,11 +459,12 @@ "server_parameters": { "model": "openai/gpt-oss-120b", "tensor_parallel_size": 4, - "swap_space": 128, + "swap_space": 16, "disable_log_stats": "", "disable_log_requests": "", "load_format": "dummy", - "max_model_len": 8192 + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { "model": "", From 623d2939d17a64d104759a8725213eb815977bad Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 10:28:11 -0700 Subject: [PATCH 29/36] Another round of perf benchmark Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 2 +- vllm-benchmarks/benchmarks/cuda/serving-tests.json | 6 +++--- vllm-benchmarks/benchmarks/rocm/serving-tests.json | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index ad6e0594..a9adb78a 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -192,7 +192,7 @@ jobs: docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh # Run accuracy check - docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh + # docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 01f59aaf..3ebdcadb 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -425,7 +425,7 @@ "max_model_len": 8192 }, "client_parameters": { - "model": "", + "model": "openai/gpt-oss-20b", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -445,7 +445,7 @@ "max_model_len": 8192 }, "client_parameters": { - "model": "", + "model": "openai/gpt-oss-120b", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -465,7 +465,7 @@ "max_model_len": 8192 }, "client_parameters": { - "model": "", + "model": "openai/gpt-oss-120b", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index 95a6a7d3..e07c562b 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -425,7 +425,7 @@ "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { - "model": "", + "model": "openai/gpt-oss-20b", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -446,7 +446,7 @@ "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { - "model": "", + "model": "openai/gpt-oss-120b", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -467,7 +467,7 @@ "compilation-config": "{'full_cuda_graph': true}" }, "client_parameters": { - "model": "", + "model": "openai/gpt-oss-120b", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From 12375c8c576c06ac54ea0f218dbaa2b8ff164079 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 10:32:07 -0700 Subject: [PATCH 30/36] Why CI not running Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index a9adb78a..93890e96 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -191,7 +191,7 @@ jobs: # Run perf tests docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh - # Run accuracy check + # Run accuracy check (turning this on later if needed) # docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh - name: Authenticate with AWS From ce29b20c681bb9bf5b2f31bc282d5334d9552d0d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 11:59:13 -0700 Subject: [PATCH 31/36] Benchmark aime25 Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_accuracy_checks.sh | 14 ++++++++------ .github/scripts/gpt-oss/run_benchmarks.sh | 3 ++- .github/workflows/gpt-oss-benchmark.yml | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index 8d081529..bb65ab70 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -41,6 +41,7 @@ fi pushd vllm-benchmarks/gpt-oss mkdir -p /tmp/gpqa_openai +mkdir -p /tmp/aime25_openai if [[ "${DEVICE_NAME}" == "rocm" ]]; then # Not sure why this is needed on ROCm @@ -48,21 +49,21 @@ if [[ "${DEVICE_NAME}" == "rocm" ]]; then # Low OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval gpqa \ + --eval aime25 \ --reasoning-effort low \ --n-threads $(expr $(nproc) / 2) # Mid OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval gpqa \ + --eval aime25 \ --reasoning-effort medium \ --n-threads $(expr $(nproc) / 2) # High OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval gpqa \ + --eval \ --reasoning-effort high \ --n-threads $(expr $(nproc) / 2) popd @@ -70,26 +71,27 @@ else # Low OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval gpqa \ + --eval aime25 \ --reasoning-effort low \ --n-threads $(expr $(nproc) / 2) # Mid OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval gpqa \ + --eval aime25 \ --reasoning-effort medium \ --n-threads $(expr $(nproc) / 2) # High OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval gpqa \ + --eval aime25 \ --reasoning-effort high \ --n-threads $(expr $(nproc) / 2) fi mv /tmp/gpqa_openai . +mv /tmp/aime25_openai . popd kill -9 $server_pid diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh index 14c2bf11..4895647a 100755 --- a/.github/scripts/gpt-oss/run_benchmarks.sh +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -25,5 +25,6 @@ if [[ "${DEVICE_NAME}" != "rocm" ]]; then fi pip freeze -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +# Just run accuracy tests for now +# bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh popd diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 93890e96..daa866da 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -192,7 +192,7 @@ jobs: docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh # Run accuracy check (turning this on later if needed) - # docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh + docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From 8cb19eed1db57d5b039b743abbe0474c7bbc617b Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 11:59:56 -0700 Subject: [PATCH 32/36] Small bug Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_accuracy_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh index bb65ab70..900d9fc3 100755 --- a/.github/scripts/gpt-oss/run_accuracy_checks.sh +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -63,7 +63,7 @@ if [[ "${DEVICE_NAME}" == "rocm" ]]; then # High OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ --model $MODEL \ - --eval \ + --eval aime25 \ --reasoning-effort high \ --n-threads $(expr $(nproc) / 2) popd From d22c815319950108cfa303124476fa2034031798 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 13:05:35 -0700 Subject: [PATCH 33/36] Make upload benchmark results optional Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index daa866da..1f08f57c 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -205,6 +205,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results + continue-on-error: true env: BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results MODEL: ${{ matrix.model }} From 0514931347905f05b0999c1d5cebc6256418ea43 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 13:11:17 -0700 Subject: [PATCH 34/36] Ugh Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 1f08f57c..3d01118f 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -218,6 +218,9 @@ jobs: SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") SANITIZED_MODEL="${MODEL//\//_}" + echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV + echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV + python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ @@ -226,9 +229,6 @@ jobs: --device-type "${SANITIZED_DEVICE_TYPE}" \ --model "${MODEL//\//_}" - echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV - echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV - # Keep a copy of the benchmark results on GitHub for reference - uses: actions/upload-artifact@v4 with: From c2f3dc70c204e375b1dfc49a3b248e3d052b6c03 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 16:17:03 -0700 Subject: [PATCH 35/36] Darn it, I forgot to upload the results Signed-off-by: Huy Do --- .github/workflows/gpt-oss-benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml index 3d01118f..d4a292d2 100644 --- a/.github/workflows/gpt-oss-benchmark.yml +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -241,3 +241,4 @@ jobs: name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} path: | vllm-benchmarks/gpt-oss/gpqa_openai + vllm-benchmarks/gpt-oss/aime25_openai From d64186643ed8fafbef91a6f3e6f2dbbf7dff109d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 8 Aug 2025 17:26:39 -0700 Subject: [PATCH 36/36] Let's just keep perf run then Signed-off-by: Huy Do --- .github/scripts/gpt-oss/run_benchmarks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh index 4895647a..c186023b 100755 --- a/.github/scripts/gpt-oss/run_benchmarks.sh +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -26,5 +26,5 @@ fi pip freeze # Just run accuracy tests for now -# bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh popd