diff --git a/.github/scripts/gpt-oss/run_accuracy_checks.sh b/.github/scripts/gpt-oss/run_accuracy_checks.sh new file mode 100755 index 00000000..900d9fc3 --- /dev/null +++ b/.github/scripts/gpt-oss/run_accuracy_checks.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +set -eux + +# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html +if [[ "${DEVICE_TYPE}" == *B200* ]]; then + export VLLM_USE_TRTLLM_ATTENTION=1 + export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 +elif [[ "${DEVICE_NAME}" == *rocm* ]]; then + export VLLM_ROCM_USE_AITER=1 + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 + export VLLM_ROCM_USE_AITER_MHA=0 +fi + +tp=0 +if [[ "${MODEL}" == "openai/gpt-oss-120b" ]]; then + tp=4 +elif [[ "${MODEL}" == "openai/gpt-oss-20b" ]]; then + tp=1 +fi + +echo $tp +# Prepare the accuracy test +vllm serve $MODEL --tensor_parallel_size $tp & +server_pid=$! + +wait_for_server() { + timeout 1200 bash -c ' + until curl -X POST localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +if wait_for_server; then + echo "vLLM server is up and running" +else + echo "vLLM failed to start within the timeout period" +fi + +pushd vllm-benchmarks/gpt-oss +mkdir -p /tmp/gpqa_openai +mkdir -p /tmp/aime25_openai + +if [[ "${DEVICE_NAME}" == "rocm" ]]; then + # Not sure why this is needed on ROCm + pushd gpt_oss + # Low + OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval aime25 \ + --reasoning-effort low \ + --n-threads $(expr $(nproc) / 2) + + # Mid + OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval aime25 \ + --reasoning-effort medium \ + --n-threads $(expr $(nproc) / 2) + + # High + OPENAI_API_KEY="" python3 -mevals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval aime25 \ + --reasoning-effort high \ + --n-threads $(expr $(nproc) / 2) + popd +else + # Low + OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval aime25 \ + --reasoning-effort low \ + --n-threads $(expr $(nproc) / 2) + + # Mid + OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval aime25 \ + --reasoning-effort medium \ + --n-threads $(expr $(nproc) / 2) + + # High + OPENAI_API_KEY="" python3 -m gpt_oss.evals --base-url http://localhost:8000/v1 \ + --model $MODEL \ + --eval aime25 \ + --reasoning-effort high \ + --n-threads $(expr $(nproc) / 2) +fi + +mv /tmp/gpqa_openai . +mv /tmp/aime25_openai . +popd + +kill -9 $server_pid diff --git a/.github/scripts/gpt-oss/run_benchmarks.sh b/.github/scripts/gpt-oss/run_benchmarks.sh new file mode 100755 index 00000000..c186023b --- /dev/null +++ b/.github/scripts/gpt-oss/run_benchmarks.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -eux + +# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html +if [[ "${DEVICE_TYPE}" == *B200* ]]; then + export VLLM_USE_TRTLLM_ATTENTION=1 + export VLLM_USE_TRTLLM_DECODE_ATTENTION=1 + export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1 + export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 +elif [[ "${DEVICE_NAME}" == *rocm* ]]; then + export VLLM_ROCM_USE_AITER=1 + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 + export VLLM_ROCM_USE_AITER_MHA=0 +fi + +pushd vllm-benchmarks/vllm +cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true + +if [[ "${DEVICE_NAME}" != "rocm" ]]; then + pip install -U openai transformers setuptools + pip install --pre vllm==0.10.1+gptoss \ + --extra-index-url https://wheels.vllm.ai/gpt-oss/ \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu128 +fi + +pip freeze +# Just run accuracy tests for now +bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +popd diff --git a/.github/workflows/gpt-oss-benchmark.yml b/.github/workflows/gpt-oss-benchmark.yml new file mode 100644 index 00000000..d4a292d2 --- /dev/null +++ b/.github/workflows/gpt-oss-benchmark.yml @@ -0,0 +1,244 @@ +name: gpt-oss benchmark + +on: + pull_request: + paths: + - .github/workflows/gpt-oss-benchmark.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + benchmarks: + name: Run gpt-oss benchmarks + strategy: + matrix: + include: + # gpt-oss-120b + - runner: linux.aws.h100.8 + model: openai/gpt-oss-120b + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' + - runner: linux.dgx.b200.8 + model: openai/gpt-oss-120b + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' + - runner: linux.rocm.gpu.gfx942.8 + model: openai/gpt-oss-120b + docker-image: rocm/vllm-dev:open-mi300-08052025 + # gpt-oss-20b + - runner: linux.aws.h100.4 + model: openai/gpt-oss-20b + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' + - runner: linux.dgx.b200.8 + model: openai/gpt-oss-20b + docker-image: 'public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:6d8d0a24c02bfd84d46b3016b865a44f048ae84b' + - runner: linux.rocm.gpu.gfx942.8 + model: openai/gpt-oss-20b + docker-image: rocm/vllm-dev:open-mi300-08052025 + fail-fast: false + runs-on: ${{ matrix.runner }} + environment: pytorch-x-vllm + permissions: + id-token: write + contents: read + timeout-minutes: 720 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Checkout vLLM repository + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-benchmarks/vllm + + - name: Checkout gpt-oss repository + uses: actions/checkout@v4 + with: + repository: openai/gpt-oss + path: vllm-benchmarks/gpt-oss + + - uses: actions/setup-python@v5 + # Amazon Linux fails on this step + continue-on-error: true + with: + python-version: '3.12' + cache: 'pip' + + - name: Check if the device is supported + shell: bash + run: | + set -eux + + if command -v nvidia-smi; then + DEVICE_NAME=cuda + nvidia-smi + elif command -v rocm-smi; then + DEVICE_NAME=rocm + rocm-smi + else + DEVICE_NAME=cpu + lscpu + fi + echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV + + - name: Set GPU name and type + working-directory: vllm-benchmarks + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "cuda" ]]; then + DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") + fi + echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV + + - name: Install dependencies + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/rocm6.3 + else + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/cu128 + fi + + - name: Setup CUDA GPU_FLAG for docker run + if: env.DEVICE_NAME == 'cuda' + run: | + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup ROCm + if: env.DEVICE_NAME == 'rocm' + uses: pytorch/pytorch/./.github/actions/setup-rocm@main + + - name: Setup benchmark tests + env: + MODEL: ${{ matrix.model }} + run: | + set -eux + + pushd vllm-benchmarks/vllm + rm .buildkite/nightly-benchmarks/tests/*.json + popd + + # Set the list of benchmarks we want to cover in this runner + python3 .github/scripts/setup_vllm_benchmark.py \ + --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ + --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ + --models "${MODEL}" \ + --device "${DEVICE_NAME}" + + pushd vllm-benchmarks/vllm + ls -lah .buildkite/nightly-benchmarks/tests + find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; + + + - name: Run vLLM gpt-oss benchmark + env: + # To login to public.ecr.aws + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + DOCKER_IMAGE: ${{ matrix.docker-image }} + # vLLM-related environment variables + ENGINE_VERSION: v1 + SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + MODEL: ${{ matrix.model }} + run: | + set -eux + + if [[ "${DEVICE_TYPE}" == *B200* ]]; then + # Just to unblock this change on B200 + aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}" + aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}" + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws + fi + + # Leaving 1GB for the runner and other things + TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) + # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap + # comes from https://github.com/pytorch/test-infra/pull/6058 + TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e MODEL \ + -e DEVICE_NAME \ + -e DEVICE_TYPE \ + -e HF_TOKEN \ + -e ENGINE_VERSION \ + -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ + --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ + --ipc=host \ + --tty \ + --detach \ + --security-opt seccomp=unconfined \ + --shm-size=4g \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + "${DOCKER_IMAGE}" + ) + + # Run perf tests + docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_benchmarks.sh + + # Run accuracy check (turning this on later if needed) + docker exec -t "${container_name}" bash .github/scripts/gpt-oss/run_accuracy_checks.sh + + - name: Authenticate with AWS + # AWS CUDA runners already have access to the bucket via its runner IAM role + if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results + # The max duration enforced by the server side + role-duration-seconds: 18000 + aws-region: us-east-1 + + - name: Upload the benchmark results + continue-on-error: true + env: + BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results + MODEL: ${{ matrix.model }} + run: | + set -eux + + sudo chown -R ${UID} "${BENCHMARK_RESULTS}" + ls -lah "${BENCHMARK_RESULTS}" + + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") + SANITIZED_MODEL="${MODEL//\//_}" + + echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV + echo "SANITIZED_MODEL=$SANITIZED_MODEL" >> $GITHUB_ENV + + python3 .github/scripts/upload_benchmark_results.py \ + --repo vllm-benchmarks/vllm \ + --benchmark-name "vLLM benchmark" \ + --benchmark-results "${BENCHMARK_RESULTS}" \ + --device-name "${DEVICE_NAME}" \ + --device-type "${SANITIZED_DEVICE_TYPE}" \ + --model "${MODEL//\//_}" + + # Keep a copy of the benchmark results on GitHub for reference + - uses: actions/upload-artifact@v4 + with: + name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} + path: vllm-benchmarks/vllm/benchmarks/results + + # Keep a copy of the accuracy results on GitHub for reference + - uses: actions/upload-artifact@v4 + with: + name: accuracy-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODEL }} + path: | + vllm-benchmarks/gpt-oss/gpqa_openai + vllm-benchmarks/gpt-oss/aime25_openai diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index dc7281c7..dd5dbeb0 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -26,10 +26,6 @@ on: required: true type: string default: h100,rocm,spr,b200 - pull_request: - paths: - - .github/workflows/vllm-benchmark.yml - - vllm-benchmarks/** concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 9e9f15f8..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -50,5 +50,38 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 66b7c4de..3ebdcadb 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -411,5 +411,65 @@ "random_input_len": 30720, "random_output_len": 100 } + }, + { + "test_name": "serving_gpt_oss_20b_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "openai/gpt-oss-20b", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "openai/gpt-oss-120b", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "openai/gpt-oss-120b", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 647ac2f3..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -55,5 +55,41 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json index 9e9f15f8..30b5b83f 100644 --- a/vllm-benchmarks/benchmarks/rocm/latency-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -50,5 +50,38 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index 7b32d384..e07c562b 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -410,5 +410,68 @@ "random_input_len": 1024, "random_output_len": 2048 } + }, + { + "test_name": "serving_gpt_oss_20b_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" + }, + "client_parameters": { + "model": "openai/gpt-oss-20b", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" + }, + "client_parameters": { + "model": "openai/gpt-oss-120b", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192, + "compilation-config": "{'full_cuda_graph': true}" + }, + "client_parameters": { + "model": "openai/gpt-oss-120b", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json index 647ac2f3..d244eb81 100644 --- a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -55,5 +55,41 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp2", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ]