diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index dc7281c7..366647a1 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -53,7 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - RUNNERS: ${{ inputs.runners || '' }} + RUNNERS: ${{ inputs.runners || 'h100' }} run: | set -eux diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 9e9f15f8..c397a5bc 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -50,5 +50,27 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 66b7c4de..59e1a659 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -411,5 +411,45 @@ "random_input_len": 30720, "random_output_len": 100 } + }, + { + "test_name": "serving_gpt_oss_20b_tp1_random_in5k_out8k", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "openai/gpt-oss-20b", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5250, + "random_output_len": 8250 + } + }, + { + "test_name": "serving_gpt_oss_120b_tp4_random_in5k_out8k", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "openai/gpt-oss-120b", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5250, + "random_output_len": 8250 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 647ac2f3..f159426e 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -55,5 +55,29 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_gpt_oss_20b_tp1", + "parameters": { + "model": "openai/gpt-oss-20b", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_gpt_oss_120b_tp4", + "parameters": { + "model": "openai/gpt-oss-120b", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ]