Skip to content

Commit d46a0a6

Browse files
committed
Another attempt
Signed-off-by: Huy Do <huydhn@gmail.com>
1 parent 1ddea76 commit d46a0a6

File tree

3 files changed

+29
-31
lines changed

3 files changed

+29
-31
lines changed

.github/scripts/gpt-oss/run_accuracy_checks.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22

33
set -eux
44

5+
# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
6+
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
7+
export VLLM_USE_TRTLLM_ATTENTION=1
8+
export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
9+
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
10+
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
11+
elif [[ "${DEVICE_NAME}" == *rocm* ]]; then
12+
export VLLM_ROCM_USE_AITER=1
13+
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
14+
export VLLM_ROCM_USE_AITER_MHA=0
15+
fi
16+
517
tp=0
618
if [[ "${MODEL}" == "openai/gpt-oss-120b" ]]; then
719
tp=4

.github/scripts/gpt-oss/run_benchmarks.sh

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,30 @@
22

33
set -eux
44

5+
# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
6+
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
7+
export VLLM_USE_TRTLLM_ATTENTION=1
8+
export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
9+
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
10+
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
11+
elif [[ "${DEVICE_NAME}" == *rocm* ]]; then
12+
export VLLM_ROCM_USE_AITER=1
13+
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
14+
export VLLM_ROCM_USE_AITER_MHA=0
15+
fi
16+
517
pushd vllm-benchmarks/vllm
618
cp vllm/benchmarks/lib/utils.py /app/vllm-os-mini/vllm/benchmarks/utils.py || true
719

8-
if [[ $DEVICE_NAME != 'rocm' ]]; then
20+
if [[ "${DEVICE_NAME}" != "rocm" ]]; then
921
pip install -U openai transformers
1022
pip install --pre vllm==0.10.1+gptoss \
1123
--extra-index-url https://wheels.vllm.ai/gpt-oss/ \
1224
--extra-index-url https://download.pytorch.org/whl/nightly/cu128
25+
26+
export TORCH_CUDA_ARCH_LIST='8.9 9.0'
27+
pip install --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31" \
28+
--extra-index-url https://download.pytorch.org/whl/nightly/cu128
1329
fi
1430

1531
pip freeze

.github/workflows/gpt-oss-benchmark.yml

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -161,29 +161,6 @@ jobs:
161161
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
162162
fi
163163
164-
# https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
165-
if [[ "${DEVICE_TYPE}" == *B200* ]]; then
166-
export VLLM_USE_TRTLLM_ATTENTION=1
167-
export VLLM_USE_TRTLLM_DECODE_ATTENTION=1
168-
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=1
169-
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1
170-
else
171-
export VLLM_USE_TRTLLM_ATTENTION=0
172-
export VLLM_USE_TRTLLM_DECODE_ATTENTION=0
173-
export VLLM_USE_TRTLLM_CONTEXT_ATTENTION=0
174-
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=0
175-
fi
176-
177-
if [[ "${DEVICE_NAME}" == *rocm* ]]; then
178-
export VLLM_ROCM_USE_AITER=1
179-
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
180-
export VLLM_ROCM_USE_AITER_MHA=0
181-
else
182-
export VLLM_ROCM_USE_AITER=0
183-
export VLLM_USE_AITER_UNIFIED_ATTENTION=0
184-
export VLLM_ROCM_USE_AITER_MHA=0
185-
fi
186-
187164
container_name=$(docker run \
188165
${GPU_FLAG:-} \
189166
-e MODEL \
@@ -192,13 +169,6 @@ jobs:
192169
-e HF_TOKEN \
193170
-e ENGINE_VERSION \
194171
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
195-
-e VLLM_USE_TRTLLM_ATTENTION \
196-
-e VLLM_USE_TRTLLM_DECODE_ATTENTION \
197-
-e VLLM_USE_TRTLLM_CONTEXT_ATTENTION \
198-
-e VLLM_USE_FLASHINFER_MXFP4_BF16_MOE \
199-
-e VLLM_ROCM_USE_AITER \
200-
-e VLLM_USE_AITER_UNIFIED_ATTENTION \
201-
-e VLLM_ROCM_USE_AITER_MHA \
202172
--ipc=host \
203173
--tty \
204174
--detach \

0 commit comments

Comments
 (0)