6161 - pytest -v -s -m 'not cpu_test' multimodal
6262 - pytest -v -s utils_
6363
64- - label : Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
64+ - label : Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
6565 timeout_in_minutes : 10
6666 mirror_hardwares : [amdexperimental, amdproduction]
6767 agent_pool : mi325_1
@@ -73,13 +73,15 @@ steps:
7373 - tests/multimodal
7474 - tests/standalone_tests/lazy_imports.py
7575 - tests/transformers_utils
76+ - tests/config
7677 no_gpu : true
7778 commands :
7879 - python3 standalone_tests/lazy_imports.py
7980 - pytest -v -s test_inputs.py
8081 - pytest -v -s test_outputs.py
8182 - pytest -v -s -m 'cpu_test' multimodal
8283 - pytest -v -s transformers_utils
84+ - pytest -v -s config
8385
8486- label : Python-only Installation Test # 10min
8587 timeout_in_minutes : 20
@@ -390,6 +392,15 @@ steps:
390392 commands :
391393 - pytest -v -s v1/attention
392394
395+ - label : V1 Test attention (B200) # 10min
396+ timeout_in_minutes : 30
397+ gpu : b200
398+ source_file_dependencies :
399+ - vllm/v1/attention
400+ - tests/v1/attention
401+ commands :
402+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
403+
393404- label : V1 Test others (CPU) # 5 mins
394405 mirror_hardwares : [amdexperimental, amdproduction]
395406 agent_pool : mi325_1
@@ -529,7 +540,7 @@ steps:
529540 - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
530541 # Limit to no custom ops to reduce running time
531542 # Wrap with quotes to escape yaml and avoid starting -k string with a -
532- - " pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and - quant_fp8'"
543+ - " pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not + quant_fp8 and not Llama-4 '"
533544
534545- label : Cudagraph test
535546 timeout_in_minutes : 20
@@ -694,7 +705,7 @@ steps:
694705 - vllm/model_executor/models/whisper.py
695706 commands : # LMEval
696707 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
697- - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
708+ - pytest -s entrypoints/openai/correctness/
698709
699710- label : OpenAI-Compatible Tool Use # 23 min
700711 timeout_in_minutes : 35
@@ -995,12 +1006,12 @@ steps:
9951006 optional : true
9961007 commands :
9971008 - pip install --upgrade git+https://github.com/huggingface/transformers
998- - pytest -v -s tests/models/test_initialization.py
1009+ - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
9991010 - pytest -v -s tests/models/test_transformers.py
1000- - pytest -v -s tests/models/multimodal/processing/
1001- - pytest -v -s tests/models/multimodal/test_mapping.py
1011+ # - pytest -v -s tests/models/multimodal/processing/
1012+ - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
10021013 - python3 examples/offline_inference/basic/chat.py
1003- - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
1014+ # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
10041015 # Whisper needs spawn method to avoid deadlock
10051016 - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
10061017
@@ -1045,7 +1056,7 @@ steps:
10451056 - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
10461057 - pytest -v -s tests/kernels/moe/test_flashinfer.py
10471058
1048- - label : Blackwell Fusion Tests # 30 min
1059+ - label : Blackwell Fusion and Compile Tests # 30 min
10491060 timeout_in_minutes : 40
10501061 working_dir : " /vllm-workspace/"
10511062 gpu : b200
@@ -1066,7 +1077,9 @@ steps:
10661077 - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
10671078 # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
10681079 # Wrap with quotes to escape yaml
1069- - " pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1080+ - " pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
1081+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1082+ - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
10701083
10711084- label : Blackwell Fusion E2E Tests # 30 min
10721085 timeout_in_minutes : 40
@@ -1088,15 +1101,13 @@ steps:
10881101 commands :
10891102 - nvidia-smi
10901103 # Run all e2e fusion tests
1091- - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
1092- # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1093- - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
1104+ - pytest -v -s tests/compile/test_fusions_e2e.py
10941105
10951106- label : ROCm GPT-OSS Eval
10961107 timeout_in_minutes : 60
10971108 working_dir : " /vllm-workspace/"
10981109 agent_pool : mi325_1
1099- mirror_hardwares : [amdproduction]
1110+ mirror_hardwares : [amdexperimental, amdproduction]
11001111 optional : true # run on nightlies
11011112 source_file_dependencies :
11021113 - tests/evals/gpt_oss
@@ -1416,7 +1427,9 @@ steps:
14161427 - pytest -v -s tests/compile/distributed/test_async_tp.py
14171428 - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
14181429 - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1419- - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1430+ # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1431+ - " pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
1432+ - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
14201433 - pytest -v -s tests/distributed/test_context_parallel.py
14211434 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14221435 - pytest -v -s tests/v1/distributed/test_dbo.py
0 commit comments