vllm-project · AndreasKaratzas · Nov 17, 2025 · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -46,6 +46,9 @@ steps:
   source_file_dependencies:
   - requirements/nightly_torch_test.txt
   commands:
+  # NOTE: We are going to skip this test on ROCm platform 
+  # as we don't use pytorch nightly builds on ROCm. We
+  # only use stable PyTorch releases built with ROCm support.
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
 - label: Async Engine, Inputs, Utils, Worker Test # 10min

@@ -88,10 +88,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
-    && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
     && python3 -m pip install pytest-shard
 
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Copy in the v1 package
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
+# will not be imported by other tests
+RUN mkdir src && mv vllm src/vllm
+
 # -----------------------
 # Final vLLM image
 FROM base AS final
@@ -116,6 +128,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Copy in the v1 package
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
 ARG COMMON_WORKDIR
 
 # Copy over the benchmark scripts as well

@@ -5,6 +5,8 @@ ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
+ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
 ARG AITER_BRANCH="59bd8ff2"
@@ -23,6 +25,7 @@ ENV AITER_ROCM_ARCH=gfx942;gfx950
 ENV HSA_NO_SCRATCH_RECLAIM=1
 
 ARG PYTHON_VERSION=3.12
+ENV PYTHON_VERSION=${PYTHON_VERSION}
 
 RUN mkdir -p /app
 WORKDIR /app
@@ -45,6 +48,7 @@ RUN apt-get update -y \
     && python3 --version && python3 -m pip --version
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
+RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 
 FROM base AS build_triton
 ARG TRITON_BRANCH
@@ -66,20 +70,30 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_REPO
+
 RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
-    pip install -r requirements.txt && git submodule update --init --recursive \
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
+    && pip install -r requirements.txt && git submodule update --init --recursive \
     && python3 tools/amd_build/build_amd.py \
     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
 RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
     && python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
+RUN git clone ${PYTORCH_AUDIO_REPO} audio
+RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/audio/dist/*.whl /app/install
 
 FROM base AS build_fa
 ARG FA_BRANCH
@@ -130,6 +144,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_BRANCH
+ARG PYTORCH_AUDIO_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
@@ -141,6 +157,8 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \

@@ -1,43 +1,62 @@
 # Common dependencies
 -r common.txt
+
+# Test infrastructure
 tblib==3.1.0
-bm25s==0.2.13
-pystemmer==3.0.0
+pytest==8.3.5
+pytest-asyncio==0.24.0
+pytest-timeout==2.3.1
+pytest-cov==6.3.0
+pytest-forked==1.6.0
+pytest-rerunfailures==14.0
+pytest-shard==0.1.2
+
+# Async/HTTP dependencies
+anyio==4.6.2.post1
+    # via httpx, starlette
+aiohttp==3.13.0
+    # via gpt-oss
+httpx==0.27.2
+    # HTTP testing
 
-# Entrypoints test
-# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+# Audio processing dependencies
 audioread==3.0.1
+    # via librosa
 cffi==1.17.1
+    # via soundfile
 decorator==5.2.1
+    # via librosa
 lazy-loader==0.4
+    # via librosa
 platformdirs==4.3.6
+    # via pooch
 pooch==1.8.2
-#pycparse==2.22
+    # via librosa
 soundfile==0.13.1
+    # via librosa
 soxr==0.5.0.post1
+    # via librosa
 librosa==0.10.2.post1
 
-# Entrypoints test
-#vllm[video] # required by entrypoints/openai/test_video.py
-decord==0.6.0
-
-# Entrypoints test
-#sentence-transformers # required by entrypoints/openai/test_score.py
-sentence-transformers==3.4.1
-
-# Basic Models Test
-matplotlib==3.10.3
+# Retrieval and search
+bm25s==0.2.13
+    # via mteb
+pystemmer==3.0.0
+    # via mteb
 
-# Multi-Modal Models Test (Extended) 3
+# Multi-modal processing
 blobfile==3.0.0
+    # Multi-Modal Models Test
+decord==0.6.0
+    # video processing, required by entrypoints/openai/test_video.py
 
-# Required for openai schema test.
+# OpenAI compatibility and testing
+gpt-oss==0.0.8
+    # OpenAI compatibility tests
 schemathesis==3.39.15
+    # OpenAI schema test
 
-# Required for mteb test
-mteb[bm25s]>=1.38.11, <2
-
-# Required for eval tests
+# Evaluation and benchmarking
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
 
 # Required for multiprocessed tests that use spawn method
@@ -46,6 +65,20 @@ multiprocess==0.70.16
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
 torchgeo==0.7.0
+    # via terratorch
+# MTEB Benchmark Test
+mteb==2.1.2
+# Data processing
+multiprocess==0.70.16
+    # Datasets and Evaluate Test
+xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
+    # Test async scheduling
+
+# Utilities
+num2words==0.5.14
+    # via lm-eval
+pqdm==0.2.0
+    # via lm-eval
 
 # Required for suffix decoding test
 arctic-inference == 0.1.1
diff --git a/setup.py b/setup.py
@@ -49,15 +49,15 @@ def load_module_from_path(module_name, path):
         sys.platform,
     )
     VLLM_TARGET_DEVICE = "empty"
-elif (
-    sys.platform.startswith("linux")
-    and torch.version.cuda is None
-    and os.getenv("VLLM_TARGET_DEVICE") is None
-    and torch.version.hip is None
-):
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
-    # fallback to cpu
-    VLLM_TARGET_DEVICE = "cpu"
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
+    if torch.version.hip is not None:
+        VLLM_TARGET_DEVICE = "rocm"
+        logger.info("Auto-detected ROCm")
+    elif torch.version.cuda is not None:
+        VLLM_TARGET_DEVICE = "cuda"
+        logger.info("Auto-detected CUDA")
+    else:
+        VLLM_TARGET_DEVICE = "cpu"
 
 
 def is_sccache_available() -> bool:
@@ -103,20 +103,26 @@ def compute_num_jobs(self):
                 num_jobs = os.cpu_count()
 
         nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
-            # `nvcc_threads` is either the value of the NVCC_THREADS
-            # environment variable (if defined) or 1.
-            # when it is set, we reduce `num_jobs` to avoid
-            # overloading the system.
-            nvcc_threads = envs.NVCC_THREADS
-            if nvcc_threads is not None:
-                nvcc_threads = int(nvcc_threads)
-                logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
-                )
-            else:
-                nvcc_threads = 1
-            num_jobs = max(1, num_jobs // nvcc_threads)
+        if _is_cuda() and CUDA_HOME is not None:
+            try:
+                nvcc_version = get_nvcc_cuda_version()
+                if nvcc_version >= Version("11.2"):
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
+                    # environment variable (if defined) or 1.
+                    # when it is set, we reduce `num_jobs` to avoid
+                    # overloading the system.
+                    nvcc_threads = envs.NVCC_THREADS
+                    if nvcc_threads is not None:
+                        nvcc_threads = int(nvcc_threads)
+                        logger.info(
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                            nvcc_threads,
+                        )
+                    else:
+                        nvcc_threads = 1
+                    num_jobs = max(1, num_jobs // nvcc_threads)
+            except Exception as e:
+                logger.warning("Failed to get NVCC version: %s", e)
 
         return num_jobs, nvcc_threads
 
@@ -194,9 +200,9 @@ def configure(self, ext: CMakeExtension) -> None:
             # Default build tool to whatever cmake picks.
             build_tool = []
         # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
             cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
             cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
 
         other_cmake_args = os.environ.get("CMAKE_ARGS")
@@ -306,7 +312,9 @@ class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
     def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+        assert _is_cuda() or _is_hip(), (
+            "VLLM_USE_PRECOMPILED is only supported for CUDA or ROCm builds."
+        )
 
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
@@ -478,6 +486,8 @@ def get_rocm_version():
     # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
     # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
     try:
+        if ROCM_HOME is None:
+            return None
         librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
         if not librocm_core_file.is_file():
             return None
@@ -622,7 +632,9 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
         # FA3 requires CUDA 12.3 or later
         ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
         # Optional since this doesn't get built (produce an .so file) when
@@ -645,7 +657,7 @@ def _read_requirements(filename: str) -> list[str]:
 
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds."
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location

@@ -1,10 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
 import pytest
 
 from vllm.assets.audio import AudioAsset
 
 
+@pytest.fixture(scope="module")
+def rocm_aiter_fa_attention():
+    """
+    Sets VLLM_ATTENTION_BACKEND=ROCM_AITER_FA for ROCm
+    for the duration of this test module.
+    """
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        old_backend = os.environ.get("VLLM_ATTENTION_BACKEND")
+        os.environ["VLLM_ATTENTION_BACKEND"] = "ROCM_AITER_FA"
+        yield
+        if old_backend is None:
+            del os.environ["VLLM_ATTENTION_BACKEND"]
+        else:
+            os.environ["VLLM_ATTENTION_BACKEND"] = old_backend
+    else:
+        yield
+
+
+def pytest_collection_modifyitems(session, config, items):
+    """Auto-use rocm_aiter_fa_attention fixture for specific test files."""
+    for item in items:
+        if item.nodeid and (
+            "test_transcription_validation.py" in item.nodeid
+            or "test_translation_validation.py" in item.nodeid
+        ):
+            item.fixturenames.append("rocm_aiter_fa_attention")
+
+
 @pytest.fixture
 def mary_had_lamb():
     path = AudioAsset("mary_had_lamb").get_local_path()