vllm-project · AndreasKaratzas · Nov 17, 2025 · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -46,6 +46,9 @@ steps:
   source_file_dependencies:
   - requirements/nightly_torch_test.txt
   commands:
+  # NOTE: We are going to skip this test on ROCm platform 
+  # as we don't use pytorch nightly builds on ROCm. We
+  # only use stable PyTorch releases built with ROCm support.
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
 - label: Async Engine, Inputs, Utils, Worker Test # 10min

@@ -88,10 +88,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
-    && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
     && python3 -m pip install pytest-shard
 
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Copy in the v1 package
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
+# will not be imported by other tests
+RUN mkdir src && mv vllm src/vllm
+
 # -----------------------
 # Final vLLM image
 FROM base AS final
@@ -116,6 +128,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Copy in the v1 package
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
 ARG COMMON_WORKDIR
 
 # Copy over the benchmark scripts as well

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
@@ -5,6 +5,8 @@ ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
+ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
 ARG AITER_BRANCH="59bd8ff2"
@@ -23,6 +25,7 @@ ENV AITER_ROCM_ARCH=gfx942;gfx950
 ENV HSA_NO_SCRATCH_RECLAIM=1
 
 ARG PYTHON_VERSION=3.12
+ENV PYTHON_VERSION=${PYTHON_VERSION}
 
 RUN mkdir -p /app
 WORKDIR /app
@@ -45,6 +48,7 @@ RUN apt-get update -y \
     && python3 --version && python3 -m pip --version
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
+RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 
 FROM base AS build_triton
 ARG TRITON_BRANCH
@@ -66,20 +70,30 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_REPO
+
 RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
-    pip install -r requirements.txt && git submodule update --init --recursive \
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
+    && pip install -r requirements.txt && git submodule update --init --recursive \
     && python3 tools/amd_build/build_amd.py \
     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
 RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
     && python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
+RUN git clone ${PYTORCH_AUDIO_REPO} audio
+RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/audio/dist/*.whl /app/install
 
 FROM base AS build_fa
 ARG FA_BRANCH
@@ -130,6 +144,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_BRANCH
+ARG PYTORCH_AUDIO_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
@@ -141,6 +157,8 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \

@@ -1,41 +1,75 @@
 # Common dependencies
 -r common.txt
+
+# Test infrastructure
 tblib==3.1.0
-bm25s==0.2.13
-pystemmer==3.0.0
 
-# Entrypoints test
-# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+anyio==4.6.2.post1
+    # via
+    #   httpx
+    #   starlette
+
+# Audio processing dependencies
 audioread==3.0.1
+    # via librosa
 cffi==1.17.1
+    # via soundfile
 decorator==5.2.1
+    # via librosa
 lazy-loader==0.4
+    # via librosa
 platformdirs==4.3.6
+    # via pooch
 pooch==1.8.2
-#pycparse==2.22
+    # via librosa
 soundfile==0.13.1
+    # via librosa
 soxr==0.5.0.post1
+    # via librosa
 librosa==0.10.2.post1
 
-# Entrypoints test
-#vllm[video] # required by entrypoints/openai/test_video.py
-decord==0.6.0
+# HTTP/async dependencies
+aiohttp==3.13.0
+    # via gpt-oss
+pytest-asyncio==0.24.0
+    # via httpx
 
-# Entrypoints test
-#sentence-transformers # required by entrypoints/openai/test_score.py
-sentence-transformers==3.4.1
-
-# Basic Models Test
-matplotlib==3.10.3
+# Retrieval and search
+bm25s==0.2.13
+    # via mteb
+pystemmer==3.0.0
+    # via mteb
 
-# Multi-Modal Models Test (Extended) 3
 blobfile==3.0.0
+    # Multi-Modal Models Test
+decord==0.6.0
+    # video processing, required by entrypoints/openai/test_video.py
 
-# Required for openai schema test.
+# OpenAI compatibility and testing
+gpt-oss==0.0.8
+    # OpenAI compatibility tests
 schemathesis==3.39.15
+    # OpenAI schema test
 
-# Required for mteb test
+# Evaluation and benchmarking
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+    # eval tests
 mteb[bm25s]>=1.38.11, <2
+    # MTEB Benchmark Test
+sentence-transformers==3.4.1
+    # required by entrypoints/openai/test_score.py
 
-# Required for eval tests
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+# Visualization and plotting
+matplotlib==3.10.3
+    # Basic Models Test
+
+# Data processing
+multiprocess==0.70.16
+    # Datasets and Evaluate Test
+
+# Utilities
+num2words==0.5.14
+    # via lm-eval
+pqdm==0.2.0
+    # via lm-eval
+terratorch==1.1.1
diff --git a/setup.py b/setup.py
@@ -49,15 +49,15 @@ def load_module_from_path(module_name, path):
         sys.platform,
     )
     VLLM_TARGET_DEVICE = "empty"
-elif (
-    sys.platform.startswith("linux")
-    and torch.version.cuda is None
-    and os.getenv("VLLM_TARGET_DEVICE") is None
-    and torch.version.hip is None
-):
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
-    # fallback to cpu
-    VLLM_TARGET_DEVICE = "cpu"
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
+    if torch.version.hip is not None:
+        VLLM_TARGET_DEVICE = "rocm"
+        logger.info("Auto-detected ROCm")
+    elif torch.version.cuda is not None:
+        VLLM_TARGET_DEVICE = "cuda"
+        logger.info("Auto-detected CUDA")
+    else:
+        VLLM_TARGET_DEVICE = "cpu"
 
 
 def is_sccache_available() -> bool:
@@ -115,20 +115,26 @@ def compute_num_jobs(self):
                 num_jobs = os.cpu_count()
 
         nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
-            # `nvcc_threads` is either the value of the NVCC_THREADS
-            # environment variable (if defined) or 1.
-            # when it is set, we reduce `num_jobs` to avoid
-            # overloading the system.
-            nvcc_threads = envs.NVCC_THREADS
-            if nvcc_threads is not None:
-                nvcc_threads = int(nvcc_threads)
-                logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
-                )
-            else:
-                nvcc_threads = 1
-            num_jobs = max(1, num_jobs // nvcc_threads)
+        if _is_cuda() and CUDA_HOME is not None:
+            try:
+                nvcc_version = get_nvcc_cuda_version()
+                if nvcc_version >= Version("11.2"):
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
+                    # environment variable (if defined) or 1.
+                    # when it is set, we reduce `num_jobs` to avoid
+                    # overloading the system.
+                    nvcc_threads = envs.NVCC_THREADS
+                    if nvcc_threads is not None:
+                        nvcc_threads = int(nvcc_threads)
+                        logger.info(
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                            nvcc_threads,
+                        )
+                    else:
+                        nvcc_threads = 1
+                    num_jobs = max(1, num_jobs // nvcc_threads)
+            except Exception as e:
+                logger.warning("Failed to get NVCC version: %s", e)
 
         return num_jobs, nvcc_threads
 
@@ -206,9 +212,9 @@ def configure(self, ext: CMakeExtension) -> None:
             # Default build tool to whatever cmake picks.
             build_tool = []
         # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
             cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
             cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
 
         other_cmake_args = os.environ.get("CMAKE_ARGS")
@@ -318,7 +324,9 @@ class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
     def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+        assert _is_cuda() or _is_hip(), (
+            "VLLM_USE_PRECOMPILED is only supported for CUDA or ROCm builds."
+        )
 
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
@@ -490,6 +498,8 @@ def get_rocm_version():
     # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
     # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
     try:
+        if ROCM_HOME is None:
+            return None
         librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
         if not librocm_core_file.is_file():
             return None
@@ -656,7 +666,9 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
         # FA3 requires CUDA 12.3 or later
         ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
         # Optional since this doesn't get built (produce an .so file) when
@@ -679,7 +691,7 @@ def _read_requirements(filename: str) -> list[str]:
 
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds."
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location

@@ -242,11 +242,13 @@ async def test_more_than_one_prompt_logprobs_chat(
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
+    # Finish reason may be "length" or "stop" on ROCm due to different tokenization
+    from vllm.platforms import current_platform
+
     messages = [
         {"role": "system", "content": "you are a helpful assistant"},
         {"role": "user", "content": "what is 1+1?"},
     ]
-
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -259,10 +261,21 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47
-    )
+
+    if current_platform.is_rocm():
+        assert choice.finish_reason in ["length", "stop"]
+        if choice.finish_reason == "length":
+            assert chat_completion.usage == openai.types.CompletionUsage(
+                completion_tokens=10, prompt_tokens=37, total_tokens=47
+            )
+        else:
+            assert chat_completion.usage.completion_tokens <= 10
+            assert chat_completion.usage.prompt_tokens == 37
+    else:
+        assert choice.finish_reason == "length"
+        assert chat_completion.usage == openai.types.CompletionUsage(
+            completion_tokens=10, prompt_tokens=37, total_tokens=47
+        )
 
     message = choice.message
     assert message.content is not None and len(message.content) >= 10

@@ -499,11 +499,15 @@ async def test_web_search(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_code_interpreter(client: OpenAI, model_name: str):
-    response = await client.responses.create(
+    # Code interpreter needs more time for container init + code execution
+    # Extend timeout especially for ROCm
+    from vllm.platforms import current_platform
+
+    timeout_value = client.timeout * 3 if current_platform.is_rocm() else client.timeout
+    client_with_timeout = client.with_options(timeout=timeout_value)
+
+    response = await client_with_timeout.responses.create(
         model=model_name,
-        # TODO: Ideally should be able to set max tool calls
-        # to prevent multi-turn, but it is not currently supported
-        # would speed up the test
         input=(
             "What's the first 4 digits after the decimal point of "
             "cube root of `19910212 * 20250910`? "