Skip to content
43 changes: 40 additions & 3 deletions tests/v1/e2e/test_async_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_without_spec_decoding(
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)


def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
def test_with_eagle3_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
spec decoding model length.
Expand Down Expand Up @@ -111,6 +111,42 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)


def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""Test ngram_gpu speculative decoding with different configurations.

This test specifically validates ngram_gpu behavior with various:
- Number of speculative tokens (2-6)
- Prompt lookup window sizes (min/max)
- Async scheduling enabled (as in production)
- Different executors and chunking settings
"""

# Variant with larger speculation window
ngram_gpu_config = {
"method": "ngram_gpu",
"num_speculative_tokens": 3,
"prompt_lookup_max": 3,
"prompt_lookup_min": 2,
}

# Test configurations covering various scenarios
# test_preemption, executor, async_scheduling,
# spec_config, test_prefill_chunking
test_configs = [
(False, "mp", False, None, False),
(False, "mp", False, ngram_gpu_config, False),
(True, "mp", False, ngram_gpu_config, True),
(False, "mp", True, ngram_gpu_config, False),
(True, "mp", True, ngram_gpu_config, False),
(True, "uni", True, ngram_gpu_config, False),
(True, "mp", True, ngram_gpu_config, True),
]

# Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
# and ngram_gpu doesn't require a specific draft model
run_tests(monkeypatch, MODEL, test_configs, [{}])


@dynamo_config.patch(cache_size_limit=16)
def run_tests(
monkeypatch: pytest.MonkeyPatch,
Expand Down Expand Up @@ -222,18 +258,19 @@ def run_test(
else dict(gpu_memory_utilization=0.9)
)
spec_mml = (spec_config or {}).get("max_model_len")
spec_method = (spec_config or {}).get("method", "none")
test_config = (
f"executor={executor}, preemption={test_preemption}, "
f"async_sched={async_scheduling}, "
f"chunk_prefill={test_prefill_chunking}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}"
)
print("-" * 80)
print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80)
with VllmRunner(
model,
max_model_len=512,
max_model_len=4096,
enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None,
Expand Down
7 changes: 7 additions & 0 deletions vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,13 @@ def __call__(
# Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
disable_cache = not is_compile_cache_enabled(self.inductor_config)

# TODO(patchy): ngram gpu kernel will cause vllm torch compile cache errors.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? Can this be fixed?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I enabled torch compile in the ngram gpu kernel, the computational graph corresponding to ngram operator would hit a precompiled computational graph cache in the main model, leading to mismatched computational graph results. Therefore, I directly disabled the compile cache here. I tested this locally, and disabling the cache had no impact on performance.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume disabling the compile cache would lead to longer startup time? I'm not an expert here but maybe it's possible to add an identifier to the compile cache to avoid extraneous cache hits?

is_ngram_gpu_enabled = (
vllm_config.speculative_config
and vllm_config.speculative_config.method == "ngram_gpu"
)
disable_cache = disable_cache or is_ngram_gpu_enabled

if disable_cache:
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
else:
Expand Down
9 changes: 7 additions & 2 deletions vllm/config/speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@
"pangu_ultra_moe_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
NgramGPUTypes = Literal["ngram_gpu"]
SpeculativeMethod = Literal[
"ngram",
"medusa",
"mlp_speculator",
"draft_model",
"suffix",
EagleModelTypes,
NgramGPUTypes,
]


Expand Down Expand Up @@ -259,6 +261,8 @@ def __post_init__(self):
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
elif self.method == "ngram_gpu":
self.model = "ngram_gpu"
elif self.method == "suffix":
self.model = "suffix"
else:
Expand All @@ -273,9 +277,10 @@ def __post_init__(self):
):
self.method = "ngram"

if self.method in ("ngram", "[ngram]"):
if self.method in ("ngram", "[ngram]", "ngram_gpu"):
# Unified to "ngram" internally
self.method = "ngram"
if self.method in ("ngram", "[ngram]"):
self.method = "ngram"
# Set default values if not provided
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
Expand Down
8 changes: 5 additions & 3 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pydantic.dataclasses import dataclass

import vllm.envs as envs
from vllm.config.speculative import EagleModelTypes
from vllm.config.speculative import EagleModelTypes, NgramGPUTypes
from vllm.logger import enable_trace_function_call, init_logger
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid
Expand Down Expand Up @@ -378,10 +378,12 @@ def __post_init__(self):
# Currently, async scheduling only support eagle speculative
# decoding.
if self.speculative_config is not None:
if self.speculative_config.method not in get_args(EagleModelTypes):
if self.speculative_config.method not in get_args(
EagleModelTypes
) and self.speculative_config.method not in get_args(NgramGPUTypes):
raise ValueError(
"Currently, async scheduling is only supported "
"with EAGLE/MTP kind of speculative decoding"
"with EAGLE/MTP/NGram GPU kind of speculative decoding"
)
if self.speculative_config.disable_padded_drafter_batch:
raise ValueError(
Expand Down
Loading