Fix OOM.

zheyuf · zheyuf · commit f6347a8692e8 · 2025-11-18T17:07:22.000-08:00
Signed-off-by: Zheyu Fu &lt;zheyuf@NVIDIA.com&gt;
diff --git a/tests/unittest/_torch/speculative/test_draft_len_schedule.py b/tests/unittest/_torch/speculative/test_draft_len_schedule.py
@@ -33,6 +33,7 @@ def enforce_single_worker():
     ],
 )
 @pytest.mark.high_cuda_memory
+@pytest.mark.xdist_group("speculative_high_mem")
 def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict):
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
     memory_required = 30 if drafter_type == "model_drafter" else 20
@@ -146,6 +147,7 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict):
     ],
 )
 @pytest.mark.high_cuda_memory
+@pytest.mark.xdist_group("speculative_high_mem")
 def test_draft_len_schedule_functionality(drafter_type: str, draft_schedule: dict):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
diff --git a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
@@ -24,6 +24,7 @@ def enforce_single_worker(monkeypatch):
 
 @pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
 @pytest.mark.high_cuda_memory
+@pytest.mark.xdist_group("speculative_high_mem")
 def test_dynamic_spec_decode(enforce_single_worker,
                              disable_overlap_scheduler: bool):
     # mock_should_use_spec_decode doesn't work with multiple processes,
@@ -124,6 +125,7 @@ def mock_should_use_spec_decode(requests, max_batch_size, max_num_tokens,
 # Later: len(requests): 1, max_batch_size: 3, token_cap: 1638 -> num_effective_requests: 1, self.max_concurrency: 2 -> spec decode ON
 @pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
 @pytest.mark.high_cuda_memory
+@pytest.mark.xdist_group("speculative_high_mem")
 def test_dynamic_spec_decode_without_force_single_process(
         disable_overlap_scheduler: bool):
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
diff --git a/tests/unittest/_torch/speculative/test_spec_gate.py b/tests/unittest/_torch/speculative/test_spec_gate.py
@@ -21,6 +21,7 @@
 # This test set the max_concurrency to a large value to prevent spec decode turned off due to number of effective requests > max_concurrency,
 # So that we can only focus on the turning off effect from the SpeculationGate.
 @pytest.mark.high_cuda_memory
+@pytest.mark.xdist_group("speculative_high_mem")
 def test_spec_gate_e2e():
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
     if total_mem_gb < 35: