Skip to content

Commit 8ea70c9

Browse files
sumingZeroy00945504
andauthored
adapt for vllm 0.9.1 (#113)
Co-authored-by: y00945504 <yuhui87@huawei.com>
1 parent e9734b6 commit 8ea70c9

File tree

10 files changed

+262
-11
lines changed

10 files changed

+262
-11
lines changed

.github/workflows/unifiedcache_test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
ls -l "$GITHUB_WORKSPACE"
6161
- name: Apply patch
6262
run: |
63-
git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/vllm-adapt.patch
63+
git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/0.9.2/vllm-adapt.patch
6464
6565

6666
- name: Install unified-cache-management

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@ RUN export PLATFORM="cuda" && \
1111

1212
# Apply patch for vLLM
1313
RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
14-
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
14+
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
1515

1616
ENTRYPOINT ["/bin/bash"]

docker/Dockerfile-NPU

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ RUN export PLATFORM="ascend" && \
1212

1313
# Apply patch for vLLM
1414
RUN cd /vllm-workspace/vllm \
15-
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
15+
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
1616

1717
# Apply patch for vLLM-Ascend
1818
RUN cd /vllm-workspace/vllm-ascend \
19-
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-ascend-adapt.patch
19+
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch
2020

2121

2222
CMD ["/bin/bash"]

docs/source/getting-started/installation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ docker run \
3434
Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container. After installation, please apply patch to ensure uc_connector can be used:
3535
```bash
3636
cd /vllm-workspace/vllm
37-
git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
37+
git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
3838
```
3939
Refer to this [issue](https://github.com/vllm-project/vllm/issues/21702) to see details of this patch's changes.
4040

docs/source/getting-started/installation_npu.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ docker run --rm \
4242
Codes of vLLM and vLLM Ascend are placed in /vllm-workspace, you can refer to [vLLM-Ascend Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more information. After installation, please apply patches to ensure uc_connector can be used:
4343
```bash
4444
cd /vllm-workspace/vllm
45-
git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
45+
git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
4646
cd /vllm-workspace/vllm-ascend
47-
git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-ascend-adapt.patch
47+
git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch
4848
```
4949
Refer to these issues [vllm-issue](https://github.com/vllm-project/vllm/issues/21702) and [vllm-ascend-issue](https://github.com/vllm-project/vllm-ascend/issues/2057) to see details of patches' changes.
5050

unifiedcache/integration/vllm/uc_connector.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -651,13 +651,37 @@ def build_connector_meta(
651651
# clear all load_paras when build meta for new reqs done
652652
self.load_paras.clear()
653653

654-
# When prompt tokens > max_num_batched_tokens, request of running requests may need to save
655654
cached_request_data = scheduler_output.scheduled_cached_reqs
656-
for i, req_id in enumerate(cached_request_data.req_ids):
657-
save_paras = self.save_paras.get(req_id, None)
655+
656+
# Adapted for vllm 0.9.1, 0.9.2 and later versions
657+
def get_requests():
658+
# 0.9.1
659+
if isinstance(cached_request_data, list):
660+
return [
661+
(
662+
request_data.req_id,
663+
request_data.new_block_ids,
664+
)
665+
for request_data in cached_request_data
666+
]
667+
# >= 0.9.2
668+
else:
669+
return [
670+
(
671+
req_id,
672+
cached_request_data.new_block_ids[i],
673+
)
674+
for i, req_id in enumerate(cached_request_data.req_ids)
675+
]
676+
677+
# When prompt tokens > max_num_batched_tokens, request of running requests may need to save
678+
for req_id, new_block_ids in get_requests():
679+
save_paras = self.save_paras.get(req_id)
658680
if save_paras is None:
659681
continue
682+
660683
save_paras.num_blocks_saved += save_paras.num_blocks_to_save
684+
661685
if save_paras.num_blocks_need_save > save_paras.num_blocks_saved:
662686
logger.debug(f"Running request {req_id} has blocks to save")
663687
save_paras.start_save_position = 0
@@ -667,10 +691,11 @@ def build_connector_meta(
667691
save_paras.num_blocks_to_save = new_scheduled_blocks
668692
meta.add_request(
669693
req_id,
670-
vllm_block_ids=cached_request_data.new_block_ids[i][0],
694+
vllm_block_ids=new_block_ids[0],
671695
load_paras=None,
672696
save_paras=save_paras,
673697
)
698+
674699
return meta
675700

676701
def request_finished(
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
From 2973ac33a413c00020a6f5dec01498d9fff909c3 Mon Sep 17 00:00:00 2001
2+
From: y00945504 <yuhui87@huawei.com>
3+
Date: Thu, 21 Aug 2025 10:46:38 +0800
4+
Subject: [PATCH] vllm v0.9.1 adapt patch
5+
6+
---
7+
vllm/v1/core/sched/scheduler.py | 3 +++
8+
vllm/v1/outputs.py | 1 +
9+
vllm/v1/request.py | 1 +
10+
vllm/v1/worker/gpu_model_runner.py | 7 ++++---
11+
4 files changed, 9 insertions(+), 3 deletions(-)
12+
13+
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
14+
index 3d7bbe7e0..1e382e319 100644
15+
--- a/vllm/v1/core/sched/scheduler.py
16+
+++ b/vllm/v1/core/sched/scheduler.py
17+
@@ -761,6 +761,9 @@ class Scheduler(SchedulerInterface):
18+
new_logprobs = None
19+
new_token_ids = generated_token_ids
20+
kv_transfer_params = None
21+
+
22+
+ if model_runner_output.finished_dumping is not None:
23+
+ request.succeed_dumped_blocks.extend(model_runner_output.finished_dumping.get(req_id, []))
24+
25+
# Append generated tokens and check for stop. Note that if
26+
# a request is still being prefilled, we expect the model runner
27+
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
28+
index 17a299d57..d6c767f78 100644
29+
--- a/vllm/v1/outputs.py
30+
+++ b/vllm/v1/outputs.py
31+
@@ -104,6 +104,7 @@ class ModelRunnerOutput:
32+
# [req_ids]
33+
finished_sending: Optional[set[str]] = None
34+
finished_recving: Optional[set[str]] = None
35+
+ finished_dumping: Optional[dict[str, list[str]]] = None
36+
37+
38+
EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
39+
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
40+
index 53fd70fab..ff22263fd 100644
41+
--- a/vllm/v1/request.py
42+
+++ b/vllm/v1/request.py
43+
@@ -82,6 +82,7 @@ class Request:
44+
# State
45+
# The number of tokens with prefix cache hits.
46+
self.num_cached_tokens = -1
47+
+ self.succeed_dumped_blocks: list[str] = []
48+
49+
@classmethod
50+
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
51+
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
52+
index b1bc727e1..5d2aebb5c 100644
53+
--- a/vllm/v1/worker/gpu_model_runner.py
54+
+++ b/vllm/v1/worker/gpu_model_runner.py
55+
@@ -1264,7 +1264,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
56+
inputs_embeds=inputs_embeds,
57+
)
58+
59+
- self.maybe_wait_for_kv_save()
60+
+ finished_dumping = self.maybe_wait_for_kv_save()
61+
finished_sending, finished_recving = (
62+
self.get_finished_kv_transfers(scheduler_output))
63+
64+
@@ -1505,6 +1505,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
65+
prompt_logprobs_dict=prompt_logprobs_dict,
66+
finished_sending=finished_sending,
67+
finished_recving=finished_recving,
68+
+ finished_dumping=finished_dumping
69+
)
70+
71+
def kv_connector_no_forward(
72+
@@ -1540,9 +1541,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
73+
kv_connector.start_load_kv(get_forward_context())
74+
75+
@staticmethod
76+
- def maybe_wait_for_kv_save() -> None:
77+
+ def maybe_wait_for_kv_save() -> Optional[dict[str, list[str]]]:
78+
if has_kv_transfer_group():
79+
- get_kv_transfer_group().wait_for_save()
80+
+ return get_kv_transfer_group().wait_for_save()
81+
82+
@staticmethod
83+
def get_finished_kv_transfers(
84+
--
85+
2.50.1.windows.1
86+
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
From e45ed500c23f3b8905c68ada894657fd0794906b Mon Sep 17 00:00:00 2001
2+
From: y00945504 <yuhui87@huawei.com>
3+
Date: Fri, 22 Aug 2025 11:46:48 +0800
4+
Subject: [PATCH] manually apply patch
5+
6+
---
7+
vllm_ascend/attention/attention_v1.py | 33 +++++++++++++++++++++++++++
8+
vllm_ascend/worker/model_runner_v1.py | 14 +++++++-----
9+
2 files changed, 41 insertions(+), 6 deletions(-)
10+
11+
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
12+
index 694adab..487b12b 100644
13+
--- a/vllm_ascend/attention/attention_v1.py
14+
+++ b/vllm_ascend/attention/attention_v1.py
15+
@@ -24,6 +24,9 @@ import torch_npu
16+
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
17+
AttentionLayer, AttentionType)
18+
from vllm.attention.backends.utils import CommonAttentionState
19+
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
20+
+ has_kv_transfer_group,
21+
+ is_v1_kv_transfer_group)
22+
from vllm.config import get_current_vllm_config
23+
from vllm.forward_context import ForwardContext, get_forward_context
24+
from vllm.utils import direct_register_custom_op
25+
@@ -458,6 +461,8 @@ def unified_ascend_attention_with_output(
26+
output: torch.Tensor,
27+
layer_name: str,
28+
) -> None:
29+
+ wait_for_kv_layer_from_connector(layer_name)
30+
+
31+
forward_context: ForwardContext = get_forward_context()
32+
attn_metadata = forward_context.attn_metadata
33+
self = forward_context.no_compile_layers[layer_name]
34+
@@ -470,8 +475,36 @@ def unified_ascend_attention_with_output(
35+
attn_metadata,
36+
output,
37+
trace_flag=False)
38+
+ maybe_save_kv_layer_to_connector(layer_name, kv_cache)
39+
return
40+
41+
+def wait_for_kv_layer_from_connector(layer_name: str):
42+
+ if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
43+
+ return
44+
+
45+
+ connector = get_kv_transfer_group()
46+
+
47+
+ forward_context: ForwardContext = get_forward_context()
48+
+ attn_metadata = forward_context.attn_metadata
49+
+ if attn_metadata is None:
50+
+ return
51+
+ connector.wait_for_layer_load(layer_name)
52+
+
53+
+def maybe_save_kv_layer_to_connector(
54+
+ layer_name: str,
55+
+ kv_cache_layer: List[torch.Tensor],
56+
+):
57+
+ if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
58+
+ return
59+
+
60+
+ connector = get_kv_transfer_group()
61+
+
62+
+ forward_context: ForwardContext = get_forward_context()
63+
+ attn_metadata = forward_context.attn_metadata
64+
+ if attn_metadata is None:
65+
+ return
66+
+ connector.save_kv_layer(layer_name, kv_cache_layer,
67+
+ attn_metadata)
68+
69+
def unified_attention_with_output_fake(
70+
query: torch.Tensor,
71+
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
72+
index dc28bfa..ddc996b 100644
73+
--- a/vllm_ascend/worker/model_runner_v1.py
74+
+++ b/vllm_ascend/worker/model_runner_v1.py
75+
@@ -889,7 +889,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
76+
intermediate_tensors: Optional[IntermediateTensors] = None,
77+
) -> tuple[SpecDecodeMetadata, torch.Tensor, SpecDecodeMetadata,
78+
torch.Tensor, int, torch.Tensor, Optional[set[str]],
79+
- Optional[set[str]]]:
80+
+ Optional[set[str]], Optional[dict[str, list[str]]]]:
81+
# Check input valid
82+
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
83+
assert total_num_scheduled_tokens > 0
84+
@@ -1140,6 +1140,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
85+
positions = self.positions[:padded_num_tokens_across_dp]
86+
87+
# Run forward pass
88+
+ finished_dumping = None
89+
# TODO(zzzzwwjj): check param `num_tokens_across_dp` later.
90+
with set_ascend_forward_context(
91+
attn_metadata,
92+
@@ -1174,7 +1175,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
93+
inputs_embeds=inputs_embeds,
94+
**model_kwargs)
95+
96+
- self.maybe_wait_for_kv_save()
97+
+ finished_dumping = self.maybe_wait_for_kv_save()
98+
finished_sending, finished_recving = self.get_finished_kv_transfer(
99+
scheduler_output)
100+
use_spec_decode = len(
101+
@@ -1202,7 +1203,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
102+
103+
return (attn_metadata, hidden_states, spec_decode_metadata, positions,
104+
total_num_scheduled_tokens, sample_indices, finished_sending,
105+
- finished_recving)
106+
+ finished_recving, finished_dumping)
107+
108+
def _calc_spec_decode_metadata(
109+
self,
110+
@@ -1386,7 +1387,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
111+
112+
(attn_metadata, hidden_states, spec_decode_metadata, positions,
113+
num_scheduled_tokens, sample_indices, finished_sending,
114+
- finished_recving) = (self._process_reqs(scheduler_output,
115+
+ finished_recving, finished_dumping) = (self._process_reqs(scheduler_output,
116+
intermediate_tensors))
117+
118+
if self.dynamic_eplb:
119+
@@ -1493,6 +1494,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
120+
prompt_logprobs_dict={},
121+
finished_sending=finished_sending,
122+
finished_recving=finished_recving,
123+
+ finished_dumping=finished_dumping
124+
)
125+
126+
durations = ProfileExecuteDuration().pop_captured_sync()
127+
@@ -1543,8 +1545,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
128+
@staticmethod
129+
def maybe_wait_for_kv_save() -> None:
130+
if has_kv_transfer_group():
131+
- get_kv_transfer_group().wait_for_save()
132+
-
133+
+ return get_kv_transfer_group().wait_for_save()
134+
+
135+
@staticmethod
136+
def get_finished_kv_transfer(
137+
scheduler_output: "SchedulerOutput",
138+
--
139+
2.50.1.windows.1
140+
File renamed without changes.

0 commit comments

Comments
 (0)