adapt for vllm 0.9.1 (#113)

sumingZero · y00945504 · web-flow · commit 8ea70c93d984 · 2025-08-25T15:57:03.000+08:00
Co-authored-by: y00945504 &lt;yuhui87@huawei.com&gt;
diff --git a/.github/workflows/unifiedcache_test.yml b/.github/workflows/unifiedcache_test.yml
@@ -60,7 +60,7 @@ jobs:
           ls -l "$GITHUB_WORKSPACE"
       - name: Apply patch
         run: |
-          git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/vllm-adapt.patch
+          git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/0.9.2/vllm-adapt.patch
       
 
       - name: Install unified-cache-management
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -11,6 +11,6 @@ RUN export PLATFORM="cuda" && \
 
 # Apply patch for vLLM
 RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
-    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
+    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
 
 ENTRYPOINT ["/bin/bash"]
diff --git a/docker/Dockerfile-NPU b/docker/Dockerfile-NPU
@@ -12,11 +12,11 @@ RUN export PLATFORM="ascend" && \
 
 # Apply patch for vLLM
 RUN cd /vllm-workspace/vllm \
-    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
+    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
 
 # Apply patch for vLLM-Ascend
 RUN cd /vllm-workspace/vllm-ascend \
-    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-ascend-adapt.patch
+    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch
 
 
 CMD ["/bin/bash"]
diff --git a/docs/source/getting-started/installation.md b/docs/source/getting-started/installation.md
@@ -34,7 +34,7 @@ docker run \
 Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container. After installation, please apply patch to ensure uc_connector can be used:
 ```bash
 cd /vllm-workspace/vllm
-git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
+git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
 ``` 
 Refer to this [issue](https://github.com/vllm-project/vllm/issues/21702) to see details of this patch's changes.
 
diff --git a/docs/source/getting-started/installation_npu.md b/docs/source/getting-started/installation_npu.md
@@ -42,9 +42,9 @@ docker run --rm \
 Codes of vLLM and vLLM Ascend are placed in /vllm-workspace, you can refer to [vLLM-Ascend Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more information. After installation, please apply patches to ensure uc_connector can be used:
 ```bash
 cd /vllm-workspace/vllm
-git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
+git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
 cd /vllm-workspace/vllm-ascend
-git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-ascend-adapt.patch
+git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch
 ```
 Refer to these issues [vllm-issue](https://github.com/vllm-project/vllm/issues/21702) and [vllm-ascend-issue](https://github.com/vllm-project/vllm-ascend/issues/2057) to see details of patches' changes.
 
diff --git a/unifiedcache/integration/vllm/uc_connector.py b/unifiedcache/integration/vllm/uc_connector.py
@@ -651,13 +651,37 @@ def build_connector_meta(
         # clear all load_paras when build meta for new reqs done
         self.load_paras.clear()
 
-        # When prompt tokens > max_num_batched_tokens, request of running requests may need to save
         cached_request_data = scheduler_output.scheduled_cached_reqs
-        for i, req_id in enumerate(cached_request_data.req_ids):
-            save_paras = self.save_paras.get(req_id, None)
+
+        # Adapted for vllm 0.9.1, 0.9.2 and later versions
+        def get_requests():
+            # 0.9.1
+            if isinstance(cached_request_data, list):
+                return [
+                    (
+                        request_data.req_id,
+                        request_data.new_block_ids,
+                    )
+                    for request_data in cached_request_data
+                ]
+            # >= 0.9.2
+            else:
+                return [
+                    (
+                        req_id,
+                        cached_request_data.new_block_ids[i],
+                    )
+                    for i, req_id in enumerate(cached_request_data.req_ids)
+                ]
+
+        # When prompt tokens > max_num_batched_tokens, request of running requests may need to save
+        for req_id, new_block_ids in get_requests():
+            save_paras = self.save_paras.get(req_id)
             if save_paras is None:
                 continue
+
             save_paras.num_blocks_saved += save_paras.num_blocks_to_save
+
             if save_paras.num_blocks_need_save > save_paras.num_blocks_saved:
                 logger.debug(f"Running request {req_id} has blocks to save")
                 save_paras.start_save_position = 0
@@ -667,10 +691,11 @@ def build_connector_meta(
                 save_paras.num_blocks_to_save = new_scheduled_blocks
                 meta.add_request(
                     req_id,
-                    vllm_block_ids=cached_request_data.new_block_ids[i][0],
+                    vllm_block_ids=new_block_ids[0],
                     load_paras=None,
                     save_paras=save_paras,
                 )
+
         return meta
 
     def request_finished(
diff --git a/unifiedcache/patch/0.9.1/vllm-adapt.patch b/unifiedcache/patch/0.9.1/vllm-adapt.patch
@@ -0,0 +1,86 @@
+From 2973ac33a413c00020a6f5dec01498d9fff909c3 Mon Sep 17 00:00:00 2001
+From: y00945504 <yuhui87@huawei.com>
+Date: Thu, 21 Aug 2025 10:46:38 +0800
+Subject: [PATCH] vllm v0.9.1 adapt patch
+
+---
+ vllm/v1/core/sched/scheduler.py    | 3 +++
+ vllm/v1/outputs.py                 | 1 +
+ vllm/v1/request.py                 | 1 +
+ vllm/v1/worker/gpu_model_runner.py | 7 ++++---
+ 4 files changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
+index 3d7bbe7e0..1e382e319 100644
+--- a/vllm/v1/core/sched/scheduler.py
++++ b/vllm/v1/core/sched/scheduler.py
+@@ -761,6 +761,9 @@ class Scheduler(SchedulerInterface):
+             new_logprobs = None
+             new_token_ids = generated_token_ids
+             kv_transfer_params = None
++            
++            if model_runner_output.finished_dumping is not None:
++                request.succeed_dumped_blocks.extend(model_runner_output.finished_dumping.get(req_id, []))
+ 
+             # Append generated tokens and check for stop. Note that if
+             # a request is still being prefilled, we expect the model runner
+diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
+index 17a299d57..d6c767f78 100644
+--- a/vllm/v1/outputs.py
++++ b/vllm/v1/outputs.py
+@@ -104,6 +104,7 @@ class ModelRunnerOutput:
+     # [req_ids]
+     finished_sending: Optional[set[str]] = None
+     finished_recving: Optional[set[str]] = None
++    finished_dumping: Optional[dict[str, list[str]]] = None
+ 
+ 
+ EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
+diff --git a/vllm/v1/request.py b/vllm/v1/request.py
+index 53fd70fab..ff22263fd 100644
+--- a/vllm/v1/request.py
++++ b/vllm/v1/request.py
+@@ -82,6 +82,7 @@ class Request:
+         # State
+         # The number of tokens with prefix cache hits.
+         self.num_cached_tokens = -1
++        self.succeed_dumped_blocks: list[str] = []
+ 
+     @classmethod
+     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
+index b1bc727e1..5d2aebb5c 100644
+--- a/vllm/v1/worker/gpu_model_runner.py
++++ b/vllm/v1/worker/gpu_model_runner.py
+@@ -1264,7 +1264,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+                 inputs_embeds=inputs_embeds,
+             )
+ 
+-            self.maybe_wait_for_kv_save()
++            finished_dumping = self.maybe_wait_for_kv_save()
+             finished_sending, finished_recving = (
+                 self.get_finished_kv_transfers(scheduler_output))
+ 
+@@ -1505,6 +1505,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+             prompt_logprobs_dict=prompt_logprobs_dict,
+             finished_sending=finished_sending,
+             finished_recving=finished_recving,
++            finished_dumping=finished_dumping
+         )
+ 
+     def kv_connector_no_forward(
+@@ -1540,9 +1541,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
+             kv_connector.start_load_kv(get_forward_context())
+ 
+     @staticmethod
+-    def maybe_wait_for_kv_save() -> None:
++    def maybe_wait_for_kv_save() -> Optional[dict[str, list[str]]]:
+         if has_kv_transfer_group():
+-            get_kv_transfer_group().wait_for_save()
++            return get_kv_transfer_group().wait_for_save()
+ 
+     @staticmethod
+     def get_finished_kv_transfers(
+-- 
+2.50.1.windows.1
+
diff --git a/unifiedcache/patch/0.9.1/vllm-ascend-adapt.patch b/unifiedcache/patch/0.9.1/vllm-ascend-adapt.patch
@@ -0,0 +1,140 @@
+From e45ed500c23f3b8905c68ada894657fd0794906b Mon Sep 17 00:00:00 2001
+From: y00945504 <yuhui87@huawei.com>
+Date: Fri, 22 Aug 2025 11:46:48 +0800
+Subject: [PATCH] manually apply patch
+
+---
+ vllm_ascend/attention/attention_v1.py | 33 +++++++++++++++++++++++++++
+ vllm_ascend/worker/model_runner_v1.py | 14 +++++++-----
+ 2 files changed, 41 insertions(+), 6 deletions(-)
+
+diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
+index 694adab..487b12b 100644
+--- a/vllm_ascend/attention/attention_v1.py
++++ b/vllm_ascend/attention/attention_v1.py
+@@ -24,6 +24,9 @@ import torch_npu
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                               AttentionLayer, AttentionType)
+ from vllm.attention.backends.utils import CommonAttentionState
++from vllm.distributed.kv_transfer import (get_kv_transfer_group,
++                                          has_kv_transfer_group,
++                                          is_v1_kv_transfer_group)
+ from vllm.config import get_current_vllm_config
+ from vllm.forward_context import ForwardContext, get_forward_context
+ from vllm.utils import direct_register_custom_op
+@@ -458,6 +461,8 @@ def unified_ascend_attention_with_output(
+     output: torch.Tensor,
+     layer_name: str,
+ ) -> None:
++    wait_for_kv_layer_from_connector(layer_name)
++
+     forward_context: ForwardContext = get_forward_context()
+     attn_metadata = forward_context.attn_metadata
+     self = forward_context.no_compile_layers[layer_name]
+@@ -470,8 +475,36 @@ def unified_ascend_attention_with_output(
+                       attn_metadata,
+                       output,
+                       trace_flag=False)
++    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+     return
+ 
++def wait_for_kv_layer_from_connector(layer_name: str):
++    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
++        return
++
++    connector = get_kv_transfer_group()
++
++    forward_context: ForwardContext = get_forward_context()
++    attn_metadata = forward_context.attn_metadata
++    if attn_metadata is None:
++        return
++    connector.wait_for_layer_load(layer_name)
++
++def maybe_save_kv_layer_to_connector(
++    layer_name: str,
++    kv_cache_layer: List[torch.Tensor],
++):
++    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
++        return
++
++    connector = get_kv_transfer_group()
++
++    forward_context: ForwardContext = get_forward_context()
++    attn_metadata = forward_context.attn_metadata
++    if attn_metadata is None:
++        return
++    connector.save_kv_layer(layer_name, kv_cache_layer,
++                            attn_metadata)
+ 
+ def unified_attention_with_output_fake(
+     query: torch.Tensor,
+diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
+index dc28bfa..ddc996b 100644
+--- a/vllm_ascend/worker/model_runner_v1.py
++++ b/vllm_ascend/worker/model_runner_v1.py
+@@ -889,7 +889,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+         intermediate_tensors: Optional[IntermediateTensors] = None,
+     ) -> tuple[SpecDecodeMetadata, torch.Tensor, SpecDecodeMetadata,
+                torch.Tensor, int, torch.Tensor, Optional[set[str]],
+-               Optional[set[str]]]:
++               Optional[set[str]], Optional[dict[str, list[str]]]]:
+         # Check input valid
+         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+         assert total_num_scheduled_tokens > 0
+@@ -1140,6 +1140,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+             positions = self.positions[:padded_num_tokens_across_dp]
+ 
+         # Run forward pass
++        finished_dumping = None
+         # TODO(zzzzwwjj): check param `num_tokens_across_dp` later.
+         with set_ascend_forward_context(
+                 attn_metadata,
+@@ -1174,7 +1175,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+                         inputs_embeds=inputs_embeds,
+                         **model_kwargs)
+ 
+-        self.maybe_wait_for_kv_save()
++        finished_dumping = self.maybe_wait_for_kv_save()
+         finished_sending, finished_recving = self.get_finished_kv_transfer(
+             scheduler_output)
+         use_spec_decode = len(
+@@ -1202,7 +1203,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+ 
+         return (attn_metadata, hidden_states, spec_decode_metadata, positions,
+                 total_num_scheduled_tokens, sample_indices, finished_sending,
+-                finished_recving)
++                finished_recving, finished_dumping)
+ 
+     def _calc_spec_decode_metadata(
+         self,
+@@ -1386,7 +1387,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+ 
+             (attn_metadata, hidden_states, spec_decode_metadata, positions,
+              num_scheduled_tokens, sample_indices, finished_sending,
+-             finished_recving) = (self._process_reqs(scheduler_output,
++             finished_recving, finished_dumping) = (self._process_reqs(scheduler_output,
+                                                      intermediate_tensors))
+ 
+             if self.dynamic_eplb:
+@@ -1493,6 +1494,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+                 prompt_logprobs_dict={},
+                 finished_sending=finished_sending,
+                 finished_recving=finished_recving,
++                finished_dumping=finished_dumping
+             )
+ 
+         durations = ProfileExecuteDuration().pop_captured_sync()
+@@ -1543,8 +1545,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+     @staticmethod
+     def maybe_wait_for_kv_save() -> None:
+         if has_kv_transfer_group():
+-            get_kv_transfer_group().wait_for_save()
+-
++            return get_kv_transfer_group().wait_for_save()
++            
+     @staticmethod
+     def get_finished_kv_transfer(
+         scheduler_output: "SchedulerOutput",
+-- 
+2.50.1.windows.1
+
diff --git a/unifiedcache/patch/0.9.2/vllm-adapt.patch b/unifiedcache/patch/0.9.2/vllm-adapt.patch
diff --git a/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch b/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch