ModelEngine-Group
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ucmnfsstore-ut.yml‎
Lines changed: 55 additions & 0 deletions b/‎.github/workflows/ucmnfsstore-ut.yml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎.github/workflows/unifiedcache_test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unifiedcache_test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 44 additions & 1 deletion b/‎.gitignore‎
Lines changed: 44 additions & 1 deletion
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 2 additions & 12 deletions b/‎docker/Dockerfile‎
Lines changed: 2 additions & 12 deletions
diff --git a/‎docker/Dockerfile-NPU‎
Lines changed: 2 additions & 2 deletions b/‎docker/Dockerfile-NPU‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/feature/sparse_attn.md‎
Lines changed: 40 additions & 0 deletions b/‎docs/source/feature/sparse_attn.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎docs/source/getting-started/example/disaggregated_prefill/1p1d.md‎
Lines changed: 105 additions & 0 deletions b/‎docs/source/getting-started/example/disaggregated_prefill/1p1d.md‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎docs/source/getting-started/example/disaggregated_prefill/index.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/getting-started/example/disaggregated_prefill/index.md‎
Lines changed: 8 additions & 0 deletions
@@ -4,7 +4,7 @@ BEFORE SUBMITTING, PLEASE READ OUR OFFICIAL WEBSITE.
 
 -->
 
-# Prupose
+# Purpose
 
 What this PR does / why we need it?
 <!--
 
@@ -0,0 +1,55 @@
+# This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
+# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
+name: ucmnfsstore-ut
+
+on:
+  push:
+    branches: [ "dev*", "main", "*release" ]
+  pull_request:
+    branches: [ "dev*", "main", "*release" ]
+
+env:
+  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
+  BUILD_TYPE: Debug
+
+jobs:
+  ci:
+    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
+    # You can convert this to a matrix build if you need cross-platform coverage.
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install googletest
+      run: |
+        git clone https://github.com/google/googletest.git --depth=1 --branch=release-1.11.0
+        cd googletest
+        mkdir build && cd build
+        cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True ..
+        sudo make install -j
+
+    - name: Install mockcpp
+      run: |
+        git clone https://github.com/sinojelly/mockcpp.git --depth=1
+        cd mockcpp
+        mkdir build && cd build
+        cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True -DMOCKCPP_XUNIT="gtest" -DMOCKCPP_XUNIT_HOME=/usr/local/ ..
+        sudo make install -j
+
+    - name: Configure CMake
+      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
+      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
+      working-directory: ${{github.workspace}}/unifiedcache/csrc/ucmnfsstore
+      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_TESTS=ON -DCOVERAGE_ENABLED=ON -DDOWNLOAD_DEPENDENCE=ON -DRUNTIME_ENVIRONMENT=simu
+
+    - name: Build
+      # Build your program with the given configuration
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+
+    - name: Test
+      working-directory: ${{github.workspace}}/build
+      # Execute tests defined by the CMake configuration.
+      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+      run: ctest -C ${{env.BUILD_TYPE}}
@@ -60,7 +60,7 @@ jobs:
           ls -l "$GITHUB_WORKSPACE"
       - name: Apply patch
         run: |
-          git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/vllm-adapt.patch
+          git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/0.9.2/vllm-adapt.patch
       
 
       - name: Install unified-cache-management
 
@@ -1,4 +1,47 @@
-# Development Enviroment
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Linker files
+*.ilk
+
+# Debugger Files
+*.pdb
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# Debug information files
+*.dwo
+
+# Development environment files
+*.code-workspace
 .vscode/**
 .idea/**
 .git/**
 
@@ -0,0 +1 @@
+recursive-include unifiedcache/csrc *
@@ -3,24 +3,14 @@ FROM vllm/vllm-openai:v0.9.2
 
 WORKDIR /workspace
 
-# ReInstall vLLM for editing
-RUN pip uninstall -y vllm && rm -rf /vllm-workspace/*
-ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-
-# Set other VLLM_TARGET_DEVICE or other extra-index if needed
-ENV VLLM_USE_PRECOMPILED=1
-RUN VLLM_TARGET_DEVICE=cuda pip install -v -e /vllm-workspace/vllm --extra-index=https://download.pytorch.org/whl/nightly/cu128
-
 # Install unified-cache-management
 COPY . /vllm-workspace/unified-cache-management
 
 RUN export PLATFORM="cuda" && \
      pip install -v -e /vllm-workspace/unified-cache-management
 
 # Apply patch for vLLM
-RUN cd /vllm-workspace/vllm \
-    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
+RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
+    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
 
 ENTRYPOINT ["/bin/bash"]
@@ -12,11 +12,11 @@ RUN export PLATFORM="ascend" && \
 
 # Apply patch for vLLM
 RUN cd /vllm-workspace/vllm \
-    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
+    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
 
 # Apply patch for vLLM-Ascend
 RUN cd /vllm-workspace/vllm-ascend \
-    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-ascend-adapt.patch
+    && git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch
 
 
 CMD ["/bin/bash"]
@@ -1 +1,41 @@
 # Sparse Attention
+## Motivations
+Attention mechanisms, especially in LLMs, are often the bottleneck in terms of latency during inference due to their computational complexity. Despite their importance in capturing contextual relationships, traditional attention requires processing all token interactions, leading to significant delays.
+
+<p align="center">
+  <img alt="UCM" src="../images/attention_overhead.png" width="80%">
+</p>
+
+Researchers have found that attention in LLM is highly dispersed:
+<p align="center">
+  <img alt="UCM" src="../images/attention_sparsity.png" width="80%">
+</p>
+
+This movitates them actively developing sparse attention algorithms to address the latency issue. These algorithms aim to reduce the number of token interactions by focusing only on the most relevant parts of the input, thereby lowering the computation and memory requirements.
+While promising, the gap between theoretical prototypes and practical implementations in inference frameworks remains a significant challenge.
+
+Many existing frameworks, like vLLM, are optimized for traditional attention mechanisms. Adapting them for sparse attention can be complex and may require substantial modifications to the underlying architecture.
+Issues such as maintaining compatibility with existing model architectures, ensuring efficient memory usage, and leveraging hardware acceleration must be addressed to facilitate the adoption of sparse attention in real-world applications.
+
+We present an **unified sparse attention framework** under UCM. Proposing a unified framework can streamline the integration of various sparse attention algorithms into inference engines like vLLM. This framework could provide **standardized interfaces and utilities** to simplify the implementation process.
+By utilizing UCM, researchers can efficiently implement rapid prototyping and testing of different sparse attention algorithms without the need for extensive re-engineering of the inference engine. By leveraging shared optimizations within the framework, it can help ensure that the performance gains from sparse attention are realized in real-world scenarios.
+
+## Architecture
+### Overview
+The core concept of our UCMSparse attention framework is to offload the complete Key-Value (KV) cache to a dedicated KV cache storage. We then identify the crucial KV pairs relevant to the current context, as determined by our sparse attention algorithms, and selectively load only the necessary portions of the KV cache from storage into High Bandwidth Memory (HBM). This design significantly reduces the HBM footprint while accelerating generation speed.
+<p align="center">
+  <img alt="UCM" src="../images/sparse_attn_arch.png" width="80%">
+</p>
+
+
+### Key Concepts
+- UCMSparse in scheduler:  this instance locates in the same process as the `EnginerCore` and acts like a sparse attention budget controller. It estimates the number of slots required by a specific sparse attention algorithm. Then `KVCacheManager` allocates necessary blocks based on `num_slots_sparse`. For example, `ESA` only needs 20%~30% blocks of the normal attention.
+- UCMSparse in model_runner: this instance locates in the same process as the `Worker`. 
+A typical sparse attention algorithm works like this:
+    1. In prefill, it dumps full KV Cache from HBM to storage.
+    2. In decode, it retrieves the most relevant blocks based on the context and loads the blocks from store to HBM.
+    3. In decoode, it also dumps new generated blocks to keep the latest context accessible.
+- By fine-grained task scheduling, retrieval and loading can be executed asynchronously and overlap with the model execution. Therefore no overhead is introduced by UCMSparse and generation speed is boosted benefitted by less computational load and fewer memory accesses.
+
+
+See `ESA` for more details.
@@ -0,0 +1,105 @@
+# 1p1d
+
+## Overview
+This example demonstrates how to run unified-cache-management with disaggregated prefill using NFS connector on a single node with a 1 prefiller + 1 decoder setup.
+
+## Prerequisites
+- UCM: Installed with reference to the Installation documentation.
+- Hardware: At least 2 GPUs
+
+## Start disaggregated service
+For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instruct.
+
+### Run prefill server
+Prefiller Launch Command:
+```bash
+export PYTHONHASHSEED=123456
+CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
+--max-model-len 20000 \
+--tensor-parallel-size 1 \
+--gpu_memory_utilization 0.87 \
+--trust-remote-code \
+--enforce-eager \
+--no-enable-prefix-caching \
+--port 7800 \
+--block-size 128 \
+--kv-transfer-config \
+'{
+    "kv_connector": "UnifiedCacheConnectorV1",
+    "kv_connector_module_path": "unifiedcache.integration.vllm.uc_connector",
+    "kv_role": "kv_producer",
+    "kv_connector_extra_config": {
+        "ucm_connector_name": "UcmNfsStore",
+        "ucm_connector_config": {
+            "storage_backends": "/mnt/test1",
+            "kv_block_size": 33554432
+        }
+    }
+}'
+```
+
+### Run decode server
+Decoder Launch Command:
+```bash
+export PYTHONHASHSEED=123456
+CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \
+--max-model-len 20000 \
+--tensor-parallel-size 1 \
+--gpu_memory_utilization 0.87 \
+--trust-remote-code \
+--enforce-eager \
+--no-enable-prefix-caching \
+--port 7801 \
+--block-size 128 \
+--kv-transfer-config \
+'{
+    "kv_connector": "UnifiedCacheConnectorV1",
+    "kv_connector_module_path": "unifiedcache.integration.vllm.uc_connector",
+    "kv_role": "kv_consumer",
+    "kv_connector_extra_config": {
+        "ucm_connector_name": "UcmNfsStore",
+        "ucm_connector_config": {
+            "storage_backends": "/mnt/test1",
+            "kv_block_size": 33554432
+        }
+    }
+}'
+```
+### Run proxy server
+Make sure prefill nodes and decode nodes can connect to each other.
+```bash
+cd vllm-workspace/unified-cache-management/test/
+python3 toy_proxy_server.py --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <prefill-node-ip> --decoder-port 7801
+```
+
+## Testing and Benchmarking
+### Basic Test
+After running all servers , you can test with a simple curl command:
+```bash
+curl http://localhost:7802/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "/home/models/Qwen2.5-7B-Instruct",
+        "prompt": "content": "What date is today?",
+        "max_tokens": 20,
+        "temperature": 0
+    }'
+```
+### Benchmark Test
+Use the benchmark scripts provided by vLLM.
+```bash
+cd /vllm-workspace/vllm/benchmarks
+python3 benchmark_serving.py \
+    --backend vllm \
+    --dataset-name random \
+    --random-input-len 4096 \
+    --random-output-len 100 \
+    --num-prompts 10 \
+    --ignore-eos \
+    --model /home/models/Qwen2.5-7B-Instruct \
+    --tokenizer /home/models/Qwen2.5-7B-Instruct \
+    --host localhost \
+    --port 7802 \
+    --endpoint /v1/completions \
+    --request-rate 1
+```
@@ -0,0 +1,8 @@
+# Disaggregated Prefill
+
+:::{toctree}
+:maxdepth: 2
+1p1d.md
+xpyd.md
+:::
+