Skip to content

Commit 2187a2e

Browse files
committed
Merge branch 'develop' into develop_mooncake
2 parents 8d015cb + 4885393 commit 2187a2e

File tree

87 files changed

+4348
-1751
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+4348
-1751
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ BEFORE SUBMITTING, PLEASE READ OUR OFFICIAL WEBSITE.
44
55
-->
66

7-
# Prupose
7+
# Purpose
88

99
What this PR does / why we need it?
1010
<!--
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
2+
# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
3+
name: ucmnfsstore-ut
4+
5+
on:
6+
push:
7+
branches: [ "dev*", "main", "*release" ]
8+
pull_request:
9+
branches: [ "dev*", "main", "*release" ]
10+
11+
env:
12+
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
13+
BUILD_TYPE: Debug
14+
15+
jobs:
16+
ci:
17+
# The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
18+
# You can convert this to a matrix build if you need cross-platform coverage.
19+
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
20+
runs-on: ubuntu-latest
21+
22+
steps:
23+
- uses: actions/checkout@v4
24+
25+
- name: Install googletest
26+
run: |
27+
git clone https://github.com/google/googletest.git --depth=1 --branch=release-1.11.0
28+
cd googletest
29+
mkdir build && cd build
30+
cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True ..
31+
sudo make install -j
32+
33+
- name: Install mockcpp
34+
run: |
35+
git clone https://github.com/sinojelly/mockcpp.git --depth=1
36+
cd mockcpp
37+
mkdir build && cd build
38+
cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True -DMOCKCPP_XUNIT="gtest" -DMOCKCPP_XUNIT_HOME=/usr/local/ ..
39+
sudo make install -j
40+
41+
- name: Configure CMake
42+
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
43+
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
44+
working-directory: ${{github.workspace}}/unifiedcache/csrc/ucmnfsstore
45+
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_TESTS=ON -DCOVERAGE_ENABLED=ON -DDOWNLOAD_DEPENDENCE=ON -DRUNTIME_ENVIRONMENT=simu
46+
47+
- name: Build
48+
# Build your program with the given configuration
49+
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
50+
51+
- name: Test
52+
working-directory: ${{github.workspace}}/build
53+
# Execute tests defined by the CMake configuration.
54+
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
55+
run: ctest -C ${{env.BUILD_TYPE}}

.github/workflows/unifiedcache_test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
ls -l "$GITHUB_WORKSPACE"
6161
- name: Apply patch
6262
run: |
63-
git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/vllm-adapt.patch
63+
git -C /vllm-workspace/vllm apply $GITHUB_WORKSPACE/unifiedcache/patch/0.9.2/vllm-adapt.patch
6464
6565

6666
- name: Install unified-cache-management

.gitignore

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,47 @@
1-
# Development Enviroment
1+
# Prerequisites
2+
*.d
3+
4+
# Compiled Object files
5+
*.slo
6+
*.lo
7+
*.o
8+
*.obj
9+
10+
# Precompiled Headers
11+
*.gch
12+
*.pch
13+
14+
# Linker files
15+
*.ilk
16+
17+
# Debugger Files
18+
*.pdb
19+
20+
# Compiled Dynamic libraries
21+
*.so
22+
*.dylib
23+
*.dll
24+
25+
# Fortran module files
26+
*.mod
27+
*.smod
28+
29+
# Compiled Static libraries
30+
*.lai
31+
*.la
32+
*.a
33+
*.lib
34+
35+
# Executables
36+
*.exe
37+
*.out
38+
*.app
39+
40+
# Debug information files
41+
*.dwo
42+
43+
# Development environment files
44+
*.code-workspace
245
.vscode/**
346
.idea/**
447
.git/**

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
recursive-include unifiedcache/csrc *

docker/Dockerfile

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,14 @@ FROM vllm/vllm-openai:v0.9.2
33

44
WORKDIR /workspace
55

6-
# ReInstall vLLM for editing
7-
RUN pip uninstall -y vllm && rm -rf /vllm-workspace/*
8-
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
9-
ARG VLLM_TAG=v0.9.2
10-
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
11-
12-
# Set other VLLM_TARGET_DEVICE or other extra-index if needed
13-
ENV VLLM_USE_PRECOMPILED=1
14-
RUN VLLM_TARGET_DEVICE=cuda pip install -v -e /vllm-workspace/vllm --extra-index=https://download.pytorch.org/whl/nightly/cu128
15-
166
# Install unified-cache-management
177
COPY . /vllm-workspace/unified-cache-management
188

199
RUN export PLATFORM="cuda" && \
2010
pip install -v -e /vllm-workspace/unified-cache-management
2111

2212
# Apply patch for vLLM
23-
RUN cd /vllm-workspace/vllm \
24-
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
13+
RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
14+
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
2515

2616
ENTRYPOINT ["/bin/bash"]

docker/Dockerfile-NPU

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ RUN export PLATFORM="ascend" && \
1212

1313
# Apply patch for vLLM
1414
RUN cd /vllm-workspace/vllm \
15-
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-adapt.patch
15+
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-adapt.patch
1616

1717
# Apply patch for vLLM-Ascend
1818
RUN cd /vllm-workspace/vllm-ascend \
19-
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/vllm-ascend-adapt.patch
19+
&& git apply /vllm-workspace/unified-cache-management/unifiedcache/patch/0.9.2/vllm-ascend-adapt.patch
2020

2121

2222
CMD ["/bin/bash"]

docs/source/feature/sparse_attn.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,41 @@
11
# Sparse Attention
2+
## Motivations
3+
Attention mechanisms, especially in LLMs, are often the bottleneck in terms of latency during inference due to their computational complexity. Despite their importance in capturing contextual relationships, traditional attention requires processing all token interactions, leading to significant delays.
4+
5+
<p align="center">
6+
<img alt="UCM" src="../images/attention_overhead.png" width="80%">
7+
</p>
8+
9+
Researchers have found that attention in LLM is highly dispersed:
10+
<p align="center">
11+
<img alt="UCM" src="../images/attention_sparsity.png" width="80%">
12+
</p>
13+
14+
This movitates them actively developing sparse attention algorithms to address the latency issue. These algorithms aim to reduce the number of token interactions by focusing only on the most relevant parts of the input, thereby lowering the computation and memory requirements.
15+
While promising, the gap between theoretical prototypes and practical implementations in inference frameworks remains a significant challenge.
16+
17+
Many existing frameworks, like vLLM, are optimized for traditional attention mechanisms. Adapting them for sparse attention can be complex and may require substantial modifications to the underlying architecture.
18+
Issues such as maintaining compatibility with existing model architectures, ensuring efficient memory usage, and leveraging hardware acceleration must be addressed to facilitate the adoption of sparse attention in real-world applications.
19+
20+
We present an **unified sparse attention framework** under UCM. Proposing a unified framework can streamline the integration of various sparse attention algorithms into inference engines like vLLM. This framework could provide **standardized interfaces and utilities** to simplify the implementation process.
21+
By utilizing UCM, researchers can efficiently implement rapid prototyping and testing of different sparse attention algorithms without the need for extensive re-engineering of the inference engine. By leveraging shared optimizations within the framework, it can help ensure that the performance gains from sparse attention are realized in real-world scenarios.
22+
23+
## Architecture
24+
### Overview
25+
The core concept of our UCMSparse attention framework is to offload the complete Key-Value (KV) cache to a dedicated KV cache storage. We then identify the crucial KV pairs relevant to the current context, as determined by our sparse attention algorithms, and selectively load only the necessary portions of the KV cache from storage into High Bandwidth Memory (HBM). This design significantly reduces the HBM footprint while accelerating generation speed.
26+
<p align="center">
27+
<img alt="UCM" src="../images/sparse_attn_arch.png" width="80%">
28+
</p>
29+
30+
31+
### Key Concepts
32+
- UCMSparse in scheduler: this instance locates in the same process as the `EnginerCore` and acts like a sparse attention budget controller. It estimates the number of slots required by a specific sparse attention algorithm. Then `KVCacheManager` allocates necessary blocks based on `num_slots_sparse`. For example, `ESA` only needs 20%~30% blocks of the normal attention.
33+
- UCMSparse in model_runner: this instance locates in the same process as the `Worker`.
34+
A typical sparse attention algorithm works like this:
35+
1. In prefill, it dumps full KV Cache from HBM to storage.
36+
2. In decode, it retrieves the most relevant blocks based on the context and loads the blocks from store to HBM.
37+
3. In decoode, it also dumps new generated blocks to keep the latest context accessible.
38+
- By fine-grained task scheduling, retrieval and loading can be executed asynchronously and overlap with the model execution. Therefore no overhead is introduced by UCMSparse and generation speed is boosted benefitted by less computational load and fewer memory accesses.
39+
40+
41+
See `ESA` for more details.
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# 1p1d
2+
3+
## Overview
4+
This example demonstrates how to run unified-cache-management with disaggregated prefill using NFS connector on a single node with a 1 prefiller + 1 decoder setup.
5+
6+
## Prerequisites
7+
- UCM: Installed with reference to the Installation documentation.
8+
- Hardware: At least 2 GPUs
9+
10+
## Start disaggregated service
11+
For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instruct.
12+
13+
### Run prefill server
14+
Prefiller Launch Command:
15+
```bash
16+
export PYTHONHASHSEED=123456
17+
CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
18+
--max-model-len 20000 \
19+
--tensor-parallel-size 1 \
20+
--gpu_memory_utilization 0.87 \
21+
--trust-remote-code \
22+
--enforce-eager \
23+
--no-enable-prefix-caching \
24+
--port 7800 \
25+
--block-size 128 \
26+
--kv-transfer-config \
27+
'{
28+
"kv_connector": "UnifiedCacheConnectorV1",
29+
"kv_connector_module_path": "unifiedcache.integration.vllm.uc_connector",
30+
"kv_role": "kv_producer",
31+
"kv_connector_extra_config": {
32+
"ucm_connector_name": "UcmNfsStore",
33+
"ucm_connector_config": {
34+
"storage_backends": "/mnt/test1",
35+
"kv_block_size": 33554432
36+
}
37+
}
38+
}'
39+
```
40+
41+
### Run decode server
42+
Decoder Launch Command:
43+
```bash
44+
export PYTHONHASHSEED=123456
45+
CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \
46+
--max-model-len 20000 \
47+
--tensor-parallel-size 1 \
48+
--gpu_memory_utilization 0.87 \
49+
--trust-remote-code \
50+
--enforce-eager \
51+
--no-enable-prefix-caching \
52+
--port 7801 \
53+
--block-size 128 \
54+
--kv-transfer-config \
55+
'{
56+
"kv_connector": "UnifiedCacheConnectorV1",
57+
"kv_connector_module_path": "unifiedcache.integration.vllm.uc_connector",
58+
"kv_role": "kv_consumer",
59+
"kv_connector_extra_config": {
60+
"ucm_connector_name": "UcmNfsStore",
61+
"ucm_connector_config": {
62+
"storage_backends": "/mnt/test1",
63+
"kv_block_size": 33554432
64+
}
65+
}
66+
}'
67+
```
68+
### Run proxy server
69+
Make sure prefill nodes and decode nodes can connect to each other.
70+
```bash
71+
cd vllm-workspace/unified-cache-management/test/
72+
python3 toy_proxy_server.py --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <prefill-node-ip> --decoder-port 7801
73+
```
74+
75+
## Testing and Benchmarking
76+
### Basic Test
77+
After running all servers , you can test with a simple curl command:
78+
```bash
79+
curl http://localhost:7802/v1/completions \
80+
-H "Content-Type: application/json" \
81+
-d '{
82+
"model": "/home/models/Qwen2.5-7B-Instruct",
83+
"prompt": "content": "What date is today?",
84+
"max_tokens": 20,
85+
"temperature": 0
86+
}'
87+
```
88+
### Benchmark Test
89+
Use the benchmark scripts provided by vLLM.
90+
```bash
91+
cd /vllm-workspace/vllm/benchmarks
92+
python3 benchmark_serving.py \
93+
--backend vllm \
94+
--dataset-name random \
95+
--random-input-len 4096 \
96+
--random-output-len 100 \
97+
--num-prompts 10 \
98+
--ignore-eos \
99+
--model /home/models/Qwen2.5-7B-Instruct \
100+
--tokenizer /home/models/Qwen2.5-7B-Instruct \
101+
--host localhost \
102+
--port 7802 \
103+
--endpoint /v1/completions \
104+
--request-rate 1
105+
```
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Disaggregated Prefill
2+
3+
:::{toctree}
4+
:maxdepth: 2
5+
1p1d.md
6+
xpyd.md
7+
:::
8+

0 commit comments

Comments
 (0)