address overlapping compress blocks during inference issue

lucidrains · lucidrains · commit 26a1321a36bc · 2025-03-19T16:53:24.000-07:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -290,6 +290,9 @@ def __init__(
         self.split_compress_window = split_compress_window_fn
         self.compress_window_size = compress_window_size
 
+        assert compress_block_overlap_len < compress_block_size
+        self.compress_block_overlap_len = compress_block_overlap_len
+
         # compression attention related parameters
 
         self.num_mem_compress_kv = num_compressed_mem_kv
@@ -382,6 +385,7 @@ def forward_inference(
 
         sliding_window = self.sliding_window_size
         compress_divisible_seq_len = round_down_mult(seq_len, self.compress_block_size)
+        compress_overlap_len = self.compress_block_overlap_len
 
         fine_divisible_seq_len = round_up_mult(seq_len, self.selection_block_size)
         num_fine_blocks = fine_divisible_seq_len // self.selection_block_size
@@ -439,19 +443,20 @@ def forward_inference(
 
         running_compress_seq_len = run_k.shape[-2]
 
-        if divisible_by(running_compress_seq_len, self.compress_block_size):
-
-            k_compress_input = self.split_compress_window(run_k)
-            v_compress_input = self.split_compress_window(run_v)
+        if divisible_by(running_compress_seq_len, self.compress_block_size + compress_overlap_len):
+            k_compress_input = rearrange(run_k, 'b h n d -> b h 1 n d')
+            v_compress_input = rearrange(run_v, 'b h n d -> b h 1 n d')
 
             k_compress_input = einx.add('b h w n d, h n d', k_compress_input, self.k_intrablock_positions)
             v_compress_input = einx.add('b h w n d, h n d', v_compress_input, self.v_intrablock_positions)
 
             next_ck = self.k_compress(k_compress_input)
             next_cv = self.v_compress(v_compress_input)
 
-            run_k = run_k[..., 0:0, :]
-            run_v = run_v[..., 0:0, :]
+            run_kv_slice = slice(-compress_overlap_len, None) if compress_overlap_len > 0 else slice(0, 0)
+
+            run_k = run_k[..., run_kv_slice, :]
+            run_v = run_v[..., run_kv_slice, :]
 
             ck = cat((ck, next_ck), dim = -2)
             cv = cat((cv, next_cv), dim = -2)
@@ -593,6 +598,8 @@ def forward(
         compress_divisible_seq_len = round_down_mult(seq_len, self.compress_block_size)
         num_compress_blocks = compress_divisible_seq_len // self.compress_block_size
 
+        compress_overlap_len = self.compress_block_overlap_len
+
         fine_divisible_seq_len = round_up_mult(seq_len, self.selection_block_size)
         num_fine_blocks = fine_divisible_seq_len // self.selection_block_size
 
@@ -622,8 +629,14 @@ def forward(
             k_compress_input = einx.add('b h w n d, h n d', k_compress_input, self.k_intrablock_positions)
             v_compress_input = einx.add('b h w n d, h n d', v_compress_input, self.v_intrablock_positions)
 
-        run_k = k[..., compress_divisible_seq_len:, :]
-        run_v = v[..., compress_divisible_seq_len:, :]
+        run_k, run_v = k, v
+
+        if return_cache and compress_overlap_len > 0:
+            run_k = F.pad(run_k, (0, 0, compress_overlap_len, 0), value = 0.)
+            run_v = F.pad(run_v, (0, 0, compress_overlap_len, 0), value = 0.)
+
+        run_k = run_k[..., compress_divisible_seq_len:, :]
+        run_v = run_v[..., compress_divisible_seq_len:, :]
 
         cq = q
         ck = self.k_compress(k_compress_input)   # Equation (7) of the Native Sparse Attention paper
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.1.22"
+version = "0.1.23"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -52,9 +52,11 @@ def test_sparse_attn(
 
 @pytest.mark.parametrize('seq_len', (2, 8, 16))
 @pytest.mark.parametrize('num_selected_blocks', (0, 2))
+@pytest.mark.parametrize('compress_block_overlap_len', (0, 2))
 def test_inference(
     seq_len,
-    num_selected_blocks
+    num_selected_blocks,
+    compress_block_overlap_len
 ):
 
     attn = SparseAttention(
@@ -65,7 +67,8 @@ def test_inference(
         sliding_window_size = 2,
         compress_block_size = 5,
         selection_block_size = 10,
-        num_selected_blocks = num_selected_blocks
+        num_selected_blocks = num_selected_blocks,
+        compress_block_overlap_len = compress_block_overlap_len
     )
 
     tokens = torch.randn(2, seq_len, 512)