fix an off by one error in inference for block causal sliding window in fine attention, also fix another edge case where if fine selection turned off, not using block causal in regular forward

lucidrains · lucidrains · commit d1f5d411b01f · 2025-03-19T07:16:37.000-07:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -2,6 +2,7 @@
 
 from copy import deepcopy
 from math import ceil
+from functools import partial
 
 import torch
 import torch.nn.functional as F
@@ -483,7 +484,7 @@ def forward_inference(
 
         # block causal diagonal
 
-        fine_sliding_window = (seq_len % self.selection_block_size) + 1
+        fine_sliding_window = ((seq_len - 1) % self.selection_block_size) + 1
         fk = k[..., -fine_sliding_window:, :]
         fv = v[..., -fine_sliding_window:, :]
 
@@ -721,6 +722,9 @@ def forward(
         num_selected = min(num_selected, importance_scores.shape[-1])
         has_selected_kv_for_fine_attn = num_selected > 0
 
+        remainder = fine_divisible_seq_len - seq_len
+        pad_to_multiple = partial(pad_at_dim, pad = (0, remainder), dim = -2)
+
         if has_selected_kv_for_fine_attn:
 
             # get the top-n kv segments for fine attention
@@ -760,10 +764,9 @@ def forward(
                 fmask = selected_importance_values > 1e-10
 
                 if seq_len < fine_divisible_seq_len:
-                    remainder = fine_divisible_seq_len - seq_len
-                    fk = pad_at_dim(fk, (0, remainder), value = 0., dim = -2)
-                    fv = pad_at_dim(fv, (0, remainder), value = 0., dim = -2)
-                    fq = pad_at_dim(fq, (0, remainder), value = 0., dim = -2)
+                    fk = pad_to_multiple(fk)
+                    fv = pad_to_multiple(fv)
+                    fq = pad_to_multiple(fq)
 
                     fmask = pad_at_dim(fmask, (0, remainder), value = False, dim = -2)
 
@@ -845,11 +848,20 @@ def forward(
             seq_len = fk.shape[-2]
             fmask = None
 
+            fk = pad_to_multiple(fk)
+            fv = pad_to_multiple(fv)
+            fq = pad_to_multiple(fq)
+
+            fq, fk, fv = tuple(rearrange(t, 'b h (w n) d -> (b w) h n d', n = self.selection_block_size) for t in (fq, fk, fv))
+
             if self.causal:
-                fmask = causal_mask = torch.ones((seq_len, seq_len), device = device, dtype = torch.bool).tril()
+                fmask = causal_mask = torch.ones((self.selection_block_size, self.selection_block_size), device = device, dtype = torch.bool).tril()
 
             fine_attn_out = attend(fq, fk, fv, mask = fmask)
 
+            fine_attn_out = rearrange(fine_attn_out, '(b w) h n d -> b h (w n) d', b = batch)
+            fine_attn_out = fine_attn_out[..., :seq_len, :]
+
         # 3. overlapping sliding window, this is unsurprising and expected - `s` for sliding
 
         sq = q
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -50,7 +50,7 @@ def test_sparse_attn(
 
     assert tokens.shape == attended.shape
 
-@pytest.mark.parametrize('seq_len', (8,))
+@pytest.mark.parametrize('seq_len', (2, 8, 16))
 def test_inference(seq_len):
 
     attn = SparseAttention(
@@ -61,7 +61,7 @@ def test_inference(seq_len):
         sliding_window_size = 2,
         compress_block_size = 5,
         selection_block_size = 10,
-        num_selected_blocks = 2
+        num_selected_blocks = 0
     )
 
     tokens = torch.randn(2, seq_len, 512)