allow for different compress to fine block sizes during inference, and also make sure interpolated scores work

lucidrains · lucidrains · commit aa359ea08334 · 2025-03-06T15:26:45.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -171,7 +171,7 @@ def attend(
     mask_value = max_neg_value(sim)
 
     if exists(mask):
-        sim = sim.masked_fill(~mask, mask_value)
+        sim = sim.masked_fill(~mask, mask_value // 10)
 
     attn = sim.softmax(dim = -1)
 
@@ -425,13 +425,25 @@ def forward_inference(
 
         # 2. fine attention inference (todo - compress and fine diff block sizes)
 
-        assert self.compress_block_size == self.selection_block_size
-
         importance_scores = csim[..., self.num_mem_compress_kv:]
-        importance_scores += torch.randn_like(importance_scores) * 100
 
         num_compress_blocks = importance_scores.shape[-1]
-        num_selected = min(self.num_selected_blocks, num_compress_blocks)
+
+        if self.compress_block_size != self.selection_block_size:
+            compress_seq_len = num_compress_blocks * self.compress_block_size
+
+            if self.interpolated_importance_score:
+                importance_scores = interpolate_1d(importance_scores, compress_seq_len)
+            else:
+                importance_scores = repeat(importance_scores, '... j -> ... (bsz j)', bsz = self.compress_block_size)
+
+            fine_seq_len = round_down_mult(compress_seq_len, self.selection_block_size)
+
+            importance_scores = importance_scores[..., :fine_seq_len]
+            importance_scores = reduce(importance_scores, '... (bsz j) -> ... j', 'mean', bsz = self.selection_block_size)
+
+        num_fine_blocks = importance_scores.shape[-1]
+        num_selected = min(self.num_selected_blocks, num_fine_blocks)
         has_selected_kv_for_fine_attn = num_selected > 0
 
         # block causal diagonal
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -1707,6 +1707,8 @@ def native_sparse_attend(
     assert divisible_by(q_heads, kv_heads)
     assert sel_heads in (q_heads, kv_heads)
 
+    assert block_size >= 16, 'fine selection block size must be 16 or greater for now'
+
     # query heads within each group to attend to different segments
 
     if kv_heads != sel_heads:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.1.0"
+version = "0.1.1"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -39,7 +39,7 @@
 # sparse attention related
 
 SLIDING_WINDOW_SIZE = 64
-COMPRESS_BLOCK_SIZE = 16
+COMPRESS_BLOCK_SIZE = 8
 
 FINE_BLOCK_SIZE = 16
 NUM_FINE_SELECTED = 4