when doing interpolation of importance score, remask to 0 for illegal positions

lucidrains · lucidrains · commit 36837b2d2a61 · 2025-02-21T18:15:15.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -370,6 +370,9 @@ def forward(
 
         importance_scores = cattn[..., num_mem_compress_kv:]
 
+        num_selected = min(self.num_selected_blocks, importance_scores.shape[-1])
+        has_selected_kv_for_fine_attn = num_selected > 0
+
         # maybe average the compressed attention across each grouped queries (per key / values)
 
         if self.query_heads_share_selected_kv:
@@ -383,13 +386,16 @@ def forward(
         # cannot parse their equation, so will just improvise
         # first we expand all the compressed scores to the full sequence length, then average within each fine / selection block size - pad on the right to 0s, which should be fine as sliding window convers the local anyways
 
-        if self.compress_block_size != self.selection_block_size:
+        if has_selected_kv_for_fine_attn and self.compress_block_size != self.selection_block_size:
 
             score_len = importance_scores.shape[-1]
             compress_seq_len = score_len * self.compress_block_size
 
             if self.interpolated_importance_score:
+                mask = importance_scores > 1e-10
+                mask = repeat(mask, '... j -> ... (j block_size)', block_size = self.compress_block_size)
                 importance_scores = interpolate_1d(importance_scores, compress_seq_len)
+                importance_scores = importance_scores.masked_fill(~mask, 0.)
             else:
                 importance_scores = repeat(importance_scores, '... j -> ... (j block_size)', block_size = self.compress_block_size)
 
@@ -400,13 +406,11 @@ def forward(
 
         # handle if number of total blocks is less than number to select for fine attention
 
-        num_selected = min(self.num_selected_blocks, importance_scores.shape[-1])
-
         fq = rotated_q
         fk = rotated_k
         fv = v
 
-        if num_selected > 0:
+        if has_selected_kv_for_fine_attn:
             selected_importance_values, selected_block_indices = importance_scores.topk(num_selected, dim = -1)
 
             if self.use_diff_topk:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.43"
+version = "0.0.44"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -12,13 +12,15 @@
 @pytest.mark.parametrize('selection_block_size', (8, 4, 2))
 @pytest.mark.parametrize('num_selected_block', (0, 2))
 @pytest.mark.parametrize('query_heads_share_selected_kv', (False, True))
+@pytest.mark.parametrize('interpolated_importance_score', (False, True))
 def test_sparse_attn(
     use_diff_topk,
     seq_len,
     kv_heads,
     selection_block_size,
     num_selected_block,
-    query_heads_share_selected_kv
+    query_heads_share_selected_kv,
+    interpolated_importance_score
 ):
     attn = SparseAttention(
         dim = 512,
@@ -30,7 +32,8 @@ def test_sparse_attn(
         selection_block_size = selection_block_size,
         num_selected_blocks = num_selected_block,
         use_diff_topk = use_diff_topk,
-        query_heads_share_selected_kv = query_heads_share_selected_kv
+        query_heads_share_selected_kv = query_heads_share_selected_kv,
+        interpolated_importance_score = interpolated_importance_score
     )
 
     tokens = torch.randn(2, seq_len, 512)
diff --git a/train.py b/train.py
@@ -43,7 +43,7 @@
 FINE_BLOCK_SIZE = 32
 NUM_FINE_SELECTED = 0
 
-INTERPOLATED_IMPORTANCE_SCORE = True
+INTERPOLATED_IMPORTANCE_SCORE = False
 USE_DIFF_TOPK = True
 
 # experiment related