remove interpolated importance score, and for now only focus on selection block sizes greater than compress block

lucidrains · lucidrains · commit 1798a2cb980f · 2025-03-20T14:06:20.000Z
diff --git a/native_sparse_attention_pytorch/compress_networks.py b/native_sparse_attention_pytorch/compress_networks.py
@@ -135,8 +135,9 @@ def forward(
     ):
         return self.compress(kv)
 
+# simple transformer compressor, pull requested by Eric Pasewark
 
-class SimpleMultiheadSelfAttention(nn.Module):
+class SimpleMultiheadSelfAttention(Module):
     def __init__(self, dim, num_heads, dropout=0.0):
         super().__init__()
         assert dim % num_heads == 0, "Hidden dimension must be divisible by number of heads"
@@ -167,7 +168,7 @@ def forward(self, x):
         attn_out = attn_out.transpose(1, 2).reshape(B, L, D)
         return self.out_proj(attn_out)
 
-class SimpleTransformerFeedForward(nn.Module):
+class SimpleTransformerFeedForward(Module):
     def __init__(self, dim, hidden_dim, dropout=0.0):
         """Two-layer feed-forward network with GELU activation."""
         super().__init__()
@@ -183,7 +184,7 @@ def forward(self, x):
         out = self.dropout(out)
         return out
 
-class SimpleTransformerLayer(nn.Module):
+class SimpleTransformerLayer(Module):
     def __init__(self, dim, num_heads, ff_hidden_dim=None, dropout=0.0):
         """Single Transformer layer: RMSNorm + Multi-head attention + RMSNorm + FeedForward."""
         super().__init__()
@@ -201,7 +202,7 @@ def forward(self, x):
         x = x + f
         return x
 
-class CompressTransformer(nn.Module):
+class CompressTransformer(Module):
     def __init__(self, num_layers, dim, num_heads, ff_hidden_dim=None, dropout=0.0):
         """
         Stacked Transformer encoder layers.
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -145,13 +145,6 @@ def pad_at_dim(t, pad, dim = -1, value = 0.):
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
 
-def interpolate_1d(x, length, mode = 'bilinear'):
-    x, inverse_pack = pack_one_with_inverse(x, '* n')
-    x = rearrange(x, 'b n -> b 1 n 1')
-    x = F.interpolate(x, (length, 1), mode = mode)
-    x = rearrange(x, 'b 1 n 1 -> b n')
-    return inverse_pack(x)
-
 def straight_through(t, target):
     return t + (target - t).detach()
 
@@ -209,7 +202,6 @@ def __init__(
         norm = True,
         use_diff_topk = False,
         use_triton_kernel = False,
-        interpolated_importance_score = False,
         query_heads_share_selected_kv = True, # if set to True, importance score is averaged across query heads to select top-n buckets of kv per kv head - but can be set to False for each query head within a group to look at different sets of kv buckets. will be more memory and compute of course
         compress_mlp: Module | None = None,
         compress_mlp_expand_factor = 1.,
@@ -319,10 +311,10 @@ def __init__(
 
         self.use_diff_topk = use_diff_topk
 
-        self.interpolated_importance_score = interpolated_importance_score # in the case fine block size < compressed block size, will weigh space better when selecting
-
         self.query_heads_share_selected_kv = query_heads_share_selected_kv
 
+        assert divisible_by(selection_block_size, compress_block_size), f'selection block size {selection_block_size} must be greater than or equal to compress block size {compress_block_size}, as well as divisible by the compress block size'
+
         self.selection_block_size = selection_block_size
 
         assert num_selected_blocks >= 0
@@ -473,10 +465,7 @@ def forward_inference(
         if self.compress_block_size != self.selection_block_size:
             compress_seq_len = num_compress_blocks * self.compress_block_size
 
-            if self.interpolated_importance_score:
-                importance_scores = interpolate_1d(importance_scores, compress_seq_len)
-            else:
-                importance_scores = repeat(importance_scores, '... j -> ... (bsz j)', bsz = self.compress_block_size)
+            importance_scores = repeat(importance_scores, '... j -> ... (bsz j)', bsz = self.compress_block_size)
 
             fine_seq_len = round_down_mult(compress_seq_len, self.selection_block_size)
 
@@ -702,10 +691,7 @@ def forward(
 
                 compress_seq_len = num_compress_blocks * self.compress_block_size
 
-                if self.interpolated_importance_score:
-                    importance_scores = interpolate_1d(importance_scores, compress_seq_len)
-                else:
-                    importance_scores = repeat(importance_scores, '... j -> ... (j block_size)', block_size = self.compress_block_size)
+                importance_scores = repeat(importance_scores, '... j -> ... (j block_size)', block_size = self.compress_block_size)
 
                 padding = fine_divisible_seq_len - compress_seq_len
 
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -10,12 +10,11 @@
 @pytest.mark.parametrize('causal', (False, True))
 @pytest.mark.parametrize('seq_len', (1, 4, 31, 32, 120))
 @pytest.mark.parametrize('kv_heads', (8, 4))
-@pytest.mark.parametrize('selection_block_size', (8, 4, 2))
+@pytest.mark.parametrize('selection_block_size', (8, 16, 32))
 @pytest.mark.parametrize('compress_block_size', (8, 4))
 @pytest.mark.parametrize('compress_block_overlap_len', (0, 2))
 @pytest.mark.parametrize('num_selected_block', (0, 2))
 @pytest.mark.parametrize('query_heads_share_selected_kv', (False, True))
-@pytest.mark.parametrize('interpolated_importance_score', (False, True))
 def test_sparse_attn(
     use_diff_topk,
     causal,
@@ -26,7 +25,6 @@ def test_sparse_attn(
     compress_block_overlap_len,
     num_selected_block,
     query_heads_share_selected_kv,
-    interpolated_importance_score
 ):
     attn = SparseAttention(
         dim = 512,
@@ -41,7 +39,6 @@ def test_sparse_attn(
         num_selected_blocks = num_selected_block,
         use_diff_topk = use_diff_topk,
         query_heads_share_selected_kv = query_heads_share_selected_kv,
-        interpolated_importance_score = interpolated_importance_score
     )
 
     tokens = torch.randn(2, seq_len, 512)
diff --git a/train.py b/train.py
@@ -45,7 +45,6 @@
 FINE_BLOCK_SIZE = 16
 NUM_FINE_SELECTED = 4
 
-INTERPOLATED_IMPORTANCE_SCORE = False
 USE_DIFF_TOPK = True
 
 USE_EFFICIENT_INFERENCE = True # needs validation still
@@ -106,7 +105,6 @@ def decode_tokens(tokens):
         selection_block_size = FINE_BLOCK_SIZE,
         num_selected_blocks = NUM_FINE_SELECTED,
         use_diff_topk = USE_DIFF_TOPK,
-        interpolated_importance_score = INTERPOLATED_IMPORTANCE_SCORE,
         query_heads_share_selected_kv = QUERY_HEADS_SHARE_SELECTION
     )
 ).cuda()