allow for single projection "mlp", for @Mr-Grin to experiment around with

lucidrains · lucidrains · commit e901e7386920 · 2025-03-18T13:27:16.000Z
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ This will be my last open sourced project under Meta
 
 - [Flex Attention](https://pytorch.org/blog/flexattention/) for allowing for rapid prototyping
 
-- <a href="https://github.com/Mr-Grin">@Mr-Grin</a> for the code review and pointing out a few inaccuracies in the implementation
+- <a href="https://github.com/Mr-Grin">@Mr-Grin</a> for the code review and pointing out an inaccuracy with the implementation
 
 ## Install
 
diff --git a/native_sparse_attention_pytorch/compress_networks.py b/native_sparse_attention_pytorch/compress_networks.py
@@ -3,7 +3,7 @@
 from torch.nn import Module, ModuleList
 
 from einops import einsum, rearrange
-from einops.layers.torch import EinMix as Mix
+from einops.layers.torch import EinMix as Mix, Rearrange
 
 # helpers
 
@@ -98,3 +98,38 @@ def forward(
         compressed = self.net(kv)
 
         return compressed
+
+# single projection "mlp"
+
+class SingleProjection(Module):
+    def __init__(
+        self,
+        dim_head,
+        compress_window_size,
+        heads = 1
+    ):
+        super().__init__()
+        dim = dim_head * compress_window_size
+        dim_out = dim_head
+
+        is_grouped = heads > 1
+
+        if not is_grouped:
+            self.compress = nn.Sequential(
+                Rearrange('b h w n d -> b h w (n d)'),
+                nn.Linear(dim, dim_out, bias = False)
+            )
+        else:
+            self.compress = Mix(
+                'b h w n i -> b h w o',
+                weight_shape = 'h i o',
+                h = heads,
+                i = dim_head,
+                o = dim_head
+            )
+
+    def forward(
+        self,
+        kv
+    ):
+        return self.compress(kv)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.1.15"
+version = "0.1.16"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_custom_compress_mlp.py b/tests/test_custom_compress_mlp.py
@@ -77,3 +77,24 @@ def test_group_mlp():
     attended = attn(tokens)
 
     assert tokens.shape == attended.shape
+
+@pytest.mark.parametrize('grouped', (False, True))
+def test_single_projection_mlp(grouped):
+    from native_sparse_attention_pytorch.compress_networks import SingleProjection
+
+    attn = SparseAttention(
+        dim = 512,
+        dim_head = 64,
+        heads = 8,
+        sliding_window_size = 2,
+        compress_block_size = 4,
+        selection_block_size = 4,
+        num_selected_blocks = 2,
+        compress_mlp = SingleProjection(64, 4, 8 if grouped else 1)
+    )
+
+    tokens = torch.randn(2, 31, 512)
+
+    attended = attn(tokens)
+
+    assert tokens.shape == attended.shape
diff --git a/train.py b/train.py
@@ -48,7 +48,7 @@
 INTERPOLATED_IMPORTANCE_SCORE = False
 USE_DIFF_TOPK = True
 
-USE_EFFICIENT_INFERENCE = True # needs validation still
+USE_EFFICIENT_INFERENCE = False # needs validation still
 
 # experiment related