allow for the strategy combine "mlp" to be customized as well, but this portion is not that critical imo

lucidrains · lucidrains · commit f77444e91f6b · 2025-02-20T13:16:19.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -88,7 +88,7 @@ def __init__(
         use_diff_topk = False,
         compress_mlp: Module | None = None,
         compress_mlp_expand_factor = 1.,
-
+        strategy_combine_mlp: Module | None = None
     ):
         super().__init__()
         self.heads = heads
@@ -138,7 +138,7 @@ def __init__(
             compress_mlp = nn.Sequential(
                 Rearrange('b h w n d -> b h w (n d)'),
                 nn.Linear(compress_dim, compress_mlp_dim_hidden),
-                nn.SiLU(),
+                nn.ReLU(),
                 nn.Linear(compress_mlp_dim_hidden, dim_head),
             )
 
@@ -154,8 +154,11 @@ def __init__(
 
         # they combine the three sparse branches through a learned combine with sigmoid activation
 
+        if not exists(strategy_combine_mlp):
+            strategy_combine_mlp = nn.Linear(dim, 3 * heads)
+
         self.to_strategy_combine = nn.Sequential(
-            nn.Linear(dim, 3 * heads),
+            strategy_combine_mlp,
             nn.Sigmoid(),
             Rearrange('b n (h s) -> b h n s', h = heads)
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.11"
+version = "0.0.12"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }