do the differential topk gating in a more suboptimal way, but accommodates fused attention better

lucidrains · lucidrains · commit cfa6e08b5f86 · 2025-02-21T15:48:02.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -398,8 +398,9 @@ def forward(
             selected_importance_values, selected_block_indices = importance_scores.topk(num_selected, dim = -1)
 
             if self.use_diff_topk:
-                assert not exists(fine_selection_flex_mask)
                 gates = straight_through(selected_importance_values, 1.)
+                gates = gates.cumsum(dim = -1)[..., -1]
+                gates = repeat(gates, 'b h ... -> b (h qh) ...', qh = self.num_grouped_queries)
 
             if exists(fine_selection_flex_mask):
                 # flex attention for the selection for fine attention
@@ -422,7 +423,7 @@ def forward(
                     selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
 
                     if self.use_diff_topk:
-                        gates = pad_at_dim(gates, (0, remainder), value = 1., dim = -2)
+                        gates = pad_at_dim(gates, (0, remainder), value = 1.)
 
                 # handle block causal diagonal in the diagram, but run experiments without to see
 
@@ -453,16 +454,7 @@ def forward(
                 fk = fk.gather(3, selected_block_indices)
                 fv = fv.gather(3, selected_block_indices)
 
-                # handle maybe gating
-
-                if self.use_diff_topk:
-                    gates = F.pad(gates, (0, 1), value = 1.)
-
-                    fk = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fk)
-                    fv = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fv)
-
-                fk = rearrange(fk, 'b h i w j d -> b h i (w j) d')
-                fv = rearrange(fv, 'b h i w j d -> b h i (w j) d')
+                fk, fv = tuple(rearrange(t, 'b h i w j d -> b h i (w j) d') for t in (fk, fv))
 
                 # fine attention
 
@@ -483,6 +475,13 @@ def forward(
                 fine_attn_out = rearrange(fine_attn_out, 'b h qh ... -> b (h qh) ...')
 
                 fine_attn_out = fine_attn_out[..., :seq_len, :]
+
+            # handle maybe gating
+
+            if self.use_diff_topk:
+                gates = gates[..., :seq_len]
+                fine_attn_out = einx.multiply('b h n, b h n d -> b h n d', gates, fine_attn_out)
+
         else:
             # if only first block, just do a simple block causal
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.38"
+version = "0.0.39"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }