coordinate descent was unstable, just use a one hot straight through instead

lucidrains · lucidrains · commit 1de7c941efdd · 2025-02-19T20:25:02.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -7,8 +7,6 @@
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList
 
-from colt5_attention import topk as differentiable_topk
-
 from local_attention import LocalAttention
 
 from rotary_embedding_torch import RotaryEmbedding
@@ -87,7 +85,6 @@ def __init__(
         num_compressed_mem_kv = 4,
         norm = True,
         use_diff_topk = False,
-        diff_topk_coor_descent_iters = 10.
     ):
         super().__init__()
         self.heads = heads
@@ -142,7 +139,6 @@ def __init__(
         # selection related
 
         self.use_diff_topk = use_diff_topk
-        self.diff_topk_coor_descent_iters = diff_topk_coor_descent_iters
 
         self.selection_block_size = selection_block_size
         self.num_selected_blocks = num_selected_blocks
@@ -222,12 +218,12 @@ def forward(
 
         # 2. fine attention over selected based on compressed attention logits
 
-        importance_scores = csim[..., num_mem_compress_kv:]
+        importance_scores = cattn[..., num_mem_compress_kv:]
+
+        selected_importance_values, selected_block_indices = importance_scores.topk(self.num_selected_blocks, dim = -1)
 
         if self.use_diff_topk:
-            selected_importance_values, selected_block_indices, _, gates = differentiable_topk(importance_scores, self.num_selected_blocks, fused = True)
-        else:
-            selected_importance_values, selected_block_indices = importance_scores.topk(self.num_selected_blocks, dim = -1)
+            gates = selected_importance_values + (1. - selected_importance_values).detach()
 
         fmask = selected_importance_values > mask_value
 
@@ -247,6 +243,9 @@ def forward(
 
             selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
 
+            if self.use_diff_topk:
+                gates = pad_at_dim(gates, (0, remainder), value = 1., dim = -2)
+
         # handle block causal diagonal in the diagram, but run experiments without to see
 
         fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
@@ -272,7 +271,7 @@ def forward(
         # handle maybe gating
 
         if self.use_diff_topk:
-            gates = F.pad(gates, (0, 1, 0, remainder), value = 1.)
+            gates = F.pad(gates, (0, 1), value = 1.)
 
             fk = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fk)
             fv = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fv)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.4"
+version = "0.0.5"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -23,7 +23,6 @@ classifiers=[
 ]
 
 dependencies = [
-    "CoLT5-attention>=0.11.1",
     "einx>=0.3.0",
     "einops>=0.8.0",
     "local-attention>=1.11.1",
diff --git a/train.py b/train.py
@@ -98,6 +98,7 @@ def base_decoding(
         compress_block_size = 32,
         selection_block_size = 32,
         num_selected_blocks = 2,
+        use_diff_topk = False
     )
 ).cuda()
 

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,7 @@ def base_decoding(`
`98`	`98`	`compress_block_size = 32,`
`99`	`99`	`selection_block_size = 32,`
`100`	`100`	`num_selected_blocks = 2,`
	`101`	`+ use_diff_topk = False`
`101`	`102`	`)`
`102`	`103`	`).cuda()`
`103`	`104`