fix intermittent issue with triton nsa dk

lucidrains · lucidrains · commit 9e884dfd2a97 · 2025-03-01T15:22:10.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -822,20 +822,25 @@ def backward_kernel_one_col_block_sparse(
     # ds
 
     ds = (p * (dp - Di[:, :, None]) * softmax_scale)
-    ds = ds.to(q.dtype)
 
     # block dk
 
-    block_dk = ds[:, :, :, None] * q[:, :, None, :]
+    block_dk = ds[:, :, :, None] * q[:, :, None, :].to(ds.dtype)
     block_dk = tl.sum(block_dk, 1)
 
-    tl.atomic_add(block_dk_ptrs, block_dk, mask = block_masks[:, None, None], sem = 'relaxed')
+    tl.atomic_add(
+        block_dk_ptrs,
+        block_dk,
+        mask = block_masks[:, None, None] & (blocks_offs_n[:, :, None] < seqlen_k),
+        sem = 'relaxed'
+    )
 
     # block dq
 
     ds_expanded = tl.expand_dims(ds, 2)
     ds_expanded = tl.broadcast_to(ds_expanded, (BLOCK, QUERY_HEAD_GROUPS, QUERY_EXPAND_DIM, BLOCK))
     ds_expanded = ds_expanded.reshape(BLOCK, 16, BLOCK)
+    ds_expanded = ds_expanded.to(block_k.dtype)
 
     block_dq = tl.dot(ds_expanded, block_k)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.65"
+version = "0.0.66"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -100,8 +100,8 @@ def regular_attend(
 
 # mock inputs
 
-batch = 2
-seq_len = 511
+batch = 4
+seq_len = 507
 q_heads = 4
 kv_heads = 2
 fine_block_size = 16
@@ -135,7 +135,7 @@ def regular_attend(
 assert torch.allclose(rlse, nlse, atol = 1e-2)
 
 assert torch.allclose(nv.grad, rv.grad, atol = 1e-2)
-assert torch.allclose(nk.grad, rk.grad, atol = 1e-2)
 assert torch.allclose(nq.grad, rq.grad, atol = 1e-2)
+assert torch.allclose(nk.grad, rk.grad, atol = 1e-2)
 
 print('✅ outputs and gradients are same between pytorch native sparse attn and triton native sparse attn')