fix a tricky bug with lse, lse is rounded to 128, but padding needs to remain -inf

lucidrains · lucidrains · commit 97aa4ae80372 · 2025-03-05T18:10:42.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -401,7 +401,7 @@ def forward_kernel_causal_and_sparse(
     # write back lse
 
     lse_i = lse_i.reshape(BLOCK, QUERY_HEAD_GROUPS)
-    tl.store(lse_ptrs, lse_i, mask = offs_m[:, None] < seqlen_q)
+    tl.store(lse_ptrs, lse_i)
 
     # write to output
 
@@ -1362,12 +1362,12 @@ def backward_kernel(
 
     D += (
         off_b * stride_D_b +
-        off_h * QUERY_HEAD_GROUPS * seqlen_q_rounded
+        off_qh * seqlen_q_rounded
     )
 
     LSE += (
         off_b * stride_lse_b +
-        off_h * QUERY_HEAD_GROUPS * seqlen_q_rounded
+        off_qh * seqlen_q_rounded
     )
 
     num_block_n = tl.cdiv(seqlen_k, BLOCK)
@@ -1719,5 +1719,5 @@ def native_sparse_attend(
 
     if not return_lse:
         return out
-
+    
     return out, lse[..., :seq_len]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.78"
+version = "0.1.0"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }