fix maximum tracking in triton

lucidrains · lucidrains · commit 7ebebcaa5248 · 2025-05-16T00:12:52.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -278,7 +278,7 @@ def forward_kernel_causal_and_sparse(
 
             qk += tl.where(causal_mask, 0, float("-inf"))
 
-            m_ij = tl.maximum(tl.max(qk, 2) * softmax_scale, lse_i)
+            m_ij = tl.maximum(tl.max(qk, 2) * softmax_scale, m_i)
             p = tl.exp(qk * softmax_scale - m_ij[:, :, None])
 
             l_ij = tl.sum(p, 2)
@@ -408,7 +408,7 @@ def forward_kernel_causal_and_sparse(
 
             # attention
 
-            m_ij = tl.maximum(tl.max(sel_qk, 2) * softmax_scale, lse_i)
+            m_ij = tl.maximum(tl.max(sel_qk, 2) * softmax_scale, m_i)
             block_p = tl.exp(sel_qk * softmax_scale - m_ij[:, :, None])
 
             l_ij = tl.sum(block_p, 2)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.2.0"
+version = "0.2.1"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }