last commit before a week of jury duty

lucidrains · lucidrains · commit 4290e307581a · 2025-03-09T17:00:56.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -452,6 +452,7 @@ def forward_kernel(
     Out,
     SlidingOut,
     Lse,
+    SlidingLse,
     softmax_scale,
     stride_qb,
     stride_qh,
@@ -490,9 +491,13 @@ def forward_kernel(
     if RETURN_SLIDING_OUT:
         sliding = tl.program_id(2) == 0
         out_ptr = SlidingOut if sliding else Out
+        lse_ptr = SlidingLse if sliding else Lse
+        num_sel_kv_blocks = 0 if sliding else NUM_SEL_KV_BLOCKS
     else:
         sliding = False
         out_ptr = Out
+        lse_ptr = Lse
+        num_sel_kv_blocks = NUM_SEL_KV_BLOCKS
 
     forward_kernel_causal_and_sparse(
         Q,
@@ -533,7 +538,7 @@ def forward_kernel(
         BLOCK,
         QUERY_HEAD_GROUPS,
         QUERY_EXPAND_DIM,
-        NUM_SEL_KV_BLOCKS,
+        num_sel_kv_blocks,
         INCLUDE_BLOCK_CAUSAL,
         sliding
     )
@@ -570,6 +575,7 @@ def native_sparse_attn_forward(
     seqlen_q_rounded = round_up_multiple(seqlen_q, TRITON_BLOCK_SIZE)
 
     lse = torch.empty((batch, nheads, seqlen_q_rounded), device = device, dtype = torch.float32)
+    sliding_lse = torch.empty((batch, nheads, seqlen_q_rounded), device = device, dtype = torch.float32)
 
     o = torch.empty_like(q)
     slide_o = torch.empty_like(q)
@@ -592,6 +598,7 @@ def native_sparse_attn_forward(
         o,
         slide_o,
         lse,
+        sliding_lse,
         softmax_scale,
         q.stride(0),
         q.stride(1),
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -141,7 +141,7 @@ def regular_attend(
 fine_block_size = 16
 num_sel = 6
 dim_head = 64
-fused_sliding_window = False
+fused_sliding_window = True
 block_dk_dv_use_dot = False # need sufficient shared memory, A100 works
 
 q = torch.randn(batch, q_heads, seq_len, dim_head).cuda()