fixes for triton pathway

lucidrains · lucidrains · commit 4b357af9b1d3 · 2025-03-04T18:14:40.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -533,7 +533,7 @@ def forward(
             assert inp.shape[1] == 1, 'input must be single tokens if inferencing with cache key values'
             return self.forward_inference(inp, cache, return_cache = return_cache)
 
-        assert not (self.causal and return_cache)
+        assert not (not self.causal and return_cache)
 
         batch, seq_len, scale, heads, device = *inp.shape[:2], self.scale, self.heads, inp.device
 
@@ -683,7 +683,7 @@ def forward(
                     selected_block_indices,
                     fmask,
                     sel_scale = gates,
-                    include_block_diagonal = self.causal
+                    include_block_causal = self.causal
                 )
 
             elif exists(fine_selection_flex_mask):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.76"
+version = "0.0.77"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }