initial forward needs to return cache with rotated keys

lucidrains · lucidrains · commit 0fad3165c0bf · 2025-03-03T16:39:22.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -532,11 +532,6 @@ def forward(
 
         q, k, v = map(self.split_heads, (q, k, v))
 
-        # handle cache
-
-        if return_cache:
-            cache_kv = (k, v)
-
         # compressed key / values - variables prepended with `c` stands for compressed
 
         k_pos = repeat(self.k_intrablock_positions, 'h n d -> h (r n) d', r = num_compress_blocks)
@@ -573,7 +568,13 @@ def forward(
         compressed_attn_out, csim = attend(cq, ck, cv, mask = cmask, return_sim = True)
 
         # for 2. and 3., will give them relative positions with rotary - compressed needs to be handled separately (even if they already have intra block absolute positions)
-        rotated_q, rotated_k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
+
+        q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
+
+        # handle cache
+
+        if return_cache:
+            cache_kv = (k, v)
 
         # 2. fine attention over selected based on compressed attention logits - variables prepended with `f` stands for the fine attention pathway
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.71"
+version = "0.0.72"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }