resolve mem ck and cv not used during inference

lucidrains · lucidrains · commit f4d28f839f8e · 2025-03-18T09:18:02.000-07:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -419,8 +419,17 @@ def forward_inference(
         ck = cache_ck
         cv = cache_cv
 
-        repeated_ck = repeat(ck, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
-        repeated_cv = repeat(cv, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
+        ck_for_attn = cache_ck
+        cv_for_attn = cache_cv
+
+        if not is_empty(ck):
+            mem_ck, mem_cv = repeat(self.compress_mem_kv, 'kv ... -> kv b ...', b = batch)
+
+            ck_for_attn = cat((mem_ck, ck_for_attn), dim = -2)
+            cv_for_attn = cat((mem_cv, cv_for_attn), dim = -2)
+
+        repeated_ck = repeat(ck_for_attn, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
+        repeated_cv = repeat(cv_for_attn, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
 
         csim = einsum(q, repeated_ck, 'b h i d, b h j d -> b h i j') * scale
         cattn = csim.softmax(dim = -1)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.1.16"
+version = "0.1.18"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }