start with compressed and sliding window for inference

lucidrains · lucidrains · commit 0e1f6feb434f · 2025-02-28T16:10:17.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -315,13 +315,121 @@ def __init__(
 
         self.combine_heads = nn.Linear(dim_inner, dim, bias = False)
 
+    def forward_inference(
+        self,
+        inp,
+        cache,
+        return_cache = True
+    ):
+        # destruct cache
+
+        (cache_k, cache_v), (cache_ck, cache_cv) = cache
+
+        # variables
+
+        batch, scale, heads, device = inp.shape[0], self.scale, self.heads, inp.device
+        seq_len = cache_k.shape[-2] + 1
+
+        sliding_window = self.sliding_window_size
+        compress_divisible_seq_len = round_down_mult(seq_len, self.compress_block_size)
+        num_compress_blocks = compress_divisible_seq_len // self.compress_block_size
+
+        fine_divisible_seq_len = round_up_mult(seq_len, self.selection_block_size)
+        num_fine_blocks = fine_divisible_seq_len // self.selection_block_size
+
+        # maybe prenorm
+
+        inp = self.norm(inp)
+
+        # queries, keys, values
+
+        q, k, v = self.to_qkv(inp).split(self.qkv_split, dim = -1)
+
+        q, k, v = map(self.split_heads, (q, k, v))
+
+        # handle cache
+
+        k = cat((cache_k, k), dim = -2)
+        v = cat((cache_v, v), dim = -2)
+
+        if return_cache:
+            cache_kv = (k, v)
+
+        # 1. compressed attn inference
+
+        cq = q
+        ck = cache_ck
+        cv = cache_cv
+
+        if divisible_by(seq_len, self.compress_block_size):
+            k_compress_input = self.split_compress_window(k[..., -self.compress_block_size:, :] + self.k_intrablock_positions)
+            v_compress_input = self.split_compress_window(v[..., -self.compress_block_size:, :] + self.v_intrablock_positions)
+
+            next_ck = self.k_compress(k_compress_input)
+            next_cv = self.v_compress(v_compress_input)
+
+            ck = cat((ck, next_ck), dim = -2)
+            cv = cat((cv, next_cv), dim = -2)
+
+        if return_cache:
+            cache_compressed_kv = (ck, cv)
+
+        ck = repeat(ck, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
+        cv = repeat(cv, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
+
+        csim = einsum(q, ck, 'b h i d, b h j d -> b h i j') * scale
+        cattn = csim.softmax(dim = -1)
+
+        compressed_attn_out = einsum(cattn, cv, 'b h i j, b h j d -> b h i d')
+
+        # 2. fine attention inference (todo)
+
+        # not implemented
+
+        # 3. sliding window
+
+        k = repeat(k, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
+        v = repeat(v, 'b h ... -> b (h gh) ...', gh = self.num_grouped_queries)
+
+        sliding_slice = (Ellipsis, slice(-(sliding_window + 1), None), slice(None))
+        rotated_q, rotated_k = self.rotary_emb.rotate_queries_with_cached_keys(q, k[sliding_slice])
+
+        sim = einsum(rotated_q, rotated_k, 'b h i d, b h j d -> b h i j') * scale
+        attn = sim.softmax(dim = -1)
+        sliding_window_attn_out = einsum(attn, v[sliding_slice], 'b h i j, b h j d -> b h i d')
+
+        # combine strategies
+
+        strategy_weighted_combine = self.to_strategy_combine(inp)
+
+        out = einsum(strategy_weighted_combine, stack([compressed_attn_out, sliding_window_attn_out, sliding_window_attn_out]), 'b h n s, s b h n d -> b h n d')
+
+        # merge heads and combine them
+
+        out = self.merge_heads(out)
+
+        out = self.combine_heads(out)
+
+        if not return_cache:
+            return out
+
+        return out, (cache_kv, cache_compressed_kv)
+
     def forward(
         self,
         inp,
+        cache = None,
         disable_triton_kernel = False,
         sliding_window_flex_mask = None,
-        fine_selection_flex_mask = None
+        fine_selection_flex_mask = None,
+        return_cache = False
     ):
+        is_inferencing = exists(cache)
+
+        if is_inferencing:
+            assert inp.shape[1] == 1, 'input must be single tokens if inferencing with cache key values'
+            return self.forward_inference(inp, cache, return_cache = return_cache)
+
         batch, seq_len, scale, heads, device = *inp.shape[:2], self.scale, self.heads, inp.device
 
         compress_divisible_seq_len = round_down_mult(seq_len, self.compress_block_size)
@@ -340,6 +448,11 @@ def forward(
 
         q, k, v = map(self.split_heads, (q, k, v))
 
+        # handle cache
+
+        if return_cache:
+            cache_kv = (k, v)
+
         # compressed key / values - variables prepended with `c` stands for compressed
 
         k_pos = repeat(self.k_intrablock_positions, 'h n d -> h (r n) d', r = num_compress_blocks)
@@ -352,6 +465,9 @@ def forward(
         ck = self.k_compress(k_compress_input)   # Equation (7) of the Native Sparse Attention paper
         cv = self.v_compress(v_compress_input)
 
+        if return_cache:
+            cache_compressed_kv = (ck, cv)
+
         # 1. coarse attention over compressed
 
         mem_ck, mem_cv = repeat(self.compress_mem_kv, 'kv ... -> kv b ...', b = batch)
@@ -570,4 +686,9 @@ def forward(
 
         out = self.merge_heads(out)
 
-        return self.combine_heads(out)
+        out = self.combine_heads(out)
+
+        if not return_cache:
+            return out
+
+        return out, (cache_kv, cache_compressed_kv)
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
@@ -200,23 +200,31 @@ def __init__(
         self.norm = RMSNorm(dim)
         self.to_logits = Linear(dim, num_tokens, bias = False)
  
+    @torch.no_grad()
     def sample(
         self,
         prompt: Tensor,
         seq_len: int,
         temperature = 1.,
         filter_thres = 0.9,
+        use_cache_kv = False
     ):
         prompt_seq_len, out = prompt.shape[-1], prompt.clone()
         sample_num_times = max(0, seq_len - prompt_seq_len)
 
+        cache = None
+
         for _ in tqdm(range(sample_num_times)):
-            logits = self.forward(
+
+            logits, next_cache = self.forward(
                 out,
-                disable_flex = True,
-                disable_triton_kernel = True
+                cache = cache,
+                return_cache = True
             )
 
+            if use_cache_kv:
+                cache = next_cache
+
             logits = logits[:, -1]
             logits = top_k(logits, thres = filter_thres)
             sample = gumbel_sample(logits, temperature = temperature, dim = -1)
@@ -225,13 +233,28 @@ def sample(
 
         return out[..., prompt_seq_len:]
 
+    def forward_inference(
+        self,
+        ids,
+        cache = None
+    ):
+        return ids
+
     def forward(
         self,
         ids,
         return_loss = False,
         disable_flex = False,
-        disable_triton_kernel = False
+        disable_triton_kernel = False,
+        cache = None,
+        return_cache = False
     ):
+        is_inferencing = exists(cache)
+
+        if is_inferencing:
+            disable_flex &= False
+            disable_triton_kernel &= False
+
         if return_loss:
             ids, labels = ids[:, :-1], ids[:, 1:]
 
@@ -257,14 +280,29 @@ def forward(
                 fine_selection_flex_mask = create_fine_mask(seq_len, self.attn_fine_block_size)
             )
 
+        # cache
+
+        cache = default(cache, [])
+        iter_cache = iter(cache)
+
+        next_cache = []
+
+        if is_inferencing:
+            tokens = tokens[:, -1:]
+
         # layers
 
         for attn, ff in self.layers:
-            attn_out = attn(
+
+            attn_out, layer_cache = attn(
                 tokens,
+                cache = next(iter_cache, None),
+                return_cache = True,
                 **attn_kwargs
             )
 
+            next_cache.append(layer_cache)
+
             tokens = attn_out + tokens
             tokens = ff(tokens) + tokens
 
@@ -273,6 +311,9 @@ def forward(
         logits = self.to_logits(embed)
 
         if not return_loss:
-            return logits
+            if not return_cache:
+                return logits
+
+            return logits, next_cache
 
         return F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.60"
+version = "0.0.61"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -47,6 +47,8 @@
 INTERPOLATED_IMPORTANCE_SCORE = False
 USE_DIFF_TOPK = True
 
+USE_EFFICIENT_INFERENCE = False # fine attn inference logic still needs implementing
+
 # experiment related
 
 PROJECT_NAME = 'native-sparse-attention'
@@ -187,7 +189,11 @@ def __getitem__(self, index):
 
         prompt = inp[None, ...]
 
-        sampled = model.sample(prompt, GENERATE_LENGTH)
+        sampled = model.sample(
+            prompt,
+            GENERATE_LENGTH,
+            use_cache_kv = USE_EFFICIENT_INFERENCE
+        )
 
         base_decode_output = decode_tokens(sampled[0])