wire up flex attention for sliding windows

lucidrains · lucidrains · commit 232f4ebd8e04 · 2025-02-20T13:45:04.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -118,6 +118,8 @@ def __init__(
             autopad = True
         )
 
+        self.sliding_window_size = sliding_window_size
+
         # compress strategy
 
         self.compress_block_size = compress_block_size
@@ -174,7 +176,8 @@ def __init__(
 
     def forward(
         self,
-        inp
+        inp,
+        sliding_window_flex_mask = None
     ):
         batch, seq_len, scale, heads, device = *inp.shape[:2], self.scale, self.heads, inp.device
 
@@ -315,7 +318,10 @@ def forward(
 
         # 3. overlapping sliding window, this is unsurprising and expected
 
-        sliding_window_attn_out = self.sliding_window(q, k, v)
+        if exists(sliding_window_flex_mask):
+            sliding_window_attn_out = flex_attention(q, k, v, block_mask = sliding_window_flex_mask)
+        else:
+            sliding_window_attn_out = self.sliding_window(q, k, v)
 
         # combine strategies
 
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
@@ -8,7 +8,19 @@
 
 from rotary_embedding_torch import RotaryEmbedding
 
-from native_sparse_attention_pytorch.native_sparse_attention import SparseAttention
+from native_sparse_attention_pytorch.native_sparse_attention import SparseAttention, create_sliding_mask
+
+# flex attention
+# https://pytorch.org/blog/flexattention/
+
+flex_attention = None
+
+try:
+    from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+    if torch.cuda.is_available():
+        flex_attention = torch.compile(flex_attention)
+except ImportError:
+    pass
 
 # functions
 
@@ -96,6 +108,7 @@ def __init__(
         heads = 8,
         ff_expansion_factor = 4.,
         use_sparse_attn = False,
+        use_flex_sliding_window = False,
         sparse_attn_kwargs: dict = dict(
             sliding_window_size = 32,
             compress_block_size = 4,
@@ -106,6 +119,12 @@ def __init__(
         super().__init__()
         self.token_emb = nn.Embedding(num_tokens, dim)
 
+        if use_flex_sliding_window:
+            assert exists(flex_attention), 'flex attention is not available on your current version of pytorch'
+
+        self.use_sparse_attn = use_sparse_attn
+        self.use_flex_sliding_window = use_flex_sliding_window
+
         layers = []
         for _ in range(depth):
 
@@ -123,6 +142,8 @@ def __init__(
 
             layers.append(ModuleList([attn, ff]))
 
+        self.attn_sliding_window_size = attn.sliding_window_size
+
         self.layers = ModuleList(layers)
 
         self.norm = RMSNorm(dim)
@@ -131,15 +152,37 @@ def __init__(
     def forward(
         self,
         ids,
-        return_loss = False
+        return_loss = False,
+        disable_flex = False
     ):
         if return_loss:
             ids, labels = ids[:, :-1], ids[:, 1:]
 
+        seq_len = ids.shape[-1]
+
+        # token embedding
+
         tokens = self.token_emb(ids)
 
+        # prepare maybe flex attention masks
+
+        attn_kwargs = dict()
+
+        if not disable_flex and self.use_sparse_attn and self.use_flex_sliding_window:
+
+            attn_kwargs.update(
+                sliding_window_flex_mask = create_sliding_mask(seq_len, self.attn_sliding_window_size)
+            )
+
+        # layers
+
         for attn, ff in self.layers:
-            tokens = attn(tokens) + tokens
+            attn_out = attn(
+                tokens,
+                **attn_kwargs
+            )
+
+            tokens = attn_out + tokens
             tokens = ff(tokens) + tokens
 
         embed = self.norm(tokens)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.12"
+version = "0.0.14"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -27,7 +27,7 @@ dependencies = [
     "einops>=0.8.0",
     "local-attention>=1.11.1",
     "rotary-embedding-torch",
-    "torch>=2.2",
+    "torch>=2.5",
 ]
 
 [project.urls]
diff --git a/train.py b/train.py
@@ -1,7 +1,7 @@
 import math
 import gzip
 import random
-import tqdm
+from tqdm import tqdm
 import numpy as np
 
 import torch
@@ -76,8 +76,9 @@ def base_decoding(
     prompt_seq_len, out = prompt.shape[-1], prompt.clone()
     sample_num_times = max(0, seq_len - prompt_seq_len)
 
-    for _ in range(sample_num_times):
-        logits = net(out)
+    for _ in tqdm(range(sample_num_times)):
+        logits = net(out, disable_flex = True)
+
         logits = logits[:, -1]
         logits = top_k(logits, thres = filter_thres)
         sample = gumbel_sample(logits, temperature = temperature, dim = -1)
@@ -93,6 +94,7 @@ def base_decoding(
     dim = 512,
     depth = 6,
     use_sparse_attn = USE_SPARSE_ATTN,
+    use_flex_sliding_window = True,
     sparse_attn_kwargs = dict(
         sliding_window_size = 32,
         compress_block_size = 32,
@@ -144,7 +146,7 @@ def __getitem__(self, index):
 
 # training
 
-for i in tqdm.tqdm(range(NUM_BATCHES), mininterval = 10.0, desc = "training"):
+for i in tqdm(range(NUM_BATCHES), mininterval = 10.0, desc = "training"):
     model.train()
 
     for _ in range(GRAD_ACCUM_EVERY):