fix an issue with mask, make sure it converges for enwik8

lucidrains · lucidrains · commit 5a6146095249 · 2025-02-19T16:29:04.000Z
diff --git a/README.md b/README.md
@@ -33,6 +33,20 @@ attended = attn(tokens)
 assert tokens.shape == attended.shape
 ```
 
+## Example
+
+Enwik8 language modeling
+
+```bash
+$ pip install .[examples]
+```
+
+Then
+
+```bash
+$ python train.py
+```
+
 ## Citations
 
 ```bibtex
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,3 @@
+# Data source
+
+The enwik8 data was downloaded from the Hutter prize page: http://prize.hutter1.net/
diff --git a/data/enwik8.gz b/data/enwik8.gz
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -9,6 +9,8 @@
 
 from local_attention import LocalAttention
 
+from rotary_embedding_torch import RotaryEmbedding
+
 # einstein notation
 
 import einx
@@ -92,6 +94,10 @@ def __init__(
 
         self.norm = nn.RMSNorm(dim) if norm else nn.Identity()
 
+        # rotary
+
+        self.rotary_emb = RotaryEmbedding(dim_head)
+
         # qkv
 
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias = False)
@@ -193,14 +199,14 @@ def forward(
 
         cq_seq = arange(seq_len, device = device)
 
-        ck_seq = ((arange(num_compress_blocks) + 1) * self.compress_block_size) - 1
+        ck_seq = ((arange(num_compress_blocks, device = device) + 1) * self.compress_block_size) - 1
         ck_seq = F.pad(ck_seq, (num_mem_compress_kv, 0), value = -1)
 
         cmask = einx.less('j, i -> i j', ck_seq, cq_seq)
 
         mask_value = -torch.finfo(csim.dtype).max
 
-        csim = csim.masked_fill(cmask, mask_value)
+        csim = csim.masked_fill(~cmask, mask_value)
 
         cattn = csim.softmax(dim = -1)
 
@@ -218,6 +224,7 @@ def forward(
         fk = k
         fv = v
 
+        fq, fk = self.rotary_emb.rotate_queries_with_cached_keys(fq, fk)
 
         if seq_len < fine_divisible_seq_len:
             remainder = fine_divisible_seq_len - seq_len
@@ -255,7 +262,7 @@ def forward(
 
         fsim = einsum(fq, fk, 'b h i d, b h i j d -> b h i j') * self.scale
 
-        fsim = fsim.masked_fill(fmask, mask_value)
+        fsim = fsim.masked_fill(~fmask, mask_value)
 
         fattn = fsim.softmax(dim = -1)
 
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
@@ -0,0 +1,152 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import Module, ModuleList, Linear, RMSNorm
+
+from einops import rearrange
+from einops.layers.torch import Rearrange
+
+from rotary_embedding_torch import RotaryEmbedding
+
+from native_sparse_attention_pytorch.native_sparse_attention import SparseAttention
+
+# functions
+
+def exists(v):
+    return v is not None
+
+def default(v, d):
+    return v if exists(v) else d
+
+# attention
+
+class Attention(Module):
+    def __init__(
+        self,
+        dim,
+        dim_head = 64,
+        heads = 8
+    ):
+        super().__init__()
+        self.norm = RMSNorm(dim)
+
+        self.heads = heads
+        dim_inner = heads * dim_head
+
+        self.rotary_embed = RotaryEmbedding(dim_head)
+
+        self.to_q = nn.Linear(dim, dim_inner, bias = False)
+        self.to_k = nn.Linear(dim, dim_inner, bias = False)
+        self.to_v = nn.Linear(dim, dim_inner, bias = False)
+
+        self.split_heads = Rearrange('b n (h d) -> b h n d', h = heads)
+        self.merge_heads = Rearrange('b h n d -> b n (h d)')
+
+        self.to_out = nn.Linear(dim_inner, dim, bias = False)
+
+    def forward(
+        self,
+        x
+    ):
+
+        x = self.norm(x)
+
+        q = self.to_q(x)
+        k = self.to_k(x)
+        v = self.to_v(x)
+
+        q, k, v = map(self.split_heads, (q, k, v))
+
+        # relative positions
+
+        q, k = self.rotary_embed.rotate_queries_with_cached_keys(q, k)
+
+        # attention branch
+
+        out = F.scaled_dot_product_attention(
+            q, k, v,
+            is_causal = True
+        )
+
+        out = self.merge_heads(out)
+
+        return self.to_out(out)
+
+# feedforward
+
+def FeedForward(dim, expansion_factor = 4.):
+    dim_hidden = int(dim * expansion_factor)
+
+    return nn.Sequential(
+        RMSNorm(dim),
+        Linear(dim, dim_hidden),
+        nn.GELU(),
+        Linear(dim_hidden, dim)
+    )
+
+# classes
+
+class Transformer(Module):
+    def __init__(
+        self,
+        num_tokens,
+        dim,
+        depth,
+        dim_head = 64,
+        heads = 8,
+        ff_expansion_factor = 4.,
+        use_sparse_attn = False,
+        sparse_attn_kwargs: dict = dict(
+            sliding_window_size = 32,
+            compress_block_size = 4,
+            selection_block_size = 4,
+            num_selected_blocks = 4,
+        )
+    ):
+        super().__init__()
+        self.token_emb = nn.Embedding(num_tokens, dim)
+
+        layers = []
+        for _ in range(depth):
+
+            if use_sparse_attn:
+                attn = SparseAttention(
+                    dim = dim,
+                    dim_head = dim_head,
+                    heads = heads,
+                    **sparse_attn_kwargs
+                )
+            else:
+                attn = Attention(dim = dim, dim_head = dim_head, heads = heads)
+
+            ff = FeedForward(dim = dim, expansion_factor = ff_expansion_factor)
+
+            layers.append(ModuleList([attn, ff]))
+
+        self.layers = ModuleList(layers)
+
+        self.norm = RMSNorm(dim)
+        self.to_logits = Linear(dim, num_tokens, bias = False)
+ 
+    def forward(
+        self,
+        ids,
+        return_loss = False
+    ):
+        if return_loss:
+            ids, labels = ids[:, :-1], ids[:, 1:]
+
+        tokens = self.token_emb(ids)
+
+        for attn, ff in self.layers:
+            tokens = attn(tokens) + tokens
+            tokens = ff(tokens) + tokens
+
+        embed = self.norm(tokens)
+
+        logits = self.to_logits(embed)
+
+        if not return_loss:
+            return logits
+
+        return F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.2"
+version = "0.0.3"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -26,6 +26,7 @@ dependencies = [
     "einx>=0.3.0",
     "einops>=0.8.0",
     "local-attention>=1.11.1",
+    "rotary-embedding-torch",
     "torch>=2.2",
 ]
 
@@ -34,7 +35,11 @@ Homepage = "https://pypi.org/project/native-sparse-attention-pytorch/"
 Repository = "https://github.com/lucidrains/native-sparse-attention-pytorch"
 
 [project.optional-dependencies]
-examples = []
+
+examples = [
+    "tqdm",
+    "wandb"
+]
 test = [
     "pytest"
 ]
diff --git a/train.py b/train.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Data source`
	`2`	`+`
	`3`	`+The enwik8 data was downloaded from the Hutter prize page: http://prize.hutter1.net/`