release new compression block hparams

lucidrains · lucidrains · commit 56586dddcf1a · 2025-03-24T19:36:55.000-07:00
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ This will be my last open sourced project under Meta
 
 - <a href="https://github.com/Pasewark">Eric Pasewark</a> for submitting a simple transformer based compression network
 
+- <a href="https://github.com/Mr-Grin">@Mr-Grin</a> for a pull request that fixes compression block hyperparameters
+
 ## Install
 
 ```bash
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -439,7 +439,7 @@ def forward_inference(
         if return_cache:
             cache_compressed_kv = ((ck, cv), (run_k, run_v))
 
-        # 2. fine attention inference (todo - compress and fine diff block sizes)
+        # 2. fine attention inference
 
         importance_scores = csim[..., self.num_mem_compress_kv:]
 
@@ -628,7 +628,7 @@ def forward(
         # compressed masking
 
         cmask = None
-        # TODO
+
         if self.causal:
             cq_seq = arange(seq_len, device = device)
             ck_seq = ((arange(num_compress_blocks, device = device) + 1) * self.compress_block_sliding_stride) - 1
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.1.27"
+version = "0.2.0"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -6,8 +6,6 @@
 
 from native_sparse_attention_pytorch import SparseAttention
 
-device = 'cpu'
-
 @pytest.mark.parametrize('use_diff_topk', (False, True))
 @pytest.mark.parametrize('causal', (False, True))
 @pytest.mark.parametrize('seq_len', (1, 4, 31, 32, 120))
@@ -41,9 +39,9 @@ def test_sparse_attn(
         num_selected_blocks = num_selected_block,
         use_diff_topk = use_diff_topk,
         query_heads_share_selected_kv = query_heads_share_selected_kv,
-    ).to(device)
+    )
 
-    tokens = torch.randn(2, seq_len, 512).to(device)
+    tokens = torch.randn(2, seq_len, 512)
 
     attended = attn(tokens)
 
@@ -70,9 +68,9 @@ def test_inference(
         selection_block_size = selection_block_size,
         num_selected_blocks = num_selected_blocks,
         compress_block_sliding_stride = compress_block_sliding_stride
-    ).to(device)
+    )
 
-    tokens = torch.randn(2, seq_len, 512).to(device)
+    tokens = torch.randn(2, seq_len, 512)
 
     parallel_out = attn(tokens)
 
@@ -106,9 +104,9 @@ def test_transformer_inference(
             selection_block_size = selection_block_size,
             num_selected_blocks = 2
         )
-    ).to(device)
+    )
 
-    prompt = torch.randint(0, 256, (1, 1)).to(device)
+    prompt = torch.randint(0, 256, (1, 1))
 
     sampled = model.sample(prompt, 128, temperature = 0., use_cache_kv = False)
     sampled_cached = model.sample(prompt, 128, temperature = 0., use_cache_kv = True)