add test for gdn

v0i0 · v0i0 · commit ae2b1b365752 · 2025-11-20T10:31:11.000-08:00
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -1708,6 +1708,130 @@ def fused_linear_jsd_kernel(beta: float, ignore_index: int, temperature: float,
     # src[fused_linear_jsd.py:N]: return (loss / student_logits.shape[0]).sum()
     return (loss / student_logits.shape[0]).sum()
 
+--- assertExpectedJournal(TestExamples.test_gdn_fwd_h)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_helion_gdn_fwd_h(h, w, u, g, k, _BLOCK_SIZE_0: tl.constexpr, _RDIM_SIZE_4: tl.constexpr, _BLOCK_SIZE_3: tl.constexpr):
+    # src[gdn_fwd_h.py:N]: for tile_b, tile_h, tile_v in hl.tile(
+    # src[gdn_fwd_h.py:N]:     [batch, nheads, dstate], block_size=[1, 1, block_v]
+    # src[gdn_fwd_h.py:N]: ):
+    num_blocks_0 = 8
+    num_blocks_1 = 80
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0 % num_blocks_1
+    pid_2 = tl.program_id(0) // (num_blocks_0 * num_blocks_1)
+    offset_1 = pid_0
+    offset_2 = pid_1
+    offset_0 = pid_2 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    indices_5 = tl.arange(0, _RDIM_SIZE_4).to(tl.int32)
+    # src[gdn_fwd_h.py:N]: b_h = hl.zeros([dhead, tile_v], dtype=acc_dtype)
+    b_h = tl.full([64, _BLOCK_SIZE_0], 0.0, tl.float32)
+    # src[gdn_fwd_h.py:N]: for t_i in hl.tile(seqlen, block_size=chunk_size):
+    # src[gdn_fwd_h.py:N]:     h[tile_b.begin, t_i.id, tile_h.begin, :, tile_v] = b_h.to(dtype)
+    # src[gdn_fwd_h.py:N]:     b_w = w[tile_b.begin, t_i, tile_h.begin, :]
+    # src[gdn_fwd_h.py:N-N]: ...
+    for offset_4 in tl.range(0, 4096, _BLOCK_SIZE_3):
+        indices_4 = offset_4 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
+        b_h_copy = b_h
+        b_h_copy_0 = b_h_copy
+        # src[gdn_fwd_h.py:N]: h[tile_b.begin, t_i.id, tile_h.begin, :, tile_v] = b_h.to(dtype)
+        v_0 = tl.cast(b_h_copy_0, tl.bfloat16)
+        tile_id = offset_4 // _BLOCK_SIZE_3
+        tl.store(h + (offset_1 * 10485760 + tile_id * 655360 + offset_2 * 8192 + indices_5[:, None] * 128 + indices_0[None, :] * 1), v_0, None)
+        # src[gdn_fwd_h.py:N]: b_w = w[tile_b.begin, t_i, tile_h.begin, :]
+        b_w = tl.load(w + (offset_1 * 20971520 + indices_4[:, None] * 5120 + offset_2 * 64 + indices_5[None, :] * 1), None)
+        # src[gdn_fwd_h.py:N]: c_h = b_h.to(dtype)
+        v_1 = tl.cast(b_h_copy_0, tl.bfloat16)
+        # src[gdn_fwd_h.py:N]: b_v = hl.dot(b_w, c_h, out_dtype=acc_dtype)
+        b_v = tl.dot(tl.cast(b_w, tl.bfloat16), tl.cast(v_1, tl.bfloat16), input_precision='tf32', out_dtype=tl.float32)
+        # src[gdn_fwd_h.py:N]: p_v = u[tile_b.begin, t_i, tile_h.begin, tile_v].to(acc_dtype)
+        load_1 = tl.load(u + (offset_1 * 41943040 + indices_4[:, None] * 10240 + offset_2 * 128 + indices_0[None, :] * 1), None)
+        v_2 = tl.cast(load_1, tl.float32)
+        # src[gdn_fwd_h.py:N]: b_v = p_v - b_v
+        v_3 = v_2 - b_v
+        # src[gdn_fwd_h.py:N]: m_t = t_i.index < seqlen
+        v_4 = tl.full([], 4096, tl.int32)
+        v_5 = indices_4 < v_4
+        # src[gdn_fwd_h.py:N]: t_i_last = min(t_i.begin + chunk_size, seqlen) - 1
+        sub_1 = -1 + (4096 * (4096 <= 256 + offset_4) + (256 + offset_4) * (256 + offset_4 < 4096))
+        # src[gdn_fwd_h.py:N]: b_g_last = g[tile_b.begin, t_i_last, tile_h.begin].to(acc_dtype)
+        b_g_last = tl.load(g + (offset_1 * 327680 + sub_1 * 80 + offset_2 * 1), None)
+        # src[gdn_fwd_h.py:N]: b_g = g[tile_b.begin, t_i, tile_h.begin].to(acc_dtype)
+        b_g = tl.load(g + (offset_1 * 327680 + indices_4 * 80 + offset_2 * 1), None)
+        # src[gdn_fwd_h.py:N]: b_v *= torch.where(m_t, torch.exp(b_g_last - b_g), 0)[:, None]
+        v_6 = b_g_last[None]
+        v_7 = v_6 - b_g
+        v_8 = libdevice.exp(v_7)
+        v_9 = 0.0
+        v_10 = v_9[None]
+        v_11 = tl.where(v_5, v_8, v_10)
+        subscript = v_11[:, None]
+        v_12 = v_3 * subscript
+        # src[gdn_fwd_h.py:N]: b_g_last = torch.exp(b_g_last)
+        v_13 = libdevice.exp(b_g_last)
+        # src[gdn_fwd_h.py:N]: b_h *= b_g_last
+        v_14 = v_13[None, None]
+        v_15 = b_h_copy_0 * v_14
+        # src[gdn_fwd_h.py:N]: b_v = b_v.to(dtype)
+        v_16 = tl.cast(v_12, tl.bfloat16)
+        # src[gdn_fwd_h.py:N]: p_k = k[tile_b.begin, t_i, tile_h.begin, :]
+        p_k = tl.load(k + (offset_1 * 20971520 + indices_4[:, None] * 5120 + offset_2 * 64 + indices_5[None, :] * 1), None)
+        # src[gdn_fwd_h.py:N]: b_h = hl.dot(p_k.T, b_v, acc=b_h)
+        permute = tl.permute(p_k, [1, 0])
+        b_h = tl.dot(tl.cast(permute, tl.bfloat16), tl.cast(v_16, tl.bfloat16), acc=v_15, input_precision='tf32', out_dtype=tl.float32)
+
+def helion_gdn_fwd_h(k: torch.Tensor, w: torch.Tensor, u: torch.Tensor, g: torch.Tensor, chunk_size: int, *, _launcher=_default_launcher):
+    """
+    Argument:
+        k: (batch, seqlen, nheads, dhead)
+        w: (batch, seqlen, nheads, dhead)
+        u: (batch, seqlen, nheads, expand_v*dhead)
+        g: (batch, seqlen, nheads)
+        chunk_size: int
+    Return:
+        h: (batch, nchunks, nheads, dhead, expand_v*dhead)
+    """
+    # src[gdn_fwd_h.py:N]: batch, seqlen, nheads, dhead = k.shape
+    batch, seqlen, nheads, dhead = k.shape
+    # src[gdn_fwd_h.py:N]: dhead = hl.specialize(dhead)
+    dhead = 64
+    # src[gdn_fwd_h.py:N]: chunk_size = hl.specialize(chunk_size)
+    chunk_size = 256
+    # src[gdn_fwd_h.py:N]: dstate = u.shape[-1]
+    dstate = u.shape[-1]
+    # src[gdn_fwd_h.py:N]: acc_dtype = torch.float32
+    acc_dtype = torch.float32
+    # src[gdn_fwd_h.py:N]: dtype = k.dtype
+    dtype = k.dtype
+    # src[gdn_fwd_h.py:N]: nchunks = (seqlen + chunk_size - 1) // chunk_size
+    nchunks = (seqlen + chunk_size - 1) // chunk_size
+    # src[gdn_fwd_h.py:N]: h = torch.empty(batch, nchunks, nheads, dhead, dstate, dtype=dtype, device=k.device)
+    h = torch.empty(batch, nchunks, nheads, dhead, dstate, dtype=dtype, device=k.device)
+    # src[gdn_fwd_h.py:N]: for tile_b, tile_h, tile_v in hl.tile(
+    # src[gdn_fwd_h.py:N]:     [batch, nheads, dstate], block_size=[1, 1, block_v]
+    # src[gdn_fwd_h.py:N]: ):
+    _BLOCK_SIZE_0 = 32
+    _RDIM_SIZE_4 = 64
+    # src[gdn_fwd_h.py:N]: for t_i in hl.tile(seqlen, block_size=chunk_size):
+    # src[gdn_fwd_h.py:N]:     h[tile_b.begin, t_i.id, tile_h.begin, :, tile_v] = b_h.to(dtype)
+    # src[gdn_fwd_h.py:N]:     b_w = w[tile_b.begin, t_i, tile_h.begin, :]
+    # src[gdn_fwd_h.py:N-N]: ...
+    _BLOCK_SIZE_3 = 256
+    # src[gdn_fwd_h.py:N]: for tile_b, tile_h, tile_v in hl.tile(
+    # src[gdn_fwd_h.py:N]:     [batch, nheads, dstate], block_size=[1, 1, block_v]
+    # src[gdn_fwd_h.py:N]: ):
+    # src[gdn_fwd_h.py:N-N]: ...
+    _launcher(_helion_helion_gdn_fwd_h, (8 * 80 * triton.cdiv(128, _BLOCK_SIZE_0),), h, w, u, g, k, _BLOCK_SIZE_0, _RDIM_SIZE_4, _BLOCK_SIZE_3, num_warps=4, num_stages=1)
+    # src[gdn_fwd_h.py:N]: return h
+    return h
+
 --- assertExpectedJournal(TestExamples.test_geglu)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -1822,6 +1822,63 @@ def test_grpo_loss_bwd(self):
             )
         )
 
+    def test_gdn_fwd_h(self):
+        """Test gated delta net forward h kernel."""
+        import math
+
+        batch = 8
+        nheads = 80
+        seqlen = 4096
+        chunk_size = 256
+        dhead = 64
+        dstate = 128
+
+        k = torch.randn(
+            batch, seqlen, nheads, dhead, dtype=torch.bfloat16, device=DEVICE
+        )
+        k = torch.nn.functional.rms_norm(k, (dhead,))
+        w = torch.randn(
+            batch,
+            seqlen // chunk_size,
+            chunk_size,
+            nheads,
+            dhead,
+            dtype=torch.float32,
+            device=DEVICE,
+        )
+        wu, ws, wv = torch.linalg.svd(w.permute(0, 1, 3, 2, 4), full_matrices=False)
+        w = torch.einsum("bnhik,bnhkj->bnhij", wu, wv)
+        w = (
+            w.permute(0, 1, 3, 2, 4)
+            .reshape(batch, seqlen, nheads, dhead)
+            .to(torch.bfloat16)
+        )
+        u = torch.randn(
+            batch, seqlen, nheads, dstate, dtype=torch.bfloat16, device=DEVICE
+        )
+        u = torch.nn.functional.rms_norm(u, (dstate,))
+        g = torch.cumsum(
+            0.5
+            * math.log(1 / dhead)
+            * torch.rand(batch, seqlen, nheads, dtype=torch.float32, device=DEVICE),
+            dim=1,
+        )
+
+        args = (k, w, u, g, chunk_size)
+
+        # Import and use the reference implementation
+        mod = import_path(EXAMPLES_DIR / "gdn_fwd_h.py")
+        expected = mod.ref_gdn_fwd_h(*args)
+
+        self.assertExpectedJournal(
+            check_example(
+                "gdn_fwd_h",
+                args,
+                expected,
+                fn_name="helion_gdn_fwd_h",
+            )
+        )
+
 
 if __name__ == "__main__":
     unittest.main()