intel
diff --git a/‎python/test/unit/language/test_matmul.py‎
Lines changed: 4 additions & 1 deletion b/‎python/test/unit/language/test_matmul.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/test/unit/language/test_tensor_descriptor.py‎
Lines changed: 1 addition & 1 deletion b/‎python/test/unit/language/test_tensor_descriptor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/triton_kernels/reduce.py‎
Lines changed: 0 additions & 282 deletions b/‎python/triton_kernels/reduce.py‎
Lines changed: 0 additions & 282 deletions
diff --git a/‎python/triton_kernels/triton_kernels/reduce.py‎
Lines changed: 2 additions & 0 deletions b/‎python/triton_kernels/triton_kernels/reduce.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/Conversion/amd/async_ops_to_llvm_gfx1250.mlir‎
Lines changed: 35 additions & 0 deletions b/‎test/Conversion/amd/async_ops_to_llvm_gfx1250.mlir‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir‎
Lines changed: 36 additions & 0 deletions b/‎test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir‎
Lines changed: 36 additions & 0 deletions
@@ -779,8 +779,11 @@ def generate_gemm_input(dim0, dim1, dtype):
     triton_out = triton_out.to(torch.float32)
     torch.testing.assert_close(torch_out, triton_out, atol=2e-5, rtol=1e-4)
     if is_hip() and preshuffle:
-        assert "tilesPerWarp = [2, 2]" in k.asm["ttgir"]
         assert "ds_read_u8" not in k.asm["amdgcn"]
+        if mfma_nonkdim == 16:
+            assert "tilesPerWarp = [2, 2]" in k.asm["ttgir"]
+        elif mfma_nonkdim == 32:  # default tilesPerWarp = [1, 1]
+            assert "tilesPerWarp" not in k.asm["ttgir"]
 
 
 @pytest.mark.parametrize("M, N, K", [(1024, 512, 512), (998, 111, 512), (63, 128, 512)])
 
@@ -385,7 +385,7 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 @pytest.mark.interpreter
 def test_tensor_descriptor_padding(device):
     if is_xpu():
-        pytest.skip("padding is unsupported")
+        pytest.skip("FIXME: issue #5400")
 
     @triton.jit
     def device_tma_load(in_ptr, out_ptr, IM, IN, YM, YN, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr,
 
@@ -147,6 +147,8 @@ def reduce(
     Returns:
         - output: torch.Tensor
           The reduced tensor with `dim` removed.
+        - output_mxscale: Optional[torch.Tensor]
+          The output mx scale if input is micro-scaled, else None.
     """
     if x.ndim != 3:
         raise NotImplementedError("reduce only supports 3D inputs in this implementation")
 
@@ -0,0 +1,35 @@
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx1250 | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 8, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32} {
+  // CHECK-LABEL: async_copy_with_swizzle
+  tt.func public @async_copy_with_swizzle(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg2: !ttg.memdesc<32x32xf32, #shared, #smem, mutable>) {
+    // We need the splat to allow the AxisAnalysis to work during lowering
+    %1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x32x!tt.ptr<f32>, #blocked>
+    // Each thread needs to load 8 elements and we load 1 (sizePerThread) per global.load.lds
+    // CHECK-COUNT-8: llvm.amdgcn.global.load.async.to.lds.b32
+    // CHECK-NOT: llvm.amdgcn.global.load.async.to.lds
+    %2 = ttg.async_copy_global_to_local %1, %arg2 : tensor<32x32x!tt.ptr<f32>, #blocked> -> <32x32xf32, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32} {
+  // CHECK-LABEL: async_load_strided_into_lds_with_swizzle
+  tt.func public @async_load_strided_into_lds_with_swizzle(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked> {tt.divisibility = dense<[16, 16]> : tensor<2xi32>, tt.contiguity = dense<[16, 16]> : tensor<2xi32>, tt.constancy = dense<[1, 1]> : tensor<2xi32>},
+                                %arg1: !ttg.memdesc<32x32xf32, #shared, #smem, mutable>) {
+    // Each thread loads 256 contiguous bits so we split into 2 128bit loads. This was not possible on GFX9
+    // CHECK-COUNT-2: llvm.amdgcn.global.load.async.to.lds.b128
+    // CHECK-NOT: llvm.amdgcn.global.load.async.to.lds
+    %6 = ttg.async_copy_global_to_local %arg0, %arg1 : tensor<32x32x!tt.ptr<f32>, #blocked> -> <32x32xf32, #shared, #smem, mutable>
+    tt.return
+  }
+}
@@ -320,3 +320,39 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked5 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked6 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked7 = #ttg.blocked<{sizePerThread = [1, 1, 1, 1, 2, 2, 1], threadsPerWarp = [1, 1, 4, 16, 1, 1, 1], warpsPerCTA = [4, 1, 1, 1, 1, 1, 1], order = [6, 5, 4, 3, 2, 1, 0]}>
+#blocked8 = #ttg.blocked<{sizePerThread = [1, 2, 1, 1, 2, 1, 1], threadsPerWarp = [1, 1, 16, 1, 1, 4, 1], warpsPerCTA = [4, 1, 1, 1, 1, 1, 1], order = [6, 1, 4, 2, 5, 3, 0]}>
+#linear = #ttg.linear<{register = [[16, 0], [0, 4]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[32, 0], [64, 0]], block = []}>
+
+// MFMA16: [[$linear1:#.*]] = #ttg.linear<{register = {{\[\[}}0, 4{{]]}}, lane = {{\[\[}}1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2{{]]}}, warp = {{\[\[}}0, 0], [0, 0{{]]}}, block = []}>
+// MFMA16: [[$linear2:#.*]] = #ttg.linear<{register = {{\[\[}}0, 4], [16, 0{{]]}}, lane = {{\[\[}}1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2{{]]}}, warp = {{\[\[}}32, 0], [64, 0{{]]}}, block = []}>
+// MFMA16: [[$mma:#.*]] = #ttg.amd_mfma<{version = 4, warpsPerCTA = [1, 4], instrShape = [16, 16, 128], isTransposed = true, tilesPerWarp = [1, 2]}>
+// MFMA16-LABEL: mfma_dot_scaled_fp8_mxfp4
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_fp8_mxfp4(
+      %arg0: tensor<16x256xf8E4M3FN, #blocked6>,
+      %arg1: tensor<4x256x!tt.ptr<i8>, #blocked5>,
+      %arg2: tensor<128x128xi8, #blocked1>,
+      %arg3: tensor<16x128x!tt.ptr<f32>, #blocked1>
+      ) {
+    // MFMA16: [[SCALE0:%.+]] = ttg.convert_layout {{.*}} : {{.*}} -> tensor<16x8xi8, [[$linear1]]>
+    // MFMA16: [[SCALE1:%.+]] = ttg.convert_layout {{.*}} : {{.*}} -> tensor<128x8xi8, [[$linear2]]>
+    // MFMA16: tt.dot_scaled {{.*}} scale [[SCALE0]], {{.*}} scale [[SCALE1]], {{.*}} -> tensor<16x128xf32, [[$mma]]>
+    %cst0 = arith.constant dense<127> : tensor<16x8xi8, #blocked>
+    %cst1 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked1>
+    %load = tt.load %arg1 : tensor<4x256x!tt.ptr<i8>, #blocked5>
+    %reshape0 = tt.reshape %load : tensor<4x256xi8, #blocked5> -> tensor<4x1x4x16x2x2x1xi8, #blocked7>
+    %trans = tt.trans %reshape0 {order = array<i32: 0, 5, 3, 1, 4, 2, 6>} : tensor<4x1x4x16x2x2x1xi8, #blocked7> -> tensor<4x2x16x1x2x4x1xi8, #blocked8>
+    %reshape1 = tt.reshape %trans : tensor<4x2x16x1x2x4x1xi8, #blocked8> -> tensor<128x8xi8, #linear>
+    %scale = ttg.convert_layout %reshape1 : tensor<128x8xi8, #linear> -> tensor<128x8xi8, #blocked>
+    %1 = tt.dot_scaled %arg0 scale %cst0, %arg2 scale %scale, %cst1 lhs = e4m3 rhs = e2m1 {fastMath = true} : tensor<16x256xf8E4M3FN, #blocked6>, tensor<16x8xi8, #blocked> * tensor<128x128xi8, #blocked1>, tensor<128x8xi8, #blocked> -> tensor<16x128xf32, #blocked1>
+    tt.store %arg3, %1 : tensor<16x128x!tt.ptr<f32>, #blocked1>
+    tt.return
+  }
+}