@@ -1022,3 +1022,165 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
10221022    tt.return 
10231023  }
10241024}
1025+ 
1026+ // ----- 
1027+ 
1028+ #blocked  = #ttg.blocked <{sizePerThread  = [1 , 8 ], threadsPerWarp  = [4 , 16 ], warpsPerCTA  = [1 , 4 ], order  = [1 , 0 ]}>
1029+ #mma  = #ttg.amd_mfma <{version  = 4 , warpsPerCTA  = [4 , 1 ], instrShape  = [32 , 32 , 16 ], isTransposed  = true }>
1030+ 
1031+ // ASYNC-NOT: ttg.swizzled_shared 
1032+ // ASYNC: [[PADDED_ENC:#.*]] = #ttg.padded_shared 
1033+ // ASYNC-SAME{LITERAL}: {offset = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16], [0, 32], [0, 64], [32, 0], [16, 0], [1, 0], [2, 0], [4, 0], [8, 0], [64, 0]], block = []} 
1034+ // ASYNC-NOT: ttg.padded_shared 
1035+ // ASYNC-NOT: ttg.swizzled_shared 
1036+ 
1037+ // SYNC-NOT: ttg.padded_shared 
1038+ 
1039+ module  attributes  {" ttg.num-ctas"   = 1  : i32 , " ttg.num-warps"   = 4  : i32 , ttg.target  = " hip:gfx950"  , " ttg.threads-per-warp"   = 64  : i32 } {
1040+   // COMMON-LABEL: loop_expect_padded_layouts 
1041+   tt.func  public  @loop_expect_padded_layouts (%arg0:  i32 , %arg1:  tensor <128 x128 x!tt.ptr <f16 >, #blocked > {tt.constancy  = dense <1 > : tensor <2 xi32 >, tt.contiguity  = dense <[1 , 8 ]> : tensor <2 xi32 >, tt.divisibility  = dense <[1 , 16 ]> : tensor <2 xi32 >}, %arg2:  tensor <128 x128 x!tt.ptr <f16 >, #mma >) {
1042+     %c1_i32  = arith.constant  1  : i32 
1043+     %c0_i32  = arith.constant  0  : i32 
1044+     %cst  = arith.constant  dense <0.000000e+00 > : tensor <128 x128 xf16 , #mma >
1045+     %cst_0  = arith.constant  dense <0.000000e+00 > : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 8 }>>
1046+     %0  = scf.for  %arg3  = %c0_i32  to  %arg0  step  %c1_i32  iter_args (%arg4  = %cst ) -> (tensor <128 x128 xf16 , #mma >)  : i32  {
1047+       %1  = tt.load  %arg1  : tensor <128 x128 x!tt.ptr <f16 >, #blocked >
1048+       %2  = ttg.convert_layout  %1  : tensor <128 x128 xf16 , #blocked > -> tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 8 }>>
1049+       %3  = tt.dot  %2 , %cst_0 , %arg4  : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 8 }>> * tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 8 }>> -> tensor <128 x128 xf16 , #mma >
1050+       scf.yield  %3  : tensor <128 x128 xf16 , #mma >
1051+     }
1052+     tt.store  %arg2 , %0  : tensor <128 x128 x!tt.ptr <f16 >, #mma >
1053+     tt.return 
1054+   }
1055+ }
1056+ 
1057+ // ----- 
1058+ // Negative tests for padded encodings on gfx950 
1059+ 
1060+ // Unsupported kWidth 
1061+ 
1062+ #blocked  = #ttg.blocked <{sizePerThread  = [1 , 8 ], threadsPerWarp  = [4 , 16 ], warpsPerCTA  = [1 , 4 ], order  = [1 , 0 ]}>
1063+ #mma  = #ttg.amd_mfma <{version  = 4 , warpsPerCTA  = [4 , 1 ], instrShape  = [32 , 32 , 16 ], isTransposed  = true }>
1064+ 
1065+ // COMMON-NOT: ttg.padded_shared 
1066+ module  attributes  {" ttg.num-ctas"   = 1  : i32 , " ttg.num-warps"   = 4  : i32 , ttg.target  = " hip:gfx950"  , " ttg.threads-per-warp"   = 64  : i32 } {
1067+   // COMMON-LABEL: loop_padding_too_small_vector 
1068+   tt.func  public  @loop_padding_too_small_vector (%arg0:  i32 , %arg1:  tensor <128 x128 x!tt.ptr <f16 >, #blocked > {tt.constancy  = dense <1 > : tensor <2 xi32 >, tt.contiguity  = dense <[1 , 8 ]> : tensor <2 xi32 >, tt.divisibility  = dense <[1 , 16 ]> : tensor <2 xi32 >}, %arg2:  tensor <128 x128 x!tt.ptr <f16 >, #mma >) {
1069+     %c1_i32  = arith.constant  1  : i32 
1070+     %c0_i32  = arith.constant  0  : i32 
1071+     %cst  = arith.constant  dense <0.000000e+00 > : tensor <128 x128 xf16 , #mma >
1072+     %cst_0  = arith.constant  dense <0.000000e+00 > : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>>
1073+     %0  = scf.for  %arg3  = %c0_i32  to  %arg0  step  %c1_i32  iter_args (%arg4  = %cst ) -> (tensor <128 x128 xf16 , #mma >)  : i32  {
1074+       %1  = tt.load  %arg1  : tensor <128 x128 x!tt.ptr <f16 >, #blocked >
1075+       %2  = ttg.convert_layout  %1  : tensor <128 x128 xf16 , #blocked > -> tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>>
1076+       %3  = tt.dot  %2 , %cst_0 , %arg4  : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>> * tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>> -> tensor <128 x128 xf16 , #mma >
1077+       scf.yield  %3  : tensor <128 x128 xf16 , #mma >
1078+     }
1079+     tt.store  %arg2 , %0  : tensor <128 x128 x!tt.ptr <f16 >, #mma >
1080+     tt.return 
1081+   }
1082+ }
1083+ 
1084+ // ----- 
1085+ 
1086+ // Unsupported instrShape 
1087+ 
1088+ #blocked  = #ttg.blocked <{sizePerThread  = [1 , 8 ], threadsPerWarp  = [4 , 16 ], warpsPerCTA  = [1 , 4 ], order  = [1 , 0 ]}>
1089+ #mma  = #ttg.amd_mfma <{version  = 4 , warpsPerCTA  = [4 , 1 ], instrShape  = [64 , 4 , 16 ], isTransposed  = true }>
1090+ 
1091+ // COMMON-NOT: ttg.padded_shared 
1092+ module  attributes  {" ttg.num-ctas"   = 1  : i32 , " ttg.num-warps"   = 4  : i32 , ttg.target  = " hip:gfx950"  , " ttg.threads-per-warp"   = 64  : i32 } {
1093+   // COMMON-LABEL: loop_padding_invalid_instr_shape 
1094+   tt.func  public  @loop_padding_invalid_instr_shape (%arg0:  i32 , %arg1:  tensor <128 x128 x!tt.ptr <f16 >, #blocked > {tt.constancy  = dense <1 > : tensor <2 xi32 >, tt.contiguity  = dense <[1 , 8 ]> : tensor <2 xi32 >, tt.divisibility  = dense <[1 , 16 ]> : tensor <2 xi32 >}, %arg2:  tensor <128 x128 x!tt.ptr <f16 >, #mma >) {
1095+     %c1_i32  = arith.constant  1  : i32 
1096+     %c0_i32  = arith.constant  0  : i32 
1097+     %cst  = arith.constant  dense <0.000000e+00 > : tensor <128 x128 xf16 , #mma >
1098+     %cst_0  = arith.constant  dense <0.000000e+00 > : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>>
1099+     %0  = scf.for  %arg3  = %c0_i32  to  %arg0  step  %c1_i32  iter_args (%arg4  = %cst ) -> (tensor <128 x128 xf16 , #mma >)  : i32  {
1100+       %1  = tt.load  %arg1  : tensor <128 x128 x!tt.ptr <f16 >, #blocked >
1101+       %2  = ttg.convert_layout  %1  : tensor <128 x128 xf16 , #blocked > -> tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>>
1102+       %3  = tt.dot  %2 , %cst_0 , %arg4  : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>> * tensor <128 x128 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>> -> tensor <128 x128 xf16 , #mma >
1103+       scf.yield  %3  : tensor <128 x128 xf16 , #mma >
1104+     }
1105+     tt.store  %arg2 , %0  : tensor <128 x128 x!tt.ptr <f16 >, #mma >
1106+     tt.return 
1107+   }
1108+ }
1109+ 
1110+ // ----- 
1111+ 
1112+ // Block size too small 
1113+ 
1114+ #blocked  = #ttg.blocked <{sizePerThread  = [1 , 8 ], threadsPerWarp  = [4 , 16 ], warpsPerCTA  = [1 , 4 ], order  = [1 , 0 ]}>
1115+ #mma  = #ttg.amd_mfma <{version  = 4 , warpsPerCTA  = [4 , 1 ], instrShape  = [16 , 16 , 32 ], isTransposed  = true }>
1116+ 
1117+ // COMMON-NOT: ttg.padded_shared 
1118+ module  attributes  {" ttg.num-ctas"   = 1  : i32 , " ttg.num-warps"   = 4  : i32 , ttg.target  = " hip:gfx950"  , " ttg.threads-per-warp"   = 64  : i32 } {
1119+   // COMMON-LABEL: loop_padding_block_size_too_small 
1120+   tt.func  public  @loop_padding_block_size_too_small (%arg0:  i32 , %arg1:  tensor <16 x128 x!tt.ptr <f16 >, #blocked > {tt.constancy  = dense <1 > : tensor <2 xi32 >, tt.contiguity  = dense <[1 , 8 ]> : tensor <2 xi32 >, tt.divisibility  = dense <[1 , 16 ]> : tensor <2 xi32 >}, %arg2:  tensor <16 x16 x!tt.ptr <f16 >, #mma >) {
1121+     %c1_i32  = arith.constant  1  : i32 
1122+     %c0_i32  = arith.constant  0  : i32 
1123+     %cst  = arith.constant  dense <0.000000e+00 > : tensor <16 x16 xf16 , #mma >
1124+     %cst_0  = arith.constant  dense <0.000000e+00 > : tensor <128 x16 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>>
1125+     %0  = scf.for  %arg3  = %c0_i32  to  %arg0  step  %c1_i32  iter_args (%arg4  = %cst ) -> (tensor <16 x16 xf16 , #mma >)  : i32  {
1126+       %1  = tt.load  %arg1  : tensor <16 x128 x!tt.ptr <f16 >, #blocked >
1127+       %2  = ttg.convert_layout  %1  : tensor <16 x128 xf16 , #blocked > -> tensor <16 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>>
1128+       %3  = tt.dot  %2 , %cst_0 , %arg4  : tensor <16 x128 xf16 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>> * tensor <128 x16 xf16 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>> -> tensor <16 x16 xf16 , #mma >
1129+       scf.yield  %3  : tensor <16 x16 xf16 , #mma >
1130+     }
1131+     tt.store  %arg2 , %0  : tensor <16 x16 x!tt.ptr <f16 >, #mma >
1132+     tt.return 
1133+   }
1134+ }
1135+ 
1136+ // ----- 
1137+ 
1138+ // dtype > 2 bytes 
1139+ 
1140+ #blocked  = #ttg.blocked <{sizePerThread  = [1 , 8 ], threadsPerWarp  = [4 , 16 ], warpsPerCTA  = [1 , 4 ], order  = [1 , 0 ]}>
1141+ #mma  = #ttg.amd_mfma <{version  = 4 , warpsPerCTA  = [4 , 1 ], instrShape  = [16 , 16 , 32 ], isTransposed  = true }>
1142+ // COMMON-NOT: ttg.padded_shared 
1143+ module  attributes  {" ttg.num-ctas"   = 1  : i32 , " ttg.num-warps"   = 4  : i32 , ttg.target  = " hip:gfx950"  , " ttg.threads-per-warp"   = 64  : i32 } {
1144+   // COMMON-LABEL: loop_padding_block_size_too_small 
1145+   tt.func  public  @loop_padding_block_size_too_small (%arg0:  i32 , %arg1:  tensor <16 x128 x!tt.ptr <f32 >, #blocked > {tt.constancy  = dense <1 > : tensor <2 xi32 >, tt.contiguity  = dense <[1 , 8 ]> : tensor <2 xi32 >, tt.divisibility  = dense <[1 , 16 ]> : tensor <2 xi32 >}, %arg2:  tensor <16 x16 x!tt.ptr <f32 >, #mma >) {
1146+     %c1_i32  = arith.constant  1  : i32 
1147+     %c0_i32  = arith.constant  0  : i32 
1148+     %cst  = arith.constant  dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma >
1149+     %cst_0  = arith.constant  dense <0.000000e+00 > : tensor <128 x16 xf32 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>>
1150+     %0  = scf.for  %arg3  = %c0_i32  to  %arg0  step  %c1_i32  iter_args (%arg4  = %cst ) -> (tensor <16 x16 xf32 , #mma >)  : i32  {
1151+       %1  = tt.load  %arg1  : tensor <16 x128 x!tt.ptr <f32 >, #blocked >
1152+       %2  = ttg.convert_layout  %1  : tensor <16 x128 xf32 , #blocked > -> tensor <16 x128 xf32 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>>
1153+       %3  = tt.dot  %2 , %cst_0 , %arg4  : tensor <16 x128 xf32 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>> * tensor <128 x16 xf32 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>> -> tensor <16 x16 xf32 , #mma >
1154+       scf.yield  %3  : tensor <16 x16 xf32 , #mma >
1155+     }
1156+     tt.store  %arg2 , %0  : tensor <16 x16 x!tt.ptr <f32 >, #mma >
1157+     tt.return 
1158+   }
1159+ }
1160+ 
1161+ // ----- 
1162+ 
1163+ // dtype < 2 bytes 
1164+ 
1165+ #blocked  = #ttg.blocked <{sizePerThread  = [1 , 8 ], threadsPerWarp  = [4 , 16 ], warpsPerCTA  = [1 , 4 ], order  = [1 , 0 ]}>
1166+ #mma  = #ttg.amd_mfma <{version  = 4 , warpsPerCTA  = [4 , 1 ], instrShape  = [16 , 16 , 32 ], isTransposed  = true }>
1167+ // COMMON-NOT: ttg.padded_shared 
1168+ module  attributes  {" ttg.num-ctas"   = 1  : i32 , " ttg.num-warps"   = 4  : i32 , ttg.target  = " hip:gfx950"  , " ttg.threads-per-warp"   = 64  : i32 } {
1169+   // COMMON-LABEL: loop_padding_block_size_too_small 
1170+   tt.func  public  @loop_padding_block_size_too_small (%arg0:  i32 , %arg1:  tensor <16 x128 x!tt.ptr <f8E5M2 >, #blocked > {tt.constancy  = dense <1 > : tensor <2 xi32 >, tt.contiguity  = dense <[1 , 8 ]> : tensor <2 xi32 >, tt.divisibility  = dense <[1 , 16 ]> : tensor <2 xi32 >}, %arg2:  tensor <16 x16 x!tt.ptr <f8E5M2 >, #mma >) {
1171+     %c1_i32  = arith.constant  1  : i32 
1172+     %c0_i32  = arith.constant  0  : i32 
1173+     %cst  = arith.constant  dense <0.000000e+00 > : tensor <16 x16 xf8 E5 M2 , #mma >
1174+     %cst_0  = arith.constant  dense <0.000000e+00 > : tensor <128 x16 xf8 E5 M2 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>>
1175+     %0  = scf.for  %arg3  = %c0_i32  to  %arg0  step  %c1_i32  iter_args (%arg4  = %cst ) -> (tensor <16 x16 xf8 E5 M2 , #mma >)  : i32  {
1176+       %1  = tt.load  %arg1  : tensor <16 x128 x!tt.ptr <f8E5M2 >, #blocked >
1177+       %2  = ttg.convert_layout  %1  : tensor <16 x128 xf8 E5 M2 , #blocked > -> tensor <16 x128 xf8 E5 M2 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>>
1178+       %3  = tt.dot  %2 , %cst_0 , %arg4  : tensor <16 x128 xf8 E5 M2 , #ttg.dot_op <{opIdx  = 0 , parent  = #mma , kWidth  = 1 }>> * tensor <128 x16 xf8 E5 M2 , #ttg.dot_op <{opIdx  = 1 , parent  = #mma , kWidth  = 1 }>> -> tensor <16 x16 xf8 E5 M2 , #mma >
1179+       scf.yield  %3  : tensor <16 x16 xf8 E5 M2 , #mma >
1180+     }
1181+     tt.store  %arg2 , %0  : tensor <16 x16 x!tt.ptr <f8E5M2 >, #mma >
1182+     tt.return 
1183+   }
1184+ }
1185+ 
1186+ // End of negative tests for padding on gfx950 
0 commit comments