Skip to content

Commit 09b5efe

Browse files
AlexAUTita9naiwa
authored andcommitted
[AMD] Add lit tests for pipelining with padded layouts on gfx950 (triton-lang#8399)
Adds lit tests for pipelining with padded shared layouts introduced by triton-lang#8365.
1 parent f8b4b53 commit 09b5efe

File tree

1 file changed

+162
-0
lines changed

1 file changed

+162
-0
lines changed

test/TritonGPU/loop-pipeline-hip.mlir

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,3 +1022,165 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
10221022
tt.return
10231023
}
10241024
}
1025+
1026+
// -----
1027+
1028+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
1029+
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
1030+
1031+
// ASYNC-NOT: ttg.swizzled_shared
1032+
// ASYNC: [[PADDED_ENC:#.*]] = #ttg.padded_shared
1033+
// ASYNC-SAME{LITERAL}: {offset = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16], [0, 32], [0, 64], [32, 0], [16, 0], [1, 0], [2, 0], [4, 0], [8, 0], [64, 0]], block = []}
1034+
// ASYNC-NOT: ttg.padded_shared
1035+
// ASYNC-NOT: ttg.swizzled_shared
1036+
1037+
// SYNC-NOT: ttg.padded_shared
1038+
1039+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
1040+
// COMMON-LABEL: loop_expect_padded_layouts
1041+
tt.func public @loop_expect_padded_layouts(%arg0: i32, %arg1: tensor<128x128x!tt.ptr<f16>, #blocked> {tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 8]> : tensor<2xi32>, tt.divisibility = dense<[1, 16]> : tensor<2xi32>}, %arg2: tensor<128x128x!tt.ptr<f16>, #mma>) {
1042+
%c1_i32 = arith.constant 1 : i32
1043+
%c0_i32 = arith.constant 0 : i32
1044+
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma>
1045+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
1046+
%0 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %cst) -> (tensor<128x128xf16, #mma>) : i32 {
1047+
%1 = tt.load %arg1 : tensor<128x128x!tt.ptr<f16>, #blocked>
1048+
%2 = ttg.convert_layout %1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
1049+
%3 = tt.dot %2, %cst_0, %arg4 : tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x128xf16, #mma>
1050+
scf.yield %3 : tensor<128x128xf16, #mma>
1051+
}
1052+
tt.store %arg2, %0 : tensor<128x128x!tt.ptr<f16>, #mma>
1053+
tt.return
1054+
}
1055+
}
1056+
1057+
// -----
1058+
// Negative tests for padded encodings on gfx950
1059+
1060+
// Unsupported kWidth
1061+
1062+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
1063+
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
1064+
1065+
// COMMON-NOT: ttg.padded_shared
1066+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
1067+
// COMMON-LABEL: loop_padding_too_small_vector
1068+
tt.func public @loop_padding_too_small_vector(%arg0: i32, %arg1: tensor<128x128x!tt.ptr<f16>, #blocked> {tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 8]> : tensor<2xi32>, tt.divisibility = dense<[1, 16]> : tensor<2xi32>}, %arg2: tensor<128x128x!tt.ptr<f16>, #mma>) {
1069+
%c1_i32 = arith.constant 1 : i32
1070+
%c0_i32 = arith.constant 0 : i32
1071+
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma>
1072+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
1073+
%0 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %cst) -> (tensor<128x128xf16, #mma>) : i32 {
1074+
%1 = tt.load %arg1 : tensor<128x128x!tt.ptr<f16>, #blocked>
1075+
%2 = ttg.convert_layout %1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
1076+
%3 = tt.dot %2, %cst_0, %arg4 : tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<128x128xf16, #mma>
1077+
scf.yield %3 : tensor<128x128xf16, #mma>
1078+
}
1079+
tt.store %arg2, %0 : tensor<128x128x!tt.ptr<f16>, #mma>
1080+
tt.return
1081+
}
1082+
}
1083+
1084+
// -----
1085+
1086+
// Unsupported instrShape
1087+
1088+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
1089+
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [64, 4, 16], isTransposed = true}>
1090+
1091+
// COMMON-NOT: ttg.padded_shared
1092+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
1093+
// COMMON-LABEL: loop_padding_invalid_instr_shape
1094+
tt.func public @loop_padding_invalid_instr_shape(%arg0: i32, %arg1: tensor<128x128x!tt.ptr<f16>, #blocked> {tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 8]> : tensor<2xi32>, tt.divisibility = dense<[1, 16]> : tensor<2xi32>}, %arg2: tensor<128x128x!tt.ptr<f16>, #mma>) {
1095+
%c1_i32 = arith.constant 1 : i32
1096+
%c0_i32 = arith.constant 0 : i32
1097+
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma>
1098+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
1099+
%0 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %cst) -> (tensor<128x128xf16, #mma>) : i32 {
1100+
%1 = tt.load %arg1 : tensor<128x128x!tt.ptr<f16>, #blocked>
1101+
%2 = ttg.convert_layout %1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
1102+
%3 = tt.dot %2, %cst_0, %arg4 : tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<128x128xf16, #mma>
1103+
scf.yield %3 : tensor<128x128xf16, #mma>
1104+
}
1105+
tt.store %arg2, %0 : tensor<128x128x!tt.ptr<f16>, #mma>
1106+
tt.return
1107+
}
1108+
}
1109+
1110+
// -----
1111+
1112+
// Block size too small
1113+
1114+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
1115+
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
1116+
1117+
// COMMON-NOT: ttg.padded_shared
1118+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
1119+
// COMMON-LABEL: loop_padding_block_size_too_small
1120+
tt.func public @loop_padding_block_size_too_small(%arg0: i32, %arg1: tensor<16x128x!tt.ptr<f16>, #blocked> {tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 8]> : tensor<2xi32>, tt.divisibility = dense<[1, 16]> : tensor<2xi32>}, %arg2: tensor<16x16x!tt.ptr<f16>, #mma>) {
1121+
%c1_i32 = arith.constant 1 : i32
1122+
%c0_i32 = arith.constant 0 : i32
1123+
%cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #mma>
1124+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
1125+
%0 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %cst) -> (tensor<16x16xf16, #mma>) : i32 {
1126+
%1 = tt.load %arg1 : tensor<16x128x!tt.ptr<f16>, #blocked>
1127+
%2 = ttg.convert_layout %1 : tensor<16x128xf16, #blocked> -> tensor<16x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
1128+
%3 = tt.dot %2, %cst_0, %arg4 : tensor<16x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf16, #mma>
1129+
scf.yield %3 : tensor<16x16xf16, #mma>
1130+
}
1131+
tt.store %arg2, %0 : tensor<16x16x!tt.ptr<f16>, #mma>
1132+
tt.return
1133+
}
1134+
}
1135+
1136+
// -----
1137+
1138+
// dtype > 2 bytes
1139+
1140+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
1141+
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
1142+
// COMMON-NOT: ttg.padded_shared
1143+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
1144+
// COMMON-LABEL: loop_padding_block_size_too_small
1145+
tt.func public @loop_padding_block_size_too_small(%arg0: i32, %arg1: tensor<16x128x!tt.ptr<f32>, #blocked> {tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 8]> : tensor<2xi32>, tt.divisibility = dense<[1, 16]> : tensor<2xi32>}, %arg2: tensor<16x16x!tt.ptr<f32>, #mma>) {
1146+
%c1_i32 = arith.constant 1 : i32
1147+
%c0_i32 = arith.constant 0 : i32
1148+
%cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma>
1149+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
1150+
%0 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %cst) -> (tensor<16x16xf32, #mma>) : i32 {
1151+
%1 = tt.load %arg1 : tensor<16x128x!tt.ptr<f32>, #blocked>
1152+
%2 = ttg.convert_layout %1 : tensor<16x128xf32, #blocked> -> tensor<16x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
1153+
%3 = tt.dot %2, %cst_0, %arg4 : tensor<16x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x16xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma>
1154+
scf.yield %3 : tensor<16x16xf32, #mma>
1155+
}
1156+
tt.store %arg2, %0 : tensor<16x16x!tt.ptr<f32>, #mma>
1157+
tt.return
1158+
}
1159+
}
1160+
1161+
// -----
1162+
1163+
// dtype < 2 bytes
1164+
1165+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
1166+
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
1167+
// COMMON-NOT: ttg.padded_shared
1168+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
1169+
// COMMON-LABEL: loop_padding_block_size_too_small
1170+
tt.func public @loop_padding_block_size_too_small(%arg0: i32, %arg1: tensor<16x128x!tt.ptr<f8E5M2>, #blocked> {tt.constancy = dense<1> : tensor<2xi32>, tt.contiguity = dense<[1, 8]> : tensor<2xi32>, tt.divisibility = dense<[1, 16]> : tensor<2xi32>}, %arg2: tensor<16x16x!tt.ptr<f8E5M2>, #mma>) {
1171+
%c1_i32 = arith.constant 1 : i32
1172+
%c0_i32 = arith.constant 0 : i32
1173+
%cst = arith.constant dense<0.000000e+00> : tensor<16x16xf8E5M2, #mma>
1174+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
1175+
%0 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %cst) -> (tensor<16x16xf8E5M2, #mma>) : i32 {
1176+
%1 = tt.load %arg1 : tensor<16x128x!tt.ptr<f8E5M2>, #blocked>
1177+
%2 = ttg.convert_layout %1 : tensor<16x128xf8E5M2, #blocked> -> tensor<16x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
1178+
%3 = tt.dot %2, %cst_0, %arg4 : tensor<16x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x16xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf8E5M2, #mma>
1179+
scf.yield %3 : tensor<16x16xf8E5M2, #mma>
1180+
}
1181+
tt.store %arg2, %0 : tensor<16x16x!tt.ptr<f8E5M2>, #mma>
1182+
tt.return
1183+
}
1184+
}
1185+
1186+
// End of negative tests for padding on gfx950

0 commit comments

Comments
 (0)