@@ -124,21 +124,28 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
124124#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
125125module attributes {" ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
126126 // CHECK-LABEL: llvm.func spir_kernelcc @prefetch_tensor_of_pointers
127- tt.func public @prefetch_tensor_of_pointers (%tensor_of_ptr: tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>) {
127+ tt.func public @prefetch_tensor_of_pointers (%arg0: !tt.ptr <f16 >) {
128+ %0 = tt.make_range {end = 64 : i32 , start = 0 : i32 } : tensor <64 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
129+ %1 = tt.expand_dims %0 {axis = 1 : i32 } : tensor <64 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>> -> tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
130+ %2 = arith.constant dense <64 > : tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
131+ %3 = arith.muli %1 , %2 : tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
132+ %4 = tt.make_range {end = 32 : i32 , start = 0 : i32 } : tensor <32 xi32 , #ttg.slice <{dim = 0 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
133+ %5 = tt.expand_dims %4 {axis = 0 : i32 } : tensor <32 xi32 , #ttg.slice <{dim = 0 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>> -> tensor <1 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
134+ %6 = tt.broadcast %3 : tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>> -> tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
135+ %7 = tt.broadcast %5 : tensor <1 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>> -> tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
136+ %8 = arith.addi %6 , %7 : tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
137+ %9 = tt.splat %arg0 : !tt.ptr <f16 > -> tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
138+ %tensor_of_ptr = tt.addptr %9 , %8 : tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>, tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
139+
140+ // CHECK: %[[ADDR_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
128141 // CHECK: %[[ADDR_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
129142 // CHECK: %[[ADDR_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
130143 // CHECK: %[[ADDR_16:.*]] = llvm.extractvalue {{.*}}[16] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
131144 // CHECK: %[[ADDR_32:.*]] = llvm.extractvalue {{.*}}[32] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
132145 // CHECK: %[[ADDR_48:.*]] = llvm.extractvalue {{.*}}[48] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
133146 // CHECK: %[[BASE_WIDTH:.*]] = llvm.mlir.constant(64 : i32) : i32
134- // CHECK: %[[VAL_13:.*]] = llvm.ptrtoint %[[ADDR_0]] : !llvm.ptr<1> to i64
135- // CHECK: %[[VAL_14:.*]] = llvm.ptrtoint %[[ADDR_1]] : !llvm.ptr<1> to i64
136- // CHECK: %[[PITCH:.*]] = llvm.sub %[[VAL_14]], %[[VAL_13]] : i64
137- // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
138- // CHECK: %[[UNIFIED_PITCH:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[PITCH]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
139- // CHECK: %[[UNIFIED_PITCH_I32:.*]] = llvm.trunc %[[UNIFIED_PITCH]] : i64 to i32
140- // CHECK: %[[PITCH_IN_BYTES_I32:.*]] = llvm.intr.umax(%[[UNIFIED_PITCH_I32]], %[[BASE_WIDTH]]) : (i32, i32) -> i32
141- // CHECK-DAG: %[[BASE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32
147+ // CHECK: %[[PITCH:.*]] = llvm.mlir.constant(128 : i32) : i32
148+ // CHECK: %[[BASE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32
142149 // CHECK: %[[CST_0_:.*]] = llvm.mlir.constant(0 : i32) : i32
143150 // CHECK: llvm.mlir.constant(0 : i32) : i32
144151
@@ -151,7 +158,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
151158 // CHECK: %[[VAL_13:.*]] = llvm.ptrtoint %[[ADDR_0]] : !llvm.ptr<1> to i64
152159 // CHECK: %[[UNIFIED_BASE:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_13]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
153160 // CHECK: %[[VAL_26:.*]] = llvm.inttoptr %[[UNIFIED_BASE]] : i64 to !llvm.ptr<1>
154- // CHECK: triton_gen.2Dblockprefetch %[[VAL_26]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
161+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_26]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
155162
156163 // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
157164 // CHECK: %[[VAL_29:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%{{.*}}, %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
@@ -162,7 +169,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
162169 // CHECK: %[[VAL_32:.*]] = llvm.ptrtoint %[[ADDR_16]] : !llvm.ptr<1> to i64
163170 // CHECK: %[[VAL_33:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_32]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
164171 // CHECK: %[[VAL_34:.*]] = llvm.inttoptr %[[VAL_33]] : i64 to !llvm.ptr<1>
165- // CHECK: triton_gen.2Dblockprefetch %[[VAL_34]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[VAL_31]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
172+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_34]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[VAL_31]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
166173
167174 // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
168175 // CHECK: %[[VAL_36:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%{{.*}}, %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
@@ -173,7 +180,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
173180 // CHECK: %[[VAL_39:.*]] = llvm.ptrtoint %[[ADDR_32]] : !llvm.ptr<1> to i64
174181 // CHECK: %[[VAL_40:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_39]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
175182 // CHECK: %[[VAL_41:.*]] = llvm.inttoptr %[[VAL_40]] : i64 to !llvm.ptr<1>
176- // CHECK: triton_gen.2Dblockprefetch %[[VAL_41]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[VAL_38]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
183+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_41]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[VAL_38]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
177184
178185 // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
179186 // CHECK: %[[VAL_43:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%{{.*}}, %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
@@ -184,7 +191,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
184191 // CHECK: %[[VAL_46:.*]] = llvm.ptrtoint %[[ADDR_48]] : !llvm.ptr<1> to i64
185192 // CHECK: %[[VAL_47:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_46]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
186193 // CHECK: %[[VAL_48:.*]] = llvm.inttoptr %[[VAL_47]] : i64 to !llvm.ptr<1>
187- // CHECK: triton_gen.2Dblockprefetch %[[VAL_48]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[VAL_45]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
194+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_48]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[VAL_45]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
188195
189196 %mask_tensor = arith.constant dense <1 > : tensor <64 x32 xi1 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
190197 ttig.prefetch %tensor_of_ptr , %mask_tensor {boundaryCheck = array<i32 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , operandSegmentSizes = array<i32 : 1 , 1 , 1 >, ttig.block_io = " row_major" } : tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
@@ -199,34 +206,6 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
199206
200207// -----
201208
202- // COM: Check that pitch is a constant calculated by AxisInfo analysis, instead of calculating dynamically.
203- #dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
204- module attributes {" ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
205- // CHECK-LABEL: llvm.func spir_kernelcc @prefetch_tensor_of_pointers
206- tt.func public @prefetch_tensor_of_pointers (%arg0: i32 , %arg1: !tt.ptr <bf16 >) {
207- %cst_0 = arith.constant dense <512 > : tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
208- %cst_1 = arith.constant dense <512 > : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
209- %c128_i32 = arith.constant 128 : i32
210- %0 = arith.muli %arg0 , %c128_i32 : i32
211- %1 = tt.splat %0 : i32 -> tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
212- %2 = tt.make_range {end = 128 : i32 , start = 0 : i32 } : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
213- %3 = arith.addi %1 , %2 : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
214- %4 = arith.remsi %3 , %cst_1 : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
215- %5 = tt.expand_dims %4 {axis = 1 : i32 } : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>> -> tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
216- %6 = arith.muli %5 , %cst_0 : tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
217- %7 = tt.broadcast %6 : tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>> -> tensor <128 x64 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
218- %8 = tt.splat %arg1 : !tt.ptr <bf16 > -> tensor <128 x64 x!tt.ptr <bf16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
219- %9 = tt.addptr %8 , %7 : tensor <128 x64 x!tt.ptr <bf16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>, tensor <128 x64 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
220-
221- // CHECK-DAG: %[[PITCH:.*]] = llvm.mlir.constant(1024 : i32) : i32
222- // CHECK-COUNT-4: triton_gen.2Dblockprefetch {{.*}}, %[[PITCH]], {{.*}} {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
223- ttig.prefetch %9 {boundaryCheck = array<i32 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , operandSegmentSizes = array<i32 : 1 , 1 , 1 >, ttig.block_io = " row_major" } : tensor <128 x64 x!tt.ptr <bf16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
224- tt.return
225- }
226- }
227-
228- // -----
229-
230209// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated.
231210#mma = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [4 , 1 ], repCluster = [4 , 1 ], A = [32 , 8 ], B = [8 , 16 ], C = [32 , 16 ]}>
232211module attributes {ttig.min_sg_size = 16 : i32 , ttig.support_sg_2d_block , ttig.target_arch = " spir64" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , ttg.target = " xpu" , " ttg.threads-per-warp" = 16 : i32 } {
0 commit comments