Skip to content

Commit d917ea5

Browse files
authored
[GPU] enable simd16 version for convolution_gpu_mmad_b_fs_yx_fsv32 (#32501)
### Details: - new platforms not support simd8 (LNL, BMG) ### Tickets: - 174772
1 parent 3e4bcd3 commit d917ea5

File tree

14 files changed

+412
-86
lines changed

14 files changed

+412
-86
lines changed

src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ struct format {
200200
os_is_zyx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD
201201
os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution
202202
os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution
203+
os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2, ///< format for weights for MMAD fsv32 convolution
204+
os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2, ///< format for weights for MMAD fsv32 convolution
203205
os_is_zyx_osa4_isa8_osv8_isv4, ///< format for weights for MMAD fsv32 convolution
204206
os_is_yx_osa4_isa8_osv8_isv4, ///< format for weights for MMAD fsv32 convolution
205207
os_is_yx_osv16_isv4, ///< format for weights for IMAD convolutions

src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,10 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
538538
return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
539539
case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
540540
return kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
541+
case format::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2:
542+
return kernel_selector::weights_layout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2;
543+
case format::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2:
544+
return kernel_selector::weights_layout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2;
541545
case format::os_is_yx_osv16_isv4:
542546
return kernel_selector::weights_layout::os_is_yx_osv16_isv4;
543547
case format::os_is_yx_osv32_isv4_swizzled_by_2:
@@ -728,6 +732,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
728732
return cldnn::format::os_is_zyx_isa8_osv16_isv4;
729733
case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
730734
return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
735+
case kernel_selector::weights_layout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2:
736+
return cldnn::format::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2;
737+
case kernel_selector::weights_layout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2:
738+
return cldnn::format::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2;
731739
case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
732740
return cldnn::format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
733741
case kernel_selector::weights_layout::os_is_yx_osv32_isv4_swizzled_by_2:

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl

Lines changed: 127 additions & 65 deletions
Large diffs are not rendered by default.

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,65 @@ inline uint get_os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4_index(uint o, uint i,
794794
return idx;
795795
}
796796

797+
inline uint get_os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2_index(
798+
uint o, uint i, uint y, uint x,
799+
uint size_x, uint size_y,
800+
uint size_ifm, uint size_ofm,
801+
uint offset)
802+
{
803+
const uint o_swizzled = (o % 2) * 16 + ((o % 32) / 2) + (o / 32) * 32;
804+
const uint isv_idx = i % 4;
805+
const uint isa_idx = (i / 4) % 8;
806+
const uint is_idx = i / 32;
807+
const uint osv_idx = o_swizzled % 16;
808+
const uint osa_idx = (o_swizzled / 16) % 2;
809+
const uint os_idx = o / 32;
810+
811+
const uint f_32_aligned = (size_ifm + 31) / 32;
812+
813+
size_t idx = offset +
814+
isv_idx +
815+
osv_idx * 4 +
816+
isa_idx * 16 * 4 +
817+
osa_idx * 16 * 32 +
818+
x * 32 * 32 +
819+
y * size_x * 32 * 32 +
820+
is_idx * 32 * 32 * size_x * size_y +
821+
os_idx * 32 * 32 * f_32_aligned * size_x * size_y;
822+
823+
return idx;
824+
}
825+
826+
inline uint get_os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2_index(
827+
uint o, uint i, uint z, uint y, uint x,
828+
uint size_x, uint size_y, uint size_z,
829+
uint size_ifm, uint size_ofm,
830+
uint offset)
831+
{
832+
const uint o_swizzled = (o % 2) * 16 + ((o % 32) / 2) + (o / 32) * 32;
833+
const uint isv_idx = i % 4;
834+
const uint isa_idx = (i / 4) % 8;
835+
const uint is_idx = i / 32;
836+
const uint osv_idx = o_swizzled % 16;
837+
const uint osa_idx = (o_swizzled / 16) % 2;
838+
const uint os_idx = o / 32;
839+
840+
const uint f_32_aligned = (size_ifm + 31) / 32;
841+
842+
size_t idx = offset +
843+
isv_idx +
844+
osv_idx * 4 +
845+
isa_idx * 16 * 4 +
846+
osa_idx * 16 * 32 +
847+
x * 32 * 32 +
848+
y * size_x * 32 * 32 +
849+
z * size_x * size_y * 32 * 32 +
850+
is_idx * 32 * 32 * size_x * size_y * size_z +
851+
os_idx * 32 * 32 * f_32_aligned * size_x * size_y * size_z;
852+
853+
return idx;
854+
}
855+
797856
inline uint get_os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4_index(uint o, uint i, uint z, uint y, uint x,
798857
uint size_x, uint size_y, uint size_z,
799858
uint size_ifm, uint size_ofm, uint offset)
@@ -985,6 +1044,25 @@ inline uint get_g_is_os_yx_isa4_osa8_isv8_osv4(uint g, uint o, uint i, uint z, u
9851044
CAT(prefix, _OFM_NUM), \
9861045
CAT(prefix, _OFFSET))
9871046

1047+
#define GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(prefix, o, i, y, x) \
1048+
get_os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2_index( \
1049+
o, i, y, x, \
1050+
CAT(prefix, _SIZE_X), \
1051+
CAT(prefix, _SIZE_Y), \
1052+
CAT(prefix, _IFM_NUM), \
1053+
CAT(prefix, _OFM_NUM), \
1054+
CAT(prefix, _OFFSET))
1055+
1056+
#define GET_FILTER_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(prefix, o, i, z, y, x) \
1057+
get_os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2_index( \
1058+
o, i, z, y, x, \
1059+
CAT(prefix, _SIZE_X), \
1060+
CAT(prefix, _SIZE_Y), \
1061+
CAT(prefix, _SIZE_Z), \
1062+
CAT(prefix, _IFM_NUM), \
1063+
CAT(prefix, _OFM_NUM), \
1064+
CAT(prefix, _OFFSET))
1065+
9881066
inline uint get_is_o32_yx_isv32_swizzled_by_4_index(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size)
9891067
{
9901068
const uint o_aligned_to_32 = ((o_size + 31) / 32) * 32;

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
431431
return GET_FILTER_OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, y, x);
432432
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4
433433
return GET_FILTER_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, z, y, x);
434+
#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2
435+
return GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, y, x);
436+
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2
437+
return GET_FILTER_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, z, y, x);
434438
#elif defined OUTPUT_LAYOUT_OS_IS_YX_ISV16_OSV16
435439
return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(OUTPUT, o, i, y, x, 16, 16);
436440
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISV16_OSV16

src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,8 @@ std::string toString(WeightsLayout layout) {
347347
case WeightsLayout::os_is_zyx_isa8_osv16_isv4: return "OS_IS_ZYX_ISA8_OSV16_ISV4";
348348
case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
349349
case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
350+
case WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2: return "OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2";
351+
case WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2: return "OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2";
350352
case WeightsLayout::os_is_yx_osv16_isv4: return "OS_IS_YX_OSV16_ISV4";
351353
case WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2: return "OS_IS_YX_OSV32_ISV4_SWIZZLED_BY_2";
352354
case WeightsLayout::os_is_yx_osv32_isv4: return "OS_IS_YX_OSV32_ISV4";

src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,8 +410,8 @@ Datatype ConvolutionKernelBase::GetPackedInputType(const convolution_params& par
410410
return GetPackedType(params.inputs[0].GetDType());
411411
}
412412

413-
Datatype ConvolutionKernelBase::GetPackedOutputType(const convolution_params& params) const {
414-
return GetPackedType(params.outputs[0].GetDType());
413+
Datatype ConvolutionKernelBase::GetPackedOutputType(const convolution_params& params, size_t pack_size) const {
414+
return GetPackedType(params.outputs[0].GetDType(), pack_size);
415415
}
416416

417417
Datatype ConvolutionKernelBase::GetActivationType(const convolution_params& params) const {

src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class ConvolutionKernelBase : public WeightBiasKernelBase {
7777

7878
Datatype GetPackedType(Datatype dt, size_t pack_size = 4) const;
7979
Datatype GetPackedInputType(const convolution_params& params) const;
80-
Datatype GetPackedOutputType(const convolution_params& params) const;
80+
Datatype GetPackedOutputType(const convolution_params& params, size_t pack_size = 4) const;
8181
Datatype GetActivationType(const convolution_params& params) const;
8282
Datatype GetAccumulatorType(const convolution_params& params) const;
8383
void GetUpdateDispatchDataFunc(KernelData& kd) const override;

src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ bool ConvolutionKernel_mmad_b_fs_yx_fsv32::Validate(const Params& p) const {
6363
DO_NOT_USE_THIS_KERNEL(p.layerID);
6464
}
6565

66-
if (!IsSIMDSizeSupported(params.engineInfo, 8))
66+
if (!IsSIMDSizeSupported(params.engineInfo, 8) && !IsSIMDSizeSupported(params.engineInfo, 16))
6767
DO_NOT_USE_THIS_KERNEL(p.layerID);
6868

6969
if (params.groups > 1)
@@ -109,12 +109,18 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32::SetDef
109109
break;
110110
ow_group--;
111111
}
112-
113-
dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 4;
112+
if (IsSIMDSizeSupported(cp.engineInfo, 8)) {
113+
dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 4;
114+
} else {
115+
dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 2;
116+
}
114117
dispatchData.gws[1] = Align(CeilDiv(cp.outputs[0].X().v, dispatchData.cldnnStyle.blockWidth), ow_group) * cp.outputs[0].Y().v * cp.outputs[0].Z().v;
115118
dispatchData.gws[2] = cp.outputs[0].Batch().v;
116-
117-
dispatchData.lws[0] = 8;
119+
if (IsSIMDSizeSupported(cp.engineInfo, 8)) {
120+
dispatchData.lws[0] = 8;
121+
} else {
122+
dispatchData.lws[0] = 16;
123+
}
118124
dispatchData.lws[1] = ow_group;
119125
dispatchData.lws[2] = 1;
120126

@@ -144,7 +150,13 @@ JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolu
144150
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
145151

146152
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED_IN"));
147-
jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params), "PACKED_OUT"));
153+
if (IsSIMDSizeSupported(params.engineInfo, 8)) {
154+
jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params), "PACKED_OUT"));
155+
jit.AddConstant(MakeJitConstant("OF_TO_DO", 4));
156+
} else {
157+
jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params, 2), "PACKED_OUT"));
158+
jit.AddConstant(MakeJitConstant("OF_TO_DO", 2));
159+
}
148160
if (params.weights.GetDType() == WeightsType::INT8) {
149161
jit.AddConstant(MakeJitConstant("FILTER_TYPE_CHAR", 1));
150162
} else if (params.weights.GetDType() == WeightsType::UINT8) {
@@ -159,22 +171,26 @@ JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolu
159171
std::vector<std::string> idx_order2;
160172
std::vector<std::string> idx_order3;
161173
if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) {
162-
idx_order0 = {"b", "(fg*32 + 4*lid+0)", "y", "(x+i)"};
163-
idx_order1 = {"b", "(fg*32 + 4*lid+1)", "y", "(x+i)"};
164-
idx_order2 = {"b", "(fg*32 + 4*lid+2)", "y", "(x+i)"};
165-
idx_order3 = {"b", "(fg*32 + 4*lid+3)", "y", "(x+i)"};
174+
idx_order0 = {"b", "(fg*32 + OF_TO_DO*lid+0)", "y", "(x+i)"};
175+
idx_order1 = {"b", "(fg*32 + OF_TO_DO*lid+1)", "y", "(x+i)"};
176+
idx_order2 = {"b", "(fg*32 + OF_TO_DO*lid+2)", "y", "(x+i)"};
177+
idx_order3 = {"b", "(fg*32 + OF_TO_DO*lid+3)", "y", "(x+i)"};
166178
} else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) {
167-
idx_order0 = {"b", "(fg*32 + 4*lid+0)", "z", "y", "(x+i)"};
168-
idx_order1 = {"b", "(fg*32 + 4*lid+1)", "z", "y", "(x+i)"};
169-
idx_order2 = {"b", "(fg*32 + 4*lid+2)", "z", "y", "(x+i)"};
170-
idx_order3 = {"b", "(fg*32 + 4*lid+3)", "z", "y", "(x+i)"};
179+
idx_order0 = {"b", "(fg*32 + OF_TO_DO*lid+0)", "z", "y", "(x+i)"};
180+
idx_order1 = {"b", "(fg*32 + OF_TO_DO*lid+1)", "z", "y", "(x+i)"};
181+
idx_order2 = {"b", "(fg*32 + OF_TO_DO*lid+2)", "z", "y", "(x+i)"};
182+
idx_order3 = {"b", "(fg*32 + OF_TO_DO*lid+3)", "z", "y", "(x+i)"};
171183
}
172184

173185
FusedOpsConfiguration conf0 = {"_0", idx_order0, "res0", input_dt, 1 };
174186
FusedOpsConfiguration conf1 = {"_1", idx_order1, "res1", input_dt, 1 };
175187
FusedOpsConfiguration conf2 = {"_2", idx_order2, "res2", input_dt, 1 };
176188
FusedOpsConfiguration conf3 = {"_3", idx_order3, "res3", input_dt, 1 };
177-
jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1, conf2, conf3}));
189+
if (IsSIMDSizeSupported(params.engineInfo, 8)) {
190+
jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1, conf2, conf3}));
191+
} else {
192+
jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1}));
193+
}
178194
}
179195

180196
return jit;

src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,18 @@ class ConvolutionKernel_mmad_b_fs_yx_fsv32 : public ConvolutionKernelBase {
2929
bool NeedPaddedInput() const override { return false; }
3030

3131
WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
32-
if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) {
33-
return WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
32+
if (IsSIMDSizeSupported(p.engineInfo, 8)) {
33+
if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) {
34+
return WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
35+
} else {
36+
return WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
37+
}
3438
} else {
35-
return WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
39+
if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) {
40+
return WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2;
41+
} else {
42+
return WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2;
43+
}
3644
}
3745
}
3846
std::vector<FusedOpType> GetSupportedFusedOps() const override {

0 commit comments

Comments
 (0)