|
33 | 33 | #undef MAX
|
34 | 34 | #define MIN(a, b) ((a) < (b) ? (a) : (b))
|
35 | 35 | #define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 36 | +#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) |
36 | 37 |
|
37 | 38 | #define UNUSED(x) (void)(x)
|
38 | 39 |
|
@@ -396,6 +397,8 @@ struct ggml_backend_opencl_context {
|
396 | 397 | cl_program program_conv_2d_f16_f32;
|
397 | 398 | cl_program program_tsembd;
|
398 | 399 | cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
| 400 | + cl_program program_mul_mm_f32_f32_l4_lm; |
| 401 | + cl_program program_mul_mm_f16_f32_l4_lm; |
399 | 402 |
|
400 | 403 | cl_kernel kernel_add, kernel_add_row;
|
401 | 404 | cl_kernel kernel_mul, kernel_mul_row;
|
@@ -450,6 +453,8 @@ struct ggml_backend_opencl_context {
|
450 | 453 | cl_kernel kernel_conv_2d_f16_f32;
|
451 | 454 | cl_kernel kernel_timestep_embedding;
|
452 | 455 | cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
| 456 | + cl_kernel kernel_mul_mm_f32_f32_l4_lm; |
| 457 | + cl_kernel kernel_mul_mm_f16_f32_l4_lm; |
453 | 458 |
|
454 | 459 | std::vector<ProfilingInfo> profiling_info;
|
455 | 460 |
|
@@ -1040,6 +1045,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
1040 | 1045 | GGML_LOG_CONT(".");
|
1041 | 1046 | }
|
1042 | 1047 |
|
| 1048 | + // mul_mm_f32_f32_l4_lm |
| 1049 | + { |
| 1050 | +#ifdef GGML_OPENCL_EMBED_KERNELS |
| 1051 | + const std::string kernel_src { |
| 1052 | + #include "mul_mm_f32_f32_l4_lm.cl.h" |
| 1053 | + }; |
| 1054 | +#else |
| 1055 | + const std::string kernel_src = read_file("mul_mm_f32_f32_l4_lm.cl"); |
| 1056 | +#endif |
| 1057 | + backend_ctx->program_mul_mm_f32_f32_l4_lm = |
| 1058 | + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); |
| 1059 | + |
| 1060 | + CL_CHECK((backend_ctx->kernel_mul_mm_f32_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f32_f32_l4_lm, "kernel_mul_mm_f32_f32_l4_lm", &err), err)); |
| 1061 | + GGML_LOG_CONT("."); |
| 1062 | + } |
| 1063 | + |
| 1064 | + // mul_mm_f16_f32_l4_lm |
| 1065 | + { |
| 1066 | +#ifdef GGML_OPENCL_EMBED_KERNELS |
| 1067 | + const std::string kernel_src { |
| 1068 | + #include "mul_mm_f16_f32_l4_lm.cl.h" |
| 1069 | + }; |
| 1070 | +#else |
| 1071 | + const std::string kernel_src = read_file("mul_mm_f16_f32_l4_lm.cl"); |
| 1072 | +#endif |
| 1073 | + backend_ctx->program_mul_mm_f16_f32_l4_lm = |
| 1074 | + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); |
| 1075 | + |
| 1076 | + CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_l4_lm, "kernel_mul_mm_f16_f32_l4_lm", &err), err)); |
| 1077 | + GGML_LOG_CONT("."); |
| 1078 | + } |
| 1079 | + |
1043 | 1080 | // mul
|
1044 | 1081 | {
|
1045 | 1082 | #ifdef GGML_OPENCL_EMBED_KERNELS
|
@@ -5297,18 +5334,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
5297 | 5334 |
|
5298 | 5335 | ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
5299 | 5336 |
|
5300 |
| - if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && |
5301 |
| - src0->ne[1] > 32 && // M > 32 |
5302 |
| - src1->ne[1] > 32 && // N > 32 |
5303 |
| - src0->ne[0] > 32 && // K > 32 |
5304 |
| - src0->ne[2] == 1 && src0->ne[3] == 1 && |
5305 |
| - src1->ne[2] == 1 && src1->ne[3] == 1 && |
5306 |
| - ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && |
5307 |
| - backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) { |
5308 |
| - ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst); |
5309 |
| - return; |
5310 |
| - } |
5311 |
| - |
5312 | 5337 | ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
5313 | 5338 | ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
5314 | 5339 | ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -5655,6 +5680,101 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
5655 | 5680 | } // if (ne01 && ne1)
|
5656 | 5681 | #endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
5657 | 5682 |
|
| 5683 | + // GEMM using local memory |
| 5684 | + // Current BK = 16, so ne00 % 16 == 0 |
| 5685 | + if (ggml_is_contiguous(src0) && |
| 5686 | + ggml_is_contiguous(src1) && |
| 5687 | + src1t == GGML_TYPE_F32 && |
| 5688 | + ne00 % 16 == 0 && |
| 5689 | + ne11 > 1) { |
| 5690 | + switch(src0t) { |
| 5691 | + case GGML_TYPE_F32: { |
| 5692 | + kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm; |
| 5693 | + nth0 = 128; // calculated as (BM*BN)/(TM*TN) |
| 5694 | + |
| 5695 | + int batch_stride_a = ne00*ne01; |
| 5696 | + int batch_stride_b = ne10*ne11; |
| 5697 | + int batch_stride_d = ne0*ne1; |
| 5698 | + |
| 5699 | + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| 5700 | + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| 5701 | + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| 5702 | + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| 5703 | + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| 5704 | + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| 5705 | + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| 5706 | + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| 5707 | + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| 5708 | + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); |
| 5709 | + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| 5710 | + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a |
| 5711 | + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b |
| 5712 | + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d |
| 5713 | + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); |
| 5714 | + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); |
| 5715 | + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); |
| 5716 | + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); |
| 5717 | + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); |
| 5718 | + |
| 5719 | + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. |
| 5720 | + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; |
| 5721 | + size_t local_work_size[] = {(size_t)nth0, 1, 1}; |
| 5722 | + |
| 5723 | + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); |
| 5724 | + return; |
| 5725 | + } |
| 5726 | + case GGML_TYPE_F16: { |
| 5727 | + kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm; |
| 5728 | + nth0 = 128; // calculated as (BM*BN)/(TM*TN) |
| 5729 | + |
| 5730 | + int batch_stride_a = ne00*ne01; |
| 5731 | + int batch_stride_b = ne10*ne11; |
| 5732 | + int batch_stride_d = ne0*ne1; |
| 5733 | + |
| 5734 | + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| 5735 | + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| 5736 | + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| 5737 | + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| 5738 | + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| 5739 | + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| 5740 | + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| 5741 | + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| 5742 | + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| 5743 | + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); |
| 5744 | + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| 5745 | + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a |
| 5746 | + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b |
| 5747 | + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d |
| 5748 | + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); |
| 5749 | + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); |
| 5750 | + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); |
| 5751 | + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); |
| 5752 | + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); |
| 5753 | + |
| 5754 | + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. |
| 5755 | + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; |
| 5756 | + size_t local_work_size[] = {(size_t)nth0, 1, 1}; |
| 5757 | + |
| 5758 | + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); |
| 5759 | + return; |
| 5760 | + } |
| 5761 | + default: |
| 5762 | + break; |
| 5763 | + } |
| 5764 | + } |
| 5765 | + |
| 5766 | + if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && |
| 5767 | + src0->ne[1] > 32 && // M > 32 |
| 5768 | + src1->ne[1] > 32 && // N > 32 |
| 5769 | + src0->ne[0] > 32 && // K > 32 |
| 5770 | + src0->ne[2] == 1 && src0->ne[3] == 1 && |
| 5771 | + src1->ne[2] == 1 && src1->ne[3] == 1 && |
| 5772 | + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && |
| 5773 | + backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) { |
| 5774 | + ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst); |
| 5775 | + return; |
| 5776 | + } |
| 5777 | + |
5658 | 5778 | if (!ggml_is_transposed(src0) &&
|
5659 | 5779 | !ggml_is_transposed(src1) &&
|
5660 | 5780 | src1t == GGML_TYPE_F32 &&
|
|
0 commit comments