|
33 | 33 | #undef MAX
|
34 | 34 | #define MIN(a, b) ((a) < (b) ? (a) : (b))
|
35 | 35 | #define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 36 | +#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) |
36 | 37 |
|
37 | 38 | #define UNUSED(x) (void)(x)
|
38 | 39 |
|
@@ -395,6 +396,8 @@ struct ggml_backend_opencl_context {
|
395 | 396 | cl_program program_conv_2d_f16_f32;
|
396 | 397 | cl_program program_tsembd;
|
397 | 398 | cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
| 399 | + cl_program program_mul_mm_f32_f32_l4_lm; |
| 400 | + cl_program program_mul_mm_f16_f32_l4_lm; |
398 | 401 |
|
399 | 402 | cl_kernel kernel_add, kernel_add_row;
|
400 | 403 | cl_kernel kernel_mul, kernel_mul_row;
|
@@ -449,6 +452,8 @@ struct ggml_backend_opencl_context {
|
449 | 452 | cl_kernel kernel_conv_2d_f16_f32;
|
450 | 453 | cl_kernel kernel_timestep_embedding;
|
451 | 454 | cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
| 455 | + cl_kernel kernel_mul_mm_f32_f32_l4_lm; |
| 456 | + cl_kernel kernel_mul_mm_f16_f32_l4_lm; |
452 | 457 |
|
453 | 458 | std::vector<ProfilingInfo> profiling_info;
|
454 | 459 |
|
@@ -1039,6 +1044,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
1039 | 1044 | GGML_LOG_CONT(".");
|
1040 | 1045 | }
|
1041 | 1046 |
|
| 1047 | + // mul_mm_f32_f32_l4_lm |
| 1048 | + { |
| 1049 | +#ifdef GGML_OPENCL_EMBED_KERNELS |
| 1050 | + const std::string kernel_src { |
| 1051 | + #include "mul_mm_f32_f32_l4_lm.cl.h" |
| 1052 | + }; |
| 1053 | +#else |
| 1054 | + const std::string kernel_src = read_file("mul_mm_f32_f32_l4_lm.cl"); |
| 1055 | +#endif |
| 1056 | + backend_ctx->program_mul_mm_f32_f32_l4_lm = |
| 1057 | + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); |
| 1058 | + |
| 1059 | + CL_CHECK((backend_ctx->kernel_mul_mm_f32_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f32_f32_l4_lm, "kernel_mul_mm_f32_f32_l4_lm", &err), err)); |
| 1060 | + GGML_LOG_CONT("."); |
| 1061 | + } |
| 1062 | + |
| 1063 | + // mul_mm_f16_f32_l4_lm |
| 1064 | + { |
| 1065 | +#ifdef GGML_OPENCL_EMBED_KERNELS |
| 1066 | + const std::string kernel_src { |
| 1067 | + #include "mul_mm_f16_f32_l4_lm.cl.h" |
| 1068 | + }; |
| 1069 | +#else |
| 1070 | + const std::string kernel_src = read_file("mul_mm_f16_f32_l4_lm.cl"); |
| 1071 | +#endif |
| 1072 | + backend_ctx->program_mul_mm_f16_f32_l4_lm = |
| 1073 | + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); |
| 1074 | + |
| 1075 | + CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_l4_lm, "kernel_mul_mm_f16_f32_l4_lm", &err), err)); |
| 1076 | + GGML_LOG_CONT("."); |
| 1077 | + } |
| 1078 | + |
1042 | 1079 | // mul
|
1043 | 1080 | {
|
1044 | 1081 | #ifdef GGML_OPENCL_EMBED_KERNELS
|
@@ -5139,18 +5176,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
5139 | 5176 |
|
5140 | 5177 | ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
5141 | 5178 |
|
5142 |
| - if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && |
5143 |
| - src0->ne[1] > 32 && // M > 32 |
5144 |
| - src1->ne[1] > 32 && // N > 32 |
5145 |
| - src0->ne[0] > 32 && // K > 32 |
5146 |
| - src0->ne[2] == 1 && src0->ne[3] == 1 && |
5147 |
| - src1->ne[2] == 1 && src1->ne[3] == 1 && |
5148 |
| - ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && |
5149 |
| - backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) { |
5150 |
| - ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst); |
5151 |
| - return; |
5152 |
| - } |
5153 |
| - |
5154 | 5179 | ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
5155 | 5180 | ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
5156 | 5181 | ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -5497,6 +5522,101 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
5497 | 5522 | } // if (ne01 && ne1)
|
5498 | 5523 | #endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
5499 | 5524 |
|
| 5525 | + // GEMM using local memory |
| 5526 | + // Current BK = 16, so ne00 % 16 == 0 |
| 5527 | + if (ggml_is_contiguous(src0) && |
| 5528 | + ggml_is_contiguous(src1) && |
| 5529 | + src1t == GGML_TYPE_F32 && |
| 5530 | + ne00 % 16 == 0 && |
| 5531 | + ne11 > 1) { |
| 5532 | + switch(src0t) { |
| 5533 | + case GGML_TYPE_F32: { |
| 5534 | + kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm; |
| 5535 | + nth0 = 128; // calculated as (BM*BN)/(TM*TN) |
| 5536 | + |
| 5537 | + int batch_stride_a = ne00*ne01; |
| 5538 | + int batch_stride_b = ne10*ne11; |
| 5539 | + int batch_stride_d = ne0*ne1; |
| 5540 | + |
| 5541 | + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| 5542 | + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| 5543 | + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| 5544 | + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| 5545 | + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| 5546 | + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| 5547 | + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| 5548 | + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| 5549 | + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| 5550 | + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); |
| 5551 | + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| 5552 | + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a |
| 5553 | + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b |
| 5554 | + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d |
| 5555 | + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); |
| 5556 | + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); |
| 5557 | + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); |
| 5558 | + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); |
| 5559 | + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); |
| 5560 | + |
| 5561 | + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. |
| 5562 | + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; |
| 5563 | + size_t local_work_size[] = {(size_t)nth0, 1, 1}; |
| 5564 | + |
| 5565 | + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); |
| 5566 | + return; |
| 5567 | + } |
| 5568 | + case GGML_TYPE_F16: { |
| 5569 | + kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm; |
| 5570 | + nth0 = 128; // calculated as (BM*BN)/(TM*TN) |
| 5571 | + |
| 5572 | + int batch_stride_a = ne00*ne01; |
| 5573 | + int batch_stride_b = ne10*ne11; |
| 5574 | + int batch_stride_d = ne0*ne1; |
| 5575 | + |
| 5576 | + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| 5577 | + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| 5578 | + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| 5579 | + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| 5580 | + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| 5581 | + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| 5582 | + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| 5583 | + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| 5584 | + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| 5585 | + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); |
| 5586 | + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| 5587 | + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a |
| 5588 | + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b |
| 5589 | + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d |
| 5590 | + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); |
| 5591 | + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); |
| 5592 | + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); |
| 5593 | + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); |
| 5594 | + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); |
| 5595 | + |
| 5596 | + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. |
| 5597 | + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; |
| 5598 | + size_t local_work_size[] = {(size_t)nth0, 1, 1}; |
| 5599 | + |
| 5600 | + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); |
| 5601 | + return; |
| 5602 | + } |
| 5603 | + default: |
| 5604 | + break; |
| 5605 | + } |
| 5606 | + } |
| 5607 | + |
| 5608 | + if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && |
| 5609 | + src0->ne[1] > 32 && // M > 32 |
| 5610 | + src1->ne[1] > 32 && // N > 32 |
| 5611 | + src0->ne[0] > 32 && // K > 32 |
| 5612 | + src0->ne[2] == 1 && src0->ne[3] == 1 && |
| 5613 | + src1->ne[2] == 1 && src1->ne[3] == 1 && |
| 5614 | + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && |
| 5615 | + backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) { |
| 5616 | + ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst); |
| 5617 | + return; |
| 5618 | + } |
| 5619 | + |
5500 | 5620 | if (!ggml_is_transposed(src0) &&
|
5501 | 5621 | !ggml_is_transposed(src1) &&
|
5502 | 5622 | src1t == GGML_TYPE_F32 &&
|
|
0 commit comments