From 23b9e0572e2196c691762dcfe935733998a5decd Mon Sep 17 00:00:00 2001 From: Akarshan Date: Sat, 26 Jul 2025 11:55:55 +0530 Subject: [PATCH 1/7] SYCL: Add set_rows support for quantized types This commit adds support for GGML_OP_SET_ROWS operation for various quantized tensor types (Q8_0, Q5_1, Q5_0, Q4_1, Q4_0, IQ4_NL) and BF16 type in the SYCL backend. The quantization/dequantization copy kernels were moved from cpy.cpp to cpy.hpp to make them available for set_rows.cpp. This addresses part of the TODOs mentioned in the code. --- ggml/src/ggml-sycl/cpy.cpp | 212 ------------------------------ ggml/src/ggml-sycl/cpy.hpp | 216 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 15 ++- ggml/src/ggml-sycl/set_rows.cpp | 88 +++++++++++++ 4 files changed, 315 insertions(+), 216 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 1ffd7f1226724..3d321b58ac6c9 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -1,31 +1,12 @@ #include "cpy.hpp" #include -#include #include "dequantize.hpp" #include "ggml-sycl/common.hpp" #include "ggml-sycl/presets.hpp" #include "ggml.h" -static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { - if (x <= val[0]) { - return 0; - } - if (x >= val[n - 1]) { - return n - 1; - } - int ml = 0, mu = n - 1; - while (mu - ml > 1) { - int mav = (ml + mu) / 2; - if (x < val[mav]) { - mu = mav; - } else { - ml = mav; - } - } - return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu; -} static void cpy_1_f32_f32(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; @@ -97,28 +78,6 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne cpy_1(cx + x_offset, cdst + dst_offset); } -static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = sycl::fmax(amax, sycl::fabs((float) v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f / d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j] * id; - - dsti->qs[j] = sycl::round((float) x0); - } -} /* quantized type same copy */ template @@ -140,178 +99,7 @@ static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { } } -static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float) v)) { - amax = sycl::fabs((float) v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f / d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0 / 2; ++j) { - const float x0 = xi[0 + j] * id; - const float x1 = xi[QK4_0 / 2 + j] * id; - - const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - if (v < vmin) { - vmin = v; - } - if (v > vmax) { - vmax = v; - } - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f / d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = vmin; - - for (int j = 0; j < QK4_1 / 2; ++j) { - const float x0 = (xi[0 + j] - vmin) * id; - const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; - - const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_0 * dsti = (block_q5_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK5_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float) v)) { - amax = sycl::fabs((float) v); - vmax = v; - } - } - - const float d = vmax / -16; - const float id = d ? 1.0f / d : 0.0f; - - dsti->d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0 / 2; ++j) { - const float x0 = xi[0 + j] * id; - const float x1 = xi[QK5_0 / 2 + j] * id; - - const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f)); - const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f)); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -static void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_1 * dsti = (block_q5_1 *) cdsti; - - float min = xi[0]; - float max = xi[0]; - - for (int j = 1; j < QK5_1; ++j) { - const float v = xi[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f / d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1 / 2; ++j) { - const float x0 = (xi[0 + j] - min) * id; - const float x1 = (xi[QK5_1 / 2 + j] - min) * id; - - const uint8_t xi0 = (uint8_t) (x0 + 0.5f); - const uint8_t xi1 = (uint8_t) (x1 + 0.5f); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -static void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_iq4_nl * dsti = (block_iq4_nl *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_NL; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float) v)) { - amax = sycl::fabs((float) v); - vmax = v; - } - } - - float d = vmax / kvalues_iq4nl[0]; - const float id = d ? 1.0f / d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL / 2; ++j) { - const float x0 = xi[0 + j] * id; - const float x1 = xi[QK4_NL / 2 + j] * id; - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); - dsti->qs[j] = xi0 | (xi1 << 4); - const float v0 = kvalues_iq4nl[xi0]; - const float v1 = kvalues_iq4nl[xi1]; - const float w0 = xi[0 + j] * xi[0 + j]; - const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j]; - sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j]; - sumq2 += w0 * v0 * v0 + w1 * v1 * v1; - } - - dsti->d = sumq2 > 0 ? sumqx / sumq2 : d; -} template static void cpy_blck_q_f32(const char * cxi, char * cdsti) { float * cdstf = (float *) (cdsti); diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index 0a0f561d2309a..d3be454e79e5a 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -2,9 +2,225 @@ #define GGML_SYCL_CPY_HPP #include "common.hpp" +#include +#include typedef void (*cpy_kernel_t)(const char * cx, char * cdst); +__dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) { + return 0; + } + if (x >= val[n - 1]) { + return n - 1; + } + int ml = 0, mu = n - 1; + while (mu - ml > 1) { + int mav = (ml + mu) / 2; + if (x < val[mav]) { + mu = mav; + } else { + ml = mav; + } + } + return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu; +} + + +inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float) v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j] * id; + + dsti->qs[j] = sycl::round((float) x0); + } +} + + +inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + vmin = sycl::min(v, vmin); + vmax = sycl::max(v, vmax); + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1 / 2; ++j) { + const float x0 = (xi[0 + j] - vmin) * id; + const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q5_0 * dsti = (block_q5_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK5_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -16; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK5_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f)); + const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f)); + + dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2); + } + memcpy(dsti->qh, &qh, sizeof(qh)); +} + +inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q5_1 * dsti = (block_q5_1 *) cdsti; + + float min = xi[0]; + float max = xi[0]; + + for (int j = 1; j < QK5_1; ++j) { + const float v = xi[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1 / 2; ++j) { + const float x0 = (xi[0 + j] - min) * id; + const float x1 = (xi[QK5_1 / 2 + j] - min) * id; + + const uint8_t xi0 = (uint8_t) (x0 + 0.5f); + const uint8_t xi1 = (uint8_t) (x1 + 0.5f); + + dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2); + } + memcpy(dsti->qh, &qh, sizeof(qh)); +} + + +inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_iq4_nl * dsti = (block_iq4_nl *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_NL; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + float d = vmax / kvalues_iq4nl[0]; + const float id = d ? 1.0f / d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_NL / 2 + j] * id; + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); + dsti->qs[j] = xi0 | (xi1 << 4); + const float v0 = kvalues_iq4nl[xi0]; + const float v1 = kvalues_iq4nl[xi1]; + const float w0 = xi[0 + j] * xi[0 + j]; + const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j]; + sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j]; + sumq2 += w0 * v0 * v0 + w1 * v1 * v1; + } + + dsti->d = sumq2 > 0 ? sumqx / sumq2 : d; +} + void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a023d6fb4525b..47189b462933a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4385,10 +4385,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } case GGML_OP_SET_ROWS: { - // TODO: add support - // ref: https://github.com/ggml-org/llama.cpp/pull/14274 -#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") - return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64)); + return (( + op->type == GGML_TYPE_F32 || + op->type == GGML_TYPE_F16 || + op->type == GGML_TYPE_BF16 || + op->type == GGML_TYPE_Q8_0 || + op->type == GGML_TYPE_Q5_1 || + op->type == GGML_TYPE_Q5_0 || + op->type == GGML_TYPE_Q4_1 || + op->type == GGML_TYPE_Q4_0 || + op->type == GGML_TYPE_IQ4_NL + ) && (op->src[1]->type == GGML_TYPE_I64)); } break; case GGML_OP_CPY: { diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 3091fab39958d..9107b6bc75928 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -1,4 +1,8 @@ #include "set_rows.hpp" +#include "ggml-sycl/common.hpp" +#include "cpy.hpp" +#include "ggml.h" +#include namespace utils { template @@ -15,6 +19,59 @@ convert (const char* src, char* dst) { *reinterpret_cast(dst) = dst_val; } + +template +static void set_rows_sycl_q( + const char * __restrict__ src0_d, + const int64_t * __restrict__ src1_d, + blockType * __restrict__ dst_d, + // tensor dimensions src0 and src1 + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, int64_t ne11, const int64_t ne12, const int64_t ne13, + // strides for src0 + const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03, + // strides for src1 + const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, + // strides for dst + const size_t nb1, const size_t nb2, const size_t nb3, + queue_ptr stream) { + const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk; + constexpr int block_size = 64; + const int64_t grid_size = ceil_div(total_blocks, block_size); + + sycl_parallel_for( + stream, + sycl::nd_range<1>(grid_size * block_size, block_size), + [=](sycl::nd_item<1> item_ct1) { + const int64_t i = item_ct1.get_global_id(0); + if (i >= total_blocks) return; + const int64_t i_base = i * qk; + const int64_t i03 = i_base / (ne00 * ne01 * ne02); + const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02); + const int64_t i02 = rem1 / (ne00 * ne01); + const int64_t rem2 = rem1 - i02 * ne00 * ne01; + const int64_t i01 = rem2 / ne00; + const int64_t i00 = rem2 - i01 * ne00; + const int64_t i12 = i03 % ne12; + const int64_t i11 = i02 % ne11; + const int64_t i10 = i01; + size_t src_offset = calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03}); + const char * src_block = src0_d + src_offset + i00 * sizeof(float); + size_t src1_offset = calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}); + const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)]; + size_t dst_offset = calculate_offset<3>({nb1, nb2, nb3}, {dst_row, i02, i03}) + (i00 / qk) * sizeof(blockType); + char * dst_block = reinterpret_cast(reinterpret_cast(dst_d) + dst_offset); + cpyblck(src_block, dst_block); + } + ); + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); + GGML_UNUSED(nb00); + GGML_UNUSED(nb13); + +} + + template static void k_set_rows( const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst, @@ -124,6 +181,37 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { stream ); break; + case GGML_TYPE_BF16: + set_rows_sycl( + (const char *)src0->data, src1_dd, (char *)dst->data, + ne00, ne01, ne02, ne03, + ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + sizeof(float), sizeof(sycl::ext::oneapi::bfloat16), + stream + ); + break; + case GGML_TYPE_Q8_0: + set_rows_sycl_q((const char *)src0->data, src1_dd, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_1: + set_rows_sycl_q((const char *)src0->data, src1_dd, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_0: + set_rows_sycl_q((const char *)src0->data, src1_dd, (block_q5_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_1: + set_rows_sycl_q((const char *)src0->data, src1_dd, (block_q4_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_0: + set_rows_sycl_q((const char *)src0->data, src1_dd, (block_q4_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_IQ4_NL: + set_rows_sycl_q((const char *)src0->data, src1_dd, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + default: GGML_ABORT("Unsupported tensor type!"); break; From 0afd0733451ceddb7cb26b13b9d65ed0572da14a Mon Sep 17 00:00:00 2001 From: Akarshan Date: Sat, 26 Jul 2025 12:01:37 +0530 Subject: [PATCH 2/7] Use get_global_linear_id() instead ggml-ci --- ggml/src/ggml-sycl/set_rows.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 9107b6bc75928..b7f4932686413 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -43,7 +43,7 @@ static void set_rows_sycl_q( stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) { - const int64_t i = item_ct1.get_global_id(0); + const int64_t i = item_ct1.get_global_linear_id(); if (i >= total_blocks) return; const int64_t i_base = i * qk; const int64_t i03 = i_base / (ne00 * ne01 * ne02); From 9539e37c03467703cebf91baf396b0cf93fca419 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Sat, 26 Jul 2025 18:26:43 +0530 Subject: [PATCH 3/7] Fix formatting ggml-ci --- ggml/src/ggml-sycl/cpy.hpp | 6 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 18 ++---- ggml/src/ggml-sycl/set_rows.cpp | 97 +++++++++++++++++--------------- 3 files changed, 61 insertions(+), 60 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index d3be454e79e5a..b0d574f940b3a 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -2,6 +2,7 @@ #define GGML_SYCL_CPY_HPP #include "common.hpp" + #include #include @@ -26,7 +27,6 @@ __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu; } - inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; block_q8_0 * dsti = (block_q8_0 *) cdsti; @@ -50,7 +50,6 @@ inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { } } - inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; block_q4_0 * dsti = (block_q4_0 *) cdsti; @@ -184,7 +183,6 @@ inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { memcpy(dsti->qh, &qh, sizeof(qh)); } - inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; block_iq4_nl * dsti = (block_iq4_nl *) cdsti; @@ -224,4 +222,4 @@ inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -#endif // GGML_SYCL_CPY_HPP +#endif // GGML_SYCL_CPY_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 47189b462933a..3935ce67de47d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4385,18 +4385,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } case GGML_OP_SET_ROWS: { - return (( - op->type == GGML_TYPE_F32 || - op->type == GGML_TYPE_F16 || - op->type == GGML_TYPE_BF16 || - op->type == GGML_TYPE_Q8_0 || - op->type == GGML_TYPE_Q5_1 || - op->type == GGML_TYPE_Q5_0 || - op->type == GGML_TYPE_Q4_1 || - op->type == GGML_TYPE_Q4_0 || - op->type == GGML_TYPE_IQ4_NL - ) && (op->src[1]->type == GGML_TYPE_I64)); - } break; + return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || + op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 || + op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) && + (op->src[1]->type == GGML_TYPE_I64)); + } + break; case GGML_OP_CPY: { ggml_type src0_type = op->src[0]->type; diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index b7f4932686413..acf18b8872a42 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -19,59 +19,68 @@ convert (const char* src, char* dst) { *reinterpret_cast(dst) = dst_val; } - -template -static void set_rows_sycl_q( - const char * __restrict__ src0_d, - const int64_t * __restrict__ src1_d, - blockType * __restrict__ dst_d, - // tensor dimensions src0 and src1 - const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, - const int64_t ne10, int64_t ne11, const int64_t ne12, const int64_t ne13, - // strides for src0 - const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03, - // strides for src1 - const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, - // strides for dst - const size_t nb1, const size_t nb2, const size_t nb3, - queue_ptr stream) { +template +static void set_rows_sycl_q(const char * __restrict__ src0_d, + const int64_t * __restrict__ src1_d, + blockType * __restrict__ dst_d, + // tensor dimensions src0 and src1 + const int64_t ne00, + const int64_t ne01, + const int64_t ne02, + const int64_t ne03, + const int64_t ne10, + int64_t ne11, + const int64_t ne12, + const int64_t ne13, + // strides for src0 + const size_t nb00, + const size_t nb01, + const size_t nb02, + const size_t nb03, + // strides for src1 + const size_t nb10, + const size_t nb11, + const size_t nb12, + const size_t nb13, + // strides for dst + const size_t nb1, + const size_t nb2, + const size_t nb3, + queue_ptr stream) { const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk; - constexpr int block_size = 64; - const int64_t grid_size = ceil_div(total_blocks, block_size); + constexpr int block_size = 64; + const int64_t grid_size = ceil_div(total_blocks, block_size); - sycl_parallel_for( - stream, - sycl::nd_range<1>(grid_size * block_size, block_size), - [=](sycl::nd_item<1> item_ct1) { - const int64_t i = item_ct1.get_global_linear_id(); - if (i >= total_blocks) return; - const int64_t i_base = i * qk; - const int64_t i03 = i_base / (ne00 * ne01 * ne02); - const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02); - const int64_t i02 = rem1 / (ne00 * ne01); - const int64_t rem2 = rem1 - i02 * ne00 * ne01; - const int64_t i01 = rem2 / ne00; - const int64_t i00 = rem2 - i01 * ne00; - const int64_t i12 = i03 % ne12; - const int64_t i11 = i02 % ne11; - const int64_t i10 = i01; - size_t src_offset = calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03}); - const char * src_block = src0_d + src_offset + i00 * sizeof(float); - size_t src1_offset = calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}); - const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)]; - size_t dst_offset = calculate_offset<3>({nb1, nb2, nb3}, {dst_row, i02, i03}) + (i00 / qk) * sizeof(blockType); - char * dst_block = reinterpret_cast(reinterpret_cast(dst_d) + dst_offset); - cpyblck(src_block, dst_block); + sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) { + const int64_t i = item_ct1.get_global_linear_id(); + if (i >= total_blocks) { + return; } - ); + const int64_t i_base = i * qk; + const int64_t i03 = i_base / (ne00 * ne01 * ne02); + const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02); + const int64_t i02 = rem1 / (ne00 * ne01); + const int64_t rem2 = rem1 - i02 * ne00 * ne01; + const int64_t i01 = rem2 / ne00; + const int64_t i00 = rem2 - i01 * ne00; + const int64_t i12 = i03 % ne12; + const int64_t i11 = i02 % ne11; + const int64_t i10 = i01; + size_t src_offset = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 }); + const char * src_block = src0_d + src_offset + i00 * sizeof(float); + size_t src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 }); + const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)]; + size_t dst_offset = + calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType); + char * dst_block = reinterpret_cast(reinterpret_cast(dst_d) + dst_offset); + cpyblck(src_block, dst_block); + }); GGML_UNUSED(ne10); GGML_UNUSED(ne13); GGML_UNUSED(nb00); GGML_UNUSED(nb13); - } - template static void k_set_rows( const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst, From d1e09ef7d26c41271cf912432b81ec14a4e6abb8 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Sat, 26 Jul 2025 18:39:13 +0530 Subject: [PATCH 4/7] Use const for ne11 and size_t variables in set_rows_sycl_q ggml-ci --- ggml/src/ggml-sycl/set_rows.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index acf18b8872a42..e70c5d5588f27 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -29,7 +29,7 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d, const int64_t ne02, const int64_t ne03, const int64_t ne10, - int64_t ne11, + const int64_t ne11, const int64_t ne12, const int64_t ne13, // strides for src0 @@ -66,11 +66,11 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d, const int64_t i12 = i03 % ne12; const int64_t i11 = i02 % ne11; const int64_t i10 = i01; - size_t src_offset = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 }); + const size_t src_offset = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 }); const char * src_block = src0_d + src_offset + i00 * sizeof(float); - size_t src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 }); + const size_t src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 }); const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)]; - size_t dst_offset = + const size_t dst_offset = calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType); char * dst_block = reinterpret_cast(reinterpret_cast(dst_d) + dst_offset); cpyblck(src_block, dst_block); From 34b8f0251455e0616f12e239cb7fe1844395af3a Mon Sep 17 00:00:00 2001 From: Akarshan Date: Sun, 27 Jul 2025 11:56:13 +0530 Subject: [PATCH 5/7] Increase block size for q kernel to 256 ggml-ci --- ggml/src/ggml-sycl/set_rows.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index e70c5d5588f27..55f753eeb3dff 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -48,7 +48,7 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d, const size_t nb3, queue_ptr stream) { const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk; - constexpr int block_size = 64; + constexpr int block_size = 256; const int64_t grid_size = ceil_div(total_blocks, block_size); sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) { From e5818d47d23b059ca3747c0d6d30885c3ff75497 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Mon, 28 Jul 2025 17:56:47 +0530 Subject: [PATCH 6/7] Cleanup imports --- ggml/src/ggml-sycl/cpy.hpp | 3 --- ggml/src/ggml-sycl/set_rows.cpp | 3 --- 2 files changed, 6 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index b0d574f940b3a..f81cb6f1fd3d7 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -3,9 +3,6 @@ #include "common.hpp" -#include -#include - typedef void (*cpy_kernel_t)(const char * cx, char * cdst); __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 55f753eeb3dff..7a8e1410b7040 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -1,8 +1,5 @@ #include "set_rows.hpp" -#include "ggml-sycl/common.hpp" #include "cpy.hpp" -#include "ggml.h" -#include namespace utils { template From 3ad45b1efad5fbaaf034039bb7b8e22ceae40935 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Mon, 28 Jul 2025 18:28:08 +0530 Subject: [PATCH 7/7] Add float.h to cpy.hpp --- ggml/src/ggml-sycl/cpy.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index f81cb6f1fd3d7..3c331f1ef27b9 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -2,6 +2,7 @@ #define GGML_SYCL_CPY_HPP #include "common.hpp" +#include typedef void (*cpy_kernel_t)(const char * cx, char * cdst);