From 23b9e0572e2196c691762dcfe935733998a5decd Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Sat, 26 Jul 2025 11:55:55 +0530
Subject: [PATCH 1/7] SYCL: Add set_rows support for quantized types

This commit adds support for GGML_OP_SET_ROWS operation for various
quantized tensor types (Q8_0, Q5_1, Q5_0, Q4_1, Q4_0, IQ4_NL) and BF16
type in the SYCL backend.

The quantization/dequantization copy kernels were moved from cpy.cpp
to cpy.hpp to make them available for set_rows.cpp.

This addresses part of the TODOs mentioned in the code.
---
 ggml/src/ggml-sycl/cpy.cpp       | 212 ------------------------------
 ggml/src/ggml-sycl/cpy.hpp       | 216 +++++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/ggml-sycl.cpp |  15 ++-
 ggml/src/ggml-sycl/set_rows.cpp  |  88 +++++++++++++
 4 files changed, 315 insertions(+), 216 deletions(-)
diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp
index 1ffd7f1226724..3d321b58ac6c9 100644
--- a/ggml/src/ggml-sycl/cpy.cpp
+++ b/ggml/src/ggml-sycl/cpy.cpp
@@ -1,31 +1,12 @@
 #include "cpy.hpp"
 
 #include <float.h>
-#include <string>
 
 #include "dequantize.hpp"
 #include "ggml-sycl/common.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml.h"
 
-static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
-    if (x <= val[0]) {
-        return 0;
-    }
-    if (x >= val[n - 1]) {
-        return n - 1;
-    }
-    int ml = 0, mu = n - 1;
-    while (mu - ml > 1) {
-        int mav = (ml + mu) / 2;
-        if (x < val[mav]) {
-            mu = mav;
-        } else {
-            ml = mav;
-        }
-    }
-    return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
-}
 
 static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
     const float * xi   = (const float *) cxi;
@@ -97,28 +78,6 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
-static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi   = (const float *) cxi;
-    block_q8_0 *  dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f;  // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax          = sycl::fmax(amax, sycl::fabs((float) v));
-    }
-
-    const float d  = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f / d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j] * id;
-
-        dsti->qs[j] = sycl::round((float) x0);
-    }
-}
 
 /* quantized type same copy */
 template<typename T>
@@ -140,178 +99,7 @@ static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
     }
 }
 
-static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi   = (const float *) cxi;
-    block_q4_0 *  dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < sycl::fabs((float) v)) {
-            amax = sycl::fabs((float) v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f / d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0 / 2; ++j) {
-        const float x0 = xi[0 + j] * id;
-        const float x1 = xi[QK4_0 / 2 + j] * id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
-
-        dsti->qs[j] = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi   = (const float *) cxi;
-    block_q4_1 *  dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) {
-            vmin = v;
-        }
-        if (v > vmax) {
-            vmax = v;
-        }
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f / d : 0.0f;
-
-    dsti->dm.x() = d;
-    dsti->dm.y() = vmin;
-
-    for (int j = 0; j < QK4_1 / 2; ++j) {
-        const float x0 = (xi[0 + j] - vmin) * id;
-        const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
 
-        dsti->qs[j] = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
-    const float * xi   = (const float *) cxi;
-    block_q5_0 *  dsti = (block_q5_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK5_0; ++j) {
-        const float v = xi[j];
-        if (amax < sycl::fabs((float) v)) {
-            amax = sycl::fabs((float) v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -16;
-    const float id = d ? 1.0f / d : 0.0f;
-
-    dsti->d = d;
-
-    uint32_t qh = 0;
-    for (int j = 0; j < QK5_0 / 2; ++j) {
-        const float x0 = xi[0 + j] * id;
-        const float x1 = xi[QK5_0 / 2 + j] * id;
-
-        const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
-        const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
-
-        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
-        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
-    }
-    memcpy(dsti->qh, &qh, sizeof(qh));
-}
-
-static void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
-    const float * xi   = (const float *) cxi;
-    block_q5_1 *  dsti = (block_q5_1 *) cdsti;
-
-    float min = xi[0];
-    float max = xi[0];
-
-    for (int j = 1; j < QK5_1; ++j) {
-        const float v = xi[j];
-        min           = v < min ? v : min;
-        max           = v > max ? v : max;
-    }
-
-    const float d  = (max - min) / 31;
-    const float id = d ? 1.0f / d : 0.0f;
-
-    dsti->dm.x() = d;
-    dsti->dm.y() = min;
-
-    uint32_t qh = 0;
-    for (int j = 0; j < QK5_1 / 2; ++j) {
-        const float x0 = (xi[0 + j] - min) * id;
-        const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
-
-        const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
-        const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
-
-        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
-        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
-    }
-    memcpy(dsti->qh, &qh, sizeof(qh));
-}
-
-static void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
-    const float *  xi   = (const float *) cxi;
-    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_NL; ++j) {
-        const float v = xi[j];
-        if (amax < sycl::fabs((float) v)) {
-            amax = sycl::fabs((float) v);
-            vmax = v;
-        }
-    }
-
-    float       d  = vmax / kvalues_iq4nl[0];
-    const float id = d ? 1.0f / d : 0.0f;
-
-    float sumqx = 0, sumq2 = 0;
-    for (int j = 0; j < QK4_NL / 2; ++j) {
-        const float   x0  = xi[0 + j] * id;
-        const float   x1  = xi[QK4_NL / 2 + j] * id;
-        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
-        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
-        dsti->qs[j]       = xi0 | (xi1 << 4);
-        const float v0    = kvalues_iq4nl[xi0];
-        const float v1    = kvalues_iq4nl[xi1];
-        const float w0    = xi[0 + j] * xi[0 + j];
-        const float w1    = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
-        sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
-        sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
-    }
-
-    dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
-}
 
 template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const char * cxi, char * cdsti) {
     float * cdstf = (float *) (cdsti);
diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp
index 0a0f561d2309a..d3be454e79e5a 100644
--- a/ggml/src/ggml-sycl/cpy.hpp
+++ b/ggml/src/ggml-sycl/cpy.hpp
@@ -2,9 +2,225 @@
 #define GGML_SYCL_CPY_HPP
 
 #include "common.hpp"
+#include <algorithm>
+#include <cfloat>
 
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 
+__dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
+    if (x <= val[0]) {
+        return 0;
+    }
+    if (x >= val[n - 1]) {
+        return n - 1;
+    }
+    int ml = 0, mu = n - 1;
+    while (mu - ml > 1) {
+        int mav = (ml + mu) / 2;
+        if (x < val[mav]) {
+            mu = mav;
+        } else {
+            ml = mav;
+        }
+    }
+    return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
+}
+
+
+inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi   = (const float *) cxi;
+    block_q8_0 *  dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f;  // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax          = sycl::fmax(amax, sycl::fabs((float) v));
+    }
+
+    const float d  = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j] * id;
+
+        dsti->qs[j] = sycl::round((float) x0);
+    }
+}
+
+
+inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi   = (const float *) cxi;
+    block_q4_0 *  dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float) v)) {
+            amax = sycl::fabs((float) v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f / d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0 / 2; ++j) {
+        const float x0 = xi[0 + j] * id;
+        const float x1 = xi[QK4_0 / 2 + j] * id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
+
+        dsti->qs[j] = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi   = (const float *) cxi;
+    block_q4_1 *  dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        vmin = sycl::min(v, vmin);
+        vmax = sycl::max(v, vmax);
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    dsti->dm.x() = d;
+    dsti->dm.y() = vmin;
+
+    for (int j = 0; j < QK4_1 / 2; ++j) {
+        const float x0 = (xi[0 + j] - vmin) * id;
+        const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
+
+        dsti->qs[j] = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
+    const float * xi   = (const float *) cxi;
+    block_q5_0 *  dsti = (block_q5_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK5_0; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float) v)) {
+            amax = sycl::fabs((float) v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -16;
+    const float id = d ? 1.0f / d : 0.0f;
+
+    dsti->d = d;
+
+    uint32_t qh = 0;
+    for (int j = 0; j < QK5_0 / 2; ++j) {
+        const float x0 = xi[0 + j] * id;
+        const float x1 = xi[QK5_0 / 2 + j] * id;
+
+        const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
+        const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
+
+        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
+    }
+    memcpy(dsti->qh, &qh, sizeof(qh));
+}
+
+inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
+    const float * xi   = (const float *) cxi;
+    block_q5_1 *  dsti = (block_q5_1 *) cdsti;
+
+    float min = xi[0];
+    float max = xi[0];
+
+    for (int j = 1; j < QK5_1; ++j) {
+        const float v = xi[j];
+        min           = v < min ? v : min;
+        max           = v > max ? v : max;
+    }
+
+    const float d  = (max - min) / 31;
+    const float id = d ? 1.0f / d : 0.0f;
+
+    dsti->dm.x() = d;
+    dsti->dm.y() = min;
+
+    uint32_t qh = 0;
+    for (int j = 0; j < QK5_1 / 2; ++j) {
+        const float x0 = (xi[0 + j] - min) * id;
+        const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
+
+        const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
+        const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
+
+        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
+    }
+    memcpy(dsti->qh, &qh, sizeof(qh));
+}
+
+
+inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
+    const float *  xi   = (const float *) cxi;
+    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_NL; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float) v)) {
+            amax = sycl::fabs((float) v);
+            vmax = v;
+        }
+    }
+
+    float       d  = vmax / kvalues_iq4nl[0];
+    const float id = d ? 1.0f / d : 0.0f;
+
+    float sumqx = 0, sumq2 = 0;
+    for (int j = 0; j < QK4_NL / 2; ++j) {
+        const float   x0  = xi[0 + j] * id;
+        const float   x1  = xi[QK4_NL / 2 + j] * id;
+        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
+        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
+        dsti->qs[j]       = xi0 | (xi1 << 4);
+        const float v0    = kvalues_iq4nl[xi0];
+        const float v1    = kvalues_iq4nl[xi1];
+        const float w0    = xi[0 + j] * xi[0 + j];
+        const float w1    = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
+        sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
+        sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
+    }
+
+    dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
+}
+
 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
 void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index a023d6fb4525b..47189b462933a 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4385,10 +4385,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             }
         case GGML_OP_SET_ROWS:
             {
-                // TODO: add support
-                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
-#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
-                return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
+                return ((
+                            op->type == GGML_TYPE_F32 ||
+                            op->type == GGML_TYPE_F16 ||
+                            op->type == GGML_TYPE_BF16 ||
+                            op->type == GGML_TYPE_Q8_0 ||
+                            op->type == GGML_TYPE_Q5_1 ||
+                            op->type == GGML_TYPE_Q5_0 ||
+                            op->type == GGML_TYPE_Q4_1 ||
+                            op->type == GGML_TYPE_Q4_0 ||
+                            op->type == GGML_TYPE_IQ4_NL
+                            ) && (op->src[1]->type == GGML_TYPE_I64));
             } break;
         case GGML_OP_CPY:
             {
diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index 3091fab39958d..9107b6bc75928 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -1,4 +1,8 @@
 #include "set_rows.hpp"
+#include "ggml-sycl/common.hpp"
+#include "cpy.hpp"
+#include "ggml.h"
+#include <cstddef>
 
 namespace utils {
 template<typename T>
@@ -15,6 +19,59 @@ convert (const char* src, char* dst) {
    *reinterpret_cast<TOut*>(dst) = dst_val;
 }
 
+
+template<typename blockType, int qk, cpy_kernel_t cpyblck>
+static void set_rows_sycl_q(
+        const char * __restrict__ src0_d,
+        const int64_t * __restrict__ src1_d,
+        blockType  * __restrict__ dst_d,
+        // tensor dimensions src0 and src1
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, int64_t ne11, const int64_t ne12, const int64_t ne13,
+        // strides for src0
+        const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
+        // strides for src1
+        const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13,
+        // strides for dst
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        queue_ptr stream) {
+    const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk;
+    constexpr int block_size = 64;
+    const int64_t grid_size = ceil_div(total_blocks, block_size);
+
+    sycl_parallel_for(
+            stream,
+            sycl::nd_range<1>(grid_size * block_size, block_size),
+            [=](sycl::nd_item<1> item_ct1) {
+            const int64_t i = item_ct1.get_global_id(0);
+            if (i >= total_blocks) return;
+            const int64_t i_base = i * qk;
+            const int64_t i03 = i_base / (ne00 * ne01 * ne02);
+            const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02);
+            const int64_t i02 = rem1 / (ne00 * ne01);
+            const int64_t rem2 = rem1 - i02 * ne00 * ne01;
+            const int64_t i01 = rem2 / ne00;
+            const int64_t i00 = rem2 - i01 * ne00;
+            const int64_t i12 = i03 % ne12;
+            const int64_t i11 = i02 % ne11;
+            const int64_t i10 = i01;
+            size_t src_offset = calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
+            const char * src_block = src0_d + src_offset + i00 * sizeof(float);
+            size_t src1_offset = calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12});
+            const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)];
+            size_t dst_offset = calculate_offset<3>({nb1, nb2, nb3}, {dst_row, i02, i03}) + (i00 / qk) * sizeof(blockType);
+            char * dst_block = reinterpret_cast<char*>(reinterpret_cast<char*>(dst_d) + dst_offset);
+            cpyblck(src_block, dst_block);
+        }
+    );
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
+    GGML_UNUSED(nb00);
+    GGML_UNUSED(nb13);
+
+}
+
+
 template<typename TIn, typename TOut>
 static void k_set_rows(
         const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
@@ -124,6 +181,37 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
                 stream
             );
             break;
+        case GGML_TYPE_BF16:
+            set_rows_sycl<float, sycl::ext::oneapi::bfloat16>(
+                (const char *)src0->data, src1_dd, (char *)dst->data,
+                ne00, ne01, ne02, ne03,
+                ne11, ne12,
+                nb01, nb02, nb03,
+                nb10, nb11, nb12,
+                nb1, nb2, nb3,
+                sizeof(float), sizeof(sycl::ext::oneapi::bfloat16),
+                stream
+            );
+            break;
+        case GGML_TYPE_Q8_0:
+            set_rows_sycl_q<block_q8_0, QK8_0, cpy_blck_f32_q8_0>((const char *)src0->data, src1_dd, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            set_rows_sycl_q<block_q5_1, QK5_1, cpy_blck_f32_q5_1>((const char *)src0->data, src1_dd, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            set_rows_sycl_q<block_q5_0, QK5_0, cpy_blck_f32_q5_0>((const char *)src0->data, src1_dd, (block_q5_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            set_rows_sycl_q<block_q4_1, QK4_1, cpy_blck_f32_q4_1>((const char *)src0->data, src1_dd, (block_q4_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            set_rows_sycl_q<block_q4_0, QK4_0, cpy_blck_f32_q4_0>((const char *)src0->data, src1_dd, (block_q4_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
+            break;
+        case GGML_TYPE_IQ4_NL:
+            set_rows_sycl_q<block_iq4_nl, QK4_NL, cpy_blck_f32_iq4_nl>((const char *)src0->data, src1_dd, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
+            break;
+
         default:
             GGML_ABORT("Unsupported tensor type!");
             break;

From 0afd0733451ceddb7cb26b13b9d65ed0572da14a Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Sat, 26 Jul 2025 12:01:37 +0530
Subject: [PATCH 2/7] Use get_global_linear_id() instead

ggml-ci
---
 ggml/src/ggml-sycl/set_rows.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index 9107b6bc75928..b7f4932686413 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -43,7 +43,7 @@ static void set_rows_sycl_q(
             stream,
             sycl::nd_range<1>(grid_size * block_size, block_size),
             [=](sycl::nd_item<1> item_ct1) {
-            const int64_t i = item_ct1.get_global_id(0);
+            const int64_t i = item_ct1.get_global_linear_id();
             if (i >= total_blocks) return;
             const int64_t i_base = i * qk;
             const int64_t i03 = i_base / (ne00 * ne01 * ne02);

From 9539e37c03467703cebf91baf396b0cf93fca419 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Sat, 26 Jul 2025 18:26:43 +0530
Subject: [PATCH 3/7] Fix formatting

ggml-ci
---
 ggml/src/ggml-sycl/cpy.hpp       |  6 +-
 ggml/src/ggml-sycl/ggml-sycl.cpp | 18 ++----
 ggml/src/ggml-sycl/set_rows.cpp  | 97 +++++++++++++++++---------------
 3 files changed, 61 insertions(+), 60 deletions(-)

diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp
index d3be454e79e5a..b0d574f940b3a 100644
--- a/ggml/src/ggml-sycl/cpy.hpp
+++ b/ggml/src/ggml-sycl/cpy.hpp
@@ -2,6 +2,7 @@
 #define GGML_SYCL_CPY_HPP
 
 #include "common.hpp"
+
 #include <algorithm>
 #include <cfloat>
 
@@ -26,7 +27,6 @@ __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
     return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
 }
 
-
 inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     const float * xi   = (const float *) cxi;
     block_q8_0 *  dsti = (block_q8_0 *) cdsti;
@@ -50,7 +50,6 @@ inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     }
 }
 
-
 inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
     const float * xi   = (const float *) cxi;
     block_q4_0 *  dsti = (block_q4_0 *) cdsti;
@@ -184,7 +183,6 @@ inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
     memcpy(dsti->qh, &qh, sizeof(qh));
 }
 
-
 inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
     const float *  xi   = (const float *) cxi;
     block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
@@ -224,4 +222,4 @@ inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
 void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
-#endif // GGML_SYCL_CPY_HPP
+#endif  // GGML_SYCL_CPY_HPP
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 47189b462933a..3935ce67de47d 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4385,18 +4385,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             }
         case GGML_OP_SET_ROWS:
             {
-                return ((
-                            op->type == GGML_TYPE_F32 ||
-                            op->type == GGML_TYPE_F16 ||
-                            op->type == GGML_TYPE_BF16 ||
-                            op->type == GGML_TYPE_Q8_0 ||
-                            op->type == GGML_TYPE_Q5_1 ||
-                            op->type == GGML_TYPE_Q5_0 ||
-                            op->type == GGML_TYPE_Q4_1 ||
-                            op->type == GGML_TYPE_Q4_0 ||
-                            op->type == GGML_TYPE_IQ4_NL
-                            ) && (op->src[1]->type == GGML_TYPE_I64));
-            } break;
+                return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
+                         op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
+                         op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
+                        (op->src[1]->type == GGML_TYPE_I64));
+            }
+            break;
         case GGML_OP_CPY:
             {
                 ggml_type src0_type = op->src[0]->type;
diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index b7f4932686413..acf18b8872a42 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -19,59 +19,68 @@ convert (const char* src, char* dst) {
    *reinterpret_cast<TOut*>(dst) = dst_val;
 }
 
-
-template<typename blockType, int qk, cpy_kernel_t cpyblck>
-static void set_rows_sycl_q(
-        const char * __restrict__ src0_d,
-        const int64_t * __restrict__ src1_d,
-        blockType  * __restrict__ dst_d,
-        // tensor dimensions src0 and src1
-        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-        const int64_t ne10, int64_t ne11, const int64_t ne12, const int64_t ne13,
-        // strides for src0
-        const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
-        // strides for src1
-        const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13,
-        // strides for dst
-        const size_t nb1, const size_t nb2, const size_t nb3,
-        queue_ptr stream) {
+template <typename blockType, int qk, cpy_kernel_t cpyblck>
+static void set_rows_sycl_q(const char * __restrict__ src0_d,
+                            const int64_t * __restrict__ src1_d,
+                            blockType * __restrict__ dst_d,
+                            // tensor dimensions src0 and src1
+                            const int64_t ne00,
+                            const int64_t ne01,
+                            const int64_t ne02,
+                            const int64_t ne03,
+                            const int64_t ne10,
+                            int64_t       ne11,
+                            const int64_t ne12,
+                            const int64_t ne13,
+                            // strides for src0
+                            const size_t  nb00,
+                            const size_t  nb01,
+                            const size_t  nb02,
+                            const size_t  nb03,
+                            // strides for src1
+                            const size_t  nb10,
+                            const size_t  nb11,
+                            const size_t  nb12,
+                            const size_t  nb13,
+                            // strides for dst
+                            const size_t  nb1,
+                            const size_t  nb2,
+                            const size_t  nb3,
+                            queue_ptr     stream) {
     const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk;
-    constexpr int block_size = 64;
-    const int64_t grid_size = ceil_div(total_blocks, block_size);
+    constexpr int block_size   = 64;
+    const int64_t grid_size    = ceil_div(total_blocks, block_size);
 
-    sycl_parallel_for(
-            stream,
-            sycl::nd_range<1>(grid_size * block_size, block_size),
-            [=](sycl::nd_item<1> item_ct1) {
-            const int64_t i = item_ct1.get_global_linear_id();
-            if (i >= total_blocks) return;
-            const int64_t i_base = i * qk;
-            const int64_t i03 = i_base / (ne00 * ne01 * ne02);
-            const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02);
-            const int64_t i02 = rem1 / (ne00 * ne01);
-            const int64_t rem2 = rem1 - i02 * ne00 * ne01;
-            const int64_t i01 = rem2 / ne00;
-            const int64_t i00 = rem2 - i01 * ne00;
-            const int64_t i12 = i03 % ne12;
-            const int64_t i11 = i02 % ne11;
-            const int64_t i10 = i01;
-            size_t src_offset = calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
-            const char * src_block = src0_d + src_offset + i00 * sizeof(float);
-            size_t src1_offset = calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12});
-            const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)];
-            size_t dst_offset = calculate_offset<3>({nb1, nb2, nb3}, {dst_row, i02, i03}) + (i00 / qk) * sizeof(blockType);
-            char * dst_block = reinterpret_cast<char*>(reinterpret_cast<char*>(dst_d) + dst_offset);
-            cpyblck(src_block, dst_block);
+    sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
+        const int64_t i = item_ct1.get_global_linear_id();
+        if (i >= total_blocks) {
+            return;
         }
-    );
+        const int64_t i_base      = i * qk;
+        const int64_t i03         = i_base / (ne00 * ne01 * ne02);
+        const int64_t rem1        = i_base - i03 * (ne00 * ne01 * ne02);
+        const int64_t i02         = rem1 / (ne00 * ne01);
+        const int64_t rem2        = rem1 - i02 * ne00 * ne01;
+        const int64_t i01         = rem2 / ne00;
+        const int64_t i00         = rem2 - i01 * ne00;
+        const int64_t i12         = i03 % ne12;
+        const int64_t i11         = i02 % ne11;
+        const int64_t i10         = i01;
+        size_t        src_offset  = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 });
+        const char *  src_block   = src0_d + src_offset + i00 * sizeof(float);
+        size_t        src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 });
+        const int64_t dst_row     = src1_d[src1_offset / sizeof(int64_t)];
+        size_t        dst_offset =
+            calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType);
+        char * dst_block = reinterpret_cast<char *>(reinterpret_cast<char *>(dst_d) + dst_offset);
+        cpyblck(src_block, dst_block);
+    });
     GGML_UNUSED(ne10);
     GGML_UNUSED(ne13);
     GGML_UNUSED(nb00);
     GGML_UNUSED(nb13);
-
 }
 
-
 template<typename TIn, typename TOut>
 static void k_set_rows(
         const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,

From d1e09ef7d26c41271cf912432b81ec14a4e6abb8 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Sat, 26 Jul 2025 18:39:13 +0530
Subject: [PATCH 4/7] Use const for ne11 and size_t variables in
 set_rows_sycl_q

ggml-ci
---
 ggml/src/ggml-sycl/set_rows.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index acf18b8872a42..e70c5d5588f27 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -29,7 +29,7 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d,
                             const int64_t ne02,
                             const int64_t ne03,
                             const int64_t ne10,
-                            int64_t       ne11,
+                            const int64_t ne11,
                             const int64_t ne12,
                             const int64_t ne13,
                             // strides for src0
@@ -66,11 +66,11 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d,
         const int64_t i12         = i03 % ne12;
         const int64_t i11         = i02 % ne11;
         const int64_t i10         = i01;
-        size_t        src_offset  = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 });
+        const size_t  src_offset  = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 });
         const char *  src_block   = src0_d + src_offset + i00 * sizeof(float);
-        size_t        src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 });
+        const size_t  src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 });
         const int64_t dst_row     = src1_d[src1_offset / sizeof(int64_t)];
-        size_t        dst_offset =
+        const size_t  dst_offset =
             calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType);
         char * dst_block = reinterpret_cast<char *>(reinterpret_cast<char *>(dst_d) + dst_offset);
         cpyblck(src_block, dst_block);

From 34b8f0251455e0616f12e239cb7fe1844395af3a Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Sun, 27 Jul 2025 11:56:13 +0530
Subject: [PATCH 5/7] Increase block size for q kernel to 256

ggml-ci
---
 ggml/src/ggml-sycl/set_rows.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index e70c5d5588f27..55f753eeb3dff 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -48,7 +48,7 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d,
                             const size_t  nb3,
                             queue_ptr     stream) {
     const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk;
-    constexpr int block_size   = 64;
+    constexpr int block_size   = 256;
     const int64_t grid_size    = ceil_div(total_blocks, block_size);
 
     sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {

From e5818d47d23b059ca3747c0d6d30885c3ff75497 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Mon, 28 Jul 2025 17:56:47 +0530
Subject: [PATCH 6/7] Cleanup imports

---
 ggml/src/ggml-sycl/cpy.hpp      | 3 ---
 ggml/src/ggml-sycl/set_rows.cpp | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp
index b0d574f940b3a..f81cb6f1fd3d7 100644
--- a/ggml/src/ggml-sycl/cpy.hpp
+++ b/ggml/src/ggml-sycl/cpy.hpp
@@ -3,9 +3,6 @@
 
 #include "common.hpp"
 
-#include <algorithm>
-#include <cfloat>
-
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 
 __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index 55f753eeb3dff..7a8e1410b7040 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -1,8 +1,5 @@
 #include "set_rows.hpp"
-#include "ggml-sycl/common.hpp"
 #include "cpy.hpp"
-#include "ggml.h"
-#include <cstddef>
 
 namespace utils {
 template<typename T>

From 3ad45b1efad5fbaaf034039bb7b8e22ceae40935 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Mon, 28 Jul 2025 18:28:08 +0530
Subject: [PATCH 7/7] Add float.h to cpy.hpp

---
 ggml/src/ggml-sycl/cpy.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp
index f81cb6f1fd3d7..3c331f1ef27b9 100644
--- a/ggml/src/ggml-sycl/cpy.hpp
+++ b/ggml/src/ggml-sycl/cpy.hpp
@@ -2,6 +2,7 @@
 #define GGML_SYCL_CPY_HPP
 
 #include "common.hpp"
+#include <float.h>
 
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);