@@ -4559,7 +4559,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
4559
4559
4560
4560
// heuristic to choose workgroup size
4561
4561
uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
4562
- if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA || ctx->device->vendor_id == VK_VENDOR_ID_INTEL ) {
4562
+ if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA) {
4563
4563
// Prefer larger workgroups when M is small, to spread the work out more
4564
4564
// and keep more SMs busy.
4565
4565
// q6_k seems to prefer small workgroup size even for "medium" values of M.
@@ -4575,7 +4575,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
4575
4575
}
4576
4576
4577
4577
if (b_type == GGML_TYPE_Q8_1) {
4578
- return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_SUBGROUP ][a_type][num_cols-1];
4578
+ return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg ][a_type][num_cols-1];
4579
4579
}
4580
4580
4581
4581
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1];
@@ -6036,7 +6036,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
6036
6036
}
6037
6037
}
6038
6038
if (quantize_y) {
6039
- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
6039
+ if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
6040
+ ctx->prealloc_y_last_tensor_used != src1) {
6041
+ ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
6042
+ ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
6043
+ ctx->prealloc_y_last_tensor_used = src1;
6044
+ }
6040
6045
}
6041
6046
6042
6047
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
0 commit comments