@@ -2792,7 +2792,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
2792
2792
uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_16 : (subgroup_size_16 * 4);
2793
2793
uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? device->subgroup_size : (device->subgroup_size * 4);
2794
2794
2795
- const bool s = device->subgroup_add && device->architecture != vk_device_architecture::AMD_GCN;
2795
+ const bool s = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN;
2796
2796
2797
2797
for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
2798
2798
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[s], arr_dmmv_f32_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
@@ -2843,8 +2843,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
2843
2843
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[s], arr_dmmv_iq4_nl_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
2844
2844
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[s], arr_dmmv_mxfp4_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
2845
2845
}
2846
+ }
2846
2847
2847
2848
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
2849
+ for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
2848
2850
if (device->integer_dot_product) {
2849
2851
const uint32_t subgroup_size = (device->subgroup_size_control && device->vendor_id == VK_VENDOR_ID_INTEL) ? device->subgroup_min_size : device->subgroup_size;
2850
2852
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
@@ -2861,8 +2863,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
2861
2863
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_q8_1_f32_len, mul_mat_vec_q8_0_q8_1_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {subgroup_size, 1*rm_stdq, i+1}, 1, true);
2862
2864
}
2863
2865
}
2864
- #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
2865
2866
}
2867
+ #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
2866
2868
2867
2869
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
2868
2870
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -5622,7 +5624,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
5622
5624
5623
5625
if (dryrun) {
5624
5626
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
5625
- const uint64_t y_sz_upd = y_sz * ne12 * ne13;
5627
+ uint64_t y_sz_upd = y_sz * ne12 * ne13;
5628
+ if (quantize_y) {
5629
+ y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
5630
+ }
5626
5631
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
5627
5632
if (
5628
5633
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
@@ -5634,7 +5639,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
5634
5639
ctx->prealloc_size_x = x_sz_upd;
5635
5640
}
5636
5641
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
5637
- ctx->prealloc_size_y = CEIL_DIV( y_sz_upd, 128) * 128 ;
5642
+ ctx->prealloc_size_y = y_sz_upd;
5638
5643
}
5639
5644
if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
5640
5645
ctx->prealloc_size_split_k = split_k_size;
@@ -5688,7 +5693,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
5688
5693
GGML_ASSERT(d_Y->size >= y_sz * ne12 * ne13);
5689
5694
} else if (quantize_y) {
5690
5695
d_Y = ctx->prealloc_y;
5691
- GGML_ASSERT(d_Y->size >= y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1) );
5696
+ GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz * ne12 * ne13, 144) * 144 );
5692
5697
} else {
5693
5698
d_Y = d_Qy;
5694
5699
y_buf_offset = qy_buf_offset;
@@ -5730,10 +5735,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
5730
5735
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
5731
5736
}
5732
5737
5738
+ uint32_t y_sz_total = y_sz * ne12 * ne13;
5739
+ if (quantize_y) {
5740
+ y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
5741
+ }
5742
+
5733
5743
// compute
5734
5744
ggml_vk_matmul(
5735
5745
ctx, subctx, pipeline,
5736
- { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
5746
+ { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total },
5737
5747
{ d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
5738
5748
ne01, ne11, ne10,
5739
5749
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
@@ -5844,7 +5854,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5844
5854
5845
5855
if (dryrun) {
5846
5856
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
5847
- const uint64_t y_sz_upd = y_sz * ne12 * ne13;
5857
+ uint64_t y_sz_upd = y_sz * ne12 * ne13;
5858
+ if (quantize_y) {
5859
+ y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
5860
+ }
5848
5861
if (
5849
5862
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
5850
5863
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
@@ -5854,7 +5867,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5854
5867
ctx->prealloc_size_x = x_sz_upd;
5855
5868
}
5856
5869
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
5857
- ctx->prealloc_size_y = CEIL_DIV( y_sz_upd, 128) * 128 ;
5870
+ ctx->prealloc_size_y = y_sz_upd;
5858
5871
}
5859
5872
5860
5873
// Request descriptor sets
@@ -5899,7 +5912,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5899
5912
d_Y = ctx->prealloc_y;
5900
5913
} else if (quantize_y) {
5901
5914
d_Y = ctx->prealloc_y;
5902
- GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz * ne12 * ne13, 128 ) * 128 );
5915
+ GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz * ne12 * ne13, 144 ) * 144 );
5903
5916
} else {
5904
5917
d_Y = d_Qy;
5905
5918
y_buf_offset = qy_buf_offset;
@@ -5946,6 +5959,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5946
5959
groups_x = CEIL_DIV(groups_x, groups_z);
5947
5960
}
5948
5961
5962
+ // TODO: Clean up this whole sz * ne_2 * ne_3 thing, it hasn't been necessary for a long time
5963
+ uint32_t y_sz_total = y_sz * ne12 * ne13;
5964
+ if (quantize_y) {
5965
+ y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
5966
+ }
5967
+
5949
5968
// compute
5950
5969
const vk_mat_vec_push_constants pc = {
5951
5970
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
@@ -5954,7 +5973,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5954
5973
};
5955
5974
ggml_vk_sync_buffers(subctx);
5956
5975
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5957
- { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
5976
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz_total }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
5958
5977
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5959
5978
}
5960
5979
0 commit comments