mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-04 16:14:06 +00:00
Compare commits
3 Commits
b8962
...
0cc4m/vulk
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
df488da9ac | ||
|
|
63f85ed27e | ||
|
|
a4901b0477 |
@@ -62,6 +62,8 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
#define YIELD()
|
||||
#endif
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
@@ -975,6 +977,7 @@ struct vk_mat_mat_push_constants {
|
||||
uint32_t k_split;
|
||||
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
||||
uint32_t padded_N;
|
||||
uint32_t deltas_offset;
|
||||
};
|
||||
|
||||
#define MAT_VEC_FUSION_FLAGS_BIAS0 0x1
|
||||
@@ -996,6 +999,7 @@ struct vk_mat_vec_push_constants {
|
||||
uint32_t ne12;
|
||||
uint32_t broadcast2;
|
||||
uint32_t broadcast3;
|
||||
uint32_t deltas_offset;
|
||||
};
|
||||
|
||||
struct vk_mat_vec_p021_push_constants {
|
||||
@@ -1030,6 +1034,7 @@ struct vk_mat_mat_id_push_constants {
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
|
||||
uint32_t padded_N;
|
||||
uint32_t deltas_offset;
|
||||
};
|
||||
struct vk_mat_vec_id_push_constants {
|
||||
uint32_t ncols;
|
||||
@@ -1044,6 +1049,7 @@ struct vk_mat_vec_id_push_constants {
|
||||
uint32_t ne11;
|
||||
uint32_t expert_i1;
|
||||
uint32_t nbi1;
|
||||
uint32_t deltas_offset;
|
||||
};
|
||||
|
||||
struct vk_flash_attn_push_constants {
|
||||
@@ -1942,7 +1948,7 @@ static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
|
||||
|
||||
static uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t)
|
||||
{
|
||||
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
|
||||
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));
|
||||
}
|
||||
|
||||
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
@@ -6411,6 +6417,8 @@ static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffe
|
||||
}
|
||||
}
|
||||
|
||||
static size_t ggml_vk_repack_size_tensor(const ggml_tensor * tensor);
|
||||
|
||||
static vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false) {
|
||||
|
||||
@@ -6426,7 +6434,7 @@ static vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
}
|
||||
GGML_ASSERT(buffer != nullptr);
|
||||
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
size_t size = ggml_vk_repack_size_tensor(tensor);
|
||||
|
||||
size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
// The shader must support misaligned offsets when indexing into the buffer
|
||||
@@ -6985,6 +6993,33 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
|
||||
constexpr uint32_t VULKAN_REPACK_ALIGNMENT = 256;
|
||||
|
||||
static size_t ggml_vk_get_num_blocks(const ggml_tensor * tensor) {
|
||||
const size_t num_blocks_per_row = tensor->ne[0] / ggml_blck_size(tensor->type);
|
||||
return num_blocks_per_row * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
|
||||
}
|
||||
|
||||
static size_t ggml_vk_repack_q4_0_delta_offset(size_t n_blocks) {
|
||||
return GGML_PAD(n_blocks * 16, VULKAN_REPACK_ALIGNMENT);
|
||||
}
|
||||
|
||||
static size_t ggml_vk_repack_q4_0_size(size_t n_blocks) {
|
||||
return ggml_vk_repack_q4_0_delta_offset(n_blocks) + n_blocks * 2;
|
||||
}
|
||||
|
||||
static size_t ggml_vk_repack_q4_0_delta_offset_tensor(const ggml_tensor * tensor) {
|
||||
return ggml_vk_repack_q4_0_delta_offset(ggml_vk_get_num_blocks(tensor));
|
||||
}
|
||||
|
||||
static size_t ggml_vk_repack_q4_0_size_tensor(const ggml_tensor * tensor) {
|
||||
return ggml_vk_repack_q4_0_size(ggml_vk_get_num_blocks(tensor));
|
||||
}
|
||||
|
||||
static size_t ggml_vk_repack_size_tensor(const ggml_tensor * tensor) {
|
||||
return tensor->type == GGML_TYPE_Q4_0 ? ggml_vk_repack_q4_0_size_tensor(tensor) : ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, bool disable_split_k, const vk_pipeline& pipeline) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << disable_split_k << ")");
|
||||
|
||||
@@ -7079,7 +7114,7 @@ static void ggml_vk_matmul(
|
||||
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
||||
uint32_t padded_n) {
|
||||
uint32_t padded_n, uint32_t deltas_offset) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
|
||||
if (split_k == 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
|
||||
@@ -7088,7 +7123,7 @@ static void ggml_vk_matmul(
|
||||
while (base_work_group_z < batch) {
|
||||
uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n, deltas_offset };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z });
|
||||
base_work_group_z += groups_z;
|
||||
}
|
||||
@@ -7111,7 +7146,7 @@ static void ggml_vk_matmul(
|
||||
while (base_work_group_z < batch) {
|
||||
uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n, deltas_offset };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z });
|
||||
base_work_group_z += groups_z;
|
||||
@@ -7159,13 +7194,13 @@ static void ggml_vk_matmul_id(
|
||||
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11,
|
||||
uint32_t padded_n) {
|
||||
uint32_t padded_n, uint32_t deltas_offset) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), expert_count: (" << expert_count_buf.buffer->buffer << ", " << expert_count_buf.offset << ", " << expert_count_buf.size << "), " <<
|
||||
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
||||
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
||||
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
||||
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
||||
nei0, nei1, nbi1, ne11, padded_n };
|
||||
nei0, nei1, nbi1, ne11, padded_n, deltas_offset };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids, expert_count_buf }, pc, { m, nei1, n_as });
|
||||
}
|
||||
|
||||
@@ -7462,7 +7497,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
|
||||
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, disable_split_k, pipeline);
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
const uint64_t qx_sz = ggml_vk_repack_size_tensor(src0);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
||||
const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
@@ -7610,6 +7645,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
||||
}
|
||||
|
||||
const uint32_t deltas_offset = src0->type == GGML_TYPE_Q4_0 ? ggml_vk_repack_q4_0_delta_offset_tensor(src0) / 2 : 0;
|
||||
|
||||
// compute
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, pipeline,
|
||||
@@ -7617,7 +7654,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * split_k },
|
||||
ne01, ne11, ne10,
|
||||
ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d,
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n, deltas_offset
|
||||
); // NOLINT
|
||||
|
||||
if (x_non_contig || qx_needs_dequant) {
|
||||
@@ -7784,7 +7821,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
const uint64_t x_ne = ggml_nelements(src0);
|
||||
const uint64_t y_ne = ggml_nelements(src1);
|
||||
|
||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_vk_repack_size_tensor(src0), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
|
||||
const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) :
|
||||
(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
@@ -7910,6 +7947,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1]));
|
||||
|
||||
const uint32_t deltas_offset = src0->type == GGML_TYPE_Q4_0 ? ggml_vk_repack_q4_0_delta_offset_tensor(src0) / 2 : 0;
|
||||
|
||||
uint32_t base_work_group_y = 0;
|
||||
while (base_work_group_y < ne12 * ne13) {
|
||||
|
||||
@@ -7919,6 +7958,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
stride_batch_x, stride_batch_y, stride_batch_d,
|
||||
fusion_flags, base_work_group_y,
|
||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||
deltas_offset,
|
||||
};
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
||||
{
|
||||
@@ -8293,7 +8333,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
const uint64_t y_ne = padded_n * ne10 * ne12 * ne13;
|
||||
const uint64_t d_ne = ggml_nelements(dst);
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
const uint64_t qx_sz = ggml_vk_repack_size_tensor(src0);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
||||
const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
@@ -8461,6 +8501,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
||||
}
|
||||
|
||||
const uint32_t deltas_offset = src0->type == GGML_TYPE_Q4_0 ? ggml_vk_repack_q4_0_delta_offset_tensor(src0) / 2 : 0;
|
||||
|
||||
// compute
|
||||
ggml_vk_matmul_id(
|
||||
ctx, subctx, pipeline,
|
||||
@@ -8468,7 +8510,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
{ d_D, d_buf_offset, d_sz }, { d_ids, ids_buf_offset, ids_sz }, expert_count_buf,
|
||||
ne01, ne21, ne10, ne10, ne10, ne01,
|
||||
stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n, deltas_offset
|
||||
); // NOLINT
|
||||
|
||||
if (x_non_contig || qx_needs_dequant) {
|
||||
@@ -8560,7 +8602,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||
const uint64_t x_ne = ggml_nelements(src0);
|
||||
const uint64_t y_ne = ggml_nelements(src1);
|
||||
|
||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_vk_repack_size_tensor(src0), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
|
||||
const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) :
|
||||
(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
@@ -8684,13 +8726,16 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||
fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE1;
|
||||
}
|
||||
|
||||
const uint32_t deltas_offset = src0->type == GGML_TYPE_Q4_0 ? ggml_vk_repack_q4_0_delta_offset_tensor(src0) / 2 : 0;
|
||||
|
||||
// Loop over the batch dimension
|
||||
for (uint32_t expert_i1 = 0; expert_i1 < nei1; ++expert_i1) {
|
||||
const vk_mat_vec_id_push_constants pc = {
|
||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||
(uint32_t)(ne00 * ne01), stride_batch_y, (uint32_t)(ne20 * ne21),
|
||||
fusion_flags,
|
||||
(uint32_t)nei0, (uint32_t)ne11, expert_i1, nbi1
|
||||
(uint32_t)nei0, (uint32_t)ne11, expert_i1, nbi1,
|
||||
deltas_offset,
|
||||
};
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
||||
{
|
||||
@@ -12002,7 +12047,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
split_k, batch, batch, batch, 1, 1, n, 0
|
||||
);
|
||||
}
|
||||
ggml_vk_ctx_end(subctx);
|
||||
@@ -12479,7 +12524,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
ctx, subctx, p, { qx_buf, 0, qx_sz }, { qy_buf, 0, qy_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k },
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
split_k, batch, batch, batch, 1, 1, n, 0
|
||||
);
|
||||
}
|
||||
} else {
|
||||
@@ -12488,7 +12533,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
ctx, subctx, p, { qx_buf, 0, qx_sz }, { y_buf, 0, y_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k },
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
split_k, batch, batch, batch, 1, 1, n, 0
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -13443,6 +13488,27 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml
|
||||
return;
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
const size_t repacked_size = ggml_vk_repack_q4_0_size_tensor(tensor);
|
||||
const size_t deltas_offset = ggml_vk_repack_q4_0_delta_offset_tensor(tensor);
|
||||
|
||||
void * data_repacked = malloc(repacked_size);
|
||||
uint8_t * quants = (uint8_t *)data_repacked;
|
||||
ggml_fp16_t * deltas = (ggml_fp16_t *)((uint8_t *)data_repacked + deltas_offset);
|
||||
|
||||
const block_q4_0 * src = (const block_q4_0 *)data;
|
||||
|
||||
for (size_t i = 0; i < ggml_vk_get_num_blocks(tensor); i++) {
|
||||
memcpy(quants + 16 * i, src[i].qs, 16);
|
||||
deltas[i] = src[i].d;
|
||||
}
|
||||
|
||||
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data_repacked, repacked_size);
|
||||
|
||||
free(data_repacked);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
||||
}
|
||||
|
||||
@@ -13456,6 +13522,27 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
const size_t repacked_size = ggml_vk_repack_q4_0_size_tensor(tensor);
|
||||
const size_t deltas_offset = ggml_vk_repack_q4_0_delta_offset_tensor(tensor);
|
||||
|
||||
void * data_repacked = malloc(repacked_size);
|
||||
uint8_t * quants = (uint8_t *)data_repacked;
|
||||
ggml_fp16_t * deltas = (ggml_fp16_t *)((uint8_t *)data_repacked + deltas_offset);
|
||||
|
||||
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data_repacked, repacked_size);
|
||||
|
||||
block_q4_0 * dst = (block_q4_0 *)data;
|
||||
|
||||
for (size_t i = 0; i < ggml_vk_get_num_blocks(tensor); i++) {
|
||||
memcpy(dst[i].qs, quants + 16 * i, 16);
|
||||
dst[i].d = deltas[i];
|
||||
}
|
||||
|
||||
free(data_repacked);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
||||
}
|
||||
|
||||
@@ -13532,6 +13619,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
||||
}
|
||||
|
||||
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
const size_t num_blocks_per_row = tensor->ne[0] / ggml_blck_size(tensor->type);
|
||||
|
||||
return ggml_vk_repack_q4_0_size(num_blocks_per_row * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]);
|
||||
}
|
||||
|
||||
return ggml_nbytes(tensor);
|
||||
|
||||
UNUSED(buft);
|
||||
@@ -13658,6 +13751,28 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
const size_t repacked_size = ggml_vk_repack_q4_0_size_tensor(tensor);
|
||||
const size_t deltas_offset = ggml_vk_repack_q4_0_delta_offset_tensor(tensor);
|
||||
|
||||
void * data_repacked = malloc(repacked_size);
|
||||
uint8_t * quants = (uint8_t *)data_repacked;
|
||||
ggml_fp16_t * deltas = (ggml_fp16_t *)((uint8_t *)data_repacked + deltas_offset);
|
||||
|
||||
const block_q4_0 * src = (const block_q4_0 *)data;
|
||||
|
||||
for (size_t i = 0; i < ggml_vk_get_num_blocks(tensor); i++) {
|
||||
memcpy(quants + 16 * i, src[i].qs, 16);
|
||||
deltas[i] = src[i].d;
|
||||
}
|
||||
|
||||
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data_repacked, repacked_size);
|
||||
|
||||
free(data_repacked);
|
||||
return;
|
||||
}
|
||||
|
||||
vk_context cpy_ctx;
|
||||
|
||||
@@ -13674,8 +13789,6 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
}
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
||||
|
||||
bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size);
|
||||
@@ -13705,11 +13818,31 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
const size_t repacked_size = ggml_vk_repack_q4_0_size_tensor(tensor);
|
||||
const size_t deltas_offset = ggml_vk_repack_q4_0_delta_offset_tensor(tensor);
|
||||
|
||||
void * data_repacked = malloc(repacked_size);
|
||||
uint8_t * quants = (uint8_t *)data_repacked;
|
||||
ggml_fp16_t * deltas = (ggml_fp16_t *)((uint8_t *)data_repacked + deltas_offset);
|
||||
|
||||
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data_repacked, repacked_size);
|
||||
|
||||
block_q4_0 * dst = (block_q4_0 *)data;
|
||||
|
||||
for (size_t i = 0; i < ggml_vk_get_num_blocks(tensor); i++) {
|
||||
memcpy(dst[i].qs, quants + 16 * i, 16);
|
||||
dst[i].d = deltas[i];
|
||||
}
|
||||
|
||||
free(data_repacked);
|
||||
return;
|
||||
}
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
||||
bool ret = ggml_vk_buffer_read_async(compute_ctx, buf, src_offset, data, size);
|
||||
|
||||
@@ -16220,6 +16353,25 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||
srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
|
||||
}
|
||||
} else if (srci->type == GGML_TYPE_Q4_0) {
|
||||
const size_t repacked_size = ggml_vk_repack_q4_0_size_tensor(srci);
|
||||
const size_t deltas_offset = ggml_vk_repack_q4_0_delta_offset_tensor(srci);
|
||||
|
||||
void * data_repacked = malloc(repacked_size);
|
||||
uint8_t * quants = (uint8_t *)data_repacked;
|
||||
ggml_fp16_t * deltas = (ggml_fp16_t *)((uint8_t *)data_repacked + deltas_offset);
|
||||
|
||||
ggml_vk_buffer_read(buffer_gpu, offset, data_repacked, repacked_size);
|
||||
|
||||
block_q4_0 * dst = (block_q4_0 *)srci_clone->data;
|
||||
|
||||
for (size_t i = 0; i < ggml_vk_get_num_blocks(srci); i++) {
|
||||
memcpy(dst[i].qs, quants + 16 * i, 16);
|
||||
dst[i].d = deltas[i];
|
||||
}
|
||||
|
||||
free(data_repacked);
|
||||
memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else {
|
||||
if (offset + srci_size >= buffer_gpu->size) {
|
||||
srci_size = buffer_gpu->size - offset;
|
||||
|
||||
@@ -23,6 +23,16 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a_quants[(a_offset + ib) * 16 + iqs]);
|
||||
return (vec2(vui & 0xF, vui >> 4) - 8.0f);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a_quants16[(a_offset + ib) * 8 + iqs/2]);
|
||||
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
|
||||
}
|
||||
#else
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return (vec2(vui & 0xF, vui >> 4) - 8.0f);
|
||||
@@ -32,6 +42,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_1)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
@@ -450,7 +461,11 @@ vec2 get_dm(uint ib, uint a_offset) {
|
||||
|
||||
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
#if defined(DATA_A_Q4_0) && defined(A_TYPE_REPACKED)
|
||||
return vec2(float(data_a_deltas[a_offset + p.deltas_offset + ib]), 0);
|
||||
#else
|
||||
return vec2(float(data_a[a_offset + ib].d), 0);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -13,16 +13,28 @@ float16_t dequantFuncF32(const in decodeBufF32 bl, const in uint blockCoords[2],
|
||||
return vf16[idx];
|
||||
}
|
||||
|
||||
#ifdef A_TYPE_REPACKED
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_0 {
|
||||
uint16_t qs[8];
|
||||
};
|
||||
#else
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
|
||||
block_q4_0_packed16 block;
|
||||
};
|
||||
#endif
|
||||
|
||||
float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
#ifdef A_TYPE_REPACKED
|
||||
const uint ib = pos_a + blockCoords[0] * (p.stride_a / QUANT_K) + blockCoords[1];
|
||||
const float16_t d = data_a_deltas[p.deltas_offset + ib];
|
||||
uint32_t qs = uint32_t(bl.qs[(idx & 0xE) >> 1]);
|
||||
#else
|
||||
const float16_t d = bl.block.d;
|
||||
uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
|
||||
#endif
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
qs >>= shift;
|
||||
qs &= 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
|
||||
@@ -38,6 +38,8 @@ layout (push_constant) uniform parameter
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
|
||||
uint deltas_offset;
|
||||
} p;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
|
||||
@@ -15,6 +15,12 @@ layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16
|
||||
#if defined(A_TYPE_PACKED32)
|
||||
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||
#endif
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
layout (binding = 0) readonly buffer A_QUANTS {uint8_t data_a_quants[];};
|
||||
layout (binding = 0) readonly buffer A_QUANTS16 {uint16_t data_a_quants16[];};
|
||||
layout (binding = 0) readonly buffer A_QUANTS32 {uint32_t data_a_quants32[];};
|
||||
layout (binding = 0) readonly buffer A_DELTAS {float16_t data_a_deltas[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
#ifdef B_TYPE_VEC2
|
||||
|
||||
@@ -6,7 +6,11 @@
|
||||
|
||||
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
|
||||
FLOAT_TYPE get_dm(uint ib) {
|
||||
#if defined(DATA_A_Q4_0) && defined(A_TYPE_REPACKED)
|
||||
return FLOAT_TYPE(data_a_deltas[p.deltas_offset + ib]);
|
||||
#else
|
||||
return FLOAT_TYPE(data_a[ib].d);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -33,9 +37,13 @@ FLOAT_TYPE_VEC2 get_dm(uint ib) {
|
||||
#if defined(DATA_A_Q4_0)
|
||||
// 2-byte loads for Q4_0 blocks (18 bytes)
|
||||
i32vec2 repack(uint ib, uint iqs) {
|
||||
#if defined(DATA_A_Q4_0) && defined(A_TYPE_REPACKED)
|
||||
const uint32_t vui = data_a_quants32[ib * 4 + iqs];
|
||||
#else
|
||||
const u16vec2 quants = u16vec2(data_a_packed16[ib].qs[iqs * 2 ],
|
||||
data_a_packed16[ib].qs[iqs * 2 + 1]);
|
||||
const uint32_t vui = pack32(quants);
|
||||
#endif
|
||||
return i32vec2( vui & 0x0F0F0F0F,
|
||||
(vui >> 4) & 0x0F0F0F0F);
|
||||
}
|
||||
|
||||
@@ -62,6 +62,12 @@ layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16
|
||||
#if defined(A_TYPE_PACKED32)
|
||||
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||
#endif
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
layout (binding = 0) readonly buffer A_QUANTS {uint8_t data_a_quants[];};
|
||||
layout (binding = 0) readonly buffer A_QUANTS16 {uint16_t data_a_quants16[];};
|
||||
layout (binding = 0) readonly buffer A_QUANTS32 {uint32_t data_a_quants32[];};
|
||||
layout (binding = 0) readonly buffer A_DELTAS {float16_t data_a_deltas[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
@@ -98,6 +104,9 @@ layout (push_constant) uniform parameter
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
|
||||
uint padded_N;
|
||||
uint deltas_offset;
|
||||
} p;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 64;
|
||||
|
||||
@@ -63,13 +63,22 @@ layout (push_constant) uniform parameter
|
||||
#endif
|
||||
// N dimension for the B matrix can be >= p.N
|
||||
uint padded_N;
|
||||
uint deltas_offset;
|
||||
} p;
|
||||
|
||||
|
||||
#ifdef A_TYPE_REPACKED
|
||||
struct block_q4_0_quants { uint16_t qs[8]; };
|
||||
layout (binding = 0) readonly buffer A {block_q4_0_quants data_a[];};
|
||||
layout (binding = 0) readonly buffer A_DELTAS {float16_t data_a_deltas[];};
|
||||
#else
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
#endif
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
uint pos_a;
|
||||
|
||||
#if QUANT_K > 1
|
||||
#define DECODEFUNCA , dequantFuncA
|
||||
|
||||
@@ -254,10 +263,10 @@ void main() {
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint pos_a = expert_idx * (p.batch_stride_a / QUANT_K);
|
||||
pos_a = expert_idx * (p.batch_stride_a / QUANT_K);
|
||||
uint pos_b = 0;
|
||||
#else
|
||||
uint pos_a = batch_idx_a * (p.batch_stride_a / QUANT_K);
|
||||
pos_a = batch_idx_a * (p.batch_stride_a / QUANT_K);
|
||||
uint pos_b = batch_idx * p.batch_stride_b;
|
||||
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
|
||||
#endif
|
||||
|
||||
@@ -52,8 +52,13 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
const float d = float(data_a_deltas[p.deltas_offset + ib]);
|
||||
const uint vui = data_a_quants32[ib * 4 + iqs];
|
||||
#else
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
||||
#endif
|
||||
const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
|
||||
const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
|
||||
|
||||
|
||||
@@ -30,6 +30,13 @@ layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16
|
||||
#if defined(A_TYPE_PACKED32)
|
||||
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||
#endif
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
layout (binding = 0) readonly buffer A_QUANTS {uint8_t data_a_quants[];};
|
||||
layout (binding = 0) readonly buffer A_QUANTS16 {uint16_t data_a_quants16[];};
|
||||
layout (binding = 0) readonly buffer A_QUANTS32 {uint32_t data_a_quants32[];};
|
||||
layout (binding = 0) readonly buffer A_DELTAS {float16_t data_a_deltas[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 1) readonly buffer B {block_q8_1_x4_packed128 data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
@@ -65,6 +72,9 @@ layout (push_constant) uniform parameter
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
|
||||
uint padded_N;
|
||||
uint deltas_offset;
|
||||
} p;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 64;
|
||||
|
||||
@@ -11,11 +11,19 @@
|
||||
// 4-byte loads for Q4_1 blocks (20 bytes)
|
||||
void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
|
||||
#ifdef DATA_A_Q4_0
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
buf_a[buf_ib].qs[iqs] = data_a_quants32[ib * 4 + iqs];
|
||||
#else
|
||||
buf_a[buf_ib].qs[iqs] = pack32(u16vec2(data_a_packed16[ib].qs[iqs * 2],
|
||||
data_a_packed16[ib].qs[iqs * 2 + 1]));
|
||||
#endif
|
||||
|
||||
if (iqs == 0) {
|
||||
#if defined(A_TYPE_REPACKED)
|
||||
buf_a[buf_ib].dm = FLOAT_TYPE(data_a_deltas[p.deltas_offset + ib]);
|
||||
#else
|
||||
buf_a[buf_ib].dm = FLOAT_TYPE(data_a_packed16[ib].d);
|
||||
#endif
|
||||
}
|
||||
#else // DATA_A_Q4_1
|
||||
buf_a[buf_ib].qs[iqs] = data_a_packed32[ib].qs[iqs];
|
||||
|
||||
@@ -561,6 +561,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
continue;
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> mm_base_dict = base_dict;
|
||||
if (tname == "q4_0") {
|
||||
mm_base_dict["A_TYPE_REPACKED"] = "1";
|
||||
}
|
||||
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
||||
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? "1" : load_vec_quant;
|
||||
@@ -576,19 +581,19 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
|
||||
// don't generate f32 variants for coopmat2
|
||||
if (!coopmat2) {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(merge_maps(mm_base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(merge_maps(mm_base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
if (tname != "f16" && tname != "f32") {
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(merge_maps(mm_base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(merge_maps(mm_base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
// Integer dot mmq performs better with f32 accumulators
|
||||
if (!f16acc && !coopmat && !coopmat2 && (is_legacy_quant(tname) || is_k_quant(tname) || tname == "mxfp4")) {
|
||||
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(merge_maps(mm_base_dict, float_type_dict), {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -678,33 +683,38 @@ void process_shaders() {
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}};
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
std::map<std::string, std::string> mmv_base_dict = base_dict;
|
||||
if (tname == "q4_0") {
|
||||
mmv_base_dict["A_TYPE_REPACKED"] = "1";
|
||||
}
|
||||
|
||||
// mul mat vec
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup_no_shmem", shader, merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32", shader, merge_maps(mmv_base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup", shader, merge_maps(mmv_base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(mmv_base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
|
||||
// mul mat vec with integer dot product
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (is_legacy_quant(tname) || tname == "mxfp4" || is_k_quant(tname) || tname == "iq1_s" || tname == "iq1_m") {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(mmv_base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(mmv_base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(mmv_base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(mmv_base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user