mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-15 13:34:06 +00:00
opencl: add q4_0 MoE GEMM for Adreno (#22731)
* Q4_0 MoE CLC pass sanity check * release program * opencl: fix whitespace * opencl: remove unused cl_program * opencl: break #if block to make it more clear * opencl: adjust format --------- Co-authored-by: Li He <lih@qti.qualcomm.com>
This commit is contained in:
@@ -102,6 +102,8 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_id_q8_0_f32_flat
|
||||
mul_mv_id_mxfp4_f32
|
||||
mul_mv_id_mxfp4_f32_flat
|
||||
gemm_moe_q4_0_f32_ns
|
||||
gemv_moe_q4_0_f32_ns
|
||||
gemm_moe_mxfp4_f32
|
||||
gemv_moe_mxfp4_f32
|
||||
gemm_moe_mxfp4_f32_ns
|
||||
|
||||
@@ -542,6 +542,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mm_f16_f32_kq;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
||||
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
||||
cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
|
||||
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
||||
cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns;
|
||||
@@ -600,6 +601,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns;
|
||||
cl_kernel kernel_moe_reorder_b;
|
||||
@@ -950,6 +952,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1", &err), err));
|
||||
@@ -2884,6 +2888,40 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_q4_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_moe_q4_0_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_moe_q4_0_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_moe_q4_0_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_0_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q4_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_moe_q4_0_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_moe_q4_0_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q4_0_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_0_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_mxfp4_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3657,11 +3695,14 @@ struct ggml_tensor_extra_cl_q4_0 {
|
||||
CL_CHECK(clReleaseMemObject(d));
|
||||
d = nullptr;
|
||||
}
|
||||
if (q_img != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
q_img = nullptr;
|
||||
}
|
||||
// Currently, q_img and d_img are only initialized when SMALL_ALLOC is
|
||||
// enabled. They point to the images in ggml_backend_opencl_buffer_context.
|
||||
// So, there is no need to release them here.
|
||||
// TODO: initialize them for non SMALL_PATH path, or remove them.
|
||||
q_img = nullptr;
|
||||
d_img = nullptr;
|
||||
size_q = 0;
|
||||
size_d = 0;
|
||||
@@ -4926,17 +4967,53 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
//cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Adreno moe q4_0 kernel needs special transpose and unshuffling
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0_trans4_ns;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
// Create image for Q
|
||||
cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32};
|
||||
cl_image_desc img_desc_q = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(ggml_nelements(tensor) / 8),
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
{ extra->q }
|
||||
};
|
||||
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
||||
tensor->extra = extra;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
||||
|
||||
// The optimized kernels need weights in natural order, so unshuffle.
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
|
||||
}
|
||||
#else
|
||||
#else
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
@@ -4952,7 +5029,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
tensor->extra = extra;
|
||||
|
||||
// transpose the weights and scales
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Only do transpose for large, non batched matrix
|
||||
// TODO: use preallocated images instead of sub-buffer then image
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
@@ -4966,10 +5043,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_1) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
@@ -5689,6 +5764,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0_trans4_ns;
|
||||
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
ggml_cl_buffer buf_trans_q;
|
||||
ggml_cl_buffer buf_trans_d;
|
||||
@@ -12811,6 +12916,179 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
// subgroup mat vec
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0: {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, src0)) {
|
||||
cl_int status;
|
||||
|
||||
size_t local_size[3] = {64, 2, 1};
|
||||
size_t global_size[3] = {64, 2, 1};
|
||||
|
||||
if (ne12 == 1) { // for gemv
|
||||
kernel = backend_ctx->kernel_gemv_moe_q4_0_f32_ns;
|
||||
|
||||
cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
|
||||
|
||||
// create a sub_buffer for src2
|
||||
cl_buffer_region region;
|
||||
region.origin = offset2;
|
||||
region.size = ne20 * ne21 * sizeof(int);
|
||||
buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create image for src1
|
||||
cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
|
||||
buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
|
||||
|
||||
// launch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(src1_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(buf_src1_image));
|
||||
CL_CHECK(clReleaseMemObject(buf_src2));
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns;
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
|
||||
moe_router_reoerder(backend, src2, ne20);
|
||||
backend_ctx->toggle_reorder = false;
|
||||
}
|
||||
|
||||
cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
|
||||
cl_mem buf_src2, buf_src2_emap;
|
||||
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = sizeof(int) * max_post_router_tile * n_tile_size;
|
||||
buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
region.origin = 0;
|
||||
region.size = sizeof(short) * max_post_router_tile;
|
||||
buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Reorder activations
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Create image for reordered src1
|
||||
// Use pre-allocated placeholder
|
||||
region.origin = 0;
|
||||
region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
|
||||
backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
|
||||
buf_src1_reordered = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
cl_image_format image_format_buf_src1;
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
unsigned short map_ratio = ne20 / ne11;
|
||||
GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size));
|
||||
|
||||
size_t reorder_b_local_size[3] = {256, 1, 1};
|
||||
size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
|
||||
|
||||
// Dispatch reorder kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
|
||||
|
||||
// MoE kernel prepare
|
||||
// Create sub buffer for dst
|
||||
region.origin = offsetd;
|
||||
region.size = ne0 * ne1 * ne2 * sizeof(float);
|
||||
sub_buf_dst = clCreateSubBuffer(
|
||||
extrad->data_device,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// Create image for dst
|
||||
cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
|
||||
buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
|
||||
// set thread grid
|
||||
global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
|
||||
global_size[2] = static_cast<size_t>(max_post_router_tile);
|
||||
local_size[1] = 1;
|
||||
local_size[2] = 1;
|
||||
|
||||
// Dispatch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
clReleaseMemObject(sub_buf_src1_pre);
|
||||
clReleaseMemObject(buf_src1_reordered);
|
||||
clReleaseMemObject(image_src1_reordered);
|
||||
clReleaseMemObject(buf_src2);
|
||||
clReleaseMemObject(buf_src2_emap);
|
||||
clReleaseMemObject(sub_buf_dst);
|
||||
clReleaseMemObject(buf_dst_image);
|
||||
}
|
||||
return;
|
||||
} // fallback to generic Q4_0 MoE kernel
|
||||
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
|
||||
@@ -190,6 +190,92 @@ kernel void kernel_restore_block_q4_0_noshuffle(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q4_0_trans4_ns(
|
||||
global struct block_q4_0 * src0,
|
||||
__global uint * dst_q,
|
||||
__global half * dst_d,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK4_0;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q4_0 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK4_0 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_q[offset] = q_block.x;
|
||||
dst_q[offset + ne01] = q_block.y;
|
||||
dst_q[offset + ne01 * 2] = q_block.z;
|
||||
dst_q[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q4_0_trans4_ns(
|
||||
__global uint * src_q,
|
||||
__global half * src_d,
|
||||
__global struct block_q4_0 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK4_0;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q4_0 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_d_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_q[src_q_offset];
|
||||
q_block.y = src_q[src_q_offset + ne01];
|
||||
q_block.z = src_q[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_q[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK4_0 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK4_0 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q4_1
|
||||
// Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
|
||||
|
||||
252
ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
Normal file
252
ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
Normal file
@@ -0,0 +1,252 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q4_0(q4, a_f16, scale) \
|
||||
a_f16.s0 = (half)((q4.s0 & 0x000F) - 8) * scale; \
|
||||
a_f16.s1 = (half)(((q4.s0 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.s2 = (half)(((q4.s0 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.s3 = (half)(((q4.s0 & 0xF000) >> 12) - 8) * scale; \
|
||||
a_f16.s4 = (half)((q4.s1 & 0x000F) - 8) * scale; \
|
||||
a_f16.s5 = (half)(((q4.s1 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.s6 = (half)(((q4.s1 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.s7 = (half)(((q4.s1 & 0xF000) >> 12) - 8) * scale; \
|
||||
a_f16.s8 = (half)((q4.s2 & 0x000F) - 8) * scale; \
|
||||
a_f16.s9 = (half)(((q4.s2 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.sa = (half)(((q4.s2 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.sb = (half)(((q4.s2 & 0xF000) >> 12) - 8) * scale; \
|
||||
a_f16.sc = (half)((q4.s3 & 0x000F) - 8) * scale; \
|
||||
a_f16.sd = (half)(((q4.s3 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.se = (half)(((q4.s3 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.sf = (half)(((q4.s3 & 0xF000) >> 12) - 8) * scale; \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q4_0_f32_ns(
|
||||
__read_only image1d_buffer_t src0_q,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale for current Q4_0 block
|
||||
uint s_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[s_offset];
|
||||
|
||||
// Load 16 q (64-bits) in transposed layout
|
||||
uint2 q4x16;
|
||||
q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
|
||||
q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q4_0(as_ushort4(q4x16), reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 q (64-bits) in transposed layout
|
||||
q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
|
||||
q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q4_0(as_ushort4(q4x16), reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
116
ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
Normal file
116
ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
Normal file
@@ -0,0 +1,116 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#define QK_Q4_0 32
|
||||
#define N_SIMDGROUP 4
|
||||
#define SIMDGROUP_WIDTH 64
|
||||
|
||||
static inline float8 q4_0_to_fp32_packed8(ushort2 q4x8) {
|
||||
float8 fp32x8;
|
||||
fp32x8.s0 = (float)((q4x8.s0 & 0x000F) - 8);
|
||||
fp32x8.s1 = (float)(((q4x8.s0 & 0x00F0) >> 4) - 8);
|
||||
fp32x8.s2 = (float)(((q4x8.s0 & 0x0F00) >> 8) - 8);
|
||||
fp32x8.s3 = (float)(((q4x8.s0 & 0xF000) >> 12) - 8);
|
||||
fp32x8.s4 = (float)((q4x8.s1 & 0x000F) - 8);
|
||||
fp32x8.s5 = (float)(((q4x8.s1 & 0x00F0) >> 4) - 8);
|
||||
fp32x8.s6 = (float)(((q4x8.s1 & 0x0F00) >> 8) - 8);
|
||||
fp32x8.s7 = (float)(((q4x8.s1 & 0xF000) >> 12) - 8);
|
||||
return fp32x8;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("half")))
|
||||
__kernel void kernel_gemv_moe_q4_0_f32_ns(
|
||||
__global uint * src0_q,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne11
|
||||
) {
|
||||
uint i01 = get_global_id(0);
|
||||
uint i20 = get_global_id(2);
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
uint expert_offset = expert_id * ne00 * ne01 / 32;
|
||||
|
||||
__private float sum = 0.0f; // each thread calculate partial sum of one output
|
||||
|
||||
// loop along ne00 in block granularity, skip 4 blocks every iter
|
||||
for (uint ib00 = sgid; ib00 < (ne00 / QK_Q4_0); ib00 += N_SIMDGROUP) {
|
||||
|
||||
// load one block of q
|
||||
uint4 regQ;
|
||||
uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
|
||||
|
||||
regQ.s0 = src0_q[block_offset];
|
||||
regQ.s1 = src0_q[block_offset + ne01];
|
||||
regQ.s2 = src0_q[block_offset + ne01 * 2];
|
||||
regQ.s3 = src0_q[block_offset + ne01 * 3];
|
||||
|
||||
uint offset = i11 * ne00 / 4 + ib00 * 8;
|
||||
|
||||
float8 fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s0));
|
||||
|
||||
float4 shared_y4;
|
||||
shared_y4 = read_imagef(src1, (offset + 0));
|
||||
float4 acc = shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 1));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s1));
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 2));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 3));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s2));
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 4));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 5));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s3));
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 6));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 7));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
|
||||
sum += (float)(regS) * ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #subgroups=4
|
||||
__local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
|
||||
if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
|
||||
if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
|
||||
if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 1 outputs per thread in subgroup 0
|
||||
if (sgid == 0) {
|
||||
dst = dst + (offsetd >> 2);
|
||||
dst[i01 + i20 * ne01] = sum;
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user