CANN: Fix failed test cases (#12708 )

* CANN: Fix memory waste in aclnn_tensor * CANN: fix backend ops fail * CANN: fix acl_tensor memory alloc. * CANN: format * CANN: remove trailing whitespace
opencl: use max_alloc_size in backend ctx instead of querying again (#12705 )
2026-05-21 08:24:06 +00:00 · 2025-04-03 08:49:51 +08:00 · 2025-04-02 17:01:42 -07:00 · 2025-04-02 14:25:08 -05:00
11 changed files with 213 additions and 56 deletions
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -54,9 +54,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];

-    int64_t acl_storage_len = 0;
    if (ne == nullptr) {
-        acl_storage_len = ggml_nbytes(tensor);
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            acl_ne[i] = tensor->ne[i];
            // The step size of acl is in elements.
@@ -65,14 +63,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
-            acl_storage_len += (ne[i] - 1) * nb[i];
            acl_ne[i] = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }

-    // Reverse ne and stride.
    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    int64_t acl_storage_len = 1;
+    for (int i = 0; i < final_dims; i++) {
+        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
+    }
+
+    // Reverse ne and stride.
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);

--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
        tmp_stride[i] = nb[i] / type_size;
    }

+    int64_t acl_storage_len = 1;
+    for (int i = 0; i < dims; i++) {
+        acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
+    }
+
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);

-    int64_t acl_storage_len = 0;
-    for (int i = 0; i < dims; i++) {
-        acl_storage_len += (ne[i] - 1) * nb[i];
-    }
-
    aclTensor* acl_tensor =
        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
                        format, &acl_storage_len, 1, data_ptr);
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -358,8 +358,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    float min;
    float max;
@@ -1090,8 +1088,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

-    GGML_ASSERT(eps > 0.0f);
-
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
@@ -3152,7 +3148,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // TODO: use ascendc
    // Only test with LLAMA model.
    ggml_tensor* src0 = dst->src[0];  // input
-    ggml_tensor* src2 = dst->src[2];  // freq_factors
+    // ggml_tensor* src2 = dst->src[2];  // freq_factors, not used now.

    // param
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -535,9 +535,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

@@ -566,9 +563,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1458,11 +1458,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeDevice());
    ACL_CHECK(aclrtResetDevice(cann_ctx->device));

-    // finalize when last backend freed.
-    if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
-        ACL_CHECK(aclFinalize());
-    }
-
    delete cann_ctx;
    delete backend;
 }
@@ -1688,11 +1683,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
-                case GGML_TYPE_Q8_0:
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
-                case GGML_TYPE_Q4_0:
                    return true;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                    // only support contiguous for quantized types.
+                    return ggml_is_contiguous(op->src[0]) &&
+                            ggml_is_contiguous(op->src[1]);
                default:
                    return false;
            }
@@ -1738,13 +1736,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        }
        case GGML_OP_ROPE: {
            // TODO: with ops-test v == 1
-            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            float ext_factor = 0.0f;
+            memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
            // TODO: n_dims <= ne0
            if (op->src[0]->ne[0] != op->op_params[1]) {
                return false;
            }
            // TODO: ext_factor != 0
-            if (*ext_factor != 0) {
+            if (ext_factor != 0) {
                return false;
            }

@@ -1766,6 +1765,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
            return true;
        }
+        case GGML_OP_POOL_2D: {
+            const int32_t * opts = (const int32_t *) op->op_params;
+            const int       k0   = opts[1];
+            const int       k1   = opts[2];
+            const int       p0   = opts[5];
+            const int       p1   = opts[6];
+            // value of paddingH should be at most half of kernelH
+            // value of paddingW should be at most half of kernelW
+            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+        }
        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
@@ -1785,7 +1794,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_CLAMP:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
-        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -924,27 +924,24 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    // TODO: fixme: these sizes are hardcoded for now.
    //  they should be allocated based on the model's size
    //  and the device's max alloc size
-    size_t max_alloc_size;
-    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, NULL));
-
    // Allocate intermediate buffers and images
    size_t required_A_q_d_bytes = 311164928;
    size_t required_A_s_d_bytes = 38895616;
    size_t required_B_d_bytes = 45088768;

    // Ensure buffer sizes do not exceed the maximum allocation size
-    size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, max_alloc_size);
-    size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, max_alloc_size);
-    size_t max_B_d_bytes   = MIN(required_B_d_bytes, max_alloc_size);
-    if (required_A_q_d_bytes > max_alloc_size) {
+    size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
+    size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
+    size_t max_B_d_bytes   = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
+    if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
        GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
                      required_A_q_d_bytes, max_A_q_d_bytes);
    }
-    if (required_A_s_d_bytes > max_alloc_size) {
+    if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
        GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
                      required_A_s_d_bytes, max_A_s_d_bytes);
    }
-    if (required_B_d_bytes > max_alloc_size) {
+    if (required_B_d_bytes > backend_ctx->max_alloc_size) {
        GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
                      required_B_d_bytes, max_B_d_bytes);
    }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -353,6 +353,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_split_k_reduce;

    std::unordered_map<std::string, vk_pipeline_ref> pipelines;
    std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
@@ -504,6 +505,8 @@ struct vk_flash_attn_push_constants {
    float m1;

    uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
 };

 struct vk_op_push_constants {
@@ -1476,7 +1479,7 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ

    // small rows, large cols
    if (small_rows) {
-        return {flash_attention_num_small_rows, 128};
+        return {flash_attention_num_small_rows, 64};
    }
    // small cols to reduce register count
    if (ggml_is_quantized(type) || D == 256) {
@@ -2332,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);

    for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@@ -5479,9 +5483,38 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
        workgroups_y /= N;
    }

+    uint32_t split_kv = KV;
+    uint32_t split_k = 1;
+
+    if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
+        GGML_ASSERT(workgroups_x == 1);
+        // Try to run two workgroups per SM.
+        split_k = ctx->device->shader_core_count * 2 / workgroups_y;
+        if (split_k > 1) {
+            // Try to evenly split KV into split_k chunks, but it needs to be a multiple
+            // of "align", so recompute split_k based on that.
+            split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
+            split_k = CEIL_DIV(KV, split_kv);
+            workgroups_x = split_k;
+        }
+    }
+
+    // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
+    // and the per-row m and L values (ne1 rows).
+    const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
+    if (split_k_size > ctx->device->max_memory_allocation_size) {
+        GGML_ABORT("Requested preallocation size is too large");
+    }
+    if (ctx->prealloc_size_split_k < split_k_size) {
+        ctx->prealloc_size_split_k = split_k_size;
+    }
+
    if (dryrun) {
        // Request descriptor sets
        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        if (split_k > 1) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
+        }
        return;
    }

@@ -5502,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

-    ggml_vk_sync_buffers(subctx);
-
    vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
    size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;

@@ -5568,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                              v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
                                              nbm1,
                                              scale, max_bias, logit_softcap,
-                                              mask != nullptr, n_head_log2, m0, m1, gqa_ratio };
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
-                                {
-                                    vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
-                                    vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
-                                },
-                                sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
+                                              mask != nullptr, n_head_log2, m0, m1,
+                                              gqa_ratio, split_kv, split_k };
+
+    ggml_vk_sync_buffers(subctx);
+
+    if (split_k > 1) {
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+                                    {
+                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
+                                    },
+                                    // We only use split_k when group query attention is enabled, which means
+                                    // there's no more than one tile of rows (i.e. workgroups_x would have been
+                                    // one). We reuse workgroups_x to mean the number of splits, so we need to
+                                    // cancel out the divide by wg_denoms[0].
+                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
+
+        ggml_vk_sync_buffers(subctx);
+        const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
+        ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
+                                    {
+                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                    },
+                                    pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
+    } else {
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+                                    {
+                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
+                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                    },
+                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
+    }
 }

 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -63,6 +63,8 @@ layout (push_constant) uniform parameter {
    float m1;

    uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
 } p;

 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
@@ -116,6 +118,16 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
    return elem;
 }

+// Store column zero. This is used to save per-row m and L values for split_k.
+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c == 0) {
+        uint32_t offset = iq2 + r;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
 // Load the slope matrix, indexed by Q's dimension 2.
 ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
 {
@@ -135,10 +147,18 @@ void main() {
    const uint32_t N = p.N;
    const uint32_t KV = p.KV;

-    const uint32_t Tr = CEIL_DIV(N, Br);
-    const uint32_t Tc = CEIL_DIV(KV, Bc);
+    uint32_t i = gl_WorkGroupID.x;
+    uint32_t split_k_index = 0;

-    const uint32_t i = gl_WorkGroupID.x;
+    if (p.k_num > 1) {
+        i = 0;
+        split_k_index = gl_WorkGroupID.x;
+    }
+
+    const uint32_t Tr = CEIL_DIV(N, Br);
+
+    const uint32_t start_j = split_k_index * p.split_kv / Bc;
+    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);

    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
@@ -218,7 +238,7 @@ void main() {
    }

    [[dont_unroll]]
-    for (uint32_t j = 0; j < Tc; ++j) {
+    for (uint32_t j = start_j; j < end_j; ++j) {

        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);

@@ -312,6 +332,20 @@ void main() {
        O = coopMatMulAdd(P_A, V, O);
    }

+    // If there is split_k, then the split_k resolve shader does the final
+    // division by L. Store the intermediate O value and per-row m and L values.
+    if (p.k_num > 1) {
+        coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
+
+        uint32_t o_offset = D * p.ne1 * split_k_index;
+        coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
+
+        o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
+        coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
+        coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
+        return;
+    }
+
    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;

    // resize L by using smear/reduce
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
@@ -0,0 +1,59 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+
+#define BLOCK_SIZE 32
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 1) writeonly buffer D {float data_d[];};
+
+layout (push_constant) uniform parameter {
+    uint D;
+    uint N;
+    uint k_num;
+} p;
+
+void main() {
+    // Each workgroup handles a row
+    const uint n = gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+
+    uint D = p.D;
+    uint N = p.N;
+    uint k_num = p.k_num;
+
+    uint l_offset = D * N * k_num + n;
+    uint m_offset = D * N * k_num + N + n;
+    uint lm_stride = N * 2;
+
+    // Compute the max m value for the row
+    float m_max = -1.0/0.0;
+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
+        float m = data_a[m_offset + k * lm_stride];
+        m_max = max(m_max, m);
+    }
+
+    // Compute L based on m_max
+    float L = 0;
+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
+        float l = data_a[l_offset + k * lm_stride];
+        float m = data_a[m_offset + k * lm_stride];
+        L += exp(m - m_max) * l;
+    }
+
+    L = 1.0 / L;
+
+    // Scale and sum the O contributions based on m_max and store the result to memory
+    for (uint d = tid; d < D; d += BLOCK_SIZE) {
+        float O = 0.0;
+        [[unroll]] for (uint k = 0; k < k_num; ++k) {
+            uint o_offset = D * N * k + D * n + d;
+            float m = data_a[m_offset + k * lm_stride];
+            O += exp(m - m_max) * data_a[o_offset];
+        }
+        O *= L;
+        data_d[D * n + d] = O;
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -465,6 +465,7 @@ void process_shaders() {
    string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});

    string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
+    string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {});
    string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});

    string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4516,6 +4516,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
        }
    }

+    for (int kv : { 4096, 8192, 16384, }) {
+        for (int hs : { 64, 128, }) {
+            test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+        }
+    }
+
    return test_cases;
 }