mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-15 05:24:06 +00:00
Fixes for subgroup size to bring AMD and NVIDIA inline with eachother for all kernels.
This commit is contained in:
@@ -785,7 +785,8 @@ void ggml_vk_soft_max(kp::Sequence& seq,
|
||||
|
||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize;
|
||||
// FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
|
||||
const uint32_t local_x = 32;
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
|
||||
} else {
|
||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||
@@ -981,8 +982,8 @@ void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
|
||||
|
||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||
// const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
|
||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
|
||||
} else {
|
||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||
s_algo->setTensors({inA, inB, out});
|
||||
|
||||
Reference in New Issue
Block a user