mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-21 08:24:06 +00:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a0dc97e56 | ||
|
|
97a20c012b | ||
|
|
f01bd02376 | ||
|
|
6f3bd38640 | ||
|
|
be0a0f8cae | ||
|
|
92e3006bb6 | ||
|
|
833e2b7409 | ||
|
|
e0e912f49b | ||
|
|
a10b36c91a | ||
|
|
83a88bd6af |
@@ -1,6 +1,7 @@
|
||||
#include "gguf.h" // for reading GGUF splits
|
||||
#include "arg.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "chat.h"
|
||||
@@ -848,6 +849,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
if (!params.tensor_buft_overrides.empty()) {
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (params.reranking && params.embedding) {
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
@@ -2180,6 +2185,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
throw std::invalid_argument("unknown buffer type");
|
||||
}
|
||||
// FIXME: this leaks memory
|
||||
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
|
||||
@@ -1042,15 +1042,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
if (!params.devices.empty()) {
|
||||
mparams.devices = params.devices.data();
|
||||
}
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
@@ -1058,6 +1061,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.kv_overrides = params.kv_overrides.data();
|
||||
}
|
||||
|
||||
if (params.tensor_buft_overrides.empty()) {
|
||||
mparams.tensor_buft_overrides = NULL;
|
||||
} else {
|
||||
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
||||
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
||||
}
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
|
||||
@@ -279,6 +279,7 @@ struct common_params {
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
|
||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
||||
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
||||
|
||||
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_params params;
|
||||
|
||||
params.n_predict = 128;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -54,9 +54,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
// added.
|
||||
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
if (ne == nullptr) {
|
||||
acl_storage_len = ggml_nbytes(tensor);
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
acl_ne[i] = tensor->ne[i];
|
||||
// The step size of acl is in elements.
|
||||
@@ -65,14 +63,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
} else {
|
||||
// With bcast
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
acl_ne[i] = ne[i];
|
||||
acl_stride[i] = nb[i] / ggml_element_size(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse ne and stride.
|
||||
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
|
||||
int64_t acl_storage_len = 1;
|
||||
for (int i = 0; i < final_dims; i++) {
|
||||
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
||||
}
|
||||
|
||||
// Reverse ne and stride.
|
||||
std::reverse(acl_ne, acl_ne + final_dims);
|
||||
std::reverse(acl_stride, acl_stride + final_dims);
|
||||
|
||||
|
||||
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||
tmp_stride[i] = nb[i] / type_size;
|
||||
}
|
||||
|
||||
int64_t acl_storage_len = 1;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
|
||||
}
|
||||
|
||||
std::reverse(tmp_ne, tmp_ne + dims);
|
||||
std::reverse(tmp_stride, tmp_stride + dims);
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
|
||||
aclTensor* acl_tensor =
|
||||
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
||||
format, &acl_storage_len, 1, data_ptr);
|
||||
|
||||
@@ -358,8 +358,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
float min;
|
||||
float max;
|
||||
@@ -1090,8 +1088,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
GGML_ASSERT(eps > 0.0f);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
@@ -3152,7 +3148,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: use ascendc
|
||||
// Only test with LLAMA model.
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||
// ggml_tensor* src2 = dst->src[2]; // freq_factors, not used now.
|
||||
|
||||
// param
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
|
||||
@@ -535,9 +535,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
|
||||
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
@@ -566,9 +563,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
|
||||
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
|
||||
@@ -1458,11 +1458,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
ACL_CHECK(aclrtSynchronizeDevice());
|
||||
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
|
||||
|
||||
// finalize when last backend freed.
|
||||
if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
|
||||
ACL_CHECK(aclFinalize());
|
||||
}
|
||||
|
||||
delete cann_ctx;
|
||||
delete backend;
|
||||
}
|
||||
@@ -1688,11 +1683,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
// only support contiguous for quantized types.
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1738,13 +1736,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
// TODO: with ops-test v == 1
|
||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||
float ext_factor = 0.0f;
|
||||
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
|
||||
// TODO: n_dims <= ne0
|
||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: ext_factor != 0
|
||||
if (*ext_factor != 0) {
|
||||
if (ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1766,6 +1765,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_POOL_2D: {
|
||||
const int32_t * opts = (const int32_t *) op->op_params;
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
// value of paddingH should be at most half of kernelH
|
||||
// value of paddingW should be at most half of kernelW
|
||||
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
||||
}
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
@@ -1785,7 +1794,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
|
||||
@@ -924,27 +924,24 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// TODO: fixme: these sizes are hardcoded for now.
|
||||
// they should be allocated based on the model's size
|
||||
// and the device's max alloc size
|
||||
size_t max_alloc_size;
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, NULL));
|
||||
|
||||
// Allocate intermediate buffers and images
|
||||
size_t required_A_q_d_bytes = 311164928;
|
||||
size_t required_A_s_d_bytes = 38895616;
|
||||
size_t required_B_d_bytes = 45088768;
|
||||
|
||||
// Ensure buffer sizes do not exceed the maximum allocation size
|
||||
size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, max_alloc_size);
|
||||
size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, max_alloc_size);
|
||||
size_t max_B_d_bytes = MIN(required_B_d_bytes, max_alloc_size);
|
||||
if (required_A_q_d_bytes > max_alloc_size) {
|
||||
size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
|
||||
size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
|
||||
size_t max_B_d_bytes = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
|
||||
if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_A_q_d_bytes, max_A_q_d_bytes);
|
||||
}
|
||||
if (required_A_s_d_bytes > max_alloc_size) {
|
||||
if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_A_s_d_bytes, max_A_s_d_bytes);
|
||||
}
|
||||
if (required_B_d_bytes > max_alloc_size) {
|
||||
if (required_B_d_bytes > backend_ctx->max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_B_d_bytes, max_B_d_bytes);
|
||||
}
|
||||
|
||||
@@ -23,49 +23,35 @@ if (Vulkan_FOUND)
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
if(NOT DEFINED GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
# Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
# Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
|
||||
message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat is supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat is supported by glslc")
|
||||
endif()
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
|
||||
message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF)
|
||||
else()
|
||||
message(STATUS "GL_KHR_cooperative_matrix support already defined: ${GGML_VULKAN_COOPMAT_GLSLC_SUPPORT}")
|
||||
endif()
|
||||
|
||||
if(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON)
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
|
||||
message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat2 is supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat2 is supported by glslc")
|
||||
endif()
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
|
||||
message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF)
|
||||
else()
|
||||
message(STATUS "GL_NV_cooperative_matrix2 support already defined: ${GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT}")
|
||||
endif()
|
||||
|
||||
if(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
|
||||
set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON)
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
|
||||
#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1))
|
||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||
static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||
|
||||
#define VK_VENDOR_ID_AMD 0x1002
|
||||
#define VK_VENDOR_ID_APPLE 0x106b
|
||||
@@ -352,6 +353,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||
|
||||
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
||||
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
||||
@@ -501,6 +503,10 @@ struct vk_flash_attn_push_constants {
|
||||
uint32_t n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
|
||||
uint32_t gqa_ratio;
|
||||
uint32_t split_kv;
|
||||
uint32_t k_num;
|
||||
};
|
||||
|
||||
struct vk_op_push_constants {
|
||||
@@ -1473,7 +1479,7 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ
|
||||
|
||||
// small rows, large cols
|
||||
if (small_rows) {
|
||||
return {flash_attention_num_small_rows, 128};
|
||||
return {flash_attention_num_small_rows, 64};
|
||||
}
|
||||
// small cols to reduce register count
|
||||
if (ggml_is_quantized(type) || D == 256) {
|
||||
@@ -2329,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||
@@ -5402,7 +5409,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
const uint32_t nbm1 = mask ? mask->nb[1] : 0;
|
||||
|
||||
const uint32_t D = neq0;
|
||||
const uint32_t N = neq1;
|
||||
uint32_t N = neq1;
|
||||
const uint32_t KV = nek1;
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
@@ -5460,9 +5467,54 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
vk_pipeline pipeline = pipelines[aligned];
|
||||
assert(pipeline);
|
||||
|
||||
uint32_t gqa_ratio = 1;
|
||||
uint32_t qk_ratio = neq2 / nek2;
|
||||
uint32_t workgroups_x = (uint32_t)neq1;
|
||||
uint32_t workgroups_y = (uint32_t)neq2;
|
||||
uint32_t workgroups_z = (uint32_t)neq3;
|
||||
|
||||
if (N == 1 && qk_ratio > 1 && is_pow2(qk_ratio) && gqa_ratio <= flash_attention_num_small_rows &&
|
||||
qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
|
||||
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
|
||||
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
|
||||
// and change addressing calculations to index Q's dimension 2.
|
||||
gqa_ratio = qk_ratio;
|
||||
N = gqa_ratio;
|
||||
workgroups_y /= N;
|
||||
}
|
||||
|
||||
uint32_t split_kv = KV;
|
||||
uint32_t split_k = 1;
|
||||
|
||||
if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
|
||||
GGML_ASSERT(workgroups_x == 1);
|
||||
// Try to run two workgroups per SM.
|
||||
split_k = ctx->device->shader_core_count * 2 / workgroups_y;
|
||||
if (split_k > 1) {
|
||||
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
|
||||
// of "align", so recompute split_k based on that.
|
||||
split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
|
||||
split_k = CEIL_DIV(KV, split_kv);
|
||||
workgroups_x = split_k;
|
||||
}
|
||||
}
|
||||
|
||||
// Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
|
||||
// and the per-row m and L values (ne1 rows).
|
||||
const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
|
||||
if (split_k_size > ctx->device->max_memory_allocation_size) {
|
||||
GGML_ABORT("Requested preallocation size is too large");
|
||||
}
|
||||
if (ctx->prealloc_size_split_k < split_k_size) {
|
||||
ctx->prealloc_size_split_k = split_k_size;
|
||||
}
|
||||
|
||||
if (dryrun) {
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5483,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
|
||||
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
|
||||
|
||||
@@ -5549,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
|
||||
nbm1,
|
||||
scale, max_bias, logit_softcap,
|
||||
mask != nullptr, n_head_log2, m0, m1 };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
},
|
||||
sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 });
|
||||
mask != nullptr, n_head_log2, m0, m1,
|
||||
gqa_ratio, split_kv, split_k };
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
if (split_k > 1) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
},
|
||||
// We only use split_k when group query attention is enabled, which means
|
||||
// there's no more than one tile of rows (i.e. workgroups_x would have been
|
||||
// one). We reuse workgroups_x to mean the number of splits, so we need to
|
||||
// cancel out the divide by wg_denoms[0].
|
||||
sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
},
|
||||
pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
|
||||
} else {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
},
|
||||
sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
|
||||
}
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
||||
|
||||
@@ -61,6 +61,10 @@ layout (push_constant) uniform parameter {
|
||||
uint32_t n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
|
||||
uint32_t gqa_ratio;
|
||||
uint32_t split_kv;
|
||||
uint32_t k_num;
|
||||
} p;
|
||||
|
||||
layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
|
||||
@@ -103,6 +107,38 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
|
||||
#define DECODEFUNC
|
||||
#endif
|
||||
|
||||
// Store the output when doing grouped query attention.
|
||||
// Rows index by Q's dimension 2, and the first N rows are valid.
|
||||
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
|
||||
{
|
||||
if (r < N && c < D) {
|
||||
uint32_t offset = (iq2 + r) * D + c;
|
||||
data_o[o_offset + offset] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
}
|
||||
|
||||
// Store column zero. This is used to save per-row m and L values for split_k.
|
||||
ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
|
||||
{
|
||||
if (r < N && c == 0) {
|
||||
uint32_t offset = iq2 + r;
|
||||
data_o[o_offset + offset] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
}
|
||||
|
||||
// Load the slope matrix, indexed by Q's dimension 2.
|
||||
ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
|
||||
{
|
||||
const uint32_t h = iq2 + (r & (p.gqa_ratio - 1));
|
||||
|
||||
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
|
||||
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
|
||||
|
||||
return ACC_TYPE(pow(base, ACC_TYPE(exph)));
|
||||
}
|
||||
|
||||
void main() {
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
@@ -111,12 +147,22 @@ void main() {
|
||||
const uint32_t N = p.N;
|
||||
const uint32_t KV = p.KV;
|
||||
|
||||
uint32_t i = gl_WorkGroupID.x;
|
||||
uint32_t split_k_index = 0;
|
||||
|
||||
if (p.k_num > 1) {
|
||||
i = 0;
|
||||
split_k_index = gl_WorkGroupID.x;
|
||||
}
|
||||
|
||||
const uint32_t Tr = CEIL_DIV(N, Br);
|
||||
const uint32_t Tc = CEIL_DIV(KV, Bc);
|
||||
|
||||
const uint32_t i = gl_WorkGroupID.x;
|
||||
const uint32_t start_j = split_k_index * p.split_kv / Bc;
|
||||
const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
|
||||
|
||||
const uint32_t iq2 = gl_WorkGroupID.y;
|
||||
// When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
|
||||
// When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
|
||||
const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
|
||||
const uint32_t iq3 = gl_WorkGroupID.z;
|
||||
|
||||
// broadcast factors
|
||||
@@ -149,8 +195,10 @@ void main() {
|
||||
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
|
||||
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
|
||||
|
||||
// nb?1 are already divided by the type size and are in units of elements
|
||||
uint32_t q_stride = p.nb01;
|
||||
// nb?1 are already divided by the type size and are in units of elements.
|
||||
// When using grouped query attention, Q is indexed by iq2, so the stride
|
||||
// should be nb02 (which is in bytes).
|
||||
uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
|
||||
uint32_t k_stride = p.nb11;
|
||||
uint32_t v_stride = p.nb21;
|
||||
// hint to the compiler that strides are aligned for the aligned variant of the shader
|
||||
@@ -182,20 +230,15 @@ void main() {
|
||||
L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
|
||||
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-1.0/0.0);
|
||||
|
||||
ACC_TYPE slope = ACC_TYPE(1.0);
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> slopeMat = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(1.0);
|
||||
|
||||
// ALiBi
|
||||
if (p.max_bias > 0.0f) {
|
||||
const uint32_t h = iq2;
|
||||
|
||||
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
|
||||
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
|
||||
|
||||
slope = pow(base, ACC_TYPE(exph));
|
||||
coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2);
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
for (uint32_t j = 0; j < Tc; ++j) {
|
||||
for (uint32_t j = start_j; j < end_j; ++j) {
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
|
||||
|
||||
@@ -215,12 +258,16 @@ void main() {
|
||||
if (p.mask != 0) {
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
|
||||
// When using grouped query attention, all rows use the same mask.
|
||||
if (p.gqa_ratio > 1) {
|
||||
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, 0, 1);
|
||||
}
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
||||
|
||||
coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
|
||||
S += slope*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
}
|
||||
|
||||
// Clear padding elements to -inf, so they don't contribute to rowmax
|
||||
@@ -285,6 +332,20 @@ void main() {
|
||||
O = coopMatMulAdd(P_A, V, O);
|
||||
}
|
||||
|
||||
// If there is split_k, then the split_k resolve shader does the final
|
||||
// division by L. Store the intermediate O value and per-row m and L values.
|
||||
if (p.k_num > 1) {
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
|
||||
|
||||
uint32_t o_offset = D * p.ne1 * split_k_index;
|
||||
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
||||
|
||||
o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
|
||||
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
|
||||
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
|
||||
return;
|
||||
}
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
|
||||
|
||||
// resize L by using smear/reduce
|
||||
@@ -297,13 +358,18 @@ void main() {
|
||||
|
||||
O = Ldiag*O;
|
||||
|
||||
tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
|
||||
|
||||
// permute dimensions
|
||||
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1;
|
||||
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
|
||||
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute);
|
||||
if (p.gqa_ratio > 1) {
|
||||
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
||||
} else {
|
||||
tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
|
||||
|
||||
// permute dimensions
|
||||
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
|
||||
|
||||
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, D), tensorViewPermute);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
#define BLOCK_SIZE 32
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {float data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {float data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint D;
|
||||
uint N;
|
||||
uint k_num;
|
||||
} p;
|
||||
|
||||
void main() {
|
||||
// Each workgroup handles a row
|
||||
const uint n = gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
uint D = p.D;
|
||||
uint N = p.N;
|
||||
uint k_num = p.k_num;
|
||||
|
||||
uint l_offset = D * N * k_num + n;
|
||||
uint m_offset = D * N * k_num + N + n;
|
||||
uint lm_stride = N * 2;
|
||||
|
||||
// Compute the max m value for the row
|
||||
float m_max = -1.0/0.0;
|
||||
[[unroll]] for (uint k = 0; k < k_num; ++k) {
|
||||
float m = data_a[m_offset + k * lm_stride];
|
||||
m_max = max(m_max, m);
|
||||
}
|
||||
|
||||
// Compute L based on m_max
|
||||
float L = 0;
|
||||
[[unroll]] for (uint k = 0; k < k_num; ++k) {
|
||||
float l = data_a[l_offset + k * lm_stride];
|
||||
float m = data_a[m_offset + k * lm_stride];
|
||||
L += exp(m - m_max) * l;
|
||||
}
|
||||
|
||||
L = 1.0 / L;
|
||||
|
||||
// Scale and sum the O contributions based on m_max and store the result to memory
|
||||
for (uint d = tid; d < D; d += BLOCK_SIZE) {
|
||||
float O = 0.0;
|
||||
[[unroll]] for (uint k = 0; k < k_num; ++k) {
|
||||
uint o_offset = D * N * k + D * n + d;
|
||||
float m = data_a[m_offset + k * lm_stride];
|
||||
O += exp(m - m_max) * data_a[o_offset];
|
||||
}
|
||||
O *= L;
|
||||
data_d[D * n + d] = O;
|
||||
}
|
||||
}
|
||||
@@ -234,9 +234,9 @@ void main() {
|
||||
#endif
|
||||
|
||||
#if QUANT_AUXF == 1
|
||||
FLOAT_TYPE cache_a_dm[TM];
|
||||
FLOAT_TYPE cache_a_dm[WMITER * TM];
|
||||
#else
|
||||
FLOAT_TYPE_VEC2 cache_a_dm[TM];
|
||||
FLOAT_TYPE_VEC2 cache_a_dm[WMITER * TM];
|
||||
#endif
|
||||
|
||||
FLOAT_TYPE_VEC2 cache_b_ds[TN];
|
||||
@@ -247,7 +247,6 @@ void main() {
|
||||
const uint iqs = loadr_a;
|
||||
const uint buf_ib = loadc_a + l;
|
||||
|
||||
// Should ds be gated to a single thread?
|
||||
if (iqs == 0) {
|
||||
#if QUANT_AUXF == 1
|
||||
buf_a_dm[buf_ib] = get_d(ib);
|
||||
@@ -276,7 +275,6 @@ void main() {
|
||||
|
||||
const uint buf_ib = loadc_b + l;
|
||||
|
||||
// Should ds be gated to a single thread?
|
||||
if (iqs == 0) {
|
||||
buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib].ds);
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ i32vec2 repack(uint ib, uint iqs) {
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0 * dsb.y));
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0f * dsb.y));
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -51,7 +51,7 @@ i32vec2 repack(uint ib, uint iqs) {
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0 * dsb.y));
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0f * dsb.y));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -465,6 +465,7 @@ void process_shaders() {
|
||||
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {});
|
||||
string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});
|
||||
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
@@ -1159,6 +1159,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
}
|
||||
|
||||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
if (tensor->ne[i] <= 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t nbytes;
|
||||
const size_t blck_size = ggml_blck_size(tensor->type);
|
||||
if (blck_size == 1) {
|
||||
|
||||
@@ -280,10 +280,18 @@ extern "C" {
|
||||
};
|
||||
};
|
||||
|
||||
struct llama_model_tensor_buft_override {
|
||||
const char * pattern;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
};
|
||||
|
||||
struct llama_model_params {
|
||||
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
|
||||
ggml_backend_dev_t * devices;
|
||||
|
||||
// NULL-terminated list of buffer types to use for tensors that match a pattern
|
||||
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
|
||||
@@ -255,7 +255,8 @@ llama_context::llama_context(
|
||||
model.n_devices() > 1 &&
|
||||
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
|
||||
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
cparams.offload_kqv;
|
||||
cparams.offload_kqv &&
|
||||
!model.has_tensor_overrides();
|
||||
|
||||
// pipeline parallelism requires support for async compute and events in all devices
|
||||
if (pipeline_parallel) {
|
||||
@@ -1201,33 +1202,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: remove this stuff
|
||||
class batch_guard {
|
||||
public:
|
||||
batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
|
||||
}
|
||||
|
||||
~batch_guard() {
|
||||
if (!is_done) {
|
||||
kv_slot_restorer.restore();
|
||||
}
|
||||
}
|
||||
|
||||
void done() {
|
||||
is_done = true;
|
||||
}
|
||||
|
||||
void save(const llama_kv_cache_slot_info & slot_info) {
|
||||
kv_slot_restorer.save(slot_info);
|
||||
}
|
||||
|
||||
private:
|
||||
bool is_done = false;
|
||||
|
||||
llama_kv_slot_restorer kv_slot_restorer;
|
||||
};
|
||||
|
||||
batch_guard bg(*kv_self);
|
||||
llama_kv_cache_guard kv_guard(kv_self.get());
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
@@ -1280,6 +1255,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
return -2;
|
||||
};
|
||||
|
||||
// handle any pending defrags/shifts
|
||||
kv_self_update();
|
||||
|
||||
int64_t n_outputs_prev = 0;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
@@ -1319,22 +1297,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
// find KV slot
|
||||
{
|
||||
kv_self_update();
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
|
||||
kv_self->head = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto slot_info = kv_self->find_slot(ubatch);
|
||||
if (!slot_info) {
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
|
||||
return -3;
|
||||
}
|
||||
|
||||
bg.save(slot_info);
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
@@ -1371,16 +1339,6 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
}
|
||||
|
||||
// update the kv ring buffer
|
||||
{
|
||||
kv_self->head += ubatch.n_tokens;
|
||||
|
||||
// Ensure kv cache head points to a valid index.
|
||||
if (kv_self->head >= kv_self->size) {
|
||||
kv_self->head = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// plot the computation graph in dot format (for debugging purposes)
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
@@ -1467,7 +1425,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
|
||||
// finalize the batch processing
|
||||
bg.done();
|
||||
kv_guard.commit();
|
||||
|
||||
// set output mappings
|
||||
{
|
||||
|
||||
@@ -11,8 +11,6 @@
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
|
||||
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
|
||||
|
||||
llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
|
||||
}
|
||||
|
||||
@@ -206,6 +204,8 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
@@ -446,16 +446,66 @@ void llama_kv_cache_unified::defrag() {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::restore() {
|
||||
if (pending.ranges.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: tmp - move to llama_kv_cache_recurrent
|
||||
if (recurrent) {
|
||||
seq_rm(-1, -1, -1);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t new_head = size;
|
||||
|
||||
for (auto & range : pending.ranges) {
|
||||
for (uint32_t i = range.c0; i < range.c1; ++i) {
|
||||
cells[i].seq_id.clear();
|
||||
|
||||
// keep count of the number of used cells
|
||||
if (cells[i].pos >= 0) {
|
||||
used--;
|
||||
}
|
||||
|
||||
cells[i].pos = -1;
|
||||
cells[i].src = -1;
|
||||
}
|
||||
|
||||
new_head = std::min(new_head, range.c0);
|
||||
}
|
||||
|
||||
if (new_head != size && new_head < head) {
|
||||
head = new_head;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::commit() {
|
||||
if (pending.ranges.empty()) {
|
||||
LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
|
||||
__func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
|
||||
return;
|
||||
}
|
||||
|
||||
pending.ranges.clear();
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::get_can_shift() const {
|
||||
return can_shift;
|
||||
}
|
||||
|
||||
llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
|
||||
bool llama_kv_cache_unified::find_slot(
|
||||
const llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (head > used + 2*ubatch.n_tokens) {
|
||||
head = 0;
|
||||
}
|
||||
|
||||
if (recurrent) {
|
||||
// For recurrent state architectures (like Mamba or RWKV),
|
||||
// each cache cell can store the state for a whole sequence.
|
||||
@@ -477,7 +527,7 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
|
||||
// too big seq_id
|
||||
// TODO: would it be possible to resize the cache instead?
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
|
||||
return llama_kv_cache_slot_info_failed;
|
||||
return false;
|
||||
}
|
||||
if (j > 0) {
|
||||
llama_kv_cell & seq = cells[seq_id];
|
||||
@@ -616,14 +666,14 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
|
||||
[](const llama_kv_cell& cell){ return !cell.is_empty(); });
|
||||
|
||||
// sanity check
|
||||
return llama_kv_cache_slot_info(n >= n_seqs);
|
||||
return n >= n_seqs;
|
||||
}
|
||||
|
||||
// otherwise, one cell per token.
|
||||
|
||||
if (n_tokens > size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
|
||||
return llama_kv_cache_slot_info_failed;
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
@@ -651,7 +701,7 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
|
||||
|
||||
if (n_tested >= size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return llama_kv_cache_slot_info_failed;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -668,7 +718,9 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
|
||||
|
||||
used += n_tokens;
|
||||
|
||||
return llama_kv_cache_slot_info(head, head + n_tokens);
|
||||
pending.ranges.push_back({head, head + n_tokens});
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
|
||||
@@ -1033,6 +1085,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
commit();
|
||||
|
||||
// DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
|
||||
@@ -17,6 +17,9 @@ struct llama_ubatch;
|
||||
struct llama_kv_cache : public llama_memory_i {
|
||||
using llama_memory_i::llama_memory_i;
|
||||
|
||||
virtual void restore() = 0; // call if batch processing fails - restores the cache state
|
||||
virtual void commit() = 0; // call after successful batch processing - clears any pending state
|
||||
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
|
||||
@@ -25,9 +28,24 @@ struct llama_kv_cache : public llama_memory_i {
|
||||
bool get_can_edit() const override { return get_can_shift(); }
|
||||
};
|
||||
|
||||
struct llama_kv_cache_guard {
|
||||
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
||||
|
||||
~llama_kv_cache_guard() {
|
||||
kv->restore();
|
||||
}
|
||||
|
||||
void commit() {
|
||||
kv->commit();
|
||||
}
|
||||
|
||||
private:
|
||||
llama_kv_cache * kv;
|
||||
};
|
||||
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
llama_pos delta = 0;
|
||||
int32_t src = -1; // used by recurrent state models to copy states
|
||||
int32_t tail = -1;
|
||||
|
||||
@@ -46,17 +64,6 @@ struct llama_kv_cell {
|
||||
}
|
||||
};
|
||||
|
||||
// a structure holds information about the slot found in llama_kv_cache_find_slot
|
||||
struct llama_kv_cache_slot_info {
|
||||
std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
|
||||
bool found = false; // the slot was found
|
||||
|
||||
explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
|
||||
llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
|
||||
|
||||
operator bool() const { return found; }
|
||||
};
|
||||
|
||||
// ring-buffer of cached KV data
|
||||
// TODO: pimpl
|
||||
// TODO: add notion of max sequences
|
||||
@@ -93,6 +100,9 @@ public:
|
||||
void clear() override;
|
||||
void defrag() override;
|
||||
|
||||
virtual void restore() override;
|
||||
virtual void commit() override;
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
@@ -105,10 +115,9 @@ public:
|
||||
|
||||
// find an empty slot of size "n_tokens" in the cache
|
||||
// updates the cache head
|
||||
// returns a structure holding information about the slot found
|
||||
// Note: On success, it's important that cache.head points
|
||||
// to the first cell of the slot.
|
||||
llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
|
||||
bool find_slot(const llama_ubatch & batch);
|
||||
|
||||
// TODO: maybe not needed
|
||||
uint32_t get_padding(const llama_cparams & cparams) const;
|
||||
@@ -128,7 +137,19 @@ public:
|
||||
// return true if cells have been moved
|
||||
bool defrag_prepare(int32_t n_max_nodes);
|
||||
|
||||
// state save/load
|
||||
// commit/restore cache
|
||||
|
||||
struct slot_range {
|
||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||
uint32_t c1 = 0;
|
||||
};
|
||||
|
||||
// pending cell updates that are not yet committed
|
||||
struct {
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
||||
@@ -183,59 +204,6 @@ private:
|
||||
// using llama_kv_cache_unified::llama_kv_cache_unified;
|
||||
//};
|
||||
|
||||
//
|
||||
// kv cache restore
|
||||
//
|
||||
|
||||
// saves the kv_cache state for future recovery.
|
||||
// used to rollback llama_kv_cache_find_slot changes.
|
||||
struct llama_kv_slot_restorer {
|
||||
struct llama_kv_cache_state {
|
||||
uint32_t head = 0;
|
||||
uint32_t n = 0;
|
||||
} old_state;
|
||||
|
||||
// for non-recurrent models only
|
||||
// list of slots to restore
|
||||
std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
|
||||
|
||||
bool do_restore = false;
|
||||
|
||||
llama_kv_cache_unified & cache;
|
||||
|
||||
explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
|
||||
old_state.head = cache.head;
|
||||
old_state.n = cache.n;
|
||||
}
|
||||
|
||||
// saves a slot information for future restoration
|
||||
void save(const llama_kv_cache_slot_info & slot) {
|
||||
if (slot) {
|
||||
do_restore = true;
|
||||
if (slot.boundaries.first != slot.boundaries.second) {
|
||||
slot_boundaries.push_back(slot.boundaries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// must be explicitly called to restore the kv_cache state
|
||||
// and rollback changes from all llama_kv_cache_find_slot calls
|
||||
void restore() {
|
||||
if (do_restore) {
|
||||
cache.head = old_state.head;
|
||||
cache.n = old_state.n;
|
||||
|
||||
if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
|
||||
cache.seq_rm(-1, -1, -1);
|
||||
} else {
|
||||
for (auto & slot : slot_boundaries) {
|
||||
cache.seq_rm(-1, slot.first, slot.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: maybe become part of the public llama_kv_cache in the future
|
||||
int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
|
||||
|
||||
|
||||
@@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
const struct llama_model_kv_override * param_overrides_p) {
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
@@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader(
|
||||
}
|
||||
}
|
||||
|
||||
tensor_buft_overrides = param_tensor_buft_overrides_p;
|
||||
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
@@ -600,7 +603,9 @@ llama_model_loader::llama_model_loader(
|
||||
|
||||
if (trace > 0) {
|
||||
const uint16_t sid = w.idx;
|
||||
LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
||||
LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
|
||||
sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
|
||||
ggml_nbytes(tensor)/1024.0f/1024.0f);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -77,8 +77,9 @@ struct llama_model_loader {
|
||||
|
||||
llama_mmaps mappings;
|
||||
|
||||
std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
|
||||
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
||||
std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
|
||||
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
||||
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
gguf_context_ptr meta;
|
||||
std::vector<ggml_context_ptr> contexts;
|
||||
@@ -95,7 +96,8 @@ struct llama_model_loader {
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
const struct llama_model_kv_override * param_overrides_p);
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
@@ -378,9 +379,12 @@ struct llama_model::impl {
|
||||
layer_dev dev_input = {};
|
||||
layer_dev dev_output = {};
|
||||
std::vector<layer_dev> dev_layer;
|
||||
|
||||
bool has_tensor_overrides;
|
||||
};
|
||||
|
||||
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
|
||||
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
||||
}
|
||||
|
||||
llama_model::~llama_model() {}
|
||||
@@ -1571,9 +1575,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
// check overrides
|
||||
if (ml.tensor_buft_overrides) {
|
||||
std::string tensor_name = tn.str();
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
||||
buft = overrides->buft;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!buft) {
|
||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
||||
if (!buft) {
|
||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
// avoid using a host buffer when using mmap
|
||||
@@ -4151,6 +4172,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
||||
});
|
||||
}
|
||||
|
||||
bool llama_model::has_tensor_overrides() const {
|
||||
return pimpl->has_tensor_overrides;
|
||||
}
|
||||
|
||||
const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
||||
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
||||
[name](const std::pair<std::string, ggml_tensor *> & it) {
|
||||
@@ -12319,6 +12344,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
llama_model_params llama_model_default_params() {
|
||||
llama_model_params result = {
|
||||
/*.devices =*/ nullptr,
|
||||
/*.tensor_buft_overrides =*/ nullptr,
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
|
||||
@@ -382,6 +382,8 @@ struct llama_model {
|
||||
|
||||
ggml_backend_buffer_type_t select_buft(int il) const;
|
||||
|
||||
bool has_tensor_overrides() const;
|
||||
|
||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
|
||||
@@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
@@ -411,7 +411,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
|
||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -92,7 +92,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
||||
@@ -4516,6 +4516,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int kv : { 4096, 8192, 16384, }) {
|
||||
for (int hs : { 64, 128, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
}
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user