mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-03 15:44:06 +00:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a249843d89 | ||
|
|
19f4a7b296 | ||
|
|
2a358fb0c4 | ||
|
|
eae597182c | ||
|
|
00b02bb249 | ||
|
|
a876861455 | ||
|
|
385decbd63 | ||
|
|
60a3107ccd | ||
|
|
406c1a32a1 | ||
|
|
9cb9260861 | ||
|
|
202084d31d | ||
|
|
dbbebcab33 | ||
|
|
ba1cf846ed | ||
|
|
d2d3200b38 | ||
|
|
51d964a4ef | ||
|
|
efe6a83e30 | ||
|
|
fbb7fcffbc |
@@ -600,7 +600,9 @@ std::string llama_arg::to_string() {
|
||||
if (args.size() == 1) {
|
||||
ss << arg;
|
||||
} else {
|
||||
ss << format("%-7s", arg) << ", ";
|
||||
// first arg is usually abbreviation, we need padding to make it more beautiful
|
||||
auto tmp = std::string(arg) + ", ";
|
||||
ss << format("%-7s", tmp.c_str());
|
||||
}
|
||||
} else {
|
||||
ss << arg << (arg != args.back() ? ", " : "");
|
||||
@@ -1654,7 +1656,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
|
||||
}
|
||||
));
|
||||
add_opt(llama_arg(
|
||||
{"-ngl", "--gpu-layers"}, "N",
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
[](gpt_params & params, int value) {
|
||||
params.n_gpu_layers = value;
|
||||
@@ -1665,7 +1667,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||
add_opt(llama_arg(
|
||||
{"-ngld", "--gpu-layers-draft"}, "N",
|
||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||
"number of layers to store in VRAM for the draft model",
|
||||
[](gpt_params & params, int value) {
|
||||
params.n_gpu_layers_draft = value;
|
||||
|
||||
@@ -577,7 +577,7 @@ int main(int argc, char ** argv) {
|
||||
params.logits_all = true;
|
||||
params.verbosity = 1;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -681,8 +681,8 @@ extern "C" {
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used;
|
||||
struct ggml_tensor ** keys;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
// computation graph
|
||||
@@ -1272,7 +1272,7 @@ extern "C" {
|
||||
size_t nb1,
|
||||
size_t nb2,
|
||||
size_t nb3,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||
GGML_API struct ggml_tensor * ggml_set_inplace(
|
||||
@@ -1282,19 +1282,19 @@ extern "C" {
|
||||
size_t nb1,
|
||||
size_t nb2,
|
||||
size_t nb3,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||
GGML_API struct ggml_tensor * ggml_set_2d(
|
||||
@@ -1302,7 +1302,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t nb1,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
||||
@@ -1310,7 +1310,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t nb1,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// a -> b, return view(b)
|
||||
GGML_API struct ggml_tensor * ggml_cpy(
|
||||
|
||||
@@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||
case GGML_OP_ROPE_BACK:
|
||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
|
||||
# title of most generated pages and in a few other places.
|
||||
# The default value is: My Project.
|
||||
|
||||
PROJECT_NAME = "llama.cpp"
|
||||
PROJECT_NAME = "ggml"
|
||||
|
||||
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
|
||||
# could be handy for archiving the generated documentation or if some version
|
||||
@@ -44,7 +44,7 @@ PROJECT_NUMBER =
|
||||
# for a project that appears at the top of each page and should give viewer a
|
||||
# quick idea about the purpose of the project. Keep the description short.
|
||||
|
||||
PROJECT_BRIEF = "llama inference engine"
|
||||
PROJECT_BRIEF = "Tensor library for machine learning"
|
||||
|
||||
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
|
||||
# in the documentation. The maximum height of the logo should not exceed 55
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/sum.cuh"
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
@@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
ggml_cuda_dup(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1: // TODO: more efficient implementation
|
||||
ggml_cuda_op_add(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUB:
|
||||
@@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_NEG:
|
||||
ggml_cuda_op_neg(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
ggml_cuda_op_gelu(ctx, dst);
|
||||
break;
|
||||
@@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_cuda_op_pool2d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUM:
|
||||
ggml_cuda_op_sum(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
ggml_cuda_op_sum_rows(ctx, dst);
|
||||
break;
|
||||
@@ -2748,6 +2756,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
@@ -2877,6 +2886,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
@@ -2887,14 +2897,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "common.cuh"
|
||||
#include "cross-entropy-loss.cuh"
|
||||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
@@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
|
||||
// Combine results from individual blocks:
|
||||
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
|
||||
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
||||
}
|
||||
|
||||
41
ggml/src/ggml-cuda/sum.cu
Normal file
41
ggml/src/ggml-cuda/sum.cu
Normal file
@@ -0,0 +1,41 @@
|
||||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
size_t tmp_size = 0;
|
||||
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
|
||||
#else
|
||||
// Use (inefficient) sum_rows implementation as a fallback.
|
||||
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
|
||||
sum_rows_f32_cuda(x, dst, ne, 1, stream);
|
||||
GGML_UNUSED(pool);
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
|
||||
}
|
||||
5
ggml/src/ggml-cuda/sum.cuh
Normal file
5
ggml/src/ggml-cuda/sum.cuh
Normal file
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
|
||||
|
||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -1,5 +1,15 @@
|
||||
#include "unary.cuh"
|
||||
|
||||
static __global__ void neg_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = -x[i];
|
||||
}
|
||||
|
||||
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
@@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
|
||||
dst[i] = cosf(x[i]);
|
||||
}
|
||||
|
||||
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
|
||||
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
@@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
||||
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_NEG_BLOCK_SIZE 256
|
||||
#define CUDA_GELU_BLOCK_SIZE 256
|
||||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_TANH_BLOCK_SIZE 256
|
||||
@@ -12,6 +13,8 @@
|
||||
#define CUDA_SIN_BLOCK_SIZE 256
|
||||
#define CUDA_COS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
|
||||
return ctx->support_simdgroup_reduction;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
return false;
|
||||
|
||||
@@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
||||
look_ahead_size, *qptr)));
|
||||
if (!ptr) {
|
||||
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
|
||||
@@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
void * dev_ptr;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
||||
size, *stream)));
|
||||
if (!dev_ptr) {
|
||||
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
|
||||
}
|
||||
@@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
||||
size, *stream)));
|
||||
|
||||
if (!buf) {
|
||||
char err_buf[1024];
|
||||
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
|
||||
throw std::runtime_error(err_buf);
|
||||
}
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
/*
|
||||
|
||||
@@ -4616,7 +4616,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
@@ -4626,10 +4626,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
@@ -4639,7 +4639,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
@@ -5783,11 +5783,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node);
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node);
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
@@ -6602,6 +6602,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
@@ -6642,7 +6643,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
||||
@@ -5267,6 +5267,7 @@ struct ggml_tensor * ggml_concat(
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad || b->grad) {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
@@ -5388,6 +5389,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad)) {
|
||||
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
@@ -5826,6 +5828,7 @@ static struct ggml_tensor * ggml_set_impl(
|
||||
// make a view of the destination
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
GGML_ASSERT(offset < (size_t)(1 << 30));
|
||||
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
@@ -6783,14 +6786,12 @@ struct ggml_tensor * ggml_rope_back(
|
||||
GGML_ASSERT(ggml_is_vector(b));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
||||
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
|
||||
|
||||
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
is_node = false; // TODO: implement backward
|
||||
GGML_ASSERT(false && "backwards pass not implemented");
|
||||
is_node = false;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
@@ -6808,6 +6809,7 @@ struct ggml_tensor * ggml_rope_back(
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[2] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -7361,6 +7363,11 @@ struct ggml_tensor * ggml_argsort(
|
||||
enum ggml_sort_order order) {
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
||||
@@ -8322,8 +8329,7 @@ static void ggml_compute_forward_dup_same_cont(
|
||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
const size_t nb00 = src0->nb[0];
|
||||
const size_t nb0 = dst->nb[0];
|
||||
const size_t nb0 = ggml_type_size(src0->type);
|
||||
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
@@ -8337,8 +8343,8 @@ static void ggml_compute_forward_dup_same_cont(
|
||||
if (ie0 < ie1) {
|
||||
memcpy(
|
||||
((char *) dst->data + ie0*nb0),
|
||||
((char *) src0->data + ie0*nb00),
|
||||
(ie1 - ie0) * ggml_type_size(src0->type));
|
||||
((char *) src0->data + ie0*nb0),
|
||||
(ie1 - ie0) * nb0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8355,11 +8361,6 @@ static void ggml_compute_forward_dup_f16(
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
@@ -8624,11 +8625,6 @@ static void ggml_compute_forward_dup_bf16(
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
@@ -8980,11 +8976,6 @@ static void ggml_compute_forward_dup_f32(
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
@@ -9294,13 +9285,13 @@ static void ggml_compute_forward_dup_bytes(
|
||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
const size_t type_size = ggml_type_size(src0->type);
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
@@ -10969,9 +10960,6 @@ static void ggml_compute_forward_sum_f32(
|
||||
return;
|
||||
}
|
||||
|
||||
assert(ggml_is_scalar(dst));
|
||||
|
||||
|
||||
assert(ggml_is_scalar(dst));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
@@ -18372,14 +18360,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
if (src0->grad || src1->grad) {
|
||||
GGML_ASSERT(src0->type == tensor->type);
|
||||
GGML_ASSERT(tensor->grad->type == tensor->type);
|
||||
GGML_ASSERT(tensor->grad->type == src1->grad->type);
|
||||
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
|
||||
|
||||
tensor_grad_view = ggml_view_4d(ctx,
|
||||
tensor->grad,
|
||||
src1->grad->ne[0],
|
||||
src1->grad->ne[1],
|
||||
src1->grad->ne[2],
|
||||
src1->grad->ne[3],
|
||||
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
nb1, nb2, nb3, offset);
|
||||
}
|
||||
|
||||
@@ -18448,9 +18432,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
|
||||
memcpy(&offset, tensor->op_params, sizeof(offset));
|
||||
|
||||
size_t nb1 = tensor->nb[1];
|
||||
size_t nb2 = tensor->nb[2];
|
||||
size_t nb3 = tensor->nb[3];
|
||||
size_t nb1 = tensor->nb[1];
|
||||
size_t nb2 = tensor->nb[2];
|
||||
size_t nb3 = tensor->nb[3];
|
||||
|
||||
if (src0->type != src0->grad->type) {
|
||||
// gradient is typically F32, but src0 could be other type
|
||||
@@ -19146,7 +19130,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
|
||||
if (src->visited_hash_set.keys[i]) {
|
||||
// copy all hashset keys (tensors) that are in use
|
||||
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
|
||||
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# Usage:
|
||||
#
|
||||
# $ cd /path/to/llama.cpp
|
||||
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
|
||||
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
|
||||
#
|
||||
|
||||
set -e
|
||||
@@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
|
||||
echo "Syncing ggml changes since commit $lc"
|
||||
|
||||
to_skip=""
|
||||
if [ "$1" == "-skip" ]; then
|
||||
to_skip=$2
|
||||
fi
|
||||
|
||||
# context for git patches in number of lines
|
||||
ctx="8"
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-skip )
|
||||
shift
|
||||
to_skip=$1
|
||||
;;
|
||||
-C )
|
||||
shift
|
||||
ctx=$1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
cd $SRC_GGML
|
||||
|
||||
@@ -52,7 +66,7 @@ while read c; do
|
||||
fi
|
||||
fi
|
||||
|
||||
git format-patch -k $c~1..$c --stdout -- \
|
||||
git format-patch -U${ctx} -k $c~1..$c --stdout -- \
|
||||
CMakeLists.txt \
|
||||
src/CMakeLists.txt \
|
||||
cmake/FindSIMD.cmake \
|
||||
@@ -191,7 +205,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
> ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
git am ggml-src.patch
|
||||
git am -C${ctx} ggml-src.patch
|
||||
|
||||
rm -v $SRC_LLAMA/ggml-src.patch
|
||||
fi
|
||||
|
||||
@@ -1 +1 @@
|
||||
28b7633d733bbeef0026570fbc61c79c5e9aa5ae
|
||||
10e83a412717c20d57ba19f025248e18e43addf3
|
||||
|
||||
@@ -101,6 +101,10 @@ struct ring_buffer {
|
||||
}
|
||||
|
||||
void push_back(const T & value) {
|
||||
if (capacity == 0) {
|
||||
throw std::runtime_error("ring buffer: capacity is zero");
|
||||
}
|
||||
|
||||
if (sz == capacity) {
|
||||
// advance the start when buffer is full
|
||||
first = (first + 1) % capacity;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,16 +23,6 @@ struct llama_sampler_chain {
|
||||
mutable int32_t n_sample;
|
||||
};
|
||||
|
||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||
|
||||
// TODO: tmp exposed until test-sampling is fixed
|
||||
void llama_sampler_penalties_impl(
|
||||
llama_token_data_array * cur_p,
|
||||
const llama_token_cnt & token_count,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
|
||||
@@ -6399,6 +6399,11 @@ static void llm_load_vocab(
|
||||
)
|
||||
) {
|
||||
vocab.special_eot_id = t.second;
|
||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.first.c_str());
|
||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -6412,6 +6417,11 @@ static void llm_load_vocab(
|
||||
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
||||
if (t != vocab.token_to_id.end()) {
|
||||
vocab.special_eom_id = t->second;
|
||||
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t->first.c_str());
|
||||
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16067,7 +16077,7 @@ static int llama_decode_internal(
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch_all.token[i] < 0) {
|
||||
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||
return -1;
|
||||
}
|
||||
@@ -16366,7 +16376,7 @@ static int llama_encode_internal(
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0) {
|
||||
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -148,15 +148,17 @@ static void test_penalties(
|
||||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_cnt token_count;
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
||||
|
||||
auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
|
||||
|
||||
for (size_t i = 0; i < last_tokens.size(); i++) {
|
||||
token_count[last_tokens[i]]++;
|
||||
llama_sampler_accept(sampler, last_tokens[i]);
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
|
||||
APPLY(sampler, &cur_p);
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user