Compare commits

...

17 Commits
b3687 ... b3704

Author SHA1 Message Date
slaren
a249843d89 common : restore --n-gpu-layers (#9371) 2024-09-08 16:44:42 +02:00
slaren
19f4a7b296 llama : refactor samplers internal implementation (#9370) 2024-09-08 15:52:07 +02:00
Neo Zhang Jianyu
2a358fb0c4 [SYCL] add check malloc result on device (#9346)
* add check malloc result on device

* update for review comments, check all malloc_device() result

---------

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
2024-09-08 19:05:29 +08:00
slaren
eae597182c llama : sanitize tokens in the upper bound (#9359) 2024-09-08 12:41:51 +02:00
Xuan Son Nguyen
00b02bb249 imatrix : fix arg parser for imatrix (#9366)
* imatrix : fix arg parser

* beautify printing first arg
2024-09-08 12:12:17 +02:00
Georgi Gerganov
a876861455 metal : update support condition for im2col + fix warning (#0) 2024-09-08 11:05:55 +03:00
Georgi Gerganov
385decbd63 sync : ggml 2024-09-08 11:05:55 +03:00
Georgi Gerganov
60a3107ccd scripts : option to increase git patch context 2024-09-08 11:05:55 +03:00
Salvatore Mesoraca
406c1a32a1 vulkan: add dryrun support to sin and cos ops (ggml/947)
sin and cos failed test-backend-ops because they
tried to dereference a context pointer that is null
on dry runs.

This commit prevents that segfault.

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
2024-09-08 11:05:55 +03:00
Salvatore Mesoraca
9cb9260861 vulkan: correctly report support for OP_CONT (ggml/946)
test-backend-ops fails because ggml_cont aborts
when invoked passing an unsupported type.

This commit makes ggml_cont tests pass

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
2024-09-08 11:05:55 +03:00
Johannes Gäßler
202084d31d tests: add gradient tests for all backends (ggml/932)
* tests: add gradient checking to test-backend-ops

* remove old comment

* reorder includes

* adjust SIN/COS parameters

* add documentation, use supports_op if possible
2024-09-08 11:05:55 +03:00
Johannes Gäßler
dbbebcab33 ggml: fix ggml_graph_cpy undefined behavior (ggml/943) 2024-09-08 11:05:55 +03:00
Georgi Gerganov
ba1cf846ed cann : fix doxy (ggml/0) 2024-09-08 11:05:55 +03:00
Mengqing Cao
d2d3200b38 cann : add Ascend NPU support (whisper/2336)
* enable Ascend NPU in src/whisper.cpp
  * sync test-backend-ops with llama.cpp
2024-09-08 11:05:55 +03:00
Georgi Gerganov
51d964a4ef cuda : mark BF16 CONT as unsupported 2024-09-08 11:05:55 +03:00
Salvatore Mesoraca
efe6a83e30 ggml : fix cont with transposed tensors when one dimension is 1 (ggml/934)
* ggml_cont: fix issue with transposed tensors when one dimension is 1

when using multiple threads, it is not enough
to check for the tensors to be contiguous for
ggml_compute_forward_dup_same_cont to work correctly.
The tensors strides also need to match.

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

* Add ggml_cont tests

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

* Remove dead code

it isn't possible to reach this code because
all these functions are invoked by ggml_compute_forward_dup
if and only if src0->type != dst->type

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

* Make ggml_compute_forward_dup_same_cont work with contiguous tensors

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

---------

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-09-08 11:05:55 +03:00
Kevin Gibbons
fbb7fcffbc llama : set attrs of mislabelled EOT/EOM tokens (#9348) 2024-09-08 08:51:00 +03:00
23 changed files with 2046 additions and 866 deletions

View File

@@ -600,7 +600,9 @@ std::string llama_arg::to_string() {
if (args.size() == 1) {
ss << arg;
} else {
ss << format("%-7s", arg) << ", ";
// first arg is usually abbreviation, we need padding to make it more beautiful
auto tmp = std::string(arg) + ", ";
ss << format("%-7s", tmp.c_str());
}
} else {
ss << arg << (arg != args.back() ? ", " : "");
@@ -1654,7 +1656,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
}
));
add_opt(llama_arg(
{"-ngl", "--gpu-layers"}, "N",
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
[](gpt_params & params, int value) {
params.n_gpu_layers = value;
@@ -1665,7 +1667,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(llama_arg(
{"-ngld", "--gpu-layers-draft"}, "N",
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model",
[](gpt_params & params, int value) {
params.n_gpu_layers_draft = value;

View File

@@ -577,7 +577,7 @@ int main(int argc, char ** argv) {
params.logits_all = true;
params.verbosity = 1;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View File

@@ -681,8 +681,8 @@ extern "C" {
struct ggml_hash_set {
size_t size;
ggml_bitset_t * used;
struct ggml_tensor ** keys;
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
};
// computation graph
@@ -1272,7 +1272,7 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1282,19 +1282,19 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes
GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1302,7 +1302,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1310,7 +1310,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes
// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(

View File

@@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
default:
return true;
}

View File

@@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.
PROJECT_NAME = "llama.cpp"
PROJECT_NAME = "ggml"
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@ PROJECT_NUMBER =
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.
PROJECT_BRIEF = "llama inference engine"
PROJECT_BRIEF = "Tensor library for machine learning"
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55

View File

@@ -27,6 +27,7 @@
#include "ggml-cuda/rope.cuh"
#include "ggml-cuda/scale.cuh"
#include "ggml-cuda/softmax.cuh"
#include "ggml-cuda/sum.cuh"
#include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
@@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
ggml_cuda_dup(ctx, dst);
break;
case GGML_OP_ADD:
case GGML_OP_ADD1: // TODO: more efficient implementation
ggml_cuda_op_add(ctx, dst);
break;
case GGML_OP_SUB:
@@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_NEG:
ggml_cuda_op_neg(ctx, dst);
break;
case GGML_UNARY_OP_GELU:
ggml_cuda_op_gelu(ctx, dst);
break;
@@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_POOL_2D:
ggml_cuda_op_pool2d(ctx, dst);
break;
case GGML_OP_SUM:
ggml_cuda_op_sum(ctx, dst);
break;
case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst);
break;
@@ -2748,6 +2756,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
@@ -2877,6 +2886,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
@@ -2887,14 +2897,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
return true;
case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
return true;
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_2D:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:

View File

@@ -1,6 +1,6 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include "sum.cuh"
#include <cmath>
#include <cstdint>
@@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
}

41
ggml/src/ggml-cuda/sum.cu Normal file
View File

@@ -0,0 +1,41 @@
#include "sumrows.cuh"
#include "sum.cuh"
#include <cstdint>
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#include <cub/cub.cuh>
using namespace cub;
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
#else
// Use (inefficient) sum_rows implementation as a fallback.
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
sum_rows_f32_cuda(x, dst, ne, 1, stream);
GGML_UNUSED(pool);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
}
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;
const int64_t ne = ggml_nelements(src0);
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
}

View File

@@ -0,0 +1,5 @@
#include "common.cuh"
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -1,5 +1,15 @@
#include "unary.cuh"
static __global__ void neg_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = -x[i];
}
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
dst[i] = cosf(x[i]);
}
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;

View File

@@ -1,5 +1,6 @@
#include "common.cuh"
#define CUDA_NEG_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
@@ -12,6 +13,8 @@
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
return ctx->support_simdgroup_reduction;
case GGML_OP_NORM:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
return true;
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_1D:
case GGML_OP_POOL_2D:
return false;

View File

@@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
SYCL_CHECK(
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
look_ahead_size, *qptr)));
if (!ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
return nullptr;
}
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
@@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
void * dev_ptr;
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream)));
if (!dev_ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
return nullptr;
}
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
}
@@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
*/
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
size, *stream)));
if (!buf) {
char err_buf[1024];
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
throw std::runtime_error(err_buf);
}
// set padding to 0 to avoid possible NaN values
if (size > original_size) {
/*

View File

@@ -4616,7 +4616,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
}, dryrun);
}
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4626,10 +4626,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}, dryrun);
}
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4639,7 +4639,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}, dryrun);
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@@ -5783,11 +5783,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
break;
case GGML_OP_SIN:
ggml_vk_sin(ctx, compute_ctx, src0, node);
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_COS:
ggml_vk_cos(ctx, compute_ctx, src0, node);
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_CLAMP:
@@ -6602,6 +6602,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
return false;
}
} break;
case GGML_OP_CONT:
case GGML_OP_CPY:
case GGML_OP_DUP:
{
@@ -6642,7 +6643,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ARGSORT:

View File

@@ -5267,6 +5267,7 @@ struct ggml_tensor * ggml_concat(
bool is_node = false;
if (a->grad || b->grad) {
GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}
@@ -5388,6 +5389,7 @@ struct ggml_tensor * ggml_leaky_relu(
bool is_node = false;
if (!inplace && (a->grad)) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}
@@ -5826,6 +5828,7 @@ static struct ggml_tensor * ggml_set_impl(
// make a view of the destination
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
GGML_ASSERT(offset < (size_t)(1 << 30));
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));
@@ -6783,14 +6786,12 @@ struct ggml_tensor * ggml_rope_back(
GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]);
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
bool is_node = false;
if (a->grad) {
is_node = false; // TODO: implement backward
GGML_ASSERT(false && "backwards pass not implemented");
is_node = false;
}
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
@@ -6808,6 +6809,7 @@ struct ggml_tensor * ggml_rope_back(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;
return result;
}
@@ -7361,6 +7363,11 @@ struct ggml_tensor * ggml_argsort(
enum ggml_sort_order order) {
bool is_node = false;
if (a->grad) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
ggml_set_op_params_i32(result, 0, (int32_t) order);
@@ -8322,8 +8329,7 @@ static void ggml_compute_forward_dup_same_cont(
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == dst->type);
const size_t nb00 = src0->nb[0];
const size_t nb0 = dst->nb[0];
const size_t nb0 = ggml_type_size(src0->type);
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
@@ -8337,8 +8343,8 @@ static void ggml_compute_forward_dup_same_cont(
if (ie0 < ie1) {
memcpy(
((char *) dst->data + ie0*nb0),
((char *) src0->data + ie0*nb00),
(ie1 - ie0) * ggml_type_size(src0->type));
((char *) src0->data + ie0*nb0),
(ie1 - ie0) * nb0);
}
}
@@ -8355,11 +8361,6 @@ static void ggml_compute_forward_dup_f16(
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows
const int nr = ne01;
// number of rows per thread
@@ -8624,11 +8625,6 @@ static void ggml_compute_forward_dup_bf16(
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows
const int nr = ne01;
// number of rows per thread
@@ -8980,11 +8976,6 @@ static void ggml_compute_forward_dup_f32(
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows
const int nr = ne01;
// number of rows per thread
@@ -9294,13 +9285,13 @@ static void ggml_compute_forward_dup_bytes(
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(src0->type == dst->type);
GGML_TENSOR_UNARY_OP_LOCALS;
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
GGML_TENSOR_UNARY_OP_LOCALS;
const size_t type_size = ggml_type_size(src0->type);
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
@@ -10969,9 +10960,6 @@ static void ggml_compute_forward_sum_f32(
return;
}
assert(ggml_is_scalar(dst));
assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float));
@@ -18372,14 +18360,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
if (src0->grad || src1->grad) {
GGML_ASSERT(src0->type == tensor->type);
GGML_ASSERT(tensor->grad->type == tensor->type);
GGML_ASSERT(tensor->grad->type == src1->grad->type);
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
tensor_grad_view = ggml_view_4d(ctx,
tensor->grad,
src1->grad->ne[0],
src1->grad->ne[1],
src1->grad->ne[2],
src1->grad->ne[3],
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
nb1, nb2, nb3, offset);
}
@@ -18448,9 +18432,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
memcpy(&offset, tensor->op_params, sizeof(offset));
size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];
size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];
if (src0->type != src0->grad->type) {
// gradient is typically F32, but src0 could be other type
@@ -19146,7 +19130,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
}
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
if (src->visited_hash_set.keys[i]) {
// copy all hashset keys (tensors) that are in use
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
}
}

View File

@@ -5,7 +5,7 @@
# Usage:
#
# $ cd /path/to/llama.cpp
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
#
set -e
@@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
echo "Syncing ggml changes since commit $lc"
to_skip=""
if [ "$1" == "-skip" ]; then
to_skip=$2
fi
# context for git patches in number of lines
ctx="8"
while [ "$1" != "" ]; do
case $1 in
-skip )
shift
to_skip=$1
;;
-C )
shift
ctx=$1
;;
esac
shift
done
cd $SRC_GGML
@@ -52,7 +66,7 @@ while read c; do
fi
fi
git format-patch -k $c~1..$c --stdout -- \
git format-patch -U${ctx} -k $c~1..$c --stdout -- \
CMakeLists.txt \
src/CMakeLists.txt \
cmake/FindSIMD.cmake \
@@ -191,7 +205,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
> ggml-src.patch.tmp
mv ggml-src.patch.tmp ggml-src.patch
git am ggml-src.patch
git am -C${ctx} ggml-src.patch
rm -v $SRC_LLAMA/ggml-src.patch
fi

View File

@@ -1 +1 @@
28b7633d733bbeef0026570fbc61c79c5e9aa5ae
10e83a412717c20d57ba19f025248e18e43addf3

View File

@@ -101,6 +101,10 @@ struct ring_buffer {
}
void push_back(const T & value) {
if (capacity == 0) {
throw std::runtime_error("ring buffer: capacity is zero");
}
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;

File diff suppressed because it is too large Load Diff

View File

@@ -23,16 +23,6 @@ struct llama_sampler_chain {
mutable int32_t n_sample;
};
using llama_token_cnt = std::unordered_map<llama_token, int>;
// TODO: tmp exposed until test-sampling is fixed
void llama_sampler_penalties_impl(
llama_token_data_array * cur_p,
const llama_token_cnt & token_count,
float penalty_repeat,
float penalty_freq,
float penalty_present);
struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,

View File

@@ -6399,6 +6399,11 @@ static void llm_load_vocab(
)
) {
vocab.special_eot_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
break;
}
}
@@ -6412,6 +6417,11 @@ static void llm_load_vocab(
const auto & t = vocab.token_to_id.find("<|eom_id|>");
if (t != vocab.token_to_id.end()) {
vocab.special_eom_id = t->second;
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t->first.c_str());
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
}
@@ -16067,7 +16077,7 @@ static int llama_decode_internal(
}
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
@@ -16366,7 +16376,7 @@ static int llama_encode_internal(
}
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}

File diff suppressed because it is too large Load Diff

View File

@@ -148,15 +148,17 @@ static void test_penalties(
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_cnt token_count;
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
for (size_t i = 0; i < last_tokens.size(); i++) {
token_count[last_tokens[i]]++;
llama_sampler_accept(sampler, last_tokens[i]);
}
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
APPLY(sampler, &cur_p);
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);