Compare commits

...

4 Commits

Author SHA1 Message Date
klosax
f3c3b4b167 Add --rope-scale parameter (#2544)
* common.cpp : Add --rope-scale parameter
* README.md : Add info about using linear rope scaling
2023-08-07 19:07:19 +02:00
Georgi Gerganov
93356bdb7a ggml : mul mat tweaks (#2372)
* ggml : mul mat wip

ggml-ci

* ggml : alternative thread distribution for mul_mat

ggml-ci

* ggml : mul_mat block tiling attempt

* ggml : mul_mat threads yield

ggml-ci
2023-08-07 14:25:58 +03:00
Georgi Gerganov
60baff7c85 ggml : pad result of ggml_nbytes() 2023-08-07 14:24:42 +03:00
Georgi Gerganov
9082b5dfbf ggml : change params pointer (style change) (#2539)
ggml-ci
2023-08-07 13:55:18 +03:00
3 changed files with 106 additions and 68 deletions

View File

@@ -194,6 +194,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--rope-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
} else if (arg == "--memory-f32") {
params.memory_f16 = false;
} else if (arg == "--top-p") {
@@ -564,8 +570,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");

View File

@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Extended Context Size
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
### Keep Prompt
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.

157
ggml.c
View File

@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
//
// is enough, but just in case, adding the second part
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
}
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.ne =*/ { 1, 1, 1, 1 },
/*.nb =*/ { 0, 0, 0, 0 },
/*.op =*/ GGML_OP_NONE,
/*.op_params =*/ {0},
/*.op_params =*/ { 0 },
/*.is_param =*/ false,
/*.grad =*/ NULL,
/*.src =*/ { NULL },
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
}
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
assert(params_size <= GGML_MAX_OP_PARAMS);
memcpy(tensor->op_params, params, params_size);
}
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
result->src[0] = a;
int32_t params[] = { axis0, axis1, axis2, axis3 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
return result;
}
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, inplace ? 1 : 0 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_DIAG_MASK_INF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, inplace ? 1 : 0 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_DIAG_MASK_ZERO;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
memcpy(params + 4, &freq_base, sizeof(float));
memcpy(params + 4, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float));
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, n_dims, mode, n_ctx };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
int32_t op_params[3] = { n_past, n_head };
memcpy(op_params + 2, &bias_max, sizeof(float));
ggml_set_op_params(result, &op_params, sizeof(op_params));
ggml_set_op_params(result, op_params, sizeof(op_params));
result->op = GGML_OP_ALIBI;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
float params[] = { min, max };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CLAMP;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6893,7 +6894,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
int32_t params[] = { s0, p0, d0 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6932,7 +6933,7 @@ struct ggml_tensor * ggml_conv_2d(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6985,7 +6986,7 @@ struct ggml_tensor * ggml_pool_1d(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
int32_t params[] = { op, k0, s0, p0 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_POOL_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7022,7 +7023,7 @@ struct ggml_tensor * ggml_pool_2d(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_POOL_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
int32_t params[] = { npx, npy, w };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_WIN_PART;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
int32_t params[] = { w };
ggml_set_op_params(result, &params, sizeof(params));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_WIN_UNPART;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -10730,72 +10731,96 @@ static void ggml_compute_forward_mul_mat(
return;
}
// parallelize by src0 rows
const int64_t dr = (ne01 + nth - 1)/nth;
const int64_t ir10 = dr*ith;
const int64_t ir11 = MIN(ir10 + dr, ne01);
// src1 rows
const int64_t nr1 = ne11*ne12*ne13;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
const int64_t i03 = (ir0/(ne02));
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
// GG: this is likely the correct way to broadcast, though need some more thought
// therefore leaving the comments to remind us for now
const int64_t i02 = (i12 / (ne12 / ne02));
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
// const int64_t i02 = (ir0 - i03*ne02);
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;
// distribute the thread work across the inner or outer loop based on which one is larger
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
const int64_t ith0 = ith % nth0;
const int64_t ith1 = ith / nth0;
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
for (int64_t ir = ir10; ir < ir11; ++ir) {
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
}
const int64_t ir010 = dr0*ith0;
const int64_t ir011 = MIN(ir010 + dr0, nr0);
const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
// threads with no work simply yield (not sure if it helps)
if (ir010 >= ir011 || ir110 >= ir111) {
sched_yield();
return;
}
//int64_t t1 = ggml_time_us();
//static int64_t acc = 0;
//acc += t1 - t0;
//if (t1 - t0 > 10) {
// printf("\n");
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
//}
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
// attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16];
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
// broadcast src0 into src1
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
}
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
}
}
}
// ggml_compute_forward_out_prod
static void ggml_compute_forward_out_prod_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,