Add --rope-scale parameter (#2544 )

* common.cpp : Add --rope-scale parameter * README.md : Add info about using linear rope scaling
ggml : mul mat tweaks (#2372 )
2026-05-07 17:44:09 +00:00 · 2023-08-07 19:07:19 +02:00 · 2023-08-07 14:25:58 +03:00 · 2023-08-07 14:24:42 +03:00 · 2023-08-07 13:55:18 +03:00
3 changed files with 106 additions and 68 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -194,6 +194,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@@ -564,8 +570,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by

 -   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

+### Extended Context Size
+
+Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+
+- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+
 ### Keep Prompt

 The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
--- a/ggml.c
+++ b/ggml.c
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    //
    // is enough, but just in case, adding the second part

-    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
+    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
 }

 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        /*.ne           =*/ { 1, 1, 1, 1 },
        /*.nb           =*/ { 0, 0, 0, 0 },
        /*.op           =*/ GGML_OP_NONE,
-        /*.op_params    =*/ {0},
+        /*.op_params    =*/ { 0 },
        /*.is_param     =*/ false,
        /*.grad         =*/ NULL,
        /*.src          =*/ { NULL },
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 }

 static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
+    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
    assert(params_size <= GGML_MAX_OP_PARAMS);
    memcpy(tensor->op_params, params, params_size);
 }
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
    result->src[0] = a;

    int32_t params[] = { axis0, axis1, axis2, axis3 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    return result;
 }
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[] = { n_past, inplace ? 1 : 0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_DIAG_MASK_INF;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[] = { n_past, inplace ? 1 : 0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_DIAG_MASK_ZERO;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[6] = { n_past, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base, sizeof(float));
+    memcpy(params + 4, &freq_base,  sizeof(float));
    memcpy(params + 5, &freq_scale, sizeof(float));
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

    int32_t params[] = { n_past, n_dims, mode, n_ctx };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(

    int32_t op_params[3] = { n_past, n_head };
    memcpy(op_params + 2, &bias_max, sizeof(float));
-    ggml_set_op_params(result, &op_params, sizeof(op_params));
+    ggml_set_op_params(result, op_params, sizeof(op_params));

    result->op   = GGML_OP_ALIBI;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);

    float params[] = { min, max };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_CLAMP;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6893,7 +6894,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);

    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_CONV_1D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6932,7 +6933,7 @@ struct ggml_tensor * ggml_conv_2d(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_CONV_2D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6985,7 +6986,7 @@ struct ggml_tensor * ggml_pool_1d(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);

    int32_t params[] = { op, k0, s0, p0 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_POOL_1D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7022,7 +7023,7 @@ struct ggml_tensor * ggml_pool_2d(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);

    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_POOL_2D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    int32_t params[] = { npx, npy, w };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_WIN_PART;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);

    int32_t params[] = { w };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_WIN_UNPART;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -10730,72 +10731,96 @@ static void ggml_compute_forward_mul_mat(
        return;
    }

-    // parallelize by src0 rows
-    const int64_t dr = (ne01 + nth - 1)/nth;
-
-    const int64_t ir10 = dr*ith;
-    const int64_t ir11 = MIN(ir10 + dr, ne01);
-
-    // src1 rows
-    const int64_t nr1 = ne11*ne12*ne13;
-
    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];

-    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
-        const int64_t i13 = (ir1/(ne12*ne11));
-        const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
-        const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+    const int64_t nr0 = ne01;           // src0 rows
+    const int64_t nr1 = ne11*ne12*ne13; // src1 rows

-        const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
-        const int64_t i03 = (ir0/(ne02));
-        // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
-        // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
-        // GG: this is likely the correct way to broadcast, though need some more thought
-        //     therefore leaving the comments to remind us for now
-        const int64_t i02 = (i12 / (ne12 / ne02));
-        // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
-        // const int64_t i02 = (ir0 - i03*ne02);
+    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);

-        const int64_t i1 = i11;
-        const int64_t i2 = i12;
-        const int64_t i3 = i13;
+    // distribute the thread work across the inner or outer loop based on which one is larger

-        const char * src0_row = (const char *) src0->data + (  0 + i02*nb02 + i03*nb03     );
+    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

-        // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-        //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-        //       the original src1 data pointer, so we should index using the indices directly
-        // TODO: this is a bit of a hack, we should probably have a better way to handle this
-        const char * src1_col = (const char *) wdata +
-            (src1_cont || src1->type != vec_dot_type
-             ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-             : (i11*nb11 + i12*nb12 + i13*nb13));
+    const int64_t ith0 = ith % nth0;
+    const int64_t ith1 = ith / nth0;

-        float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;

-        for (int64_t ir = ir10; ir < ir11; ++ir) {
-            vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
-        }
+    const int64_t ir010 = dr0*ith0;
+    const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+    const int64_t ir110 = dr1*ith1;
+    const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir010 >= ir011 || ir110 >= ir111) {
+        sched_yield();
+        return;
    }

-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);

-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    float tmp[16];
+
+    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                const int64_t i13 = (ir1/(ne12*ne11));
+                const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
+                const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13/r3;
+                const int64_t i02 = i12/r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+                     : (i11*nb11 + i12*nb12 + i13*nb13));
+
+                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                }
+                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+            }
+        }
+    }
 }

-
 // ggml_compute_forward_out_prod

-
 static void ggml_compute_forward_out_prod_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
Author	SHA1	Message	Date
klosax	f3c3b4b167	Add --rope-scale parameter (#2544 ) * common.cpp : Add --rope-scale parameter * README.md : Add info about using linear rope scaling	2023-08-07 19:07:19 +02:00
Georgi Gerganov	93356bdb7a	ggml : mul mat tweaks (#2372 ) * ggml : mul mat wip ggml-ci * ggml : alternative thread distribution for mul_mat ggml-ci * ggml : mul_mat block tiling attempt * ggml : mul_mat threads yield ggml-ci	2023-08-07 14:25:58 +03:00
Georgi Gerganov	60baff7c85	ggml : pad result of ggml_nbytes()	2023-08-07 14:24:42 +03:00
Georgi Gerganov	9082b5dfbf	ggml : change params pointer (style change) (#2539 ) ggml-ci	2023-08-07 13:55:18 +03:00