llama : infill sampling handle very long tokens (#9924 )

* llama : infill sampling handle very long tokens ggml-ci * cont : better indices ggml-ci
readme : update bindings list (#9918 )
2026-05-02 15:14:06 +00:00 · 2024-10-17 22:32:47 +03:00 · 2024-10-17 09:57:14 +03:00 · 2024-10-17 02:46:58 +02:00 · 2024-10-17 01:34:22 +02:00 · 2024-10-17 00:36:51 +02:00
33 changed files with 855 additions and 267 deletions
--- a/README.md
+++ b/README.md
@@ -130,6 +130,8 @@ Typically finetunes of the base models below are supported as well.
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
+- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
+- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)

 **UI:**

--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -947,6 +947,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sparams.tfs_z = std::stof(value);
        }
    ).set_sparam());
+    add_opt(common_arg(
+        {"--xtc-probability"}, "N",
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
+        [](common_params & params, const std::string & value) {
+            params.sparams.xtc_probability = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--xtc-threshold"}, "N",
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
+        [](common_params & params, const std::string & value) {
+            params.sparams.xtc_threshold = std::stof(value);
+        }
+    ).set_sparam());
    add_opt(common_arg(
        {"--typical"}, "N",
        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2104,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
+    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
+    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
    fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@@ -90,6 +90,8 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_TFS_Z       = 4,
    COMMON_SAMPLER_TYPE_TYPICAL_P   = 5,
    COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
+    COMMON_SAMPLER_TYPE_XTC         = 7,
+    COMMON_SAMPLER_TYPE_INFILL      = 8,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -108,6 +110,8 @@ struct common_sampler_params {
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   min_p             = 0.05f; // 0.0 = disabled
+    float   xtc_probability   = 0.00f; // 0.0 = disabled
+    float   xtc_threshold     = 0.10f; // > 0.5 disables XTC
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -124,13 +128,15 @@ struct common_sampler_params {
    bool    ignore_eos        = false;
    bool    no_perf           = false; // disable performance metrics

+
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TFS_Z,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
        COMMON_SAMPLER_TYPE_TOP_P,
        COMMON_SAMPLER_TYPE_MIN_P,
-        COMMON_SAMPLER_TYPE_TEMPERATURE
+        COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
    };

    std::string grammar; // optional BNF-like grammar to constrain sampling
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -611,7 +611,7 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
    }

    /*
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
@@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                    case COMMON_SAMPLER_TYPE_MIN_P:
                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                        break;
+                    case COMMON_SAMPLER_TYPE_XTC:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                        break;
                    case COMMON_SAMPLER_TYPE_TFS_Z:
                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
                        break;
@@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                        break;
+                    case COMMON_SAMPLER_TYPE_INFILL:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                        break;
                    default:
                        GGML_ASSERT(false && "unknown sampler type");
                }
@@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
+        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
        default : return '?';
    }
 }
@@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
+        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
        default : return "";
    }
 }
@@ -396,6 +406,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "tfs_z",       COMMON_SAMPLER_TYPE_TFS_Z },
        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
    };

    // since samplers names are written multiple ways
@@ -441,7 +453,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
    };

    std::vector<common_sampler_type> samplers;
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -540,7 +540,7 @@ class SchemaConverter:
        return self._add_rule(
            name,
            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")


    def _resolve_ref(self, ref):
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
    if (!image_embed_result) {
        clip_image_u8_free(img);
-        LOG_ERR("%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: couldn't embed the image\n", __func__);
        return NULL;
    }

--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres

 Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`

+### XTC Sampling
+
+-   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
+-   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
+
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
+
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+
+Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
+
+Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
+
 ### Logit Bias

 -   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -569,30 +569,30 @@ int main(int argc, char ** argv) {
                    if (!params.ctx_shift){
                        LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
                        break;
-                    } else {
-                        if (params.n_predict == -2) {
-                            LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                            break;
-                        }
-
-                        const int n_left    = n_past - params.n_keep;
-                        const int n_discard = n_left/2;
-
-                        LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                                n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                        llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
-
-                        n_past -= n_discard;
-
-                        LOG_DBG("after swap: n_past = %d\n", n_past);
-
-                        LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
-
-                        LOG_DBG("clear session path\n");
-                        path_session.clear();
                    }
+
+                    if (params.n_predict == -2) {
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        break;
+                    }
+
+                    const int n_left    = n_past - params.n_keep;
+                    const int n_discard = n_left/2;
+
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                    n_past -= n_discard;
+
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
+
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+
+                    LOG_DBG("clear session path\n");
+                    path_session.clear();
                }
            } else {
                // context extension via Self-Extend
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -524,10 +524,12 @@ Takes a prefix and a suffix and returns the predicted completion as stream.

 - `input_prefix`: Set the prefix of the code to infill.
 - `input_suffix`: Set the suffix of the code to infill.
- `prompt`: Added after the `FIM_MID` token
- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874
+- `input_extra`:  Additional context inserted before the FIM prefix.
+- `prompt`:       Added after the `FIM_MID` token

-It also accepts all the options of `/completion`.
+`input_extra` is array of `{"filename": string, "text": string}` objects.
+
+The endpoint also accepts all the options of `/completion`.

 If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:

@@ -545,7 +547,7 @@ If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](
 If the tokens are missing, then the extra context is simply prefixed at the start:

 ```txt
-[extra_context]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
 ```

 ### **GET** `/props`: Get server global properties.
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -43,6 +43,8 @@
      top_k: 0, // <= 0 to use vocab size
      top_p: 1.0, // 1.0 = disabled
      min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
      tfs_z: 1.0, // 1.0 = disabled
      typical_p: 1.0, // 1.0 = disabled
      presence_penalty: 0.0, // 0.0 = disabled
@@ -836,6 +838,8 @@ return html`
          ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
          ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
          ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+          ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+          ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
          ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
        </fieldset>

@@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => {
  const snapSettings = {
    temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
    min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
+    xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
    top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
    tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
    typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -307,6 +307,8 @@
      top_k: 40, // <= 0 to use vocab size
      top_p: 0.95, // 1.0 = disabled
      min_p: 0.05, // 0 = disabled
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
      tfs_z: 1.0, // 1.0 = disabled
      typical_p: 1.0, // 1.0 = disabled
      presence_penalty: 0.0, // 0.0 = disabled
@@ -1013,6 +1015,8 @@
              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
            </fieldset>
            <hr />
            <fieldset class="three">
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -529,7 +529,7 @@ export class SchemaConverter {
      return joinSeq();
    };

-    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
+    return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
  }

  _notStrings(strings) {
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -136,10 +136,6 @@ struct slot_params {
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit

    std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-    json extra_context;
 };

 struct server_slot {
@@ -169,6 +165,10 @@ struct server_slot {

    json prompt; // can be either a string, array of strings or array of token ids

+    json input_prefix;
+    json input_suffix;
+    json input_extra;
+
    // when a task is submitted, we first tokenize the prompt and store it here
    std::vector<llama_token> prompt_tokens;
    std::vector<llama_token> extra_tokens;
@@ -863,6 +863,8 @@ struct server_context {
        slot.sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
        slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
+        slot.sparams.xtc_probability   = json_value(data, "xtc_probability",   default_sparams.xtc_probability);
+        slot.sparams.xtc_threshold     = json_value(data, "xtc_threshold",     default_sparams.xtc_threshold);
        slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
        slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
        slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
@@ -908,12 +910,12 @@ struct server_context {
        }

        // infill
-        slot.params.input_prefix  = json_value(data, "input_prefix",  default_params.input_prefix);
-        slot.params.input_suffix  = json_value(data, "input_suffix",  default_params.input_suffix);
-        slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
+        slot.input_prefix = json_value(data, "input_prefix", json());
+        slot.input_suffix = json_value(data, "input_suffix", json());
+        slot.input_extra  = json_value(data, "input_extra",  json());

-        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
-        for (const auto & chunk : slot.params.extra_context) {
+        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
+        for (const auto & chunk : slot.input_extra) {
            // { "text": string, "filename": string }
            if (!chunk.contains("text") || !chunk["text"].is_string()) {
                send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
@@ -930,7 +932,7 @@ struct server_context {
        }

        // get prompt
-        if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
+        {
            const auto & prompt = data.find("prompt");
            if (prompt == data.end()) {
                send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@@ -1088,22 +1090,21 @@ struct server_context {
            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());

            const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
+            bool send_text = true;

            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
            if (stop_pos != std::string::npos) {
-                is_stop_full = true;
                slot.generated_text.erase(
                    slot.generated_text.begin() + pos + stop_pos,
                    slot.generated_text.end());
                pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            } else {
-                is_stop_full = false;
+            } else if (slot.has_next_token) {
                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
+                send_text = stop_pos == std::string::npos;
            }

            // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
+            if (send_text) {
                // no send the stop word in the response
                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                slot.n_sent_text += result.text_to_send.size();
@@ -1196,6 +1197,8 @@ struct server_context {
            {"top_k",                     slot.sparams.top_k},
            {"top_p",                     slot.sparams.top_p},
            {"min_p",                     slot.sparams.min_p},
+            {"xtc_probability",           slot.sparams.xtc_probability},
+            {"xtc_threshold",             slot.sparams.xtc_threshold},
            {"tfs_z",                     slot.sparams.tfs_z},
            {"typical_p",                 slot.sparams.typ_p},
            {"repeat_last_n",             slot.sparams.penalty_last_n},
@@ -1954,6 +1957,8 @@ struct server_context {
                                } break;
                            case SERVER_TASK_CMPL_TYPE_INFILL:
                                {
+                                    // TODO: optimize this block by reducing memory allocations and movement
+
                                    // use FIM repo-level pattern:
                                    // ref: https://arxiv.org/pdf/2409.12186
                                    //
@@ -1964,10 +1969,11 @@ struct server_context {
                                    // extra chunk 1
                                    // ...
                                    // [FIM_SEP]filename
-                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
+                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
                                    //
-                                    auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
-                                    auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
+                                    auto tokens_prefix = tokenize(slot.input_prefix, false, false);
+                                    auto tokens_suffix = tokenize(slot.input_suffix, false, false);
+                                    auto tokens_prompt = tokenize(slot.prompt,       false, false);

                                    slot.extra_tokens.clear();
                                    if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
@@ -1977,7 +1983,7 @@ struct server_context {
                                        slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
                                    }

-                                    for (const auto & chunk : slot.params.extra_context) {
+                                    for (const auto & chunk : slot.input_extra) {
                                        // { "text": string, "filename": string }
                                        const std::string text     = chunk.value("text", "");
                                        const std::string filename = chunk.value("filename", "tmp");
@@ -2008,20 +2014,21 @@ struct server_context {
                                    }

                                    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
-                                    const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
+                                    const int n_suffix_take = std::min<int>(tokens_suffix.size(),   (n_batch/4));
+                                    const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);

                                    // fill the rest of the context with extra chunks
                                    const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());

-                                    prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
-                                    suffix_tokens.resize(n_suffix_take);
+                                    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+                                    tokens_suffix.resize(n_suffix_take);

-                                    prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
-                                    suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
+                                    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
+                                    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+                                    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));

-                                    auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
-                                    auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                                    auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
+                                    auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;

                                    if (llama_add_bos_token(model)) {
                                        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
@@ -2136,40 +2143,17 @@ struct server_context {

                                    while (head_c < slot.cache_tokens.size() &&
                                           head_p < prompt_tokens.size()) {
-                                        if (llama_token_is_control(model, slot.cache_tokens[head_c]) &&
-                                            slot.cache_tokens[head_c] != llama_token_fim_rep(model) &&
-                                            slot.cache_tokens[head_c] != llama_token_fim_sep(model)) {
-                                            break;
-                                        }
-
-                                        if (llama_token_is_control(model, prompt_tokens[head_p]) &&
-                                            prompt_tokens[head_p] != llama_token_fim_rep(model) &&
-                                            prompt_tokens[head_p] != llama_token_fim_sep(model)) {
-                                            break;
-                                        }

                                        size_t n_match = 0;
-
                                        while (head_c + n_match < slot.cache_tokens.size() &&
                                               head_p + n_match < prompt_tokens.size()     &&
                                               slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
-                                            if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) &&
-                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) &&
-                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) {
-                                                break;
-                                            }
-
-                                            if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) &&
-                                                prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) &&
-                                                prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) {
-                                                break;
-                                            }

                                            n_match++;
                                        }

                                        if (n_match >= (size_t) params.n_cache_reuse) {
-                                            SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
                                            //}
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse(

    // Handle "logprobs" field
    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
-    if (body.contains("logprobs")) {
+    if (json_value(body, "logprobs", false)) {
        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
-    } else if (body.contains("top_logprobs")) {
+    } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
    }

--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);

+GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -348,7 +348,6 @@ struct tensor_alloc {
 };

 struct leaf_alloc {
-    int buffer_id;
    struct tensor_alloc leaf;
 };

@@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
        if (leaf->view_src || leaf->data) {
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-metal.h"
 #endif

+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -557,6 +561,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
        register_backend(ggml_backend_metal_reg());
 #endif
+#ifdef GGML_USE_VULKAN
+        register_backend(ggml_backend_vk_reg());
+#endif
 #ifdef GGML_USE_BLAS
        register_backend(ggml_backend_blas_reg());
 #endif
@@ -564,7 +571,7 @@ struct ggml_backend_registry {
        register_backend(ggml_backend_rpc_reg());
 #endif

-        // TODO: sycl, vulkan, kompute, cann
+        // TODO: sycl, kompute, cann

        register_backend(ggml_backend_cpu_reg());
    }
@@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) {

 // backend CPU

-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
-
 static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
    return "CPU";

@@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 }

 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
+    ggml_aligned_free(buffer->context, buffer->size);
 }

 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }

 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
+    auto alloc_size = size;
+    if (alloc_size == 0) {
+        alloc_size = 1;
+    }
+
+    void * data = ggml_aligned_malloc(alloc_size);
+
    if (data == NULL) {
-        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
        return NULL;
    }

-    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
 }

 static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
            ggml_backend_cann_buffer_types[i] = {
                /* .iface    = */ ggml_backend_cann_buffer_type_interface,
+                /* .device    = */ nullptr,
                /* .context  = */
                 new ggml_backend_cann_buffer_type_context{
                    i, "CANN" + std::to_string(i)},
@@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
    ACL_CHECK(aclrtCreateEvent(&event));

    return new ggml_backend_event{
-        /* .backend = */ backend,
+        /* .device = */ nullptr,
        /* .context = */ event,
    };
 }
@@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
 *
 * @param event Pointer to the event structure to be recorded.
 */
-static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
+static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)event->backend->context;
-
+        (ggml_backend_cann_context*)backend->context;
    ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
 }

@@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
                                         ggml_backend_event_t event) {
    ggml_backend_cann_context* cann_ctx =
        (ggml_backend_cann_context*)backend->context;
-
-    if (ggml_backend_is_cann(event->backend)) {
+    if (ggml_backend_is_cann(backend)) {
        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
                                       (aclrtEvent)event->context));
    } else {
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -19,6 +19,9 @@ extern "C" {
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

+// required for mmap as gguf only guarantees 32-byte alignment
+#define TENSOR_ALIGNMENT 32
+
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

+// Memory allocation
+
+void * ggml_aligned_malloc(size_t size);
+void ggml_aligned_free(void * ptr, size_t size);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
        if (device->fp16) {
            device_extensions.push_back("VK_KHR_shader_float16_int8");
        }
-        device->name = device->properties.deviceName.data();
+        device->name = GGML_VK_NAME + std::to_string(idx);

        device_create_info = {
            vk::DeviceCreateFlags(),
@@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {

        device->buffer_type = {
            /* .iface    = */ ggml_backend_vk_buffer_type_interface,
-            /* .device   = */ nullptr,
+            /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
            /* .context  = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
        };

@@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
-        /* .device   = */ nullptr,
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
        /* .context  = */ nullptr,
    };

@@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    UNUSED(backend);
 }

-static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
+// TODO: enable async and synchronize
+static ggml_backend_i ggml_backend_vk_interface = {
+    /* .get_name                = */ ggml_backend_vk_name,
+    /* .free                    = */ ggml_backend_vk_free,
+    /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
+    /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
+    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
+    /* .supports_op             = */ NULL,
+    /* .supports_buft           = */ NULL,
+    /* .offload_op              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};

+static ggml_guid_t ggml_backend_vk_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
+    VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
+
+    ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
+    ggml_vk_init(ctx, dev_num);
+
+    ggml_backend_t vk_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_vk_guid(),
+        /* .interface = */ ggml_backend_vk_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
+        /* .context   = */ ctx,
+    };
+
+    return vk_backend;
+}
+
+bool ggml_backend_is_vk(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
+}
+
+int ggml_backend_vk_get_device_count() {
+    return ggml_vk_get_device_count();
+}
+
+void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+    int dev_idx = vk_instance.device_indices[device];
+    ggml_vk_get_device_description(dev_idx, description, description_size);
+}
+
+void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+
+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+
+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+
+    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
+        if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *total = heap.size;
+            *free = heap.size;
+            break;
+        }
+    }
+}
+
+//////////////////////////
+
+struct ggml_backend_vk_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
+    ggml_backend_vk_get_device_memory(ctx->device, free, total);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ggml_backend_vk_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return ggml_backend_vk_host_buffer_type();
+}
+
+static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+}
+
+static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_vk_device_get_name(dev);
+    props->description = ggml_backend_vk_device_get_description(dev);
+    props->type        = ggml_backend_vk_device_get_type(dev);
+    ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* async       */ false,
+        /* host_buffer */ true,
+        /* events      */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
+    UNUSED(params);
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ggml_backend_vk_init(ctx->device);
+}
+
+static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
@@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
            return false;
    }

-    UNUSED(backend);
+    UNUSED(dev);
 }

-static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
+static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
+        return false;
+    }
+
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
+
+    return buft_ctx->device->idx == ctx->device;
+}
+
+static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    const int min_batch_size = 32;

    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);

-    UNUSED(backend);
+    UNUSED(dev);
 }

-static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
-        return false;
-    }
-
-    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    return buft_ctx->device == ctx->device;
-}
-
-// TODO: enable async and synchronize
-static ggml_backend_i ggml_backend_vk_interface = {
-    /* .get_name                = */ ggml_backend_vk_name,
-    /* .free                    = */ ggml_backend_vk_free,
-    /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
-    /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
-    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
-    /* .supports_op             = */ ggml_backend_vk_supports_op,
-    /* .supports_buft           = */ ggml_backend_vk_supports_buft,
-    /* .offload_op              = */ ggml_backend_vk_offload_op,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
+    /* .get_name             = */ ggml_backend_vk_device_get_name,
+    /* .get_description      = */ ggml_backend_vk_device_get_description,
+    /* .get_memory           = */ ggml_backend_vk_device_get_memory,
+    /* .get_type             = */ ggml_backend_vk_device_get_type,
+    /* .get_props            = */ ggml_backend_vk_device_get_props,
+    /* .init_backend         = */ ggml_backend_vk_device_init,
+    /* .get_buffer_type      = */ ggml_backend_vk_device_get_buffer_type,
+    /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_vk_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_vk_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_vk_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
 };

-static ggml_guid_t ggml_backend_vk_guid() {
-    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
-    return &guid;
+static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return GGML_VK_NAME;
 }

-ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
-    VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
-
-    ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
-    ggml_vk_init(ctx, dev_num);
-
-    ggml_backend_t vk_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_vk_guid(),
-        /* .interface = */ ggml_backend_vk_interface,
-        /* .device    = */ nullptr,
-        /* .context   = */ ctx,
-    };
-
-    return vk_backend;
+static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return ggml_backend_vk_get_device_count();
 }

-bool ggml_backend_is_vk(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
-}
+static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;

-int ggml_backend_vk_get_device_count() {
-    return ggml_vk_get_device_count();
-}
+    static bool initialized = false;

-void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
-    ggml_vk_get_device_description(device, description, description_size);
-}
-
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
-
-    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
-
-    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
-        if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-            *total = heap.size;
-            *free = heap.size;
-            break;
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+                ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
+                char desc[256];
+                ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
+                ctx->device = i;
+                ctx->name = GGML_VK_NAME + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_vk_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
        }
    }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
+    /* .get_name         = */ ggml_backend_vk_reg_get_name,
+    /* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_vk_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_vk_reg() {
+    static ggml_backend_reg reg = {
+        /* .iface   = */ ggml_backend_vk_reg_i,
+        /* .context = */ nullptr,
+    };
+
+    return &reg;
 }

 // Extension availability
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -35,10 +35,6 @@
 #include <omp.h>
 #endif

-#ifdef GGML_USE_METAL
-#include <unistd.h>
-#endif
-
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
 #endif

 #if defined(__APPLE__)
+#include <unistd.h>
+#include <mach/mach.h>
 #include <TargetConditionals.h>
 #endif

@@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
 //#define GGML_SOFT_MAX_ACCELERATE
 #endif

+
+void * ggml_aligned_malloc(size_t size) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
+    return _aligned_malloc(size, TENSOR_ALIGNMENT);
 #else
-inline static void * ggml_aligned_malloc(size_t size) {
    if (size == 0) {
        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
        return NULL;
    }
    void * aligned_memory = NULL;
 #ifdef GGML_USE_CPU_HBM
-    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+    int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
+#elif TARGET_OS_OSX
+    kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
+    int result = EFAULT;
+    switch (alloc_status) {
+        case KERN_SUCCESS:
+            result = 0;
+            break;
+        case KERN_INVALID_ADDRESS:
+            result = EINVAL;
+            break;
+        case KERN_NO_SPACE:
+            result = ENOMEM;
+            break;
+        default:
+            result = EFAULT;
+            break;
+    }
 #elif GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
+    const long page_size = sysconf(_SC_PAGESIZE);
+    int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
 #else
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+    int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
 #endif
    if (result != 0) {
        // Handle allocation failure
@@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
        return NULL;
    }
    return aligned_memory;
+#endif
 }
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#ifdef GGML_USE_CPU_HBM
-#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+
+void ggml_aligned_free(void * ptr, size_t size) {
+    GGML_UNUSED(size);
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    _aligned_free(ptr);
+#elif GGML_USE_CPU_HBM
+    if (ptr != NULL) {
+        hbw_free(ptr);
+    }
+#elif TARGET_OS_OSX
+    if (ptr != NULL) {
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
+    }
 #else
-#define GGML_ALIGNED_FREE(ptr)    free(ptr)
-#endif
+    free(ptr);
 #endif
+}
+

 inline static void * ggml_malloc(size_t size) {
    if (size == 0) {
@@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

    *ctx = (struct ggml_context) {
        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
        /*.no_alloc           =*/ params.no_alloc,
        /*.no_alloc_save      =*/ params.no_alloc,
@@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
                    __func__, i, ggml_used_mem(ctx));

            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
+                ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
            }

            found = true;
@@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
 void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
    if (!threadpool) return;

+    const int n_threads = threadpool->n_threads_max;
+
 #ifndef GGML_USE_OPENMP
    struct ggml_compute_state* workers = threadpool->workers;
-    const int n_threads = threadpool->n_threads_max;

    ggml_mutex_lock(&threadpool->mutex);

@@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
    ggml_cond_destroy(&threadpool->cond);
 #endif // GGML_USE_OPENMP

-    GGML_ALIGNED_FREE(threadpool->workers);
-    GGML_ALIGNED_FREE(threadpool);
+    const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
+    ggml_aligned_free(threadpool->workers, workers_size);
+    ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
 }

 #ifndef GGML_USE_OPENMP
@@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
                struct ggml_cplan * cplan) {

    struct ggml_threadpool * threadpool =
-        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
+        ggml_aligned_malloc(sizeof(struct ggml_threadpool));
    {
        threadpool->cgraph           = cgraph;
        threadpool->cplan            = cplan;
@@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(

    // Allocate and init workers state
    const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
-    struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
+    struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);

    memset(workers, 0, workers_size);
    for (int j = 0; j < tpp->n_threads; j++) {
--- a/include/llama.h
+++ b/include/llama.h
@@ -1101,6 +1101,9 @@ extern "C" {
    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);

+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
+
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1145,6 +1148,28 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

+    // this sampler is meant to be used for fill-in-the-middle infilling
+    // it's supposed to be used after top_k + top_p sampling
+    //
+    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+    // 2. combine probs of tokens that have the same prefix
+    //
+    // example:
+    //
+    // - before:
+    //   "hel":   0.5
+    //   "hell":  0.2
+    //   "hello": 0.1
+    //   "dummy": 0.1
+    //
+    // - after:
+    //   "hel":   0.8
+    //   "dummy": 0.1
+    //
+    // 3. discard non-EOG tokens with low prob
+    // 4. if no tokens are left -> pick EOT
+    //
+    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);

    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-564f42082f858f9674b2a2e06e9e779d9ed2c754
+2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
    };
 }

+// xtc
+
+struct llama_sampler_xtc {
+    const float    probability;
+    const float    threshold;
+    const size_t   min_keep;
+
+    const uint32_t seed;
+    uint32_t       seed_cur;
+
+    std::mt19937   rng;
+};
+
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+    return "xtc";
+}
+
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+
+    if (ctx->probability <= 0.0f
+        || ctx->threshold > 0.5f
+        || cur_p->size < 2) {
+        return;
+    }
+
+    std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+    float chance = distribution(ctx->rng);
+    if (chance > ctx->probability) return;
+
+    // in case it's not sorted/recalculated yet
+    llama_sampler_softmax_impl(cur_p);
+
+    int pos_last = 0;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].p >= ctx->threshold) {
+            pos_last = i;
+        } else break;
+    }
+
+    if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
+        cur_p->data += pos_last;
+        cur_p->size -= pos_last;
+    }
+}
+
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_xtc *) smpl->ctx;
+}
+
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler_i llama_sampler_xtc_i = {
+    /* .name   = */ llama_sampler_xtc_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sample_xtc_apply,
+    /* .reset  = */ llama_sampler_xtc_reset,
+    /* .clone  = */ llama_sampler_xtc_clone,
+    /* .free   = */ llama_sampler_xtc_free,
+};
+
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_xtc_i,
+        /* .ctx   = */ new llama_sampler_xtc {
+            /* .probability   = */ p,
+            /* .threshold     = */ t,
+            /* .min_keep      = */ min_keep,
+            /* .seed          = */ seed,
+            /* .seed_cur      = */ seed_cur,
+            /* .rng           = */ std::mt19937(seed_cur),
+        },
+    };
+}
+
 // mirostat

 struct llama_sampler_mirostat {
@@ -1644,6 +1739,229 @@ struct llama_sampler * llama_sampler_init_logit_bias(
    };
 }

+// infill
+
+//#define GGML_DEBUG_SAMPLER_INFILL
+
+struct llama_sampler_infill {
+    const struct llama_vocab * vocab;
+
+    std::vector<char> buf0;
+    std::vector<char> buf1;
+};
+
+static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+    return "infill";
+}
+
+static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_infill *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+#if defined(GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+    float p_txt_sum = 0.0f;
+    float p_eog_sum = 0.0f;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+            p_eog_sum += cur_p->data[i].p;
+        } else {
+            p_txt_sum += cur_p->data[i].p;
+        }
+    }
+
+    const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
+
+    LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+    if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+        LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+
+        // keep just the EOG tokens
+        const auto size_org = cur_p->size;
+
+        cur_p->size = 0;
+
+        float p_sum = 0.0f;
+
+        for (size_t i = 0; i < size_org; ++i) {
+            if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+                p_sum += cur_p->data[i].p;
+
+                cur_p->data[cur_p->size++] = cur_p->data[i];
+            }
+        }
+
+        // normalize probs
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= p_sum;
+        }
+
+        return;
+    }
+
+    size_t n_combined = 0; GGML_UNUSED(n_combined);
+
+    // combine tokens with common prefix
+    for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
+        for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
+            if (cur_p->data[i0].logit == -INFINITY) {
+                break;
+            }
+
+            if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
+                continue;
+            }
+
+            int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+            if (len0 < 0) {
+                ctx->buf0.resize(len0);
+                len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+                assert(len0 > 0);
+            }
+
+            int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+            if (len1 < 0) {
+                ctx->buf1.resize(len1);
+                len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+                assert(len1 > 0);
+            }
+
+            // token i0 is a prefix of token i1
+            if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
+                int dst = i0;
+                int src = i1;
+
+                // merge into the token with higher probability
+                if (cur_p->data[i1].p > cur_p->data[i0].p) {
+                    std::swap(dst, src);
+                }
+
+                cur_p->data[dst].p += cur_p->data[src].p;
+                cur_p->data[src].logit = -INFINITY;
+                cur_p->data[src].p     = 0.0f;
+
+                n_combined++;
+            }
+        }
+    }
+
+    size_t n_non_eog = 0;
+
+    size_t size_org = cur_p->size;
+
+    float p_sum = 0.0f;
+    float thold = 0.2f;
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        if (!is_eog) {
+            ++n_non_eog;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        // keep this token
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+
+    // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+    if (n_non_eog == 0) {
+        cur_p->size = 1;
+        cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
+        cur_p->data[0].logit = 1.0f;
+
+        return;
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+    size_org = cur_p->size;
+    p_sum = 0.0f;
+    thold = 1.0/(n_non_eog + 1);
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+#undef LOG_DBG_CUR
+}
+
+static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
+    return llama_sampler_init_infill_impl(*ctx->vocab);
+}
+
+static void llama_sampler_infill_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_infill *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_infill_i = {
+    /* .name   = */ llama_sampler_infill_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_infill_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_infill_clone,
+    /* .free   = */ llama_sampler_infill_free,
+};
+
+struct llama_sampler * llama_sampler_init_infill_impl(
+        const struct llama_vocab & vocab) {
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_infill_i,
+        /* .ctx   = */ new llama_sampler_infill {
+            /* .vocab = */ &vocab,
+            /* .buf0 = */ std::vector<char>(512),
+            /* .buf1 = */ std::vector<char>(512),
+        },
+    };
+}
+
 // utils

 uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@@ -4,8 +4,6 @@

 #include "llama-grammar.h"

-#include <unordered_map>
-
 struct llama_vocab;
 struct llama_grammar;

@@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
                      const char * grammar_root);
+
+struct llama_sampler * llama_sampler_init_infill_impl(
+        const struct llama_vocab & vocab);
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
        }

        // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols.size(); ++i) {
+        for (int i = 1; i < (int) symbols.size(); ++i) {
            try_add_bigram(i - 1, i);
        }

@@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
                index++;
                symbols.emplace_back(sym);
            }
-            for (size_t i = 1; i < symbols.size(); ++i) {
+            for (int i = 1; i < (int) symbols.size(); ++i) {
                add_new_bigram(i - 1, i);
            }

--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -48,7 +48,7 @@ struct llama_vocab {
    id special_cls_id  = LLAMA_TOKEN_NULL;
    id special_mask_id = LLAMA_TOKEN_NULL;

-    id linefeed_id    = 13;
+    id linefeed_id = 13;

    // fim tokens
    id special_fim_pre_id = LLAMA_TOKEN_NULL;
@@ -149,6 +149,12 @@ int32_t llama_token_to_piece_impl(
                         int32_t   lstrip,
                            bool   special);

+// check if token0 is contained as a prefix in token1
+bool llama_token_is_prefix_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token0,
+                     llama_token   token1);
+
 int32_t llama_detokenize_impl(
        const struct llama_vocab & vocab,
               const llama_token * tokens,
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,9 +8,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

-#if defined(GGML_USE_VULKAN)
-#  include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
 #  include "ggml-sycl.h"
 #elif defined(GGML_USE_KOMPUTE)
 #   include "ggml-kompute.h"
@@ -3418,8 +3416,6 @@ static int llama_get_device_count(const llama_model & model) {

 #if defined(GGML_USE_SYCL)
    count += ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_VULKAN)
-    count += ggml_backend_vk_get_device_count();
 #elif defined(GGML_USE_CANN)
    count += ggml_backend_cann_get_device_count();
 #endif
@@ -3451,10 +3447,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
    }
 #elif defined(GGML_USE_CPU_HBM)
    buft = ggml_backend_cpu_hbm_buffer_type();
-#elif defined(GGML_USE_VULKAN)
-    if (host_buffer) {
-        buft = ggml_backend_vk_host_buffer_type();
-    }
 #endif

    if (buft == nullptr) {
@@ -3473,9 +3465,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
    }
    device -= (int)model.devices.size();

-#if defined(GGML_USE_VULKAN)
-    buft = ggml_backend_vk_buffer_type(device);
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
    buft = ggml_backend_sycl_buffer_type(device);
 #elif defined(GGML_USE_KOMPUTE)
    buft = ggml_backend_kompute_buffer_type(device);
@@ -3535,11 +3525,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
    size_t free;
    ggml_backend_sycl_get_device_memory(device, &free, &total);
    return free;
-#elif defined(GGML_USE_VULKAN)
-    size_t total;
-    size_t free;
-    ggml_backend_vk_get_device_memory(device, &free, &total);
-    return free;
 #elif defined(GGML_USE_CANN)
    size_t total;
    size_t free;
@@ -16095,9 +16080,11 @@ struct llm_build_context {
        cur = ggml_get_rows(ctx0, cur, inp_out_ids);

        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_norm", -1);

+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
        cb(cur, "result_output", -1);
+
        ggml_build_forward_expand(gf, cur);

        return gf;
@@ -19093,8 +19080,7 @@ bool llama_supports_mlock(void) {
 }

 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;
 #else
@@ -19225,8 +19211,13 @@ struct llama_model * llama_load_model_from_file(

            case GGML_BACKEND_DEVICE_TYPE_GPU:
            case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
+            {
+                size_t free, total; // NOLINT
+                ggml_backend_dev_memory(dev, &free, &total);
+                LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
                model->devices.push_back(dev);
                break;
+            }
        }
    }

@@ -19421,32 +19412,7 @@ struct llama_context * llama_new_context_with_model(
            main_gpu -= (int)model->devices.size();
        }

-#if defined(GGML_USE_VULKAN)
-        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
-            ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_vk_init(device);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
        // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
            ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
@@ -21830,6 +21796,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
    return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }

+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+    return llama_sampler_init_infill_impl(model->vocab);
+}
+
 //
 // model split
 //
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^abc?d*efg+(hij)?kl$"
        })""",
        R"""(
-            root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
+            root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
@@ -709,7 +709,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
        })""",
        R"""(
-            root ::= "\"" "[]{}()|+*?" "\"" space
+            root ::= "\"" ("[]{}()|+*?") "\"" space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
@@ -722,7 +722,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^\"$"
        })""",
        R"""(
-            root ::= "\"" "\"" "\"" space
+            root ::= "\"" ("\"") "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp with top-level alternation",
+        R"""({
+            "type": "string",
+            "pattern": "^A|B|C|D$"
+        })""",
+        R"""(
+            root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
@@ -736,7 +749,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            dot ::= [^\x0A\x0D]
-            root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
+            root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
            root-1 ::= [0-9]
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -111,6 +111,28 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
    }
 }

+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
+    const size_t n_vocab = probs.size();
+
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        const float logit = logf(probs[token_id]);
+        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
+    DUMP(&cur_p);
+    APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
+    DUMP(&cur_p);
+
+    GGML_ASSERT(cur_p.size == expected_probs.size());
+    for (size_t i = 0; i < cur_p.size; i++) {
+        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
+    }
+}
+
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    const size_t n_vocab = probs.size();

@@ -263,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
    }
    const int64_t t_end = ggml_time_us();
    llama_sampler_free(cnstr);
-    printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }

 #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@@ -279,12 +301,13 @@ static void test_perf() {
        data.emplace_back(llama_token_data{i, logit, 0.0f});
    }

-    BENCH(llama_sampler_init_top_k    (40),      data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1), data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1), data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_softmax  (),        data, 32);
+    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
+    BENCH(llama_sampler_init_softmax  (),                       data, 32);
 }

 int main(void) {
@@ -309,6 +332,14 @@ int main(void) {
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);

+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
+
    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
Author	SHA1	Message	Date
Georgi Gerganov	99bd4ac28c	llama : infill sampling handle very long tokens (#9924 ) * llama : infill sampling handle very long tokens ggml-ci * cont : better indices ggml-ci	2024-10-17 22:32:47 +03:00
Tim Wang	3752217ed5	readme : update bindings list (#9918 ) Co-authored-by: Tim Wang <tim.wang@ing.com>	2024-10-17 09:57:14 +03:00
Diego Devesa	f010b77a37	vulkan : add backend registry / device interfaces (#9721 ) * vulkan : add backend registry / device interfaces * llama : print devices used on model load	2024-10-17 02:46:58 +02:00
Gilad S.	2194200278	fix: allocating CPU buffer with size `0` (#9917 )	2024-10-17 01:34:22 +02:00
Gilad S.	73afe681aa	fix: use `vm_allocate` to allocate CPU backend buffer on macOS (#9875 ) * fix: use `vm_allocate` to allocate CPU backend buffer on macOS * fix: switch to `posix_memalign` to keep existing `free()` usages work * feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS * style: formatting * fix: move const outside of `#ifndef` * style: formatting * fix: unused var * fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h` * fix: unused var * fix: page align to `GGUF_DEFAULT_ALIGNMENT` * fix: page align to `TENSOR_ALIGNMENT` * fix: convert `TENSOR_ALIGNMENT` to a macro * fix: increase page size to `32` on iOS * fix: iOS page size * fix: `hbw_posix_memalign` alignment	2024-10-17 00:36:51 +02:00
Daniel Bevenius	9e04102448	llama : suppress conversion from 'size_t' to 'int' (#9046 ) * llama : suppress conversion from 'size_t' to 'int' This commit updates llm_tokenizer_spm.tokenize to suppress/remove the following warnings that are generated on Windows when using MSVC: ```console src\llama-vocab.cpp(211,1): warning C4267: 'argument': conversion from 'size_t' to 'int', possible loss of data src\llama-vocab.cpp(517,1): warning C4267: 'argument': conversion from 'size_t' to 'int', possible loss of data ``` This is done by adding a cast for the size_t returned from symbols.size(). I believe this is safe as it seems unlikely that symbols, which stores an entry for each UTF8 character, would become larger than INT_MAX. The motivation for this change is to reduce the number of warnings that are currently generated when building on Windows. * squash! llama : suppress conversion from 'size_t' to 'int' Move cast into for loop.	2024-10-16 20:34:28 +03:00
Daniel Bevenius	dbf18e4de9	llava : fix typo in error message [no ci] (#9884 )	2024-10-16 20:24:05 +03:00
Joe Eli McIlvain	66c2c93082	grammar : fix JSON Schema for string regex with top-level alt. (#9903 ) Prior to this commit, using a JSON Schema containing a string with `pattern` regular expression that uses top-level alternation (e.g. `"pattern": "^A\|B\|C\|D$"`) would result in invalid JSON output from the constrained sampling grammar, because it ended up creating a grammar rule like this for the string: ``` thing ::= "\"" "A" \| "B" \| "C" \| "D" "\"" space ``` Note that this rule will only match a starting quote for the "A" case, and will only match an ending quote for the "D" case, so this rule will always produce invalid JSON when used for sampling (that is, the JSON will always be lacking the starting quote, the ending quote, or both). This was fixed in a simple way by adding parentheses to the generated rule (for all string pattern rules, to keep it simple), such that the new generated rule looks like this (correct): ``` thing ::= "\"" ("A" \| "B" \| "C" \| "D") "\"" space ```	2024-10-16 19:03:24 +03:00
Molly Sophia	10433e8b45	llama : add tensor name for "result_norm" (#9907 ) Signed-off-by: Molly Sophia <mollysophia379@gmail.com>	2024-10-16 13:10:21 +03:00
Alexey Parfenov	1f66b699c4	server : fix the disappearance of the end of the text (#9867 ) * server: fix the disappearance of the end of the text when streaming with stop strings * simplify "send text" checks	2024-10-16 11:35:53 +03:00
Georgi Gerganov	0e41b300ed	sync : ggml	2024-10-16 11:28:14 +03:00
Daniel Bevenius	cd60b88bf7	ggml-alloc : remove buffer_id from leaf_alloc (ggml/987) This commit removes the buffer_id field from the leaf_alloc struct. The motivation for is that this field is only written to and never read/used as far as I can tell. Each tensor_alloc has a buffer_id field and this is what caused me to look into this more closely, to understand what the buffer_id in leaf_alloc was used for.	2024-10-16 11:28:01 +03:00
leo-pony	becfd387f6	[CANN] Fix cann compilation error (#9891 ) Fix cann compilation error after merging llama.cpp supports dynamically loadable backends.	2024-10-16 08:51:46 +08:00
Georgi Gerganov	755a9b2bf0	llama : add infill sampler (#9896 ) ggml-ci	2024-10-15 16:35:33 +03:00
Georgi Gerganov	223c25a72f	server : improve infill context reuse (#9894 ) ggml-ci	2024-10-15 16:28:55 +03:00
MaggotHATE	fbc98b748e	sampling : add XTC sampler (#9742 ) * Initial XTC commit Adds XTC sampler, not activated by default, but recommended settings by default. * Cleanup * Simplified chances calculation To be more inline with the original implementation, chance is calculated once at the beginning. * First fixes by comments Still need to look into sorting * Fixed trailing backspaces * Fixed RNG to be reproduceable Thanks to @slaren for directions * Fixed forgotten header * Moved `min_keep` Moved from conditions to a simple check at the end. * Fixed broken randomization Thanks to @slaren for explanation * Swapped sorting for a custom algorithm Shifts tokens to remove the penalized ones, then puts the penalized at the back. Should make `min_keep` still viable. * Algorithm rework 1. Scan token from top till the first non-penalizable 2. Remove the last captured token (the least probable above threshold) 3. Shift all tokens to override the remaining penalizable 4. Penalize and put them at the the bottom. * Added XTC to `test-sampling` * Simplified algorithm and more tests * Updated info in common and args * Merged back lost commits in common and arg * Update dump info in common * Fixed incorrect min_keep check * Added XTC to README * Renamed parameters, fixed info and defaults * probability is at 0 by default, but XTC is included in sampling queue * threshold higher than 0.5 switches XTC off * Initial server support * Added XTC to server UIs * Fixed labels in old server UI * Made algorithm safer and more readable * Removed xtc_threshold_max * Fixed arg after update * Quick fixes by comments * Simplified algorithm since threshold_max is removed * Renamed random distribution * Fixed tests and outdated README * Small fixes	2024-10-15 12:54:55 +02:00
Georgi Gerganov	dcdd535302	server : update preact (#9895 )	2024-10-15 12:48:44 +03:00
Michał Tuszyński	4c42f93b22	readme : update bindings list (#9889 )	2024-10-15 11:20:34 +03:00
VoidIsVoid	a89f75e1b7	server : handle "logprobs" field with false value (#9871 ) Co-authored-by: Gimling <huangjl@ruyi.ai>	2024-10-14 10:04:36 +03:00