mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-02 15:14:06 +00:00
Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
99bd4ac28c | ||
|
|
3752217ed5 | ||
|
|
f010b77a37 | ||
|
|
2194200278 | ||
|
|
73afe681aa | ||
|
|
9e04102448 | ||
|
|
dbf18e4de9 | ||
|
|
66c2c93082 | ||
|
|
10433e8b45 | ||
|
|
1f66b699c4 | ||
|
|
0e41b300ed | ||
|
|
cd60b88bf7 | ||
|
|
becfd387f6 | ||
|
|
755a9b2bf0 | ||
|
|
223c25a72f | ||
|
|
fbc98b748e | ||
|
|
dcdd535302 | ||
|
|
4c42f93b22 | ||
|
|
a89f75e1b7 |
@@ -130,6 +130,8 @@ Typically finetunes of the base models below are supported as well.
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
|
||||
**UI:**
|
||||
|
||||
|
||||
@@ -947,6 +947,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sparams.tfs_z = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--xtc-probability"}, "N",
|
||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.xtc_probability = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--xtc-threshold"}, "N",
|
||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.xtc_threshold = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--typical"}, "N",
|
||||
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
||||
|
||||
@@ -2104,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
||||
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
||||
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
||||
|
||||
@@ -90,6 +90,8 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_TFS_Z = 4,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||
COMMON_SAMPLER_TYPE_XTC = 7,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 8,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -108,6 +110,8 @@ struct common_sampler_params {
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
@@ -124,13 +128,15 @@ struct common_sampler_params {
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TFS_Z,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
COMMON_SAMPLER_TYPE_MIN_P,
|
||||
COMMON_SAMPLER_TYPE_TEMPERATURE
|
||||
COMMON_SAMPLER_TYPE_XTC,
|
||||
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
@@ -611,7 +611,7 @@ private:
|
||||
}
|
||||
return join_seq();
|
||||
};
|
||||
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
||||
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
|
||||
|
||||
snprintf(result, sizeof(result),
|
||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||
top_k, tfs_z, top_p, min_p, typ_p, temp,
|
||||
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
||||
mirostat, mirostat_eta, mirostat_tau);
|
||||
|
||||
return std::string(result);
|
||||
@@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TFS_Z:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||
break;
|
||||
@@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
@@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
||||
default : return '?';
|
||||
}
|
||||
}
|
||||
@@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
||||
default : return "";
|
||||
}
|
||||
}
|
||||
@@ -396,6 +406,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
||||
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
||||
};
|
||||
|
||||
// since samplers names are written multiple ways
|
||||
@@ -441,7 +453,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
||||
};
|
||||
|
||||
std::vector<common_sampler_type> samplers;
|
||||
|
||||
@@ -540,7 +540,7 @@ class SchemaConverter:
|
||||
return self._add_rule(
|
||||
name,
|
||||
to_rule(transform()) if self._raw_pattern \
|
||||
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
|
||||
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
|
||||
|
||||
|
||||
def _resolve_ref(self, ref):
|
||||
|
||||
@@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||
if (!image_embed_result) {
|
||||
clip_image_u8_free(img);
|
||||
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
||||
LOG_ERR("%s: couldn't embed the image\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
|
||||
|
||||
Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
|
||||
|
||||
### XTC Sampling
|
||||
|
||||
- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
|
||||
- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
|
||||
|
||||
Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
|
||||
|
||||
By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
|
||||
|
||||
Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
|
||||
|
||||
Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
|
||||
|
||||
### Logit Bias
|
||||
|
||||
- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
|
||||
|
||||
@@ -569,30 +569,30 @@ int main(int argc, char ** argv) {
|
||||
if (!params.ctx_shift){
|
||||
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
||||
break;
|
||||
} else {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
LOG_DBG("clear session path\n");
|
||||
path_session.clear();
|
||||
}
|
||||
|
||||
if (params.n_predict == -2) {
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
LOG_DBG("clear session path\n");
|
||||
path_session.clear();
|
||||
}
|
||||
} else {
|
||||
// context extension via Self-Extend
|
||||
|
||||
@@ -524,10 +524,12 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
|
||||
- `input_prefix`: Set the prefix of the code to infill.
|
||||
- `input_suffix`: Set the suffix of the code to infill.
|
||||
- `prompt`: Added after the `FIM_MID` token
|
||||
- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874
|
||||
- `input_extra`: Additional context inserted before the FIM prefix.
|
||||
- `prompt`: Added after the `FIM_MID` token
|
||||
|
||||
It also accepts all the options of `/completion`.
|
||||
`input_extra` is array of `{"filename": string, "text": string}` objects.
|
||||
|
||||
The endpoint also accepts all the options of `/completion`.
|
||||
|
||||
If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
|
||||
|
||||
@@ -545,7 +547,7 @@ If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](
|
||||
If the tokens are missing, then the extra context is simply prefixed at the start:
|
||||
|
||||
```txt
|
||||
[extra_context]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
|
||||
[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
|
||||
```
|
||||
|
||||
### **GET** `/props`: Get server global properties.
|
||||
|
||||
@@ -43,6 +43,8 @@
|
||||
top_k: 0, // <= 0 to use vocab size
|
||||
top_p: 1.0, // 1.0 = disabled
|
||||
min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
|
||||
xtc_probability: 0.0, // 0 = disabled;
|
||||
xtc_threshold: 0.1, // > 0.5 disables XTC;
|
||||
tfs_z: 1.0, // 1.0 = disabled
|
||||
typical_p: 1.0, // 1.0 = disabled
|
||||
presence_penalty: 0.0, // 0.0 = disabled
|
||||
@@ -836,6 +838,8 @@ return html`
|
||||
${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
|
||||
${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
||||
${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
||||
${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
|
||||
${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
|
||||
${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
|
||||
</fieldset>
|
||||
|
||||
@@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => {
|
||||
const snapSettings = {
|
||||
temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
|
||||
min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
|
||||
xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
|
||||
xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
|
||||
top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
||||
tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
||||
typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
||||
|
||||
@@ -307,6 +307,8 @@
|
||||
top_k: 40, // <= 0 to use vocab size
|
||||
top_p: 0.95, // 1.0 = disabled
|
||||
min_p: 0.05, // 0 = disabled
|
||||
xtc_probability: 0.0, // 0 = disabled;
|
||||
xtc_threshold: 0.1, // > 0.5 disables XTC;
|
||||
tfs_z: 1.0, // 1.0 = disabled
|
||||
typical_p: 1.0, // 1.0 = disabled
|
||||
presence_penalty: 0.0, // 0.0 = disabled
|
||||
@@ -1013,6 +1015,8 @@
|
||||
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
||||
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
|
||||
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
||||
${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
|
||||
${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
|
||||
</fieldset>
|
||||
<hr />
|
||||
<fieldset class="three">
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -529,7 +529,7 @@ export class SchemaConverter {
|
||||
return joinSeq();
|
||||
};
|
||||
|
||||
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
|
||||
return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
|
||||
}
|
||||
|
||||
_notStrings(strings) {
|
||||
|
||||
@@ -136,10 +136,6 @@ struct slot_params {
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
|
||||
json input_prefix;
|
||||
json input_suffix;
|
||||
json extra_context;
|
||||
};
|
||||
|
||||
struct server_slot {
|
||||
@@ -169,6 +165,10 @@ struct server_slot {
|
||||
|
||||
json prompt; // can be either a string, array of strings or array of token ids
|
||||
|
||||
json input_prefix;
|
||||
json input_suffix;
|
||||
json input_extra;
|
||||
|
||||
// when a task is submitted, we first tokenize the prompt and store it here
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
std::vector<llama_token> extra_tokens;
|
||||
@@ -863,6 +863,8 @@ struct server_context {
|
||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||
slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability);
|
||||
slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold);
|
||||
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
||||
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
@@ -908,12 +910,12 @@ struct server_context {
|
||||
}
|
||||
|
||||
// infill
|
||||
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
|
||||
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
|
||||
slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
|
||||
slot.input_prefix = json_value(data, "input_prefix", json());
|
||||
slot.input_suffix = json_value(data, "input_suffix", json());
|
||||
slot.input_extra = json_value(data, "input_extra", json());
|
||||
|
||||
SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
|
||||
for (const auto & chunk : slot.params.extra_context) {
|
||||
SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
|
||||
for (const auto & chunk : slot.input_extra) {
|
||||
// { "text": string, "filename": string }
|
||||
if (!chunk.contains("text") || !chunk["text"].is_string()) {
|
||||
send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -930,7 +932,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
// get prompt
|
||||
if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
|
||||
{
|
||||
const auto & prompt = data.find("prompt");
|
||||
if (prompt == data.end()) {
|
||||
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -1088,22 +1090,21 @@ struct server_context {
|
||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||
|
||||
const std::string str_test = slot.generated_text.substr(pos);
|
||||
bool is_stop_full = false;
|
||||
bool send_text = true;
|
||||
|
||||
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
|
||||
if (stop_pos != std::string::npos) {
|
||||
is_stop_full = true;
|
||||
slot.generated_text.erase(
|
||||
slot.generated_text.begin() + pos + stop_pos,
|
||||
slot.generated_text.end());
|
||||
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||
} else {
|
||||
is_stop_full = false;
|
||||
} else if (slot.has_next_token) {
|
||||
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
|
||||
send_text = stop_pos == std::string::npos;
|
||||
}
|
||||
|
||||
// check if there is any token to predict
|
||||
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
|
||||
if (send_text) {
|
||||
// no send the stop word in the response
|
||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||
slot.n_sent_text += result.text_to_send.size();
|
||||
@@ -1196,6 +1197,8 @@ struct server_context {
|
||||
{"top_k", slot.sparams.top_k},
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
{"xtc_probability", slot.sparams.xtc_probability},
|
||||
{"xtc_threshold", slot.sparams.xtc_threshold},
|
||||
{"tfs_z", slot.sparams.tfs_z},
|
||||
{"typical_p", slot.sparams.typ_p},
|
||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||
@@ -1954,6 +1957,8 @@ struct server_context {
|
||||
} break;
|
||||
case SERVER_TASK_CMPL_TYPE_INFILL:
|
||||
{
|
||||
// TODO: optimize this block by reducing memory allocations and movement
|
||||
|
||||
// use FIM repo-level pattern:
|
||||
// ref: https://arxiv.org/pdf/2409.12186
|
||||
//
|
||||
@@ -1964,10 +1969,11 @@ struct server_context {
|
||||
// extra chunk 1
|
||||
// ...
|
||||
// [FIM_SEP]filename
|
||||
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
|
||||
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
|
||||
//
|
||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
|
||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
|
||||
auto tokens_prefix = tokenize(slot.input_prefix, false, false);
|
||||
auto tokens_suffix = tokenize(slot.input_suffix, false, false);
|
||||
auto tokens_prompt = tokenize(slot.prompt, false, false);
|
||||
|
||||
slot.extra_tokens.clear();
|
||||
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
|
||||
@@ -1977,7 +1983,7 @@ struct server_context {
|
||||
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
|
||||
}
|
||||
|
||||
for (const auto & chunk : slot.params.extra_context) {
|
||||
for (const auto & chunk : slot.input_extra) {
|
||||
// { "text": string, "filename": string }
|
||||
const std::string text = chunk.value("text", "");
|
||||
const std::string filename = chunk.value("filename", "tmp");
|
||||
@@ -2008,20 +2014,21 @@ struct server_context {
|
||||
}
|
||||
|
||||
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
||||
const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
|
||||
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
|
||||
const int n_suffix_take = std::min<int>(tokens_suffix.size(), (n_batch/4));
|
||||
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
|
||||
|
||||
// fill the rest of the context with extra chunks
|
||||
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
|
||||
|
||||
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
|
||||
suffix_tokens.resize(n_suffix_take);
|
||||
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
|
||||
tokens_suffix.resize(n_suffix_take);
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
|
||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
|
||||
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
|
||||
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
|
||||
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
|
||||
|
||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||
auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
|
||||
auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;
|
||||
|
||||
if (llama_add_bos_token(model)) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||
@@ -2136,40 +2143,17 @@ struct server_context {
|
||||
|
||||
while (head_c < slot.cache_tokens.size() &&
|
||||
head_p < prompt_tokens.size()) {
|
||||
if (llama_token_is_control(model, slot.cache_tokens[head_c]) &&
|
||||
slot.cache_tokens[head_c] != llama_token_fim_rep(model) &&
|
||||
slot.cache_tokens[head_c] != llama_token_fim_sep(model)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (llama_token_is_control(model, prompt_tokens[head_p]) &&
|
||||
prompt_tokens[head_p] != llama_token_fim_rep(model) &&
|
||||
prompt_tokens[head_p] != llama_token_fim_sep(model)) {
|
||||
break;
|
||||
}
|
||||
|
||||
size_t n_match = 0;
|
||||
|
||||
while (head_c + n_match < slot.cache_tokens.size() &&
|
||||
head_p + n_match < prompt_tokens.size() &&
|
||||
slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
|
||||
if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) &&
|
||||
slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) &&
|
||||
slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) &&
|
||||
prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) &&
|
||||
prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) {
|
||||
break;
|
||||
}
|
||||
|
||||
n_match++;
|
||||
}
|
||||
|
||||
if (n_match >= (size_t) params.n_cache_reuse) {
|
||||
SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||
//}
|
||||
|
||||
@@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse(
|
||||
|
||||
// Handle "logprobs" field
|
||||
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
||||
if (body.contains("logprobs")) {
|
||||
if (json_value(body, "logprobs", false)) {
|
||||
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
||||
} else if (body.contains("top_logprobs")) {
|
||||
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
|
||||
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -348,7 +348,6 @@ struct tensor_alloc {
|
||||
};
|
||||
|
||||
struct leaf_alloc {
|
||||
int buffer_id;
|
||||
struct tensor_alloc leaf;
|
||||
};
|
||||
|
||||
@@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
||||
if (leaf->view_src || leaf->data) {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
||||
|
||||
@@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
#include "ggml-blas.h"
|
||||
#endif
|
||||
@@ -557,6 +561,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_METAL
|
||||
register_backend(ggml_backend_metal_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
@@ -564,7 +571,7 @@ struct ggml_backend_registry {
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
|
||||
// TODO: sycl, vulkan, kompute, cann
|
||||
// TODO: sycl, kompute, cann
|
||||
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
}
|
||||
@@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// backend CPU
|
||||
|
||||
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "CPU";
|
||||
|
||||
@@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
||||
auto alloc_size = size;
|
||||
if (alloc_size == 0) {
|
||||
alloc_size = 1;
|
||||
}
|
||||
|
||||
void * data = ggml_aligned_malloc(alloc_size);
|
||||
|
||||
if (data == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
|
||||
@@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
|
||||
ggml_backend_cann_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */
|
||||
new ggml_backend_cann_buffer_type_context{
|
||||
i, "CANN" + std::to_string(i)},
|
||||
@@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
|
||||
ACL_CHECK(aclrtCreateEvent(&event));
|
||||
|
||||
return new ggml_backend_event{
|
||||
/* .backend = */ backend,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ event,
|
||||
};
|
||||
}
|
||||
@@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
|
||||
*
|
||||
* @param event Pointer to the event structure to be recorded.
|
||||
*/
|
||||
static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
|
||||
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)event->backend->context;
|
||||
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
|
||||
}
|
||||
|
||||
@@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
||||
ggml_backend_event_t event) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
if (ggml_backend_is_cann(event->backend)) {
|
||||
if (ggml_backend_is_cann(backend)) {
|
||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
||||
(aclrtEvent)event->context));
|
||||
} else {
|
||||
|
||||
@@ -19,6 +19,9 @@ extern "C" {
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// required for mmap as gguf only guarantees 32-byte alignment
|
||||
#define TENSOR_ALIGNMENT 32
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
@@ -196,6 +199,11 @@ struct ggml_cgraph {
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
// Memory allocation
|
||||
|
||||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
if (device->fp16) {
|
||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
device->name = device->properties.deviceName.data();
|
||||
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||
|
||||
device_create_info = {
|
||||
vk::DeviceCreateFlags(),
|
||||
@@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
device->buffer_type = {
|
||||
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
|
||||
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
||||
};
|
||||
|
||||
@@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .device = */ nullptr,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
@@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_vk_guid() {
|
||||
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
||||
|
||||
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
||||
ggml_vk_init(ctx, dev_num);
|
||||
|
||||
ggml_backend_t vk_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .interface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return vk_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
||||
}
|
||||
|
||||
int ggml_backend_vk_get_device_count() {
|
||||
return ggml_vk_get_device_count();
|
||||
}
|
||||
|
||||
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
int dev_idx = vk_instance.device_indices[device];
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
|
||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
*free = heap.size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
|
||||
struct ggml_backend_vk_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||
ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ggml_backend_vk_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
return ggml_backend_vk_host_buffer_type();
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* async */ false,
|
||||
/* host_buffer */ true,
|
||||
/* events */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
UNUSED(params);
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ggml_backend_vk_init(ctx->device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
@@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
|
||||
return false;
|
||||
}
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||
|
||||
return buft_ctx->device->idx == ctx->device;
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
return buft_ctx->device == ctx->device;
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_vk_offload_op,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||
/* .get_name = */ ggml_backend_vk_device_get_name,
|
||||
/* .get_description = */ ggml_backend_vk_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_vk_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_vk_device_get_type,
|
||||
/* .get_props = */ ggml_backend_vk_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_vk_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_vk_guid() {
|
||||
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
||||
return &guid;
|
||||
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
|
||||
UNUSED(reg);
|
||||
return GGML_VK_NAME;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
||||
|
||||
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
||||
ggml_vk_init(ctx, dev_num);
|
||||
|
||||
ggml_backend_t vk_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .interface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return vk_backend;
|
||||
static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
UNUSED(reg);
|
||||
return ggml_backend_vk_get_device_count();
|
||||
}
|
||||
|
||||
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
||||
}
|
||||
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
|
||||
static std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
int ggml_backend_vk_get_device_count() {
|
||||
return ggml_vk_get_device_count();
|
||||
}
|
||||
static bool initialized = false;
|
||||
|
||||
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
||||
ggml_vk_get_device_description(device, description, description_size);
|
||||
}
|
||||
|
||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
|
||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
*free = heap.size;
|
||||
break;
|
||||
{
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
||||
ctx->device = i;
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(device < devices.size());
|
||||
return devices[device];
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
|
||||
/* .get_name = */ ggml_backend_vk_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_vk_reg_get_device,
|
||||
/* .get_proc_address = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_vk_reg() {
|
||||
static ggml_backend_reg reg = {
|
||||
/* .iface = */ ggml_backend_vk_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return ®
|
||||
}
|
||||
|
||||
// Extension availability
|
||||
|
||||
@@ -35,10 +35,6 @@
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <unistd.h>
|
||||
#include <mach/mach.h>
|
||||
#include <TargetConditionals.h>
|
||||
#endif
|
||||
|
||||
@@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
||||
//#define GGML_SOFT_MAX_ACCELERATE
|
||||
#endif
|
||||
|
||||
|
||||
void * ggml_aligned_malloc(size_t size) {
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
return _aligned_malloc(size, TENSOR_ALIGNMENT);
|
||||
#else
|
||||
inline static void * ggml_aligned_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * aligned_memory = NULL;
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
||||
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
||||
#elif TARGET_OS_OSX
|
||||
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
|
||||
int result = EFAULT;
|
||||
switch (alloc_status) {
|
||||
case KERN_SUCCESS:
|
||||
result = 0;
|
||||
break;
|
||||
case KERN_INVALID_ADDRESS:
|
||||
result = EINVAL;
|
||||
break;
|
||||
case KERN_NO_SPACE:
|
||||
result = ENOMEM;
|
||||
break;
|
||||
default:
|
||||
result = EFAULT;
|
||||
break;
|
||||
}
|
||||
#elif GGML_USE_METAL
|
||||
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
||||
const long page_size = sysconf(_SC_PAGESIZE);
|
||||
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
|
||||
#else
|
||||
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
||||
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
||||
#endif
|
||||
if (result != 0) {
|
||||
// Handle allocation failure
|
||||
@@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
return NULL;
|
||||
}
|
||||
return aligned_memory;
|
||||
#endif
|
||||
}
|
||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
||||
|
||||
void ggml_aligned_free(void * ptr, size_t size) {
|
||||
GGML_UNUSED(size);
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
_aligned_free(ptr);
|
||||
#elif GGML_USE_CPU_HBM
|
||||
if (ptr != NULL) {
|
||||
hbw_free(ptr);
|
||||
}
|
||||
#elif TARGET_OS_OSX
|
||||
if (ptr != NULL) {
|
||||
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
|
||||
}
|
||||
#else
|
||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||
#endif
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
inline static void * ggml_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
@@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
|
||||
*ctx = (struct ggml_context) {
|
||||
/*.mem_size =*/ mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.no_alloc =*/ params.no_alloc,
|
||||
/*.no_alloc_save =*/ params.no_alloc,
|
||||
@@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
|
||||
__func__, i, ggml_used_mem(ctx));
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
||||
}
|
||||
|
||||
found = true;
|
||||
@@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
||||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||
if (!threadpool) return;
|
||||
|
||||
const int n_threads = threadpool->n_threads_max;
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
struct ggml_compute_state* workers = threadpool->workers;
|
||||
const int n_threads = threadpool->n_threads_max;
|
||||
|
||||
ggml_mutex_lock(&threadpool->mutex);
|
||||
|
||||
@@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||
ggml_cond_destroy(&threadpool->cond);
|
||||
#endif // GGML_USE_OPENMP
|
||||
|
||||
GGML_ALIGNED_FREE(threadpool->workers);
|
||||
GGML_ALIGNED_FREE(threadpool);
|
||||
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
|
||||
ggml_aligned_free(threadpool->workers, workers_size);
|
||||
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
|
||||
}
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
@@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||
struct ggml_cplan * cplan) {
|
||||
|
||||
struct ggml_threadpool * threadpool =
|
||||
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
|
||||
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
|
||||
{
|
||||
threadpool->cgraph = cgraph;
|
||||
threadpool->cplan = cplan;
|
||||
@@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||
|
||||
// Allocate and init workers state
|
||||
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
|
||||
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
|
||||
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
|
||||
|
||||
memset(workers, 0, workers_size);
|
||||
for (int j = 0; j < tpp->n_threads; j++) {
|
||||
|
||||
@@ -1101,6 +1101,9 @@ extern "C" {
|
||||
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
||||
|
||||
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
||||
|
||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
@@ -1145,6 +1148,28 @@ extern "C" {
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias);
|
||||
|
||||
// this sampler is meant to be used for fill-in-the-middle infilling
|
||||
// it's supposed to be used after top_k + top_p sampling
|
||||
//
|
||||
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
|
||||
// 2. combine probs of tokens that have the same prefix
|
||||
//
|
||||
// example:
|
||||
//
|
||||
// - before:
|
||||
// "hel": 0.5
|
||||
// "hell": 0.2
|
||||
// "hello": 0.1
|
||||
// "dummy": 0.1
|
||||
//
|
||||
// - after:
|
||||
// "hel": 0.8
|
||||
// "dummy": 0.1
|
||||
//
|
||||
// 3. discard non-EOG tokens with low prob
|
||||
// 4. if no tokens are left -> pick EOT
|
||||
//
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
||||
|
||||
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
||||
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
||||
|
||||
@@ -1 +1 @@
|
||||
564f42082f858f9674b2a2e06e9e779d9ed2c754
|
||||
2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b
|
||||
|
||||
@@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
|
||||
};
|
||||
}
|
||||
|
||||
// xtc
|
||||
|
||||
struct llama_sampler_xtc {
|
||||
const float probability;
|
||||
const float threshold;
|
||||
const size_t min_keep;
|
||||
|
||||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "xtc";
|
||||
}
|
||||
|
||||
static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
|
||||
|
||||
if (ctx->probability <= 0.0f
|
||||
|| ctx->threshold > 0.5f
|
||||
|| cur_p->size < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
||||
float chance = distribution(ctx->rng);
|
||||
if (chance > ctx->probability) return;
|
||||
|
||||
// in case it's not sorted/recalculated yet
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
|
||||
int pos_last = 0;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].p >= ctx->threshold) {
|
||||
pos_last = i;
|
||||
} else break;
|
||||
}
|
||||
|
||||
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
|
||||
cur_p->data += pos_last;
|
||||
cur_p->size -= pos_last;
|
||||
}
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
|
||||
auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
|
||||
|
||||
// copy the state
|
||||
{
|
||||
auto * result_ctx = (llama_sampler_xtc *) result->ctx;
|
||||
|
||||
result_ctx->rng = ctx->rng;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
|
||||
delete (llama_sampler_xtc *) smpl->ctx;
|
||||
}
|
||||
|
||||
static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
|
||||
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||
ctx->rng.seed(ctx->seed_cur);
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_xtc_i = {
|
||||
/* .name = */ llama_sampler_xtc_name,
|
||||
/* .accept = */ nullptr,
|
||||
/* .apply = */ llama_sample_xtc_apply,
|
||||
/* .reset = */ llama_sampler_xtc_reset,
|
||||
/* .clone = */ llama_sampler_xtc_clone,
|
||||
/* .free = */ llama_sampler_xtc_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_xtc_i,
|
||||
/* .ctx = */ new llama_sampler_xtc {
|
||||
/* .probability = */ p,
|
||||
/* .threshold = */ t,
|
||||
/* .min_keep = */ min_keep,
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// mirostat
|
||||
|
||||
struct llama_sampler_mirostat {
|
||||
@@ -1644,6 +1739,229 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
};
|
||||
}
|
||||
|
||||
// infill
|
||||
|
||||
//#define GGML_DEBUG_SAMPLER_INFILL
|
||||
|
||||
struct llama_sampler_infill {
|
||||
const struct llama_vocab * vocab;
|
||||
|
||||
std::vector<char> buf0;
|
||||
std::vector<char> buf1;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "infill";
|
||||
}
|
||||
|
||||
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_infill *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
|
||||
#if defined(GGML_DEBUG_SAMPLER_INFILL)
|
||||
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
|
||||
#else
|
||||
#define LOG_DBG_CUR(...)
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||
}
|
||||
|
||||
float p_txt_sum = 0.0f;
|
||||
float p_eog_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
p_eog_sum += cur_p->data[i].p;
|
||||
} else {
|
||||
p_txt_sum += cur_p->data[i].p;
|
||||
}
|
||||
}
|
||||
|
||||
const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
|
||||
|
||||
LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
|
||||
|
||||
if (3*p_eog_sum*cur_p->size > p_txt_sum) {
|
||||
LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
|
||||
|
||||
// keep just the EOG tokens
|
||||
const auto size_org = cur_p->size;
|
||||
|
||||
cur_p->size = 0;
|
||||
|
||||
float p_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
p_sum += cur_p->data[i].p;
|
||||
|
||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
// normalize probs
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].p /= p_sum;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
size_t n_combined = 0; GGML_UNUSED(n_combined);
|
||||
|
||||
// combine tokens with common prefix
|
||||
for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
|
||||
for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
|
||||
if (cur_p->data[i0].logit == -INFINITY) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
if (len0 < 0) {
|
||||
ctx->buf0.resize(len0);
|
||||
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
assert(len0 > 0);
|
||||
}
|
||||
|
||||
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
if (len1 < 0) {
|
||||
ctx->buf1.resize(len1);
|
||||
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
assert(len1 > 0);
|
||||
}
|
||||
|
||||
// token i0 is a prefix of token i1
|
||||
if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
|
||||
int dst = i0;
|
||||
int src = i1;
|
||||
|
||||
// merge into the token with higher probability
|
||||
if (cur_p->data[i1].p > cur_p->data[i0].p) {
|
||||
std::swap(dst, src);
|
||||
}
|
||||
|
||||
cur_p->data[dst].p += cur_p->data[src].p;
|
||||
cur_p->data[src].logit = -INFINITY;
|
||||
cur_p->data[src].p = 0.0f;
|
||||
|
||||
n_combined++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_non_eog = 0;
|
||||
|
||||
size_t size_org = cur_p->size;
|
||||
|
||||
float p_sum = 0.0f;
|
||||
float thold = 0.2f;
|
||||
|
||||
cur_p->size = 0;
|
||||
|
||||
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_eog) {
|
||||
++n_non_eog;
|
||||
}
|
||||
|
||||
p_sum += cur_p->data[i].p;
|
||||
|
||||
// keep this token
|
||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||
}
|
||||
|
||||
LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
|
||||
|
||||
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
||||
if (n_non_eog == 0) {
|
||||
cur_p->size = 1;
|
||||
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
|
||||
cur_p->data[0].logit = 1.0f;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// normalize probs
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].p /= p_sum;
|
||||
|
||||
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||
}
|
||||
|
||||
size_org = cur_p->size;
|
||||
p_sum = 0.0f;
|
||||
thold = 1.0/(n_non_eog + 1);
|
||||
|
||||
cur_p->size = 0;
|
||||
|
||||
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
}
|
||||
|
||||
p_sum += cur_p->data[i].p;
|
||||
|
||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||
}
|
||||
|
||||
// normalize probs
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].p /= p_sum;
|
||||
|
||||
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||
}
|
||||
|
||||
#undef LOG_DBG_CUR
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
|
||||
return llama_sampler_init_infill_impl(*ctx->vocab);
|
||||
}
|
||||
|
||||
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
||||
delete (llama_sampler_infill *) smpl->ctx;
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_infill_i = {
|
||||
/* .name = */ llama_sampler_infill_name,
|
||||
/* .accept = */ nullptr,
|
||||
/* .apply = */ llama_sampler_infill_apply,
|
||||
/* .reset = */ nullptr,
|
||||
/* .clone = */ llama_sampler_infill_clone,
|
||||
/* .free = */ llama_sampler_infill_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||
const struct llama_vocab & vocab) {
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_infill_i,
|
||||
/* .ctx = */ new llama_sampler_infill {
|
||||
/* .vocab = */ &vocab,
|
||||
/* .buf0 = */ std::vector<char>(512),
|
||||
/* .buf1 = */ std::vector<char>(512),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
|
||||
#include "llama-grammar.h"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
struct llama_vocab;
|
||||
struct llama_grammar;
|
||||
|
||||
@@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||
const struct llama_vocab & vocab);
|
||||
|
||||
@@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
|
||||
}
|
||||
|
||||
// seed the work queue with all possible 2-character tokens.
|
||||
for (size_t i = 1; i < symbols.size(); ++i) {
|
||||
for (int i = 1; i < (int) symbols.size(); ++i) {
|
||||
try_add_bigram(i - 1, i);
|
||||
}
|
||||
|
||||
@@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
|
||||
index++;
|
||||
symbols.emplace_back(sym);
|
||||
}
|
||||
for (size_t i = 1; i < symbols.size(); ++i) {
|
||||
for (int i = 1; i < (int) symbols.size(); ++i) {
|
||||
add_new_bigram(i - 1, i);
|
||||
}
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ struct llama_vocab {
|
||||
id special_cls_id = LLAMA_TOKEN_NULL;
|
||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
id linefeed_id = 13;
|
||||
id linefeed_id = 13;
|
||||
|
||||
// fim tokens
|
||||
id special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||
@@ -149,6 +149,12 @@ int32_t llama_token_to_piece_impl(
|
||||
int32_t lstrip,
|
||||
bool special);
|
||||
|
||||
// check if token0 is contained as a prefix in token1
|
||||
bool llama_token_is_prefix_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token0,
|
||||
llama_token token1);
|
||||
|
||||
int32_t llama_detokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const llama_token * tokens,
|
||||
|
||||
@@ -8,9 +8,7 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.h"
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
# include "ggml-kompute.h"
|
||||
@@ -3418,8 +3416,6 @@ static int llama_get_device_count(const llama_model & model) {
|
||||
|
||||
#if defined(GGML_USE_SYCL)
|
||||
count += ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
count += ggml_backend_vk_get_device_count();
|
||||
#elif defined(GGML_USE_CANN)
|
||||
count += ggml_backend_cann_get_device_count();
|
||||
#endif
|
||||
@@ -3451,10 +3447,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
|
||||
}
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_vk_host_buffer_type();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
@@ -3473,9 +3465,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||
}
|
||||
device -= (int)model.devices.size();
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
buft = ggml_backend_vk_buffer_type(device);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(device);
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(device);
|
||||
@@ -3535,11 +3525,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
size_t free;
|
||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_CANN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
@@ -16095,9 +16080,11 @@ struct llm_build_context {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
@@ -19093,8 +19080,7 @@ bool llama_supports_mlock(void) {
|
||||
}
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
@@ -19225,8 +19211,13 @@ struct llama_model * llama_load_model_from_file(
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
||||
{
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||
model->devices.push_back(dev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19421,32 +19412,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
main_gpu -= (int)model->devices.size();
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_vk_init(device);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_SYCL)
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
||||
@@ -21830,6 +21796,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
|
||||
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
|
||||
return llama_sampler_init_infill_impl(model->vocab);
|
||||
}
|
||||
|
||||
//
|
||||
// model split
|
||||
//
|
||||
|
||||
@@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^abc?d*efg+(hij)?kl$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
|
||||
root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -709,7 +709,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" "[]{}()|+*?" "\"" space
|
||||
root ::= "\"" ("[]{}()|+*?") "\"" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -722,7 +722,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^\"$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" "\"" "\"" space
|
||||
root ::= "\"" ("\"") "\"" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"regexp with top-level alternation",
|
||||
R"""({
|
||||
"type": "string",
|
||||
"pattern": "^A|B|C|D$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -736,7 +749,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
dot ::= [^\x0A\x0D]
|
||||
root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
|
||||
root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
|
||||
root-1 ::= [0-9]
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
)"""
|
||||
|
||||
@@ -111,6 +111,28 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
|
||||
}
|
||||
}
|
||||
|
||||
static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
|
||||
const size_t n_vocab = probs.size();
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
cur.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
const float logit = logf(probs[token_id]);
|
||||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < cur_p.size; i++) {
|
||||
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
const size_t n_vocab = probs.size();
|
||||
|
||||
@@ -263,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
|
||||
}
|
||||
const int64_t t_end = ggml_time_us();
|
||||
llama_sampler_free(cnstr);
|
||||
printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
|
||||
printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
|
||||
}
|
||||
|
||||
#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
|
||||
@@ -279,12 +301,13 @@ static void test_perf() {
|
||||
data.emplace_back(llama_token_data{i, logit, 0.0f});
|
||||
}
|
||||
|
||||
BENCH(llama_sampler_init_top_k (40), data, 32);
|
||||
BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_softmax (), data, 32);
|
||||
BENCH(llama_sampler_init_top_k (40), data, 32);
|
||||
BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
|
||||
BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32);
|
||||
BENCH(llama_sampler_init_softmax (), data, 32);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
@@ -309,6 +332,14 @@ int main(void) {
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
|
||||
|
||||
printf("XTC should:\n");
|
||||
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.1f}, 0.99f, 0.09f);
|
||||
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.1f}, 0.99f, 0.19f);
|
||||
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.3f, 0.2f, 0.1f}, 0.99f, 0.29f);
|
||||
|
||||
printf("XTC should not:\n");
|
||||
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.99f, 0.39f);
|
||||
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
|
||||
|
||||
Reference in New Issue
Block a user