mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 09:34:07 +00:00
This commit adds support for performing sampling operations on the backend (e.g. GPU) as part of the model computation graph. The motivation for this feature is to enable sampling to be performed directly on the backend as part of the computation graph being executed, allowing for some or all of the sampling to be done on the backend. For example, the backend sampler chain might select/sample a token directly in which case only the sampled token needs to be transferred from device memory to host memory. It is also possible for the backend samplers to perform filtering of the logits, or compute and filter the probability distribution, in which case only the filtered logits or probabilites need to be transferred back to system memory for further processing by CPU samplers. Currently the backend sampling works in a similar manner to how pooling works, it is a function that is called by build_graph and the sampler operations become part of the models computation graph.
685 lines
25 KiB
C++
685 lines
25 KiB
C++
#include "sampling.h"
|
|
|
|
#include "common.h"
|
|
#include "log.h"
|
|
|
|
#include <cmath>
|
|
#include <unordered_map>
|
|
#include <algorithm>
|
|
|
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
|
// TODO: deduplicate with llama-impl.h
|
|
template<typename T>
|
|
struct ring_buffer {
|
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
|
|
|
T & front() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[first];
|
|
}
|
|
|
|
const T & front() const {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[first];
|
|
}
|
|
|
|
T & back() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[pos];
|
|
}
|
|
|
|
const T & back() const {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[pos];
|
|
}
|
|
|
|
void push_back(const T & value) {
|
|
if (sz == capacity) {
|
|
// advance the start when buffer is full
|
|
first = (first + 1) % capacity;
|
|
} else {
|
|
sz++;
|
|
}
|
|
data[pos] = value;
|
|
pos = (pos + 1) % capacity;
|
|
}
|
|
|
|
T pop_front() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
T value = data[first];
|
|
first = (first + 1) % capacity;
|
|
sz--;
|
|
return value;
|
|
}
|
|
|
|
const T & rat(size_t i) const {
|
|
if (i >= sz) {
|
|
throw std::runtime_error("ring buffer: index out of bounds");
|
|
}
|
|
return data[(first + sz - i - 1) % capacity];
|
|
}
|
|
|
|
std::vector<T> to_vector() const {
|
|
std::vector<T> result;
|
|
result.reserve(sz);
|
|
for (size_t i = 0; i < sz; i++) {
|
|
result.push_back(data[(first + i) % capacity]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void clear() {
|
|
// here only reset the status of the buffer
|
|
sz = 0;
|
|
first = 0;
|
|
pos = 0;
|
|
}
|
|
|
|
bool empty() const {
|
|
return sz == 0;
|
|
}
|
|
|
|
size_t size() const {
|
|
return sz;
|
|
}
|
|
|
|
size_t capacity = 0;
|
|
size_t sz = 0;
|
|
size_t first = 0;
|
|
size_t pos = 0;
|
|
std::vector<T> data;
|
|
};
|
|
|
|
struct common_sampler {
|
|
common_params_sampling params;
|
|
|
|
struct llama_sampler * grmr;
|
|
struct llama_sampler * chain;
|
|
|
|
ring_buffer<llama_token> prev;
|
|
|
|
std::vector<llama_token_data> cur;
|
|
|
|
llama_token_data_array cur_p;
|
|
|
|
void set_logits(struct llama_context * ctx, int idx) {
|
|
const float * sampled_probs = llama_get_backend_sampled_probs_ith(ctx, idx);
|
|
const float * sampled_logits = llama_get_backend_sampled_logits_ith(ctx, idx);
|
|
const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx);
|
|
|
|
const llama_model * model = llama_get_model(ctx);
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
|
|
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
|
|
// Use the member variable instead of allocating locally
|
|
cur.clear();
|
|
|
|
if (sampled_probs) {
|
|
const uint32_t sampled_probs_count = llama_get_backend_sampled_probs_count_ith(ctx, idx);
|
|
cur.reserve(sampled_probs_count);
|
|
// The backend sampler has filtered the probabilities so we need to use the sampled ids.
|
|
if (sampled_ids != nullptr) {
|
|
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
|
|
cur.emplace_back(llama_token_data{sampled_ids[i], 0.0f, sampled_probs[i]});
|
|
}
|
|
} else {
|
|
for (llama_token token_id = 0; token_id < (int) sampled_probs_count; token_id++) {
|
|
cur.emplace_back(llama_token_data{token_id, 0.0f, sampled_probs[token_id]});
|
|
}
|
|
}
|
|
} else if (sampled_logits) {
|
|
const uint32_t sampled_logits_count = llama_get_backend_sampled_logits_count_ith(ctx, idx);
|
|
cur.reserve(sampled_logits_count);
|
|
// The backend sampler has filtered the logits so we need to use the sampled ids.
|
|
if (sampled_ids != nullptr) {
|
|
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
|
|
cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
|
|
}
|
|
} else {
|
|
for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
|
|
cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
|
|
}
|
|
}
|
|
} else {
|
|
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
GGML_ASSERT(logits != nullptr);
|
|
cur.reserve(n_vocab);
|
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
}
|
|
}
|
|
|
|
cur_p = { cur.data(), cur.size(), -1, false };
|
|
}
|
|
};
|
|
|
|
static bool sampler_enabled(const struct common_params_sampling & params, enum common_sampler_type type) {
|
|
return std::find(params.samplers.begin(), params.samplers.end(), type) != params.samplers.end();
|
|
}
|
|
|
|
std::string common_params_sampling::print() const {
|
|
char result[1024];
|
|
|
|
snprintf(result, sizeof(result),
|
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
|
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
|
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
|
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
|
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
|
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
|
|
mirostat, mirostat_eta, mirostat_tau);
|
|
|
|
return std::string(result);
|
|
}
|
|
|
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
|
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
|
|
|
lparams.no_perf = params.no_perf;
|
|
|
|
struct llama_sampler * grmr;
|
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
|
#else
|
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
} else {
|
|
std::vector<std::string> trigger_patterns;
|
|
std::vector<std::string> patterns_anywhere;
|
|
std::vector<llama_token> trigger_tokens;
|
|
for (const auto & trigger : params.grammar_triggers) {
|
|
switch (trigger.type) {
|
|
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
|
{
|
|
const auto & word = trigger.value;
|
|
patterns_anywhere.push_back(regex_escape(word));
|
|
break;
|
|
}
|
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
|
{
|
|
patterns_anywhere.push_back(trigger.value);
|
|
break;
|
|
}
|
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
|
{
|
|
trigger_patterns.push_back(trigger.value);
|
|
break;
|
|
}
|
|
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
|
{
|
|
const auto token = trigger.token;
|
|
trigger_tokens.push_back(token);
|
|
break;
|
|
}
|
|
default:
|
|
GGML_ASSERT(false && "unknown trigger type");
|
|
}
|
|
}
|
|
|
|
if (!patterns_anywhere.empty()) {
|
|
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
|
}
|
|
|
|
std::vector<const char *> trigger_patterns_c;
|
|
trigger_patterns_c.reserve(trigger_patterns.size());
|
|
for (const auto & regex : trigger_patterns) {
|
|
trigger_patterns_c.push_back(regex.c_str());
|
|
}
|
|
|
|
grmr = params.grammar_lazy
|
|
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
trigger_tokens.data(), trigger_tokens.size())
|
|
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
if (!grmr) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
auto * result = new common_sampler {
|
|
/* .params = */ params,
|
|
/* .grmr = */ grmr,
|
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
/* .cur = */ {},
|
|
/* .cur_p = */ {},
|
|
};
|
|
|
|
llama_sampler_chain_add(result->chain,
|
|
llama_sampler_init_logit_bias(
|
|
llama_vocab_n_tokens(vocab),
|
|
params.logit_bias.size(),
|
|
params.logit_bias.data()));
|
|
|
|
if (params.mirostat == 0) {
|
|
for (const auto & cnstr : params.samplers) {
|
|
switch (cnstr) {
|
|
case COMMON_SAMPLER_TYPE_DRY:
|
|
{
|
|
std::vector<const char *> c_breakers;
|
|
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
for (const auto & str : params.dry_sequence_breakers) {
|
|
c_breakers.push_back(str.c_str());
|
|
}
|
|
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
}
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_XTC:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_INFILL:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
|
break;
|
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
break;
|
|
default:
|
|
GGML_ASSERT(false && "unknown sampler type");
|
|
}
|
|
}
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
} else if (params.mirostat == 1) {
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
} else if (params.mirostat == 2) {
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
} else {
|
|
GGML_ASSERT(false && "unknown mirostat version");
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
struct llama_sampler * common_sampler_backend_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
|
|
llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
|
|
chain_params.no_perf = params.no_perf;
|
|
|
|
struct llama_sampler * chain = llama_sampler_chain_init(chain_params);
|
|
if (!params.backend_sampling) {
|
|
return chain; // return empty chain
|
|
}
|
|
|
|
const bool enable_temp = params.temp > 0.0f && sampler_enabled(params, COMMON_SAMPLER_TYPE_TEMPERATURE);
|
|
const bool enable_top_k = params.top_k > 0 && sampler_enabled(params, COMMON_SAMPLER_TYPE_TOP_K);
|
|
const bool enable_dist = params.backend_dist;
|
|
|
|
if (!params.logit_bias.empty()) {
|
|
llama_sampler_chain_add(chain, llama_sampler_backend_init_logit_bias(
|
|
llama_vocab_n_tokens(vocab),
|
|
params.logit_bias.size(),
|
|
params.logit_bias.data()));
|
|
}
|
|
|
|
if (enable_temp) {
|
|
llama_sampler_chain_add(chain, llama_sampler_backend_init_temp(params.temp));
|
|
}
|
|
|
|
if (enable_top_k) {
|
|
llama_sampler_chain_add(chain, llama_sampler_backend_init_top_k(params.top_k));
|
|
}
|
|
|
|
if (enable_dist) {
|
|
llama_sampler_chain_add(chain, llama_sampler_backend_init_dist(params.seed));
|
|
}
|
|
|
|
return chain;
|
|
}
|
|
|
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
if (gsmpl) {
|
|
llama_sampler_free(gsmpl->grmr);
|
|
|
|
llama_sampler_free(gsmpl->chain);
|
|
|
|
delete gsmpl;
|
|
}
|
|
}
|
|
|
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
if (accept_grammar) {
|
|
llama_sampler_accept(gsmpl->grmr, token);
|
|
}
|
|
|
|
llama_sampler_accept(gsmpl->chain, token);
|
|
|
|
gsmpl->prev.push_back(token);
|
|
}
|
|
|
|
void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
llama_sampler_reset(gsmpl->grmr);
|
|
|
|
llama_sampler_reset(gsmpl->chain);
|
|
}
|
|
|
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
return new common_sampler {
|
|
/* .params = */ gsmpl->params,
|
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
/* .prev = */ gsmpl->prev,
|
|
/* .cur = */ gsmpl->cur,
|
|
/* .cur_p = */ gsmpl->cur_p,
|
|
};
|
|
}
|
|
|
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
|
// TODO: measure grammar performance
|
|
|
|
if (gsmpl) {
|
|
llama_perf_sampler_print(gsmpl->chain);
|
|
}
|
|
if (ctx) {
|
|
llama_perf_context_print(ctx);
|
|
llama_memory_breakdown_print(ctx);
|
|
}
|
|
}
|
|
|
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
// Check if a backend sampler has already sampled a token in which case we
|
|
// return that token id directly.
|
|
const llama_token backend_sampled_token = llama_get_backend_sampled_token_ith(ctx, idx);
|
|
if (backend_sampled_token != LLAMA_TOKEN_NULL) {
|
|
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, backend_sampled_token);
|
|
return backend_sampled_token;
|
|
}
|
|
|
|
gsmpl->set_logits(ctx, idx);
|
|
|
|
auto & grmr = gsmpl->grmr;
|
|
auto & chain = gsmpl->chain;
|
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
|
|
if (grammar_first) {
|
|
llama_sampler_apply(grmr, &cur_p);
|
|
}
|
|
|
|
llama_sampler_apply(chain, &cur_p);
|
|
|
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
|
|
|
const llama_token id = cur_p.data[cur_p.selected].id;
|
|
|
|
if (grammar_first) {
|
|
return id;
|
|
}
|
|
|
|
// check if it the sampled token fits the grammar
|
|
{
|
|
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
|
|
llama_sampler_apply(grmr, &single_token_data_array);
|
|
|
|
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
if (is_valid) {
|
|
return id;
|
|
}
|
|
}
|
|
|
|
// resampling:
|
|
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
gsmpl->set_logits(ctx, idx);
|
|
|
|
llama_sampler_apply(grmr, &cur_p);
|
|
llama_sampler_apply(chain, &cur_p);
|
|
|
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
|
|
|
return cur_p.data[cur_p.selected].id;
|
|
}
|
|
|
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
|
|
|
std::vector<llama_token> result;
|
|
result.reserve(idxs.size());
|
|
|
|
size_t i = 0;
|
|
for (; i < draft.size(); i++) {
|
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
|
|
common_sampler_accept(gsmpl, id, true);
|
|
|
|
result.push_back(id);
|
|
|
|
if (draft[i] != id) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (i == draft.size()) {
|
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
|
|
common_sampler_accept(gsmpl, id, true);
|
|
|
|
result.push_back(id);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
|
std::vector<int> idxs(draft.size() + 1);
|
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
idxs[i] = i;
|
|
}
|
|
|
|
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
|
}
|
|
|
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
return llama_sampler_get_seed(gsmpl->chain);
|
|
}
|
|
|
|
// helpers
|
|
|
|
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
|
auto * res = &gsmpl->cur_p;
|
|
|
|
if (do_sort && !res->sorted) {
|
|
// remember the selected token before sorting
|
|
const llama_token id = res->data[res->selected].id;
|
|
|
|
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
|
|
return a.p > b.p;
|
|
});
|
|
|
|
// restore the selected token after sorting
|
|
for (size_t i = 0; i < res->size; ++i) {
|
|
if (res->data[i].id == id) {
|
|
res->selected = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
res->sorted = true;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
|
return gsmpl->prev.rat(0);
|
|
}
|
|
|
|
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|
std::string result = "logits ";
|
|
|
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
|
n = std::min(n, (int) gsmpl->prev.size());
|
|
|
|
if (n <= 0) {
|
|
return "";
|
|
}
|
|
|
|
std::string result;
|
|
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
|
|
|
for (int i = n - 1; i >= 0; i--) {
|
|
const llama_token id = gsmpl->prev.rat(i);
|
|
|
|
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
|
|
|
result += common_token_to_piece(ctx_main, id);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|
switch (cnstr) {
|
|
case COMMON_SAMPLER_TYPE_DRY: return 'd';
|
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
|
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
|
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
|
|
default : return '?';
|
|
}
|
|
}
|
|
|
|
std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
|
switch (cnstr) {
|
|
case COMMON_SAMPLER_TYPE_DRY: return "dry";
|
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
|
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
|
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
|
|
default : return "";
|
|
}
|
|
}
|
|
|
|
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
|
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
|
|
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
|
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
|
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
|
|
};
|
|
|
|
// since samplers names are written multiple ways
|
|
// make it ready for both system names and input names
|
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
};
|
|
|
|
std::vector<common_sampler_type> samplers;
|
|
samplers.reserve(names.size());
|
|
|
|
for (const auto & name : names) {
|
|
auto sampler = sampler_canonical_name_map.find(name);
|
|
if (sampler != sampler_canonical_name_map.end()) {
|
|
samplers.push_back(sampler->second);
|
|
continue;
|
|
}
|
|
if (allow_alt_names) {
|
|
sampler = sampler_alt_name_map.find(name);
|
|
if (sampler != sampler_alt_name_map.end()) {
|
|
samplers.push_back(sampler->second);
|
|
continue;
|
|
}
|
|
}
|
|
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
|
}
|
|
|
|
return samplers;
|
|
}
|
|
|
|
std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
|
|
std::unordered_map<char, common_sampler_type> sampler_name_map = {
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
|
|
};
|
|
|
|
std::vector<common_sampler_type> samplers;
|
|
samplers.reserve(chars.size());
|
|
|
|
for (const auto & c : chars) {
|
|
const auto sampler = sampler_name_map.find(c);
|
|
if (sampler != sampler_name_map.end()) {
|
|
samplers.push_back(sampler->second);
|
|
} else {
|
|
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
|
}
|
|
}
|
|
|
|
return samplers;
|
|
}
|