backend sampling: support returning post-sampling probs (#22622)

* server: Never return 0.0 post-sampling probabilities

* backend sampling: support returning post-sampling probs
This commit is contained in:
Tim Neumann
2026-05-10 19:12:02 +02:00
committed by GitHub
parent 5d5d2e15d2
commit 2e97c5f96f
4 changed files with 80 additions and 16 deletions

View File

@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
gsmpl->set_logits(ctx, idx);
// Check if a backend sampler has already sampled a token in which case we
// return that token id directly.
{
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
// TODO: simplify
gsmpl->cur.resize(1);
gsmpl->cur[0] = { id, 0.0f, 1.0f };
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
for (size_t i = 0; i < cur_p.size; ++i) {
if (cur_p.data[i].id == id) {
cur_p.selected = i;
break;
}
}
return id;
}
}
gsmpl->set_logits(ctx, idx);
// apply reasoning budget first
llama_sampler_apply(rbudget, &cur_p);