backend sampling: support returning post-sampling probs (#22622)

* server: Never return 0.0 post-sampling probabilities * backend sampling: support returning post-sampling probs
2026-05-14 21:14:10 +00:00 · 2026-05-10 19:12:02 +02:00
parent 5d5d2e15d2
commit 2e97c5f96f
4 changed files with 80 additions and 16 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

+    gsmpl->set_logits(ctx, idx);
+
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
+            for (size_t i = 0; i < cur_p.size; ++i) {
+                if (cur_p.data[i].id == id) {
+                    cur_p.selected = i;
+                    break;
+                }
+            }

            return id;
        }
    }

-    gsmpl->set_logits(ctx, idx);
-
    // apply reasoning budget first
    llama_sampler_apply(rbudget, &cur_p);