sampling : use pinned memory for backend sampling buffers

2026-05-14 04:54:06 +00:00 · 2025-11-21 14:02:16 +01:00
parent c1625620f6
commit 61ffe41dc1
7 changed files with 358 additions and 120 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -113,9 +113,9 @@ struct common_sampler {
    llama_token_data_array cur_p;

    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs  = llama_get_backend_sampled_probs_ith    (ctx, idx);
-        const float *       sampled_logits = llama_get_backend_sampled_logits_ith   (ctx, idx);
-        const llama_token * sampled_ids    = llama_get_backend_sampled_token_ids_ith(ctx, idx);
+        const float *       sampled_probs  = llama_get_backend_sampled_probs_ith     (ctx, idx);
+        const float *       sampled_logits = llama_get_backend_sampled_logits_ith    (ctx, idx);
+        const llama_token * sampled_ids    = llama_get_backend_sampled_candidates_ith(ctx, idx);

        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);