mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-14 21:14:10 +00:00
spec : disacard last drafted token with low prob (#22506)
This commit is contained in:
@@ -354,6 +354,7 @@ struct server_slot {
|
||||
|
||||
// generate a new draft
|
||||
spec_draft = common_speculative_draft(spec.get(), params_spec, tokens, sampled);
|
||||
n_draft_total += spec_draft.size();
|
||||
|
||||
if (spec_draft.size() > (size_t) n_draft_max) {
|
||||
SLT_WRN(*this, "draft size %d exceeds max %d, truncating\n", (int) spec_draft.size(), n_draft_max);
|
||||
@@ -3019,7 +3020,6 @@ private:
|
||||
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
slot.n_draft_total += n_draft;
|
||||
|
||||
// add accepted tokens to the prompt
|
||||
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
|
||||
|
||||
Reference in New Issue
Block a user