mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-13 12:34:05 +00:00
experiments
This commit is contained in:
@@ -2039,9 +2039,15 @@ private:
|
||||
/*.params_spec.n_draft =*/ n_draft_max,
|
||||
/*.params_spec.p_min =*/ slot.task->params.speculative.p_min,
|
||||
};
|
||||
|
||||
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
|
||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
|
||||
|
||||
if (draft.size() > 0) {
|
||||
std::string tmp = common_detokenize(slot.ctx, draft);
|
||||
//LOG_WRN("XXXXXX: draft: '%s'\n", tmp.c_str());
|
||||
}
|
||||
|
||||
// add the sampled token to the batch
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
|
||||
Reference in New Issue
Block a user