Merge branch 'master' into pr/18039

This commit is contained in:
Georgi Gerganov
2026-03-16 15:41:24 +02:00
857 changed files with 80689 additions and 43153 deletions

View File

@@ -12,6 +12,7 @@
#include "mtmd.h"
#include "mtmd-helper.h"
#include <algorithm>
#include <cstddef>
#include <cinttypes>
#include <memory>
@@ -561,7 +562,7 @@ private:
llama_model_ptr model_dft;
bool add_bos_token = true;
bool add_bos_token = true;
int32_t n_ctx; // total context for all clients / slots
@@ -569,6 +570,7 @@ private:
std::vector<server_slot> slots;
int slots_debug = 0;
int n_empty_consecutive = 0;
std::unique_ptr<server_prompt_cache> prompt_cache;
@@ -580,6 +582,8 @@ private:
float slot_prompt_similarity = 0.0f;
std::string model_name; // name of the loaded model, to be used by API
std::set<std::string> model_aliases; // additional names for the model
std::set<std::string> model_tags; // informational tags
bool sleeping = false;
@@ -736,6 +740,13 @@ private:
}
}
if (llama_model_n_swa(model) == 0) {
if (params_base.swa_full) {
params_base.swa_full = false;
SRV_WRN("%s\n", "swa_full is not supported by this model, it will be disabled");
}
}
// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;
@@ -824,10 +835,9 @@ private:
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
if (!params_base.model_alias.empty()) {
// user explicitly specified model name
model_name = params_base.model_alias;
// backward compat: use first alias as model name
model_name = *params_base.model_alias.begin();
} else if (!params_base.model.name.empty()) {
// use model name in registry format (for models in cache)
model_name = params_base.model.name;
} else {
// fallback: derive model name from file name
@@ -835,6 +845,9 @@ private:
model_name = model_path.filename().string();
}
model_aliases = params_base.model_alias;
model_tags = params_base.model_tags;
if (!is_resume) {
return init();
}
@@ -891,9 +904,10 @@ private:
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 1. It's not explicitly disabled via --reasoning off
// 2. The chat template supports it
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
const bool template_supports_thinking = params_base.use_jinja && common_chat_templates_support_enable_thinking(chat_templates.get());
const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking;
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
chat_params = {
@@ -905,6 +919,8 @@ private:
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* reasoning_budget */ params_base.reasoning_budget,
/* reasoning_budget_msg */ params_base.reasoning_budget_message,
/* media_path */ params_base.media_path,
};
}
@@ -1006,9 +1022,6 @@ private:
// don't update the cache if the slot's context is empty
update_cache = update_cache && tokens.size() > 0;
// TODO: mtmd does not support prompt cache
update_cache = update_cache && (ret->mctx == nullptr);
if (update_cache) {
SRV_WRN("%s", "updating prompt cache\n");
@@ -1187,6 +1200,9 @@ private:
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
: SLOT_STATE_STARTED;
// reset server kill-switch counter
n_empty_consecutive = 0;
SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
return true;
}
@@ -1453,7 +1469,7 @@ private:
res->id = slot.task->id;
res->id_slot = slot.id;
res->index = slot.task->index;
res->index = slot.task->index;
// keep copy of last generated text for debugging purposes
if (slots_debug) {
@@ -2142,6 +2158,9 @@ private:
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
const auto & input_tokens = slot.task->tokens;
// used to determine the number of tokens added to the batch for the current slot
const auto n_tokens_prev = batch.n_tokens;
// TODO: maybe move branch to outside of this loop in the future
if (slot.state == SLOT_STATE_STARTED) {
slot.t_start_process_prompt = ggml_time_us();
@@ -2293,15 +2312,15 @@ private:
n_past = 0;
}
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
// note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
const auto n_swa = std::max(1, llama_model_n_swa(model));
// the largest pos_min required for a checkpoint to be useful
const auto pos_min_thold = std::max(0, n_past - n_swa);
const auto pos_min_thold = std::max(0, pos_next - n_swa);
// note: disallow with mtmd contexts for now
// https://github.com/ggml-org/llama.cpp/issues/17043
if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
if (pos_min == -1) {
SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
@@ -2352,17 +2371,16 @@ private:
}
if (pos_min > pos_min_thold) {
// TODO: support can be added in the future when corresponding vision models get released
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
// search for a context checkpoint
const auto it = std::find_if(
slot.prompt.checkpoints.rbegin(),
slot.prompt.checkpoints.rend(),
[&](const auto & cur) {
[&, func_name = __func__](const auto & cur) {
// guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
return cur.pos_min < pos_min_thold;
}
);
@@ -2375,18 +2393,20 @@ private:
const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != checkpoint_size) {
SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024);
do_reset = true;
//printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
} else {
n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max));
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) checkpoint_size / 1024 / 1024);
}
}
if (do_reset) {
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
pos_next = 0;
n_past = 0;
}
}
@@ -2397,7 +2417,7 @@ private:
for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) {
const auto & cur = *it;
if (cur.pos_min > pos_min_thold) {
SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, cur.n_tokens, n_swa, (float) cur.data.size() / 1024 / 1024);
it = slot.prompt.checkpoints.erase(it);
} else {
++it;
@@ -2413,7 +2433,7 @@ private:
SLT_WRN(slot, "n_past was set to %d\n", n_past);
}
slot.n_prompt_tokens_cache = n_past;
slot.n_prompt_tokens_cache = n_past;
slot.n_prompt_tokens_processed = 0;
slot.prompt.tokens.keep_first(n_past);
@@ -2446,6 +2466,8 @@ private:
slot.n_prompt_tokens_cache = 0;
}
bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
// check if we should process the image
if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
// process the image
@@ -2465,6 +2487,8 @@ private:
const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
slot.prompt.tokens.push_back(chunk.get()); // copy
}
do_checkpoint = false; // do not checkpoint right after an image chunk
}
// If using an alora, there may be uncached tokens that come
@@ -2481,8 +2505,6 @@ private:
alora_disabled_id = enabled_loras[0];
}
bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
// make checkpoints only for completion tasks
do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION;
@@ -2525,15 +2547,29 @@ private:
slot.n_prompt_tokens_processed++;
// process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
const int n_last = std::min(n_batch, 512);
if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) {
break;
// create checkpoints that many tokens before the end of the prompt:
// - 4 + n_ubatch
// - 4
// ref: https://github.com/ggml-org/llama.cpp/pull/20288
{
static const int checkpoint_offsets[] = {4 + n_ubatch, 4};
bool should_break = false;
for (int offset : checkpoint_offsets) {
const int n_last = std::min(n_batch, offset);
if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) {
should_break = true;
break;
}
}
if (should_break) {
break;
}
}
}
// SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
// the number of tokens added to the batch for the current slot
const auto n_tokens_cur = batch.n_tokens - n_tokens_prev;
// entire prompt has been processed
if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
@@ -2547,43 +2583,75 @@ private:
slot.n_decoded = 0;
slot.i_batch = batch.n_tokens - 1;
SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
slot.init_sampler();
SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
} else {
if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
// near the end of the prompt
do_checkpoint = do_checkpoint && true;
} else {
// only do non-end checkpoints if the "checkpoint every n tokens" option is set
do_checkpoint = do_checkpoint && params_base.checkpoint_every_nt > 0;
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
if (do_checkpoint) {
llama_pos last_checkpoint = 0;
if (!slot.prompt.checkpoints.empty()) {
last_checkpoint = slot.prompt.checkpoints.back().n_tokens;
}
// no need for empty or small checkpoints
do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64);
do_checkpoint = do_checkpoint && slot.prompt.n_tokens() - batch.n_tokens - last_checkpoint >= params_base.checkpoint_every_nt;
// no need to create checkpoints that are too close together
do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64);
if (do_checkpoint) {
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
// make room for the new checkpoint, if needed
const auto & cur = slot.prompt.checkpoints.front();
SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
if (do_checkpoint) {
SLT_INF(slot, "%d tokens since last checkpoint at %d, creating new checkpoint during processing at position %d\n", params_base.checkpoint_every_nt, last_checkpoint, slot.prompt.n_tokens());
}
}
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
});
llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
}
SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
}
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
// no need for empty or small checkpoints
do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64);
// no need to create checkpoints that are too close together
do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64);
// note: we create the checkpoint before calling llama_decode(), so the current batch is not
// yet processed and therefore it is not part of the checkpoint.
if (do_checkpoint) {
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
// make room for the new checkpoint, if needed
const auto & cur = slot.prompt.checkpoints.front();
SLT_WRN(slot,
"erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
", size = %.3f MiB)\n",
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
}
const size_t checkpoint_size =
llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.n_tokens = */ slot.prompt.n_tokens() - n_tokens_cur,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
});
llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id,
LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
SLT_WRN(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
", size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
}
}
@@ -2615,6 +2683,12 @@ private:
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
if (++n_empty_consecutive > 3) {
GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277");
}
} else {
n_empty_consecutive = 0;
}
int32_t i_next = 0;
@@ -2906,6 +2980,8 @@ server_context_meta server_context::get_meta() const {
return server_context_meta {
/* build_info */ build_info,
/* model_name */ impl->model_name,
/* model_aliases */ impl->model_aliases,
/* model_tags */ impl->model_tags,
/* model_path */ impl->params_base.model.path,
/* has_mtmd */ impl->mctx != nullptr,
/* has_inp_image */ impl->chat_params.allow_image,
@@ -3702,6 +3778,8 @@ void server_routes::init_routes() {
{"data", {
{
{"id", meta->model_name},
{"aliases", meta->model_aliases},
{"tags", meta->model_tags},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},