From d719d8aafc2fede0460e87c8674500250b7b9fe0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 7 May 2026 13:07:44 +0300 Subject: [PATCH] server : improve ctx names [no ci] --- tools/server/server-context.cpp | 292 ++++++++++++++++---------------- tools/server/server-task.cpp | 18 +- tools/server/server-task.h | 14 +- 3 files changed, 161 insertions(+), 163 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e0036312bf..48f020898e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -38,16 +38,16 @@ constexpr int HTTP_POLLING_SECONDS = 1; static void server_prompt_checkpoint_update( server_prompt_checkpoint & ckpt, - llama_context * ctx, - llama_context * ctx_dft, + llama_context * ctx_main, + llama_context * ctx_drft, int id, int64_t n_tokens, bool on_device, llama_pos pos_min = -1, llama_pos pos_max = -1) { if (pos_min == -1) { - pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), id); + pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_main), id); } if (pos_max == -1) { - pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), id); + pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_main), id); } auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY; @@ -55,29 +55,27 @@ static void server_prompt_checkpoint_update( flags |= LLAMA_STATE_SEQ_FLAGS_ON_DEVICE; } - const size_t ckpt_size_main = llama_state_seq_get_size_ext(ctx, id, flags); - const size_t ckpt_size_dft = ctx_dft ? llama_state_seq_get_size_ext(ctx_dft, id, flags) : 0; + const size_t ckpt_size_main = llama_state_seq_get_size_ext(ctx_main, id, flags); + const size_t ckpt_size_drft = ctx_drft ? llama_state_seq_get_size_ext(ctx_drft, id, flags) : 0; ckpt.pos_min = pos_min; ckpt.pos_max = pos_max; ckpt.n_tokens = n_tokens; ckpt.data_main.resize(ckpt_size_main); - ckpt.data_dft.resize (ckpt_size_dft); + ckpt.data_drft.resize(ckpt_size_drft); { - const size_t n = llama_state_seq_get_data_ext(ctx, ckpt.data_main.data(), ckpt_size_main, id, flags); + const size_t n = llama_state_seq_get_data_ext(ctx_main, ckpt.data_main.data(), ckpt_size_main, id, flags); if (n != ckpt_size_main) { GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size_main, n); } } - { - if (ctx_dft) { - const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data_dft.data(), ckpt_size_dft, id, flags); - if (n != ckpt_size_dft) { - GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size_dft, n); - } + if (ctx_drft) { + const size_t n = llama_state_seq_get_data_ext(ctx_drft, ckpt.data_drft.data(), ckpt_size_drft, id, flags); + if (n != ckpt_size_drft) { + GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size_drft, n); } } } @@ -100,8 +98,8 @@ enum server_state { struct server_slot { int id; - llama_context * ctx = nullptr; - llama_context * ctx_dft = nullptr; + llama_context * ctx_main = nullptr; + llama_context * ctx_drft = nullptr; // multimodal mtmd_context * mctx = nullptr; @@ -154,27 +152,27 @@ struct server_slot { void prompt_save(server_prompt_cache & prompt_cache) const { GGML_ASSERT(prompt.data.size() == 0); - const size_t cur_size_main = llama_state_seq_get_size_ext(ctx, id, 0); - const size_t cur_size_dft = ctx_dft ? llama_state_seq_get_size_ext(ctx_dft, id, 0) : 0; + const size_t cur_size_main = llama_state_seq_get_size_ext(ctx_main, id, 0); + const size_t cur_size_drft = ctx_drft ? llama_state_seq_get_size_ext(ctx_drft, id, 0) : 0; - const size_t cur_size = cur_size_main + cur_size_dft; + const size_t cur_size = cur_size_main + cur_size_drft; SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB (draft: %.3f MiB)\n", - (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0), cur_size_dft / (1024.0 * 1024.0)); + (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0), cur_size_drft / (1024.0 * 1024.0)); - auto * cur = prompt_cache.alloc(prompt, cur_size_main, cur_size_dft); + auto * cur = prompt_cache.alloc(prompt, cur_size_main, cur_size_drft); if (cur == nullptr) { return; } - llama_state_seq_get_data_ext(ctx, cur->data.main.data(), cur_size_main, id, 0); - if (ctx_dft) { - llama_state_seq_get_data_ext(ctx_dft, cur->data.dft.data(), cur_size_dft, id, 0); + llama_state_seq_get_data_ext(ctx_main, cur->data.main.data(), cur_size_main, id, 0); + if (ctx_drft) { + llama_state_seq_get_data_ext(ctx_drft, cur->data.drft.data(), cur_size_drft, id, 0); } } bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) { - bool res = prompt_cache.load(prompt, tokens, ctx, ctx_dft, id); + bool res = prompt_cache.load(prompt, tokens, ctx_main, ctx_drft, id); if (!res) { SLT_WRN(*this, "%s", "failed to load prompt from cache\n"); } @@ -189,9 +187,9 @@ struct server_slot { SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size()); - llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1); - if (ctx_dft) { - llama_memory_seq_rm(llama_get_memory(ctx_dft), id, -1, -1); + llama_memory_seq_rm(llama_get_memory(ctx_main), id, -1, -1); + if (ctx_drft) { + llama_memory_seq_rm(llama_get_memory(ctx_drft), id, -1, -1); } prompt.tokens.clear(); @@ -251,7 +249,7 @@ struct server_slot { task_prev = std::move(task); task.reset(); - llama_set_sampler(ctx, id, nullptr); + llama_set_sampler(ctx_main, id, nullptr); // clear alora start alora_invocation_start = -1; @@ -288,7 +286,7 @@ struct server_slot { return !task->need_embd() || - (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST); + (llama_get_memory(ctx_main) && llama_pooling_type(ctx_main) == LLAMA_POOLING_TYPE_LAST); } bool can_batch_with(server_slot & other_slot) const { @@ -396,7 +394,7 @@ struct server_slot { //const int64_t t_start = ggml_time_us(); - server_prompt_checkpoint_update(spec_ckpt, ctx, ctx_dft, this->id, n_tokens, true); + server_prompt_checkpoint_update(spec_ckpt, ctx_main, ctx_drft, this->id, n_tokens, true); //const int64_t t_total = ggml_time_us() - t_start; //printf("checkpoint total: %f ms\n", t_total / 1000.0); @@ -568,7 +566,7 @@ struct server_slot { }; if (!only_metrics) { - res["prompt"] = ptask->tokens.detokenize(ctx, true); + res["prompt"] = ptask->tokens.detokenize(ctx_main, true); res["generated"] = generated_text.empty() ? debug_generated_text : generated_text; } } @@ -579,8 +577,8 @@ struct server_slot { void copy_state_to(server_slot & other) const { GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT); - llama_memory_seq_rm(llama_get_memory(ctx), other.id, -1, -1); - llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1); + llama_memory_seq_rm(llama_get_memory(ctx_main), other.id, -1, -1); + llama_memory_seq_cp(llama_get_memory(ctx_main), id, other.id, -1, -1); other.n_decoded = n_decoded; other.n_remaining = n_remaining; @@ -671,7 +669,8 @@ public: // only use these pointers outside of this class: // - when not in sleeping state // - and, with thread-safe APIs (e.g., tokenizer calls) - llama_model * model = nullptr; + llama_model * model_main = nullptr; + mtmd_context * mctx = nullptr; const llama_vocab * vocab = nullptr; @@ -698,17 +697,15 @@ private: // note: keep these alive - they determine the lifetime of the model, context, etc. common_init_result_ptr llama_init; - // TODO: rename to ctx_main - llama_context * ctx = nullptr; + llama_context * ctx_main = nullptr; llama_batch batch {}; - // TODO: rename to *_drft - llama_model_ptr model_dft; - llama_context_ptr ctx_dft; + llama_model_ptr model_drft; + llama_context_ptr ctx_drft; - common_context_seq_rm_type ctx_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; - common_context_seq_rm_type ctx_dft_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; + common_context_seq_rm_type ctx_main_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; + common_context_seq_rm_type ctx_drft_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; bool add_bos_token = true; @@ -743,8 +740,8 @@ private: void destroy() { llama_init.reset(); - ctx = nullptr; - model = nullptr; + ctx_main = nullptr; + model_main = nullptr; mtmd_free(mctx); mctx = nullptr; @@ -794,17 +791,17 @@ private: llama_init = common_init_from_params(params_base); - model = llama_init->model(); - ctx = llama_init->context(); + model_main = llama_init->model(); + ctx_main = llama_init->context(); - if (model == nullptr) { + if (model_main == nullptr) { SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); return false; } - vocab = llama_model_get_vocab(model); + vocab = llama_model_get_vocab(model_main); - n_ctx = llama_n_ctx(ctx); + n_ctx = llama_n_ctx(ctx_main); add_bos_token = llama_vocab_get_add_bos(vocab); @@ -831,20 +828,20 @@ private: auto mparams_dft = common_model_params_to_llama(params_dft); - model_dft.reset(llama_model_load_from_file(params_dft.model.path.c_str(), mparams_dft)); - if (model_dft == nullptr) { + model_drft.reset(llama_model_load_from_file(params_dft.model.path.c_str(), mparams_dft)); + if (model_drft == nullptr) { SRV_ERR("failed to load draft model, '%s'\n", params_dft.model.path.c_str()); return false; } auto cparams = common_context_params_to_llama(params_dft); - ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); + ctx_drft.reset(llama_init_from_model(model_drft.get(), cparams)); - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + ctx_drft_seq_rm_type = common_context_can_seq_rm(ctx_drft.get()); - params_base.speculative.draft.ctx_tgt = ctx; - params_base.speculative.draft.ctx_dft = ctx_dft.get(); - params_base.speculative.draft.use_ckpt = ctx_dft_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + params_base.speculative.draft.ctx_tgt = ctx_main; + params_base.speculative.draft.ctx_dft = ctx_drft.get(); + params_base.speculative.draft.use_ckpt = ctx_drft_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; } std::string & mmproj_path = params_base.mmproj.path; @@ -864,7 +861,7 @@ private: mparams.image_max_tokens = params_base.image_max_tokens; mparams.media_marker = get_media_marker(); - mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams); + mctx = mtmd_init_from_file(mmproj_path.c_str(), model_main, mparams); if (mctx == nullptr) { SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str()); return false; @@ -882,7 +879,7 @@ private: } } - if (!llama_memory_can_shift(llama_get_memory(ctx))) { + if (!llama_memory_can_shift(llama_get_memory(ctx_main))) { if (params_base.ctx_shift) { params_base.ctx_shift = false; SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled"); @@ -894,14 +891,14 @@ private: } } - if (llama_model_n_swa(model) == 0) { + if (llama_model_n_swa(model_main) == 0) { if (params_base.swa_full) { params_base.swa_full = false; SRV_WRN("%s\n", "swa_full is not supported by this model, it will be disabled"); } } - n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model); + n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model_main); // Necessary similarity of prompt for slot selection slot_prompt_similarity = params_base.slot_prompt_similarity; @@ -909,9 +906,9 @@ private: // setup slots SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_model_n_ctx_train(model_main); - int n_ctx_slot = llama_n_ctx_seq(ctx); + int n_ctx_slot = llama_n_ctx_seq(ctx_main); if (n_ctx_slot > n_ctx_train) { SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train); n_ctx_slot = n_ctx_train; @@ -919,12 +916,12 @@ private: slots.clear(); - ctx_seq_rm_type = common_context_can_seq_rm(ctx); - if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_NO) { + ctx_main_seq_rm_type = common_context_can_seq_rm(ctx_main); + if (ctx_main_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_NO) { SRV_WRN("%s", "speculative decoding not supported by this context\n"); } - if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { + if (ctx_main_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { SRV_WRN("%s", "speculative decoding will use checkpoints\n"); } @@ -936,15 +933,15 @@ private: for (int i = 0; i < params_base.n_parallel; i++) { server_slot & slot = slots[i]; - slot.id = i; - slot.ctx = ctx; - slot.n_ctx = n_ctx_slot; + slot.id = i; + slot.ctx_main = ctx_main; + slot.n_ctx = n_ctx_slot; slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; // try speculative decoding - if (ctx_seq_rm_type != COMMON_CONTEXT_SEQ_RM_TYPE_NO) { + if (ctx_main_seq_rm_type != COMMON_CONTEXT_SEQ_RM_TYPE_NO) { try { slot.spec.reset(common_speculative_init(params_base.speculative, slot.id)); } catch (const std::exception & e) { @@ -986,7 +983,7 @@ private: // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) { - const int32_t n_batch = llama_n_batch(ctx); + const int32_t n_batch = llama_n_batch(ctx_main); batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } @@ -1030,8 +1027,9 @@ private: // unlike load_model(), this is only called once during initialization bool init() { - GGML_ASSERT(ctx != nullptr); - GGML_ASSERT(model != nullptr); + GGML_ASSERT(ctx_main != nullptr); + GGML_ASSERT(model_main != nullptr); + GGML_ASSERT(!sleeping); // wiring up server queues @@ -1077,7 +1075,7 @@ private: common_chat_templates_ptr chat_templates; try { - chat_templates = common_chat_templates_init(model, params_base.chat_template); + chat_templates = common_chat_templates_init(model_main, params_base.chat_template); LOG_INF("%s: chat template, example_format: '%s'\n", __func__, common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); @@ -1340,7 +1338,7 @@ private: } } - if (!task.tokens.validate(ctx)) { + if (!task.tokens.validate(ctx_main)) { send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); return false; } @@ -1350,7 +1348,7 @@ private: // initialize samplers if (task.need_sampling()) { try { - slot.smpl.reset(common_sampler_init(model, task.params.sampling)); + slot.smpl.reset(common_sampler_init(model_main, task.params.sampling)); } catch (std::exception & e) { std::string err_msg = std::string("Failed to initialize samplers: ") + e.what(); send_error(task, err_msg, ERROR_TYPE_INVALID_REQUEST); @@ -1371,9 +1369,9 @@ private: // TODO: tmp until backend sampling is fully implemented if (backend_sampling) { - llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get())); + llama_set_sampler(ctx_main, slot.id, common_sampler_get(slot.smpl.get())); } else { - llama_set_sampler(ctx, slot.id, nullptr); + llama_set_sampler(ctx_main, slot.id, nullptr); } SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str()); @@ -1546,13 +1544,13 @@ private: for (size_t i = 0; i < n_probs; i++) { result.probs.push_back({ cur_p->data[i].id, - common_token_to_piece(ctx, cur_p->data[i].id, special), + common_token_to_piece(ctx_main, cur_p->data[i].id, special), cur_p->data[i].p }); } } else { // TODO: optimize this with min-p optimization - std::vector cur = get_token_probabilities(ctx, idx); + std::vector cur = get_token_probabilities(ctx_main, idx); const size_t max_probs = cur.size(); const size_t n_probs = std::min(max_probs, n_probs_request); @@ -1570,7 +1568,7 @@ private: for (size_t i = 0; i < n_probs; i++) { result.probs.push_back({ cur[i].id, - common_token_to_piece(ctx, cur[i].id, special), + common_token_to_piece(ctx_main, cur[i].id, special), cur[i].p }); } @@ -1673,7 +1671,7 @@ private: res->tokens = std::move(slot.generated_tokens); } res->timings = slot.get_timings(); - res->prompt = slot.task->tokens.detokenize(ctx, true); + res->prompt = slot.task->tokens.detokenize(ctx_main, true); res->response_fields = std::move(slot.task->params.response_fields); res->truncated = slot.truncated; @@ -1696,7 +1694,7 @@ private: // populate res.probs_output if (slot.task->params.sampling.n_probs > 0) { if (!slot.task->params.stream && slot.stop == STOP_TYPE_WORD) { - const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); + const llama_tokens stop_word_toks = common_tokenize(ctx_main, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); res->probs_output = std::vector( @@ -1721,7 +1719,7 @@ private: res->n_tokens = slot.task->n_tokens(); res->res_type = slot.task->params.res_type; - const int n_embd_out = llama_model_n_embd_out(model); + const int n_embd_out = llama_model_n_embd_out(model_main); std::vector embd_res(n_embd_out, 0.0f); @@ -1731,10 +1729,10 @@ private: } const float * embd = nullptr; - if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) { - embd = llama_get_embeddings_ith(ctx, i); + if (llama_pooling_type(slot.ctx_main) == LLAMA_POOLING_TYPE_NONE) { + embd = llama_get_embeddings_ith(slot.ctx_main, i); } else { - embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + embd = llama_get_embeddings_seq(slot.ctx_main, batch.seq_id[i][0]); } if (embd == nullptr) { @@ -1745,7 +1743,7 @@ private: } // normalize only when there is pooling - if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { + if (llama_pooling_type(slot.ctx_main) != LLAMA_POOLING_TYPE_NONE) { common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize); res->embedding.push_back(embd_res); break; @@ -1770,9 +1768,9 @@ private: continue; } - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + const float * embd = llama_get_embeddings_seq(ctx_main, batch.seq_id[i][0]); if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); + embd = llama_get_embeddings_ith(ctx_main, i); } if (embd == NULL) { @@ -1883,7 +1881,7 @@ private: } auto & cur = slot.prompt.checkpoints.emplace_back(); - server_prompt_checkpoint_update(cur, ctx, ctx_dft.get(), slot.id, slot.prompt.n_tokens() - n_tokens_cur, false, pos_min, pos_max); + server_prompt_checkpoint_update(cur, ctx_main, ctx_drft.get(), slot.id, slot.prompt.n_tokens() - n_tokens_cur, false, pos_min, pos_max); SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", @@ -2043,7 +2041,7 @@ private: std::string filepath = task.slot_action.filepath; const llama_tokens & tokens = slot->prompt.tokens.get_tokens(); - const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count); + const size_t nwrite = llama_state_seq_save_file(ctx_main, filepath.c_str(), slot->id, tokens.data(), token_count); const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; @@ -2082,7 +2080,7 @@ private: llama_tokens tokens; tokens.resize(slot->n_ctx); size_t token_count = 0; - size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count); + size_t nread = llama_state_seq_load_file(ctx_main, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count); if (nread == 0) { slot->prompt.tokens.clear(); // KV may already been invalidated? send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); @@ -2241,8 +2239,8 @@ private: SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep , n_keep + n_discard); - llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard); + llama_memory_seq_rm (llama_get_memory(ctx_main), slot.id, n_keep , n_keep + n_discard); + llama_memory_seq_add(llama_get_memory(ctx_main), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard); // add generated tokens to cache // ref: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2473269481 @@ -2288,12 +2286,12 @@ private: continue; } - slot.update_batch(batch, ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL); + slot.update_batch(batch, ctx_main_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL); } // process in chunks of params.n_batch - int32_t n_batch = llama_n_batch(ctx); - int32_t n_ubatch = llama_n_ubatch(ctx); + int32_t n_batch = llama_n_batch(ctx_main); + int32_t n_ubatch = llama_n_ubatch(ctx_main); float alora_scale = -1.0f; size_t alora_disabled_id = 0; @@ -2337,12 +2335,12 @@ private: /*if (1) { // first 16 tokens (avoid flooding logs) for (int i = 0; i < std::min(16, input_tokens.size()); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str()); + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx_main, input_tokens[i]).c_str()); } } else { // all for (int i = 0; i < (int) input_tokens.size(); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str()); + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx_main, input_tokens[i]).c_str()); } }*/ @@ -2361,7 +2359,7 @@ private: } // TODO: support memory-less logits computation - if (slot.task->need_logits() && !llama_get_memory(ctx)) { + if (slot.task->need_logits() && !llama_get_memory(ctx_main)) { send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER); slot.release(); continue; @@ -2413,7 +2411,7 @@ private: const auto n_cache_reuse = slot.task->params.n_cache_reuse; const bool can_cache_reuse = - llama_memory_can_shift(llama_get_memory(ctx)) && + llama_memory_can_shift(llama_get_memory(ctx_main)) && !slot.prompt.tokens.has_mtmd; if (!can_cache_reuse && n_cache_reuse > 0) { @@ -2423,7 +2421,7 @@ private: // reuse chunks from the cached prompt by shifting their KV cache in the new position if (can_cache_reuse && n_cache_reuse > 0) { GGML_ASSERT(!slot.prompt.tokens.has_mtmd); - GGML_ASSERT(ctx_dft == nullptr && "TODO: add support for draft context cache reuse"); + GGML_ASSERT(ctx_drft == nullptr && "TODO: add support for draft context cache reuse"); size_t head_c = n_past; // cache size_t head_p = n_past; // current prompt @@ -2448,13 +2446,13 @@ private: if (n_match >= (size_t) n_cache_reuse) { SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); //for (size_t i = head_p; i < head_p + n_match; i++) { - // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx_main, prompt_tokens[i]).c_str()); //} const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c); - llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift); + llama_memory_seq_rm (llama_get_memory(ctx_main), slot.id, head_p, head_c); + llama_memory_seq_add(llama_get_memory(ctx_main), slot.id, head_c, head_c + n_match, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.prompt.tokens.set_token(head_p + i, slot.prompt.tokens[head_c + i]); @@ -2481,7 +2479,7 @@ private: const auto pos_min_thold = std::max(0, pos_next - n_swa); if (n_past > 0 && n_past < slot.prompt.n_tokens()) { - const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); + const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_main), slot.id); if (pos_min == -1) { SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237"); @@ -2510,14 +2508,14 @@ private: { const auto token = slot.prompt.tokens[i]; - const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]"; + const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx_main, token) : "[mtmd]"; ss0 << piece; st0 << std::setw(8) << token; } { const auto token = slot.task->tokens[i]; - const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]"; + const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx_main, token) : "[mtmd]"; ss1 << piece; st1 << std::setw(8) << token; } @@ -2550,7 +2548,7 @@ private: if (!do_reset) { // restore the context checkpoint const size_t ckpt_size_main = it->data_main.size(); - const size_t n = llama_state_seq_set_data_ext(ctx, it->data_main.data(), ckpt_size_main, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + const size_t n = llama_state_seq_set_data_ext(ctx_main, it->data_main.data(), ckpt_size_main, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); if (n != ckpt_size_main) { SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) ckpt_size_main / 1024 / 1024); do_reset = true; @@ -2560,10 +2558,10 @@ private: n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) ckpt_size_main / 1024 / 1024); - if (ctx_dft) { - const size_t ckpt_size_dft = it->data_dft.size(); - const size_t n = llama_state_seq_set_data_ext(ctx_dft.get(), it->data_dft.data(), ckpt_size_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - if (n != ckpt_size_dft) { + if (ctx_drft) { + const size_t ckpt_size_drft = it->data_drft.size(); + const size_t n = llama_state_seq_set_data_ext(ctx_drft.get(), it->data_drft.data(), ckpt_size_drft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + if (n != ckpt_size_drft) { GGML_ABORT("inconsistent draft state"); } } @@ -2624,7 +2622,7 @@ private: SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); - if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { + if (!llama_memory_seq_rm(llama_get_memory(ctx_main), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); slot.prompt_clear(true); @@ -2657,7 +2655,7 @@ private: // - the model does not support partial sequence removal // - the model uses SWA (and we are not using `swa_full`) do_checkpoint = do_checkpoint && ( - (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) || + (ctx_main_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) || (n_swa > 0)); bool has_mtmd = false; @@ -2666,7 +2664,7 @@ private: while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { // process the image size_t n_tokens_out = 0; - int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); + int32_t res = input_tokens.process_chunk(ctx_main, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); if (res != 0) { SLT_ERR(slot, "failed to process image, res = %d\n", res); send_error(slot, "failed to process image", ERROR_TYPE_SERVER); @@ -2775,8 +2773,8 @@ private: SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens()); } - const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); - const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id); + const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_main), slot.id); + const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_main), slot.id); // no need for empty or small checkpoints do_checkpoint = do_checkpoint && (pos_min >= 0 && slot.prompt.n_tokens() >= 64); @@ -2809,7 +2807,7 @@ private: if (slot_batched) { // apply lora, only need to do it once per batch - common_set_adapter_lora(ctx, slot_batched->lora); + common_set_adapter_lora(ctx_main, slot_batched->lora); // if the lora is temporarily disabled for an alora, re-enable it // for next time @@ -2818,7 +2816,7 @@ private: slot_batched->lora[alora_disabled_id].scale = alora_scale; } - llama_set_embeddings(ctx, slot_batched->task->need_embd()); + llama_set_embeddings(ctx_main, slot_batched->task->need_embd()); } if (batch.n_tokens == 0) { @@ -2847,7 +2845,7 @@ private: batch.logits + i, }; - const int ret = llama_decode(ctx, batch_view); + const int ret = llama_decode(ctx_main, batch_view); metrics.on_decoded(slots); @@ -2900,12 +2898,12 @@ private: continue; // continue loop of n_batch } - if (ctx_dft) { + if (ctx_drft) { SRV_WRN("%s", "processing the batch using the draft context\n"); // note: for now, to keep things simple, synchronize the target context // TODO: revisit later on - llama_synchronize(ctx); + llama_synchronize(ctx_main); // the logic here varies depending on the speculative decoding method // - some draft contexts require emebeddings from the target context, others don't @@ -2913,13 +2911,13 @@ private: // TODO: extract this in a function ? { // TODO: hook the embeddings from the last target batch here - if (llama_model_has_encoder(model_dft.get())) { - //llama_encode(ctx_dft, ...); + if (llama_model_has_encoder(model_drft.get())) { + //llama_encode(ctx_drft, ...); GGML_ABORT("not implemented yet\n"); } - const int ret = llama_decode(ctx_dft.get(), batch_view); + const int ret = llama_decode(ctx_drft.get(), batch_view); if (ret != 0) { SRV_ERR("failed to decode draft batch, ret = %d\n", ret); @@ -2934,7 +2932,7 @@ private: i_next = i + n_tokens; // on successful decode, restore the original batch size - n_batch = llama_n_batch(ctx); + n_batch = llama_n_batch(ctx_main); // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too for (auto & slot : slots) { @@ -3005,7 +3003,7 @@ private: const int tok_idx = slot.i_batch - i; - llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx, tok_idx); + llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx_main, tok_idx); slot.i_batch = -1; @@ -3026,7 +3024,7 @@ private: completion_token_output result; result.tok = id; - result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok)); + result.text_to_send = common_token_to_piece(slot.ctx_main, result.tok, accept_special_token(slot, result.tok)); result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs if (slot.task->params.sampling.n_probs > 0) { @@ -3057,7 +3055,7 @@ private: // verify and try to accept the draft { - const bool use_ckpt = ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + const bool use_ckpt = ctx_main_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; // only save the sampler sampler state if we use checkpoints common_sampler_ptr smpl_save; @@ -3066,7 +3064,7 @@ private: } GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1); - auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft); + auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx_main, slot.spec_i_batch, slot.spec_draft); slot.spec_i_batch.clear(); GGML_ASSERT(accepted.size() >= 1); @@ -3087,23 +3085,23 @@ private: ckpt.pos_min, ckpt.pos_max, ckpt.size()); { - const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data_main.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + const size_t n = llama_state_seq_set_data_ext(slot.ctx_main, ckpt.data_main.data(), ckpt.data_main.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); if (n != ckpt.size()) { GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu", __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n); } - llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, ckpt.pos_max + 1, -1); + llama_memory_seq_rm(llama_get_memory(slot.ctx_main), slot.id, ckpt.pos_max + 1, -1); } - { - const size_t n = llama_state_seq_set_data_ext(slot.ctx_dft, ckpt.data_dft.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + if (slot.ctx_drft) { + const size_t n = llama_state_seq_set_data_ext(slot.ctx_drft, ckpt.data_drft.data(), ckpt.data_drft.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); if (n != ckpt.size()) { GGML_ABORT("%s: failed to restore draft checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu", __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n); } - llama_memory_seq_rm(llama_get_memory(slot.ctx_dft), slot.id, ckpt.pos_max + 1, -1); + llama_memory_seq_rm(llama_get_memory(slot.ctx_drft), slot.id, ckpt.pos_max + 1, -1); } slot.prompt.tokens.keep_first(ckpt.n_tokens); @@ -3139,13 +3137,13 @@ private: slot.sampled = ids.back(); // last accepted token SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft); - llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, slot.prompt.tokens.pos_next(), -1); + llama_memory_seq_rm(llama_get_memory(slot.ctx_main), slot.id, slot.prompt.tokens.pos_next(), -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; result.tok = ids[i]; - result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok)); + result.text_to_send = common_token_to_piece(slot.ctx_main, result.tok, accept_special_token(slot, result.tok)); result.prob = 1.0f; // set later // TODO: set result.probs @@ -3197,7 +3195,7 @@ void server_context::terminate() { } llama_context * server_context::get_llama_context() const { - return impl->ctx; + return impl->ctx_main; } server_response_reader server_context::get_response_reader() { @@ -3207,8 +3205,8 @@ server_response_reader server_context::get_response_reader() { server_context_meta server_context::get_meta() const { auto bos_id = llama_vocab_bos(impl->vocab); auto eos_id = llama_vocab_eos(impl->vocab); - auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : ""; - auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : ""; + auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_main, bos_id, true) : ""; + auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_main, eos_id, true) : ""; return server_context_meta { /* build_info */ std::string(llama_build_info()), @@ -3221,7 +3219,7 @@ server_context_meta server_context::get_meta() const { /* has_inp_audio */ impl->chat_params.allow_audio, /* json_webui_settings */ impl->json_webui_settings, /* slot_n_ctx */ impl->get_slot_n_ctx(), - /* pooling_type */ llama_pooling_type(impl->ctx), + /* pooling_type */ llama_pooling_type(impl->ctx_main), /* chat_params */ impl->chat_params, /* chat_template_caps */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()), @@ -3239,10 +3237,10 @@ server_context_meta server_context::get_meta() const { /* model_vocab_type */ llama_vocab_type(impl->vocab), /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), - /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model), - /* model_n_embd_inp */ llama_model_n_embd(impl->model), - /* model_n_params */ llama_model_n_params(impl->model), - /* model_size */ llama_model_size(impl->model), + /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model_main), + /* model_n_embd_inp */ llama_model_n_embd(impl->model_main), + /* model_n_params */ llama_model_n_params(impl->model_main), + /* model_size */ llama_model_size(impl->model_main), }; } @@ -4144,7 +4142,7 @@ void server_routes::init_routes() { std::vector tasks; tasks.reserve(documents.size()); for (size_t i = 0; i < documents.size(); i++) { - auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]); + auto tmp = format_prompt_rerank(ctx_server.model_main, ctx_server.vocab, ctx_server.mctx, query, documents[i]); server_task task = server_task(SERVER_TASK_TYPE_RERANK); task.id = rd.get_new_id(); task.tokens = std::move(tmp); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 7d40b48c64..b002eab98f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1981,7 +1981,7 @@ size_t server_prompt_cache::n_tokens() const { return res; } -server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size_main, size_t state_size_dft) { +server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size_main, size_t state_size_drft) { // first check if the current state is contained fully in the cache for (auto it = states.begin(); it != states.end(); ++it) { const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens); @@ -2006,12 +2006,12 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t } std::vector state_data_main; - std::vector state_data_dft; + std::vector state_data_drft; // check if we can allocate enough memory for the new state try { state_data_main.resize(state_size_main); - state_data_dft.resize(state_size_dft); + state_data_drft.resize(state_size_drft); } catch (const std::bad_alloc & e) { SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what()); @@ -2028,7 +2028,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t /*.tokens =*/ prompt.tokens.clone(), /*.data =*/ { /*.main =*/ std::move(state_data_main), - /*.dft =*/ std::move(state_data_dft), + /*.drft =*/ std::move(state_data_drft), }, /*.checkpoints =*/ prompt.checkpoints, }); @@ -2036,7 +2036,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t return &states.back(); } -bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, llama_context * ctx_dft, int32_t id_slot) { +bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx_main, llama_context * ctx_drft, int32_t id_slot) { const int lcp_best = prompt.tokens.get_common_prefix(tokens_new); float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins @@ -2073,7 +2073,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok auto & data = it_best->data.main; const size_t size = data.size(); - const size_t n = llama_state_seq_set_data_ext(ctx, data.data(), size, id_slot, 0); + const size_t n = llama_state_seq_set_data_ext(ctx_main, data.data(), size, id_slot, 0); if (n != size) { SRV_WRN("failed to restore state with size %zu\n", size); @@ -2085,13 +2085,13 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok } { - auto & data = it_best->data.dft; + auto & data = it_best->data.drft; if (!data.empty()) { - GGML_ASSERT(ctx_dft); + GGML_ASSERT(ctx_drft); const size_t size = data.size(); - const size_t n = llama_state_seq_set_data_ext(ctx_dft, data.data(), size, id_slot, 0); + const size_t n = llama_state_seq_set_data_ext(ctx_drft, data.data(), size, id_slot, 0); if (n != size) { SRV_WRN("failed to restore state with size %zu\n", size); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 8618fb3887..3611155ec5 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -572,10 +572,10 @@ struct server_prompt_checkpoint { int64_t n_tokens; std::vector data_main; - std::vector data_dft; + std::vector data_drft; size_t size() const { - return data_main.size() + data_dft.size(); + return data_main.size() + data_drft.size(); } bool empty() const { @@ -587,16 +587,16 @@ struct server_prompt_checkpoint { pos_max = 0; n_tokens = 0; data_main.clear(); - data_dft.clear(); + data_drft.clear(); } }; struct server_prompt_data { std::vector main; - std::vector dft; + std::vector drft; size_t size() const { - return main.size() + dft.size(); + return main.size() + drft.size(); } }; @@ -650,9 +650,9 @@ struct server_prompt_cache { size_t n_tokens() const; - server_prompt * alloc(const server_prompt & prompt, size_t state_size_main, size_t state_size_dft); + server_prompt * alloc(const server_prompt & prompt, size_t state_size_main, size_t state_size_drft); - bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, llama_context * ctx_dft, int32_t id_slot); + bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx_main, llama_context * ctx_drft, int32_t id_slot); void update(); };