common/autoparser : allow space after tool call (#22073 )

HIP: Remove unesscary NCCL_CHECK (#21914 )
mtmd: add pos_0 to mtmd_image_tokens_get_decoder_pos (breaking change) (#22082 )
2026-05-03 15:44:06 +00:00 · 2026-04-19 13:28:35 +02:00 · 2026-04-19 12:59:44 +02:00 · 2026-04-19 11:57:21 +02:00 · 2026-04-19 12:48:35 +03:00 · 2026-04-19 10:32:08 +03:00
82 changed files with 1514 additions and 1224 deletions
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -246,6 +246,7 @@ jobs:
          apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
+                  spirv-headers \
                  gcc-14-loongarch64-linux-gnu \
                  g++-14-loongarch64-linux-gnu \
                  libvulkan-dev:loong64
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
    if (!format.per_call_start.empty()) {
        auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
+            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call);
+            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
        }
        if (!format.section_start.empty()) {
            tool_calls = p.trigger_rule("tool-calls",
                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
+                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
        }
    } else {
        std::string separator = ", ";  // Default
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -2334,7 +2334,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
        ? input
        : params.generation_prompt + input;

-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
+    //LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());

    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
    if (params.debug) {
--- a/common/common.h
+++ b/common/common.h
@@ -11,7 +11,6 @@
 #include <sstream>
 #include <string>
 #include <string_view>
-#include <variant>
 #include <vector>
 #include <map>

@@ -303,7 +302,7 @@ struct common_params_speculative {
    // general-purpose speculative decoding parameters

    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min   = 0;  // minimum number of draft tokens to use for speculative decoding
    float   p_split = 0.1f; // speculative decoding split probability
    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)

@@ -312,6 +311,7 @@ struct common_params_speculative {
    uint16_t ngram_size_n     = 12; // ngram size for lookup
    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+    bool     use_checkpoints  =  false; // use checkpoints to rewind in token history of recurrent models

    std::shared_ptr<common_ngram_mod> ngram_mod;

--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -208,7 +208,7 @@ void common_ngram_map_begin(
                count_keys, count_keys_del, count_values_del, count_map_entries_upd);
    }

-    map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    map.idx_last_check = size_begin;
    map.size_last_begin = size_begin;
 }

@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
    }

-    if (map.idx_last_check  > cur_len) {
+    if (map.idx_last_check > cur_len) {
        // Should not happen because of common_ngram_map_begin().
        GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
    }
@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
                curr_key.key_idx, key_offset, curr_key.key_num, draft.size());

-        map.last_draft_created   = false;
+        map.last_draft_created   = true;
        map.last_draft_key_idx   = key_offset;
        map.last_draft_value_idx = 0; // value 0 is used for simple mode
        return;
@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
    struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.

    // update the value statistics
-    LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
+    LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
            n_accepted, curr_value.n_accepted);
    curr_value.n_accepted = n_accepted;
 }
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <iomanip>
 #include <map>
+#include <cinttypes>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -144,10 +145,28 @@ struct common_speculative_state {
    virtual void accept(uint16_t n_accepted) = 0;
 };

+struct common_speculative_checkpoint {
+    llama_pos pos_min  = 0;
+    llama_pos pos_max  = 0;
+
+    int64_t   n_tokens = 0;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+
+    size_t ckpt_size   = 0;
+};
+
 struct common_speculative_state_draft : public common_speculative_state {
    llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
    llama_context * ctx_dft;

+    struct common_speculative_checkpoint ckpt;
+    bool use_checkpoint;
+
    common_sampler * smpl;

    llama_batch  batch;
@@ -160,10 +179,12 @@ struct common_speculative_state_draft : public common_speculative_state {
            enum common_speculative_type type,
            llama_context * ctx_tgt,
            llama_context * ctx_dft,
-            const std::vector<std::pair<std::string, std::string>> & replacements)
+            const std::vector<std::pair<std::string, std::string>> & replacements,
+            bool use_checkpoint)
        : common_speculative_state(type)
        , ctx_tgt(ctx_tgt)
        , ctx_dft(ctx_dft)
+        , use_checkpoint(use_checkpoint)
    {
        batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
        smpl = nullptr;
@@ -218,7 +239,48 @@ struct common_speculative_state_draft : public common_speculative_state {
    }

    void begin(const llama_tokens & prompt) override {
-        GGML_UNUSED(prompt);
+        if (use_checkpoint && ckpt.size() > 0) {
+            // delete checkpoint
+            LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
+                    __func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
+            ckpt.pos_min   = 0;
+            ckpt.pos_max   = 0;
+            ckpt.n_tokens  = 0;
+            ckpt.ckpt_size = 0;
+            ckpt.data.clear();
+        }
+    }
+
+    size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
+        int slot_id = 0;
+        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+        ckpt.pos_min  = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
+        ckpt.pos_max  = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
+        ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
+        ckpt.data.resize(checkpoint_size);
+
+        const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        if (n != checkpoint_size) {
+            GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
+        }
+
+        LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
+                ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
+        return n;
+    }
+
+    size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
+        int slot_id = 0;
+        LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
+        const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        if (n != ckpt_size_part_expected) {
+            GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
+                        __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
+        }
+        llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
+
+        return n;
    }

    void draft(
@@ -236,8 +298,8 @@ struct common_speculative_state_draft : public common_speculative_state {

        auto * mem_dft = llama_get_memory(ctx_dft);

-        int reuse_i = 0;
-        int reuse_n = 0;
+        int reuse_i = 0; // index of part to be reused in prompt_dft
+        int reuse_n = 0; // length of part to be reused in prompt_dft

        const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;

@@ -287,18 +349,26 @@ struct common_speculative_state_draft : public common_speculative_state {
            }
        }

-        LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
+        LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
+                __func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
+        if (use_checkpoint && ckpt.ckpt_size == 0 && reuse_n > 0) {
+            LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
+                    __func__, reuse_i, reuse_n);
+            reuse_i = 0;
+            reuse_n = 0;
+        }

        result.clear();
        result.reserve(params.n_max);

-        if (reuse_n == 0) {
+        bool needs_ckpt = use_checkpoint && prompt_dft.size() > 0;
+        if (reuse_n == 0 || (use_checkpoint && reuse_i > 0)) {
            llama_memory_clear(mem_dft, false);
            prompt_dft.clear();
        } else {
            // this happens when a previous draft has been discarded (for example, due to being too small), but the
            // target model agreed with it. in this case, we simply pass back the previous results to save compute
-            if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
+            if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
                for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
                    result.push_back(prompt_dft[i]);

@@ -310,19 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
                return;
            }

+            bool do_restore = false;
+            if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
+                // This can happen after a partial acceptance (speculative decoding with checkpoints)
+                LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
+                        __func__, prompt_dft.size(), prompt_cur.size());
+                prompt_dft.resize(prompt_cur.size());
+                do_restore = true;
+            }
+
            if (reuse_i > 0) {
-                llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
+                bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
+                if (!is_removed) {
+                    LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
+                }
                llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);

                prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
            }

-            if (reuse_n < (int) prompt_dft.size()) {
-                llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
-                prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
+            if (reuse_n < (int) prompt_dft.size() || do_restore) {
+                if (use_checkpoint) {
+                    if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
+                        LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
+                                __func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
+                    }
+                    draft_restore_checkpoint(ckpt.ckpt_size);
+                    reuse_n = ckpt.n_tokens;
+                    prompt_dft.resize(reuse_n);
+                    needs_ckpt = false;
+                } else {
+                    bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
+                    if (!is_removed) {
+                        LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
+                                __func__, reuse_n, prompt_dft.size());
+                    }
+                    prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
+                }
            }
        }

+        if (needs_ckpt) {
+            ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
+        }
+
        // prepare a batch to evaluate any new tokens in the prompt
        common_batch_clear(batch);

@@ -337,7 +438,11 @@ struct common_speculative_state_draft : public common_speculative_state {
        if (batch.n_tokens > 0) {
            //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());

-            llama_decode(ctx_dft, batch);
+            int ret = llama_decode(ctx_dft, batch);
+            if (ret != 0 && ret != 1) {
+                LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
+                        __func__, ret, prompt_cur.size());
+            }
        }

        const llama_pos n_past = prompt_dft.size();
@@ -351,7 +456,11 @@ struct common_speculative_state_draft : public common_speculative_state {

        LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());

-        llama_decode(ctx_dft, batch);
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0 && ret != 1) {
+            LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
+                    __func__, ret, prompt_cur.size(), prompt_dft.size());
+        }

        common_sampler_reset(smpl);

@@ -387,7 +496,11 @@ struct common_speculative_state_draft : public common_speculative_state {
            common_batch_add(batch, id, n_past + i + 1, { 0 }, true);

            // evaluate the drafted tokens on the draft model
-            llama_decode(ctx_dft, batch);
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
+                        __func__, i, ret, prompt_cur.size(), prompt_dft.size());
+            }

            prompt_dft.push_back(id);
        }
@@ -739,6 +852,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {

 struct common_speculative {
    std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
+
    common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
 };

@@ -798,13 +912,13 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
    return it->second;
 }

-bool common_speculative_is_compat(llama_context * ctx_tgt) {
+common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt) {
    auto * mem = llama_get_memory(ctx_tgt);
    if (mem == nullptr) {
-        return false;
+        return COMMON_SPECULATIVE_COMPAT_TYPE_NO;
    }

-    bool res = true;
+    common_speculative_compat_type res = COMMON_SPECULATIVE_COMPAT_TYPE_FULL;

    llama_memory_clear(mem, true);

@@ -816,14 +930,14 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) {
    int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
    if (ret != 0) {
        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
-        res = false;
+        res = COMMON_SPECULATIVE_COMPAT_TYPE_NO;
        goto done;
    }

    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
-        res = false;
+        res = COMMON_SPECULATIVE_COMPAT_TYPE_CKPT;
        goto done;
    }

@@ -909,9 +1023,10 @@ common_speculative * common_speculative_init(
                break;
            case COMMON_SPECULATIVE_TYPE_DRAFT: {
                impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
-                    /* .ctx_tgt      = */ ctx_tgt,
-                    /* .ctx_dft      = */ ctx_dft,
-                    /* .replacements = */ params.replacements
+                    /* .ctx_tgt       = */ ctx_tgt,
+                    /* .ctx_dft       = */ ctx_dft,
+                    /* .replacements  = */ params.replacements,
+                    /* .use_checkpoint= */ params.use_checkpoints // TODO: this should be based on the draft model!
                ));
                break;
            }
@@ -966,7 +1081,8 @@ common_speculative * common_speculative_init(
    }

    auto * result = new common_speculative {
-        /* .impls = */ std::move(impls)
+        /* .impls     = */ std::move(impls),
+        /* .curr_impl = */ nullptr,
    };

    return result;
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -14,9 +14,15 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

+enum common_speculative_compat_type {
+    COMMON_SPECULATIVE_COMPAT_TYPE_NO   = 0,
+    COMMON_SPECULATIVE_COMPAT_TYPE_FULL = 1,
+    COMMON_SPECULATIVE_COMPAT_TYPE_CKPT = 2,
+};
+
 // check if the llama_context is compatible for speculative decoding
 // note: clears the memory of the context
-bool common_speculative_is_compat(llama_context * ctx_tgt);
+common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt);

 common_speculative * common_speculative_init(
        common_params_speculative & params,
@@ -39,3 +45,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);

 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
+
+struct common_speculative_deleter {
+    void operator()(common_speculative * s) { common_speculative_free(s); }
+};
+
+typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1850,20 +1850,28 @@ class TextModel(ModelBase):
            with open(module_path, encoding="utf-8") as f:
                modules = json.load(f)
            for mod in modules:
-                if mod["type"] == "sentence_transformers.models.Pooling":
+                if mod["type"].endswith("Pooling"):
                    pooling_path = mod["path"]
                    break

+        mode_mapping = {
+            "mean": gguf.PoolingType.MEAN,
+            "cls": gguf.PoolingType.CLS,
+            "lasttoken": gguf.PoolingType.LAST,
+        }
+
        # get pooling type
        if pooling_path is not None:
            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
                pooling = json.load(f)
-            if pooling["pooling_mode_mean_tokens"]:
+            if pooling.get("pooling_mode_mean_tokens"):
                pooling_type = gguf.PoolingType.MEAN
-            elif pooling["pooling_mode_cls_token"]:
+            elif pooling.get("pooling_mode_cls_token"):
                pooling_type = gguf.PoolingType.CLS
-            elif pooling["pooling_mode_lasttoken"]:
+            elif pooling.get("pooling_mode_lasttoken"):
                pooling_type = gguf.PoolingType.LAST
+            elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
+                pooling_type = mode_mapping[pooling_mode]
            else:
                raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
            self.gguf_writer.add_pooling_type(pooling_type)
@@ -7180,7 +7188,7 @@ class EmbeddingGemma(Gemma3Model):
                with open(modules_file, encoding="utf-8") as modules_json_file:
                    mods = json.load(modules_json_file)
                for mod in mods:
-                    if mod["type"] == "sentence_transformers.models.Dense":
+                    if mod["type"].endswith("Dense"):
                        mod_path = mod["path"]
                        # check if model.safetensors file for Dense layer exists
                        model_tensors_file = self.dir_model / mod_path / "model.safetensors"
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -1,11 +1,5 @@
 cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.

-# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
-# MSVC is not a valid assembler for the ASM language.
-# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
-if (POLICY CMP0194)
-    cmake_policy(SET CMP0194 NEW)
-endif()
 project("ggml" C CXX ASM)

 ### GGML Version
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -1456,6 +1456,8 @@ struct ggml_backend_meta_context {
    int                         max_nnodes    = 0;
    size_t                      max_tmp_size  = 0;
    size_t                      max_subgraphs = 0;
+    size_t                      n_subgraphs   = 0;
+    uint64_t                    uid           = 0;

    void *                               comm_ctx       = nullptr;
    ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
@@ -1616,6 +1618,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    const size_t n_backends = ggml_backend_meta_n_backends(backend);
    ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;

+    // If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
+    const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
+
    bool max_nnodes_raised = false;
    if (cgraph->n_nodes > backend_ctx->max_nnodes) {
        for (size_t j = 0; j < n_backends; j++) {
@@ -1625,173 +1630,181 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        }
        backend_ctx->max_nnodes = cgraph->n_nodes;
        max_nnodes_raised = true;
+        assert(needs_rebuild);
    }
-    for (size_t j = 0; j < n_backends; j++) {
-        auto & bcj = backend_ctx->backend_configs[j];

-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-            if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
-                // FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
-                // For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
-                bcj.nodes[i] = node;
-                continue;
+    if (needs_rebuild) {
+        size_t n_subgraphs  = 0;
+        size_t max_tmp_size = 0;
+
+        for (size_t j = 0; j < n_backends; j++) {
+            auto & bcj = backend_ctx->backend_configs[j];
+
+            for (int i = 0; i < cgraph->n_nodes; i++) {
+                ggml_tensor * node = cgraph->nodes[i];
+                if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
+                    // FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
+                    // For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
+                    bcj.nodes[i] = node;
+                    continue;
+                }
+                bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
+                GGML_ASSERT(bcj.nodes[i]);
            }
-            bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
-            GGML_ASSERT(bcj.nodes[i]);
        }
-    }

-    size_t n_subgraphs  = 0;
-    size_t max_tmp_size = 0;
-    {
-        // For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
-        auto get_i_delayed = [&](const int i) -> int {
-            int id = i; // i_delayed
-            int idr = i; // i_delayed return, last safe return value
+        {
+            // For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
+            auto get_i_delayed = [&](const int i) -> int {
+                int id = i; // i_delayed
+                int idr = i; // i_delayed return, last safe return value

-            ggml_tensor * node = cgraph->nodes[id];
-            int32_t n_used = ggml_node_get_use_count(cgraph, id);
-            if (id + 1 >= cgraph->n_nodes) {
-                return idr;
-            }
-            {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
-                        ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
-                        ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                    node = next;
+                ggml_tensor * node = cgraph->nodes[id];
+                int32_t n_used = ggml_node_get_use_count(cgraph, id);
+                if (id + 1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
+                            ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
+                            ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
+                        node = next;
+                        id++;
+                        idr = id;
+                        n_used = ggml_node_get_use_count(cgraph, id);
+                    }
+                }
+                if (id + 1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op == GGML_OP_MUL && next->src[0] == node &&
+                            ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
+                        node = next;
+                        id++;
+                        idr = id;
+                        n_used = ggml_node_get_use_count(cgraph, id);
+                    }
+                }
+
+                if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                for (int32_t k = 0; k < n_used; k++) {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
+                            next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
+                            ggml_node_get_use_count(cgraph, id+1) != 1) {
+                        return idr;
+                    }
                    id++;
-                    idr = id;
-                    n_used = ggml_node_get_use_count(cgraph, id);
                }
-            }
-            if (id + 1 >= cgraph->n_nodes) {
-                return idr;
-            }
-            {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op == GGML_OP_MUL && next->src[0] == node &&
-                        ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                    node = next;
+                {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
+                            next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
+                        return idr;
+                    }
                    id++;
-                    idr = id;
-                    n_used = ggml_node_get_use_count(cgraph, id);
                }
-            }
-
-            if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
+                for (int32_t k = 0; k < n_used - 2; k++) {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
+                            next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
+                        return idr;
+                    }
+                    id++;
+                }
+                idr = id;
                return idr;
-            }
-            for (int32_t k = 0; k < n_used; k++) {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
-                        next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
-                        ggml_node_get_use_count(cgraph, id+1) != 1) {
-                    return idr;
-                }
-                id++;
-            }
-            {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
-                        next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
-                    return idr;
-                }
-                id++;
-            }
-            for (int32_t k = 0; k < n_used - 2; k++) {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
-                        next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
-                    return idr;
-                }
-                id++;
-            }
-            idr = id;
-            return idr;
-        };
+            };

-        int i_start = 0;
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-            if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
-                continue;
-            }
-            const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
-            if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
-                max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
-            }
-            const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
-            if (!new_subgraph) {
-                continue;
-            }
+            int i_start = 0;
+            for (int i = 0; i < cgraph->n_nodes; i++) {
+                ggml_tensor * node = cgraph->nodes[i];
+                if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
+                    continue;
+                }
+                const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
+                if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
+                    max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
+                }
+                const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
+                if (!new_subgraph) {
+                    continue;
+                }

-            i = get_i_delayed(i);
+                i = get_i_delayed(i);

+                for (size_t j = 0; j < n_backends; j++) {
+                    auto & bcj = backend_ctx->backend_configs[j];
+                    bcj.cgraphs[n_subgraphs].offset = i_start;
+                }
+                n_subgraphs++;
+                i_start = i + 1;
+            }
+            GGML_ASSERT(i_start == cgraph->n_nodes);
+        }
+
+        backend_ctx->uid         = cgraph->uid;
+        backend_ctx->n_subgraphs = n_subgraphs;
+
+        if (max_tmp_size > backend_ctx->max_tmp_size) {
            for (size_t j = 0; j < n_backends; j++) {
                auto & bcj = backend_ctx->backend_configs[j];
-                bcj.cgraphs[n_subgraphs].offset = i_start;
+                bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
            }
-            n_subgraphs++;
-            i_start = i + 1;
+            backend_ctx->max_tmp_size = max_tmp_size;
        }
-        GGML_ASSERT(i_start == cgraph->n_nodes);
-    }

-    if (max_tmp_size > backend_ctx->max_tmp_size) {
-        for (size_t j = 0; j < n_backends; j++) {
-            auto & bcj = backend_ctx->backend_configs[j];
-            bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
-        }
-        backend_ctx->max_tmp_size = max_tmp_size;
-    }
-
-
-    if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
-        backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
-        const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
-        const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
-        const size_t n_cgraphs_per_device = n_reduce_steps;    // 1 ADD graph per step
-        const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
-        const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
-        const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
-        ggml_init_params params = {
-            /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        backend_ctx->ctx.reset(ggml_init(params));
-        for (size_t j = 0; j < n_backends; j++) {
-            auto & bcj = backend_ctx->backend_configs[j];
-            for (size_t i = 0; i < n_subgraphs; i++) {
-                bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
+        if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
+            backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
+            const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
+            const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
+            const size_t n_cgraphs_per_device = n_reduce_steps;    // 1 ADD graph per step
+            const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
+            const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
+            const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
+            ggml_init_params params = {
+                /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
+                /*.mem_buffer =*/ nullptr,
+                /*.no_alloc   =*/ true,
+            };
+            backend_ctx->ctx.reset(ggml_init(params));
+            for (size_t j = 0; j < n_backends; j++) {
+                auto & bcj = backend_ctx->backend_configs[j];
+                for (size_t i = 0; i < n_subgraphs; i++) {
+                    bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
+                }
+            }
+            backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
+            for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
+                backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
+            }
+            backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
+            for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
+                backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
            }
        }
-        backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
-        for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
-            backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
-        }
-        backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
-        for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
-            backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
-        }
-    }

-    for (size_t j = 0; j < n_backends; j++) {
-        auto & bcj = backend_ctx->backend_configs[j];
-        for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
-            ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
-            const size_t i_node_start = bcj.cgraphs[i_graph].offset;
-            const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
-            cgraph_ij->n_nodes = i_node_stop - i_node_start;
-            ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
-            for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
-                ggml_tensor * node_ij = bcj.nodes[i_node];
-                cgraph_ij->nodes[i_node - i_node_start] = node_ij;
-                const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
-                const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
-                cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
+        for (size_t j = 0; j < n_backends; j++) {
+            auto & bcj = backend_ctx->backend_configs[j];
+            for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
+                ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
+                const size_t i_node_start = bcj.cgraphs[i_graph].offset;
+                const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
+                cgraph_ij->n_nodes = i_node_stop - i_node_start;
+                ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
+                for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
+                    ggml_tensor * node_ij = bcj.nodes[i_node];
+                    cgraph_ij->nodes[i_node - i_node_start] = node_ij;
+                    const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
+                    const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
+                    cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
+                }
+                cgraph_ij->uid = ggml_graph_next_uid();
            }
        }
    }
@@ -1898,7 +1911,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    };


-    for (size_t i = 0; i < n_subgraphs; i++) {
+    for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
        for (size_t j = 0; j < n_backends; j++) {
            auto & bcj = backend_ctx->backend_configs[j];
            const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
@@ -1907,7 +1920,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
            }
        }

-        if (n_backends > 1 && i < n_subgraphs - 1) {
+        if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
            bool backend_allreduce_success = false;
            if (backend_ctx->comm_ctx) {
                std::vector<ggml_tensor *> nodes;
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -33,7 +33,6 @@
 #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
 #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
-#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
 #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
 #define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
--- a/ggml/src/ggml-rpc/CMakeLists.txt
+++ b/ggml/src/ggml-rpc/CMakeLists.txt
@@ -2,6 +2,7 @@ message(STATUS "Using RPC backend")

 ggml_add_backend_library(ggml-rpc
                         ggml-rpc.cpp
+                         transport.cpp
                        )

 if (WIN32)
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
--- a/ggml/src/ggml-rpc/transport.cpp
+++ b/ggml/src/ggml-rpc/transport.cpp
@@ -0,0 +1,683 @@
+#include "transport.h"
+#include "ggml-impl.h"
+
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  ifndef NOMINMAX
+#     define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <winsock2.h>
+#else
+#  include <arpa/inet.h>
+#  include <sys/socket.h>
+#  include <sys/types.h>
+#  include <netinet/in.h>
+#  include <netinet/tcp.h>
+#  include <netdb.h>
+#  include <unistd.h>
+#endif
+#include <cstdlib>
+#include <mutex>
+#include <optional>
+
+#ifdef GGML_RPC_RDMA
+#  include <infiniband/verbs.h>
+#  include <time.h>
+#  ifndef _WIN32
+#    include <poll.h>
+#  endif
+#endif // GGML_RPC_RDMA
+
+#ifdef _WIN32
+typedef SOCKET sockfd_t;
+using ssize_t = __int64;
+#else
+typedef int sockfd_t;
+#endif
+
+static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
+
+#define LOG_DBG(...) \
+    do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
+
+#ifdef GGML_RPC_RDMA
+static constexpr size_t RDMA_CHUNK    = 256 * 1024;   // 256 KiB per send/recv (fits default 8 MiB memlock)
+static constexpr int    RDMA_RX_DEPTH = 24;            // pre-posted recv ring: 24 × 256 KiB = 6 MiB
+static constexpr size_t RDMA_GID_SIZE = 16;            // RoCE GID / IB GID is always 16 bytes
+using rdma_gid_t = std::array<uint8_t, RDMA_GID_SIZE>;
+
+struct rdma_conn {
+    struct ibv_context * ctx = nullptr;
+    struct ibv_pd * pd  = nullptr;
+    struct ibv_cq * scq = nullptr;   // send completions
+    struct ibv_cq * rcq = nullptr;   // recv completions
+    struct ibv_qp * qp  = nullptr;
+
+    void          * tx_buf = nullptr;
+    struct ibv_mr * tx_mr  = nullptr;
+
+    void          * rx_buf = nullptr; // RDMA_RX_DEPTH × RDMA_CHUNK contiguous
+    struct ibv_mr * rx_mr  = nullptr;
+    int             rx_head = 0;
+
+    uint32_t        max_inline = 0;
+
+    uint8_t * rx_slot(int i) const {
+        return static_cast<uint8_t *>(rx_buf) + static_cast<size_t>(i) * RDMA_CHUNK;
+    }
+
+    bool post_rx(int i) {
+        struct ibv_sge sge = {};
+        sge.addr   = (uintptr_t)rx_slot(i);
+        sge.length = RDMA_CHUNK;
+        sge.lkey   = rx_mr->lkey;
+        struct ibv_recv_wr wr = {}, * bad = nullptr;
+        wr.wr_id   = (uint64_t)i;
+        wr.sg_list = &sge;
+        wr.num_sge = 1;
+        return ibv_post_recv(qp, &wr, &bad) == 0;
+    }
+
+    ~rdma_conn() {
+        if (tx_mr) ibv_dereg_mr(tx_mr);
+        if (rx_mr) ibv_dereg_mr(rx_mr);
+        free(tx_buf);
+        free(rx_buf);
+        if (qp)  ibv_destroy_qp(qp);
+        if (scq) ibv_destroy_cq(scq);
+        if (rcq) ibv_destroy_cq(rcq);
+        if (pd)  ibv_dealloc_pd(pd);
+        if (ctx) ibv_close_device(ctx);
+    }
+};
+
+// Local RDMA parameters captured during the probe phase and later consumed
+// by rdma_activate() after the remote side's caps arrive via HELLO.
+struct rdma_local_info {
+    uint32_t qpn     = 0;
+    uint32_t psn     = 0;
+    uint8_t  gid[RDMA_GID_SIZE] = {};
+    uint8_t  ib_port = 0;
+    int      gid_idx = 0;
+    enum ibv_mtu path_mtu = IBV_MTU_1024;
+};
+
+struct rdma_caps {
+    uint32_t qpn;
+    uint32_t psn;
+    uint8_t  gid[RDMA_GID_SIZE];
+};
+
+static_assert(sizeof(rdma_caps) == RPC_CONN_CAPS_SIZE, "rdma_caps must match conn_caps size");
+
+#endif // GGML_RPC_RDMA
+
+struct socket_t::impl {
+    impl(sockfd_t fd) : use_rdma(false), fd(fd) {}
+    ~impl();
+    bool send_data(const void * data, size_t size);
+    bool recv_data(void * data, size_t size);
+    void get_caps(uint8_t * local_caps);
+    void update_caps(const uint8_t * remote_caps);
+
+#ifdef GGML_RPC_RDMA
+    bool tcp_peer_closed();
+    std::optional<rdma_gid_t> rdma_build_target_gid();
+    bool rdma_probe();
+    bool rdma_activate(uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid);
+    bool rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc);
+    bool rdma_send(const void * data, size_t size);
+    bool rdma_recv(void * data, size_t size);
+
+    std::unique_ptr<rdma_conn> rdma;
+    rdma_local_info            rdma_local = {};
+#endif // GGML_RPC_RDMA
+    bool     use_rdma;
+    sockfd_t fd;
+};
+
+socket_t::impl::~impl() {
+#ifdef GGML_RPC_RDMA
+    rdma.reset();
+#endif // GGML_RPC_RDMA
+    LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
+#ifdef _WIN32
+    if (fd != INVALID_SOCKET) closesocket(this->fd);
+#else
+    if (fd >= 0) close(this->fd);
+#endif
+}
+
+#ifdef GGML_RPC_RDMA
+
+bool socket_t::impl::tcp_peer_closed() {
+    if (fd < 0) return false;
+#ifndef _WIN32
+    struct pollfd pfd = { fd, POLLIN | POLLRDHUP, 0 };
+    int r = poll(&pfd, 1, 0);
+    return r > 0 && (pfd.revents & (POLLHUP | POLLERR | POLLRDHUP));
+#else
+    return false;
+#endif
+}
+
+// Build a RoCE GID-shaped 16-byte target from a TCP socket's local address.
+// Used to match the socket's local IP against the kernel's GID table so that
+// a single memcmp handles IPv4, IPv4-mapped IPv6, and native IPv6 uniformly:
+//   AF_INET                -> ::ffff:a.b.c.d  (bytes 10-11 = 0xff, last 4 = IPv4)
+//   AF_INET6 (IPv4-mapped) -> ::ffff:a.b.c.d  (already in GID shape)
+//   AF_INET6 (native v6)   -> the 16-byte IPv6 address as-is
+// Returns std::nullopt on unsupported family or getsockname failure.
+std::optional<rdma_gid_t> socket_t::impl::rdma_build_target_gid() {
+    sockaddr_storage addr = {};
+    socklen_t addr_len = sizeof(addr);
+    if (getsockname(fd, reinterpret_cast<sockaddr *>(&addr), &addr_len) != 0) {
+        return std::nullopt;
+    }
+    rdma_gid_t target = {};
+    if (addr.ss_family == AF_INET) {
+        const auto * a = reinterpret_cast<const sockaddr_in *>(&addr);
+        target[10] = 0xff;
+        target[11] = 0xff;
+        memcpy(&target[12], &a->sin_addr, 4);
+        return target;
+    }
+    if (addr.ss_family == AF_INET6) {
+        const auto * a = reinterpret_cast<const sockaddr_in6 *>(&addr);
+        memcpy(target.data(), &a->sin6_addr, RDMA_GID_SIZE);
+        return target;
+    }
+    return std::nullopt;
+}
+
+bool socket_t::impl::rdma_probe() {
+    const char * dev_env = std::getenv("GGML_RDMA_DEV");
+    const char * gid_env = std::getenv("GGML_RDMA_GID");
+
+    auto target_gid = rdma_build_target_gid();
+    if (!target_gid) {
+        return false;
+    }
+
+    const uint8_t ib_port = 1;
+    int num_devs = 0;
+    ibv_device ** devs = ibv_get_device_list(&num_devs);
+    if (!devs || num_devs == 0) return false;
+
+    ibv_context * ibctx = nullptr;
+    const char * matched_dev = nullptr;
+    int gid_idx = gid_env ? atoi(gid_env) : -1;
+    int gid_version = IBV_GID_TYPE_IB;  // 0 = unknown/IB
+
+    for (int d = 0; d < num_devs; d++) {
+        const char * dn = ibv_get_device_name(devs[d]);
+        if (dev_env && strcmp(dev_env, dn) != 0) continue;
+
+        ibv_context * ctx = ibv_open_device(devs[d]);
+        if (!ctx) continue;
+
+        ibv_port_attr pa;
+        if (ibv_query_port(ctx, ib_port, &pa) != 0) { ibv_close_device(ctx); continue; }
+
+        int found_gid = gid_idx;
+        int found_version = IBV_GID_TYPE_IB;
+        if (found_gid < 0) {
+            // Find a GID on this port whose bytes equal the local TCP address
+            // (IPv4 or IPv6). Prefer RoCE v2 (UDP/IP, L3-routable) over v1
+            // (raw Ethernet, same-L2 only) so silent hangs on L3-routed paths
+            // are avoided. ibv_query_gid_ex returns gid+type in one call.
+            int v2_idx = -1;
+            int v1_idx = -1;
+            for (int i = 0; i < pa.gid_tbl_len; i++) {
+                ibv_gid_entry entry = {};
+                if (ibv_query_gid_ex(ctx, ib_port, i, &entry, 0) != 0) continue;
+                if (memcmp(entry.gid.raw, target_gid->data(), RDMA_GID_SIZE) != 0) continue;
+                if (entry.gid_type == IBV_GID_TYPE_ROCE_V2 && v2_idx < 0) {
+                    v2_idx = i;
+                } else if (entry.gid_type == IBV_GID_TYPE_ROCE_V1 && v1_idx < 0) {
+                    v1_idx = i;
+                }
+            }
+            if (v2_idx >= 0) {
+                found_gid = v2_idx;
+                found_version = IBV_GID_TYPE_ROCE_V2;
+            } else if (v1_idx >= 0) {
+                found_gid = v1_idx;
+                found_version = IBV_GID_TYPE_ROCE_V1;
+            }
+        } else {
+            // Explicit GID index from GGML_RDMA_GID — fetch its type for logging.
+            ibv_gid_entry entry = {};
+            if (ibv_query_gid_ex(ctx, ib_port, found_gid, &entry, 0) == 0) {
+                found_version = entry.gid_type;
+            }
+        }
+        if (found_gid >= 0) {
+            ibctx = ctx;
+            gid_idx = found_gid;
+            gid_version = found_version;
+            matched_dev = dn;
+            rdma_local.path_mtu = pa.active_mtu;
+            break;
+        }
+        ibv_close_device(ctx);
+    }
+    ibv_free_device_list(devs);
+    if (!ibctx) return false;
+
+    rdma_local.ib_port = ib_port;
+    rdma_local.gid_idx = gid_idx;
+
+    rdma = std::make_unique<rdma_conn>();
+    rdma->ctx = ibctx;
+
+    rdma->pd = ibv_alloc_pd(ibctx);
+    if (!rdma->pd) return false;
+
+    rdma->scq = ibv_create_cq(ibctx, 16, nullptr, nullptr, 0);
+    rdma->rcq = ibv_create_cq(ibctx, RDMA_RX_DEPTH + 4, nullptr, nullptr, 0);
+    if (!rdma->scq || !rdma->rcq) return false;
+
+    ibv_qp_init_attr qia = {};
+    qia.send_cq = rdma->scq;
+    qia.recv_cq = rdma->rcq;
+    qia.qp_type = IBV_QPT_RC;
+    qia.cap.max_send_wr     = 4;
+    qia.cap.max_recv_wr     = RDMA_RX_DEPTH + 4;
+    qia.cap.max_send_sge    = 1;
+    qia.cap.max_recv_sge    = 1;
+    qia.cap.max_inline_data = 256;
+
+    rdma->qp = ibv_create_qp(rdma->pd, &qia);
+    if (!rdma->qp) return false;
+    rdma->max_inline = qia.cap.max_inline_data;
+
+    rdma->tx_buf = aligned_alloc(4096, RDMA_CHUNK);
+    rdma->rx_buf = aligned_alloc(4096, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK);
+    if (!rdma->tx_buf || !rdma->rx_buf) return false;
+
+    rdma->tx_mr = ibv_reg_mr(rdma->pd, rdma->tx_buf, RDMA_CHUNK, IBV_ACCESS_LOCAL_WRITE);
+    rdma->rx_mr = ibv_reg_mr(rdma->pd, rdma->rx_buf, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK,
+                           IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+    if (!rdma->tx_mr || !rdma->rx_mr) return false;
+
+    ibv_gid local_gid;
+    if (ibv_query_gid(ibctx, ib_port, gid_idx, &local_gid) != 0) return false;
+
+    rdma_local.qpn = rdma->qp->qp_num;
+    rdma_local.psn = rdma->qp->qp_num & 0xffffff;
+    memcpy(&rdma_local.gid, &local_gid, RDMA_GID_SIZE);
+
+    const char * ver_str = "";
+    if (gid_version == IBV_GID_TYPE_ROCE_V2) {
+        ver_str = " RoCEv2";
+    } else if (gid_version == IBV_GID_TYPE_ROCE_V1) {
+        ver_str = " RoCEv1";
+    }
+    GGML_LOG_INFO("RDMA probed: dev=%s gid=%d%s qpn=%u inline=%u\n",
+                  matched_dev, gid_idx, ver_str, rdma_local.qpn, rdma->max_inline);
+    return true;
+}
+
+// Phase 2: Given remote QPN/PSN/GID, transition QP: RESET->INIT->pre-post->RTR->RTS.
+// On success, the connection is live and ready for rdma_send/rdma_recv.
+bool socket_t::impl::rdma_activate(uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid) {
+    // RESET -> INIT
+    {
+        struct ibv_qp_attr a = {};
+        a.qp_state        = IBV_QPS_INIT;
+        a.port_num        = rdma_local.ib_port;
+        a.pkey_index      = 0;
+        a.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE;
+        if (ibv_modify_qp(rdma->qp, &a,
+                IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < RDMA_RX_DEPTH; i++) {
+        if (!rdma->post_rx(i)) return false;
+    }
+
+    // INIT -> RTR
+    {
+        struct ibv_qp_attr a = {};
+        a.qp_state           = IBV_QPS_RTR;
+        a.path_mtu           = rdma_local.path_mtu;
+        a.dest_qp_num        = remote_qpn;
+        a.rq_psn             = remote_psn;
+        a.max_dest_rd_atomic = 1;
+        a.min_rnr_timer      = 1;
+        a.ah_attr.is_global  = 1;
+        memcpy(&a.ah_attr.grh.dgid, remote_gid, RDMA_GID_SIZE);
+        a.ah_attr.grh.hop_limit  = 1;
+        a.ah_attr.grh.sgid_index = rdma_local.gid_idx;
+        a.ah_attr.dlid       = 0;
+        a.ah_attr.port_num   = rdma_local.ib_port;
+        if (ibv_modify_qp(rdma->qp, &a,
+                IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
+                IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER) != 0) {
+            return false;
+        }
+    }
+
+    // RTR -> RTS
+    {
+        struct ibv_qp_attr a = {};
+        a.qp_state     = IBV_QPS_RTS;
+        a.timeout      = 14;
+        a.retry_cnt    = 7;
+        a.rnr_retry    = 7;
+        a.sq_psn       = rdma_local.psn;
+        a.max_rd_atomic = 1;
+        if (ibv_modify_qp(rdma->qp, &a,
+                IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
+                IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC) != 0) {
+            return false;
+        }
+    }
+
+    GGML_LOG_INFO("RDMA activated: qpn=%u->%u mtu=%d rx_depth=%d\n",
+                  rdma_local.qpn, remote_qpn, 128 << rdma_local.path_mtu, RDMA_RX_DEPTH);
+    return true;
+}
+
+bool socket_t::impl::rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc) {
+    for (uint64_t s = 0; ; s++) {
+        int n = ibv_poll_cq(cq, 1, wc);
+        if (n > 0) {
+            if (wc->status != IBV_WC_SUCCESS) {
+                GGML_LOG_ERROR("RDMA CQ wc error: status=%d (%s) vendor_err=0x%x\n",
+                    wc->status, ibv_wc_status_str(wc->status), wc->vendor_err);
+            }
+            return wc->status == IBV_WC_SUCCESS;
+        }
+        if (n < 0) return false;
+        if ((s & 0xFFFFF) == 0 && s > 0) {
+            if (tcp_peer_closed()) {
+                return false;
+            }
+        }
+    }
+}
+
+bool socket_t::impl::rdma_send(const void * data, size_t size) {
+    rdma_conn * c = rdma.get();
+    const uint8_t * src = (const uint8_t *)data;
+    size_t rem = size;
+    while (rem > 0) {
+        size_t chunk = std::min(rem, RDMA_CHUNK);
+
+        struct ibv_sge sge = {};
+        struct ibv_send_wr wr = {}, * bad = nullptr;
+        wr.opcode  = IBV_WR_SEND;
+        wr.sg_list = &sge;
+        wr.num_sge = 1;
+
+        if (chunk <= c->max_inline) {
+            sge.addr   = (uintptr_t)src;
+            sge.length = chunk;
+            wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+        } else {
+            memcpy(c->tx_buf, src, chunk);
+            sge.addr   = (uintptr_t)c->tx_buf;
+            sge.length = chunk;
+            sge.lkey   = c->tx_mr->lkey;
+            wr.send_flags = IBV_SEND_SIGNALED;
+        }
+
+        if (ibv_post_send(c->qp, &wr, &bad) != 0) return false;
+        struct ibv_wc wc;
+        if (!rdma_poll(c->scq, &wc)) return false;
+
+        src += chunk;
+        rem -= chunk;
+    }
+    return true;
+}
+
+bool socket_t::impl::rdma_recv(void * data, size_t size) {
+    rdma_conn * c = rdma.get();
+    uint8_t * dst = (uint8_t *)data;
+    size_t rem = size;
+    while (rem > 0) {
+        struct ibv_wc wc;
+        if (!rdma_poll(c->rcq, &wc)) return false;
+
+        int slot = (int)wc.wr_id;
+        size_t got = wc.byte_len;
+        memcpy(dst, c->rx_slot(slot), got);
+
+        if (!c->post_rx(slot)) return false;
+
+        dst += got;
+        rem -= got;
+    }
+    return true;
+}
+
+#endif // GGML_RPC_RDMA
+
+bool socket_t::impl::send_data(const void * data, size_t size) {
+#ifdef GGML_RPC_RDMA
+    if (use_rdma) {
+        return rdma_send(data, size);
+    }
+#endif
+    size_t bytes_sent = 0;
+    while (bytes_sent < size) {
+        size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
+        ssize_t n = send(fd, (const char *)data + bytes_sent, size_to_send, 0);
+        if (n < 0) {
+            GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
+                           bytes_sent, size_to_send);
+            return false;
+        }
+        bytes_sent += (size_t)n;
+    }
+    return true;
+}
+
+bool socket_t::impl::recv_data(void * data, size_t size) {
+#ifdef GGML_RPC_RDMA
+    if (use_rdma) {
+        return rdma_recv(data, size);
+    }
+#endif
+    size_t bytes_recv = 0;
+    while (bytes_recv < size) {
+        size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
+        ssize_t n = recv(fd, (char *)data + bytes_recv, size_to_recv, 0);
+        if (n < 0) {
+            GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
+                           bytes_recv, size_to_recv);
+            return false;
+        }
+        if (n == 0) {
+            LOG_DBG("recv returned 0 (peer closed?)\n");
+            return false;
+        }
+        bytes_recv += (size_t)n;
+    }
+    return true;
+}
+
+void socket_t::impl::get_caps(uint8_t * local_caps) {
+    memset(local_caps, 0, RPC_CONN_CAPS_SIZE);
+#ifdef GGML_RPC_RDMA
+    rdma_local = {};
+    if (rdma_probe()) {
+        rdma_caps rc = {};
+        rc.qpn = rdma_local.qpn;
+        rc.psn = rdma_local.psn;
+        memcpy(rc.gid, rdma_local.gid, RDMA_GID_SIZE);
+        memcpy(local_caps, &rc, sizeof(rc));
+    } else {
+        rdma.reset();
+    }
+#endif // GGML_RPC_RDMA
+}
+
+void socket_t::impl::update_caps(const uint8_t * remote_caps) {
+#ifdef GGML_RPC_RDMA
+    if (!rdma) {
+        return;
+    }
+    rdma_caps rc = {};
+    memcpy(&rc, remote_caps, sizeof(rc));
+    if (rc.qpn == 0) {
+        rdma.reset();
+        return;
+    }
+    if (rdma_activate(rc.qpn, rc.psn, rc.gid)) {
+        use_rdma = true;
+    } else {
+        GGML_LOG_ERROR("RDMA activate failed, staying on TCP\n");
+        rdma.reset();
+    }
+#else
+    (void)remote_caps;
+#endif // GGML_RPC_RDMA
+}
+
+
+/////////////////////////////////////////////////////////////////////////////
+
+socket_t::socket_t(std::unique_ptr<impl> p) : pimpl(std::move(p)) {}
+
+socket_t::~socket_t() = default;
+
+bool socket_t::send_data(const void * data, size_t size) {
+    return pimpl->send_data(data, size);
+}
+
+bool socket_t::recv_data(void * data, size_t size) {
+    return pimpl->recv_data(data, size);
+}
+
+void socket_t::get_caps(uint8_t * local_caps) {
+    return pimpl->get_caps(local_caps);
+}
+
+void socket_t::update_caps(const uint8_t * remote_caps) {
+    return pimpl->update_caps(remote_caps);
+}
+
+static bool is_valid_fd(sockfd_t sockfd) {
+#ifdef _WIN32
+    return sockfd != INVALID_SOCKET;
+#else
+    return sockfd >= 0;
+#endif
+}
+
+static bool set_no_delay(sockfd_t sockfd) {
+    int flag = 1;
+    // set TCP_NODELAY to disable Nagle's algorithm
+    int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
+    return ret == 0;
+}
+
+static bool set_reuse_addr(sockfd_t sockfd) {
+    int flag = 1;
+    int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
+    return ret == 0;
+}
+
+socket_ptr socket_t::accept() {
+    auto client_socket_fd = ::accept(pimpl->fd, NULL, NULL);
+    if (!is_valid_fd(client_socket_fd)) {
+        return nullptr;
+    }
+    if (!set_no_delay(client_socket_fd)) {
+        GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
+        return nullptr;
+    }
+    return socket_ptr(new socket_t(std::make_unique<impl>(client_socket_fd)));
+}
+
+socket_ptr socket_t::create_server(const char * host, int port) {
+    auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    if (!is_valid_fd(sockfd)) {
+        return nullptr;
+    }
+    if (!set_reuse_addr(sockfd)) {
+        GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
+        return nullptr;
+    }
+    if (inet_addr(host) == INADDR_NONE) {
+        GGML_LOG_ERROR("Invalid host address: %s\n", host);
+        return nullptr;
+    }
+    struct sockaddr_in serv_addr;
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = inet_addr(host);
+    serv_addr.sin_port = htons(port);
+
+    if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
+        return nullptr;
+    }
+    if (listen(sockfd, 1) < 0) {
+        return nullptr;
+    }
+    return socket_ptr(new socket_t(std::make_unique<impl>(sockfd)));
+}
+
+socket_ptr socket_t::connect(const char * host, int port) {
+    auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    if (!is_valid_fd(sockfd)) {
+        return nullptr;
+    }
+    if (!set_no_delay(sockfd)) {
+        GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
+        return nullptr;
+    }
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port);
+    struct hostent * server = gethostbyname(host);
+    if (server == NULL) {
+        GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
+        return nullptr;
+    }
+    memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
+    if (::connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+        return nullptr;
+    }
+    return socket_ptr(new socket_t(std::make_unique<impl>(sockfd)));
+}
+
+#ifdef _WIN32
+static std::mutex g_rpc_transport_mu;
+static bool g_rpc_transport_wsa_started = false;
+#endif
+
+bool rpc_transport_init() {
+#ifdef _WIN32
+    std::lock_guard<std::mutex> lock(g_rpc_transport_mu);
+    if (g_rpc_transport_wsa_started) {
+        return true;
+    }
+    WSADATA wsaData;
+    int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
+    if (res != 0) {
+        return false;
+    }
+    g_rpc_transport_wsa_started = true;
+    return true;
+#else
+    return true;
+#endif
+}
+
+void rpc_transport_shutdown() {
+#ifdef _WIN32
+    std::lock_guard<std::mutex> lock(g_rpc_transport_mu);
+    if (!g_rpc_transport_wsa_started) {
+        return;
+    }
+    WSACleanup();
+    g_rpc_transport_wsa_started = false;
+#endif
+}
--- a/ggml/src/ggml-rpc/transport.h
+++ b/ggml/src/ggml-rpc/transport.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+struct socket_t;
+typedef std::shared_ptr<socket_t> socket_ptr;
+
+static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
+static constexpr size_t RPC_CONN_CAPS_SIZE = 24;
+
+struct socket_t {
+    ~socket_t();
+
+    bool send_data(const void * data, size_t size);
+    bool recv_data(void * data, size_t size);
+
+    socket_ptr accept();
+
+    void get_caps(uint8_t * local_caps);
+    void update_caps(const uint8_t * remote_caps);
+
+    static socket_ptr create_server(const char * host, int port);
+    static socket_ptr connect(const char * host, int port);
+
+private:
+    struct impl;
+    explicit socket_t(std::unique_ptr<impl> p);
+    std::unique_ptr<impl> pimpl;
+};
+
+bool rpc_transport_init();
+void rpc_transport_shutdown();
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1077,9 +1077,9 @@ llm_graph_qkv llm_graph_context::build_qkv(
        // fused QKV path
        ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
        cb(qkv, "wqkv", il);
-        if (layer.bqkv) {
-            qkv = ggml_add(ctx0, qkv, layer.bqkv);
-            cb(qkv, "bqkv", il);
+        if (layer.wqkv_b) {
+            qkv = ggml_add(ctx0, qkv, layer.wqkv_b);
+            cb(qkv, "wqkv_b", il);
        }
        if (hparams.f_clamp_kqv > 0.0f) {
            qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -1097,8 +1097,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
        // separate Q/K/V path
        Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
        cb(Qcur, "Qcur", il);
-        if (layer.bq) {
-            Qcur = ggml_add(ctx0, Qcur, layer.bq);
+        if (layer.wq_b) {
+            Qcur = ggml_add(ctx0, Qcur, layer.wq_b);
            cb(Qcur, "Qcur", il);
        }
        if (hparams.f_clamp_kqv > 0.0f) {
@@ -1107,8 +1107,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
        }
        Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
        cb(Kcur, "Kcur", il);
-        if (layer.bk) {
-            Kcur = ggml_add(ctx0, Kcur, layer.bk);
+        if (layer.wk_b) {
+            Kcur = ggml_add(ctx0, Kcur, layer.wk_b);
            cb(Kcur, "Kcur", il);
        }
        if (hparams.f_clamp_kqv > 0.0f) {
@@ -1117,8 +1117,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
        }
        Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
        cb(Vcur, "Vcur", il);
-        if (layer.bv) {
-            Vcur = ggml_add(ctx0, Vcur, layer.bv);
+        if (layer.wv_b) {
+            Vcur = ggml_add(ctx0, Vcur, layer.wv_b);
            cb(Vcur, "Vcur", il);
        }
        if (hparams.f_clamp_kqv > 0.0f) {
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3098,14 +3098,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_;
            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
            if (layer.wqkv) {
-                layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+                layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
            } else {
                layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags);
                layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags);
                layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags);
-                layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
-                layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
-                layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
+                layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
+                layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
+                layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
            }
        };

@@ -3138,7 +3138,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

                        // optional bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

@@ -3201,7 +3201,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
                        layer.wo =
                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);

@@ -3336,9 +3336,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                        }

-
                        // optional bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

                        if (n_ff > 0) {
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -3558,10 +3557,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);

                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -3602,8 +3601,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);

-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
@@ -3719,23 +3718,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i]; // JinaBertLayer

-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
+                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);

                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);

-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
-
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);

-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
-
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens

                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
@@ -3783,10 +3775,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);

                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
@@ -3819,10 +3811,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);

                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
@@ -3889,7 +3881,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -4068,7 +4060,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);

                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);

                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
@@ -4127,7 +4119,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
@@ -4291,10 +4283,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);

                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -4329,7 +4321,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);

                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -4646,7 +4638,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);

                        // optional bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -4890,7 +4882,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
-                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
+                            layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
                        }

                        // feed forward (w/ optional biases)
@@ -5152,10 +5144,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);

                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -5570,10 +5562,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);

                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);

                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -5612,10 +5604,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

                        // attention biases - all have shape n_embd (output dimension of projections)
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+                        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
+                        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);

                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
@@ -5918,7 +5910,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);

                        // optional bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -5987,7 +5979,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
-                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
+                            layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
                        }  else {
                            if (n_expert != 0) {
                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
@@ -6808,7 +6800,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

                        // optional bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

@@ -6890,7 +6882,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        // attention layers (with optional bias)
                        create_tensor_qkv(layer, i, hidden_size, n_embd_head_k * attn_num_attention_head, attn_num_key_value_head * n_embd_head_k, attn_num_key_value_head * n_embd_head_v, 0);
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);


@@ -7026,7 +7018,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);

-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);

                        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
                        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
@@ -7191,7 +7183,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);

                        // optional bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
@@ -7422,7 +7414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

                        // bias tensors
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -246,6 +246,8 @@ struct llama_layer {
    struct ggml_tensor * wkv_b     = nullptr;
    struct ggml_tensor * wk_b      = nullptr;
    struct ggml_tensor * wv_b      = nullptr;
+    struct ggml_tensor * wqkv_b    = nullptr;
+    struct ggml_tensor * wo_b      = nullptr;
    struct ggml_tensor * wq_cross  = nullptr;
    struct ggml_tensor * wk_cross  = nullptr;
    struct ggml_tensor * wv_cross  = nullptr;
@@ -256,13 +258,6 @@ struct llama_layer {
    struct ggml_tensor * wo_enc    = nullptr;
    struct ggml_tensor * wqkv_gate = nullptr;

-    // attention bias
-    struct ggml_tensor * bq   = nullptr;
-    struct ggml_tensor * bk   = nullptr;
-    struct ggml_tensor * bv   = nullptr;
-    struct ggml_tensor * bo   = nullptr;
-    struct ggml_tensor * bqkv = nullptr;
-
    // relative position bias
    struct ggml_tensor * attn_rel_b       = nullptr;
    struct ggml_tensor * attn_rel_b_enc   = nullptr;
--- a/src/models/apertus.cpp
+++ b/src/models/apertus.cpp
@@ -50,7 +50,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
            cb(Vcur, "Vcur_pos", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/arcee.cpp
+++ b/src/models/arcee.cpp
@@ -55,7 +55,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/bailingmoe.cpp
+++ b/src/models/bailingmoe.cpp
@@ -48,7 +48,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
        }

--- a/src/models/bailingmoe2.cpp
+++ b/src/models/bailingmoe2.cpp
@@ -48,7 +48,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@@ -72,7 +72,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
            cb(cur, "kqv_out", il);
        }
--- a/src/models/bitnet.cpp
+++ b/src/models/bitnet.cpp
@@ -57,8 +57,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
            cb(cur, "attn_sub_norm", il);

            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
-            if (model.layers[il].bo) {
-                cur = ggml_add(ctx0, cur, model.layers[il].bo);
+            if (model.layers[il].wo_b) {
+                cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
            }
            cb(cur, "attn_out", il);
        }
--- a/src/models/bloom.cpp
+++ b/src/models/bloom.cpp
@@ -33,7 +33,7 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
                    n_embd_head, n_head, n_head_kv, il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/codeshell.cpp
+++ b/src/models/codeshell.cpp
@@ -47,7 +47,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/cohere2-iswa.cpp
+++ b/src/models/cohere2-iswa.cpp
@@ -58,7 +58,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/command-r.cpp
+++ b/src/models/command-r.cpp
@@ -54,7 +54,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/deci.cpp
+++ b/src/models/deci.cpp
@@ -59,7 +59,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/deepseek.cpp
+++ b/src/models/deepseek.cpp
@@ -49,7 +49,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/dots1.cpp
+++ b/src/models/dots1.cpp
@@ -49,7 +49,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/dream.cpp
+++ b/src/models/dream.cpp
@@ -43,7 +43,7 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/exaone.cpp
+++ b/src/models/exaone.cpp
@@ -46,7 +46,7 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/gpt2.cpp
+++ b/src/models/gpt2.cpp
@@ -37,7 +37,7 @@ llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params
                    n_embd_head, n_head, n_head_kv, il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/gptneox.cpp
+++ b/src/models/gptneox.cpp
@@ -46,7 +46,7 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@@ -92,7 +92,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
    const float kq_scale =
        hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
    cur = build_attn(inp_attn,
-            model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+            model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
    cb(cur, "attn_out", il);
    return cur;
--- a/src/models/granite.cpp
+++ b/src/models/granite.cpp
@@ -101,7 +101,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(

    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
    cur = build_attn(inp_attn,
-            model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+            model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
    return cur;
--- a/src/models/grok.cpp
+++ b/src/models/grok.cpp
@@ -50,7 +50,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/grovemoe.cpp
+++ b/src/models/grovemoe.cpp
@@ -50,7 +50,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/hunyuan-dense.cpp
+++ b/src/models/hunyuan-dense.cpp
@@ -64,7 +64,7 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
            cb(Qcur, "Qcur_norm", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/hunyuan-moe.cpp
+++ b/src/models/hunyuan-moe.cpp
@@ -65,7 +65,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
            cb(Qcur, "Qcur_norm", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/internlm2.cpp
+++ b/src/models/internlm2.cpp
@@ -50,7 +50,7 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/jais.cpp
+++ b/src/models/jais.cpp
@@ -27,7 +27,7 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
                    n_embd_head, n_head, n_head_kv, il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/jais2.cpp
+++ b/src/models/jais2.cpp
@@ -51,7 +51,7 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
            cb(Kcur, "Kcur_rope", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -70,7 +70,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
                cb(Kcur, "Kcur_normed", il);
            }
            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            if (model.layers[il].wo_s) {
                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
--- a/src/models/llama4.cpp
+++ b/src/models/llama4.cpp
@@ -84,7 +84,7 @@ llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_gr
                cb(Kcur, "Kcur_normed", il);
            }
            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/maincoder.cpp
+++ b/src/models/maincoder.cpp
@@ -56,7 +56,7 @@ llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_gr
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/mistral3.cpp
+++ b/src/models/mistral3.cpp
@@ -67,7 +67,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
            }

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/mpt.cpp
+++ b/src/models/mpt.cpp
@@ -56,7 +56,7 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params &
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/nemotron-h.cpp
+++ b/src/models/nemotron-h.cpp
@@ -70,7 +70,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
    const float kq_scale =
        hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
    cur = build_attn(inp_attn,
-            model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+            model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
    cb(cur, "attn_out", il);
    return cur;
--- a/src/models/nemotron.cpp
+++ b/src/models/nemotron.cpp
@@ -51,7 +51,7 @@ llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_grap
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/openai-moe-iswa.cpp
+++ b/src/models/openai-moe-iswa.cpp
@@ -48,7 +48,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);

            cb(cur, "attn_out", il);
--- a/src/models/paddleocr.cpp
+++ b/src/models/paddleocr.cpp
@@ -55,7 +55,7 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1) {
--- a/src/models/pangu-embedded.cpp
+++ b/src/models/pangu-embedded.cpp
@@ -49,7 +49,7 @@ llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, co
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/phi2.cpp
+++ b/src/models/phi2.cpp
@@ -51,7 +51,7 @@ llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params
            Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/phi3.cpp
+++ b/src/models/phi3.cpp
@@ -60,7 +60,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
            cb(Qcur, "Qcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/qwen2.cpp
+++ b/src/models/qwen2.cpp
@@ -50,7 +50,7 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/qwen2moe.cpp
+++ b/src/models/qwen2moe.cpp
@@ -50,7 +50,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/qwen2vl.cpp
+++ b/src/models/qwen2vl.cpp
@@ -53,7 +53,7 @@ llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@@ -56,7 +56,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            if (model.layers[il].wo_s) {
                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@@ -56,7 +56,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            if (model.layers[il].wo_s) {
                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vl-moe.cpp
@@ -62,7 +62,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@@ -62,7 +62,7 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }

--- a/src/models/rnd1.cpp
+++ b/src/models/rnd1.cpp
@@ -58,7 +58,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/seed-oss.cpp
+++ b/src/models/seed-oss.cpp
@@ -52,7 +52,7 @@ llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_grap
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/smallthinker.cpp
+++ b/src/models/smallthinker.cpp
@@ -59,7 +59,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
            cb(Kcur, "Kcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/smollm3.cpp
+++ b/src/models/smollm3.cpp
@@ -55,7 +55,7 @@ llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
--- a/src/models/starcoder.cpp
+++ b/src/models/starcoder.cpp
@@ -36,7 +36,7 @@ llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_gr
                    n_embd_head, n_head, n_head_kv, il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/starcoder2.cpp
+++ b/src/models/starcoder2.cpp
@@ -50,7 +50,7 @@ llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_
            cb(Vcur, "Vcur", il);

            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
--- a/src/models/t5.cpp
+++ b/src/models/t5.cpp
@@ -41,7 +41,7 @@ llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_par
            ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);

            cur = build_attn(inp_attn_self,
-                    model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
            cb(cur, "kqv_out", il);
        }
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1796,7 +1796,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "<function=special_function>\n"
               "<parameter=arg1>\n1\n</parameter>\n"
               "</function>\n"
-               "</tool_call>")
+               "</tool_call>\n")
            .enable_thinking(false)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
@@ -1809,7 +1809,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "<function=special_function>\n"
               "<parameter=arg1>\n1\n</parameter>\n"
               "</function>\n"
-               "</tool_call>")
+               "</tool_call>\n")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
            .expect(message_assist_call_thoughts)
@@ -1826,7 +1826,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "<parameter=arg1>\n1\n</parameter>\n"
               "<parameter=arg2>\n2\n</parameter>\n"
               "</function>\n"
-               "</tool_call>")
+               "</tool_call>\n")
            .enable_thinking(false)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .parallel_tool_calls(true)
@@ -1849,7 +1849,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "hello()\n"
               "</parameter>\n"
               "</function>\n"
-               "</tool_call>")
+               "</tool_call>\n")
            .enable_thinking(false)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({
@@ -1892,7 +1892,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "hello()\n"
               "</parameter>\n"
               "</function>\n"
-               "</tool_call>"
+               "</tool_call>\n"
            )
            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
@@ -1908,7 +1908,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "hello()\n"
               "</parameter>\n"
               "</function>\n"
-               "</tool_call>")
+               "</tool_call>\n")
            .expect_tool_calls({
                { "python", "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
            })
--- a/tests/test-mtmd-c-api.c
+++ b/tests/test-mtmd-c-api.c
@@ -42,7 +42,7 @@ int main(void) {
            const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
            size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
            // get position of the last token, which should be (nx - 1, ny - 1)
-            struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, n_tokens - 1);
+            struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, 0, n_tokens - 1);
            size_t nx = pos.x + 1;
            size_t ny = pos.y + 1;
            const char * id = mtmd_image_tokens_get_id(image_tokens);
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -114,10 +114,10 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
    return n_pos;
 }

-void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
+void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, llama_pos pos_0, mtmd_decoder_pos * out_pos) {
    size_t n_tokens = mtmd_image_tokens_get_n_tokens(chunks);
    for (size_t i = 0; i < n_tokens; i++) {
-        out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, i);
+        out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, pos_0, i);
    }
 }

@@ -163,15 +163,15 @@ struct decode_embd_batch {
    }

    // M-RoPE for image
-    void set_position_mrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
+    void set_position_mrope_2d(const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
        GGML_ASSERT(n_pos_per_embd == 4);
        GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
        seq_id_0[0] = seq_id;
        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i                     ] = pos_0 + rel_pos[i].t;
-            pos[i + batch.n_tokens    ] = pos_0 + rel_pos[i].y;
-            pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].x;
-            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            pos[i                     ] = rel_pos[i].t;
+            pos[i + batch.n_tokens    ] = rel_pos[i].y;
+            pos[i + batch.n_tokens * 2] = rel_pos[i].x;
+            pos[i + batch.n_tokens * 3] = rel_pos[i].z;
        }
        for (int i = 0; i < batch.n_tokens; i++) {
            batch.n_seq_id[i] = 1;
@@ -188,7 +188,7 @@ struct decode_embd_batch {
            pos[i                     ] = pos_0 + i;
            pos[i + batch.n_tokens    ] = pos_0 + i;
            pos[i + batch.n_tokens * 2] = pos_0 + i;
-            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            pos[i + batch.n_tokens * 3] = pos_0 + i;
        }
        for (int i = 0; i < batch.n_tokens; i++) {
            batch.n_seq_id[i] = 1;
@@ -268,8 +268,8 @@ int32_t mtmd_helper_decode_image_chunk(
            }
            const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
            std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
-            mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
-            batch_embd.set_position_mrope_2d(n_past, rel_pos, seq_id);
+            mtmd_helper_image_get_decoder_pos(image_tokens, n_past, rel_pos.data());
+            batch_embd.set_position_mrope_2d(rel_pos, seq_id);
        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
            batch_embd.set_position_mrope_1d(n_past, seq_id);
        } else {
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -49,7 +49,7 @@ MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);

 // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
 // out_pos must have length == mtmd_helper_get_n_tokens(image)
-MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos);
+MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);

 // helper function that automatically:
 // 1. run llama_decode() on text chunks
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1246,11 +1246,14 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
    return image_tokens->ny;
 }

-mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
+mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i) {
    mtmd_decoder_pos pos;
-    pos.t = 0;
-    pos.x = i % image_tokens->nx;
-    pos.y = i / image_tokens->nx;
+    // M-RoPE logic
+    // TODO: support other types of position encoding if needed
+    pos.t = pos_0;
+    pos.x = pos_0 + (i % image_tokens->nx);
+    pos.y = pos_0 + (i / image_tokens->nx);
+    pos.z = 0; // unused for now
    return pos;
 }

--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -196,11 +196,13 @@ struct mtmd_decoder_pos {
    uint32_t t;
    uint32_t x;
    uint32_t y;
+    uint32_t z; // unused for now, reserved for future use
 };
 // get position for decoder attention, to be used by M-RoPE models
 // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
+// pos_0 is the absolute position of the first token
 // return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
-MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
+MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);

 // tokenize an input text prompt and a list of bitmaps (images/audio)
 // the prompt must have the input image marker (default: "<__media__>") in it
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -806,6 +806,7 @@ By default, it is read-only. To make POST request to change global properties, y
  "modalities": {
    "vision": false
  },
+  "media_marker": "<__media_YoNhud46VdDqbuFmKYEO9PY7A4ARzRfg__>",
  "build_info": "b(build number)-(build commit hash)",
  "is_sleeping": false
 }
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -391,15 +391,25 @@ void server_tokens::push_back(server_tokens & tokens) {
 }

 void server_tokens::insert(const llama_tokens & inp_tokens) {
-    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
    tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
 }

-const llama_tokens & server_tokens::get_text_tokens() const {
-    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+const llama_tokens & server_tokens::get_tokens() const {
+    GGML_ASSERT(!has_mtmd);
    return tokens;
 }

+llama_tokens server_tokens::get_text_tokens() const {
+    llama_tokens res;
+    res.reserve(tokens.size());
+    for (llama_token t : tokens) {
+        if (t != LLAMA_TOKEN_NULL) {
+            res.push_back(t);
+        }
+    }
+    return res;
+}
+
 void server_tokens::set_token(llama_pos pos, llama_token id) {
    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
    tokens[pos] = id;
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -190,7 +190,9 @@ public:
    void insert(const llama_tokens & inp_tokens);

    // for compatibility with speculative decoding, ctx shift, slot save/load
-    const llama_tokens & get_text_tokens() const;
+    const llama_tokens & get_tokens() const;
+
+    llama_tokens get_text_tokens() const;

    // for compatibility with speculative decoding
    void set_token(llama_pos pos, llama_token id);
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1,3 +1,4 @@
+
 #include "server-context.h"
 #include "server-common.h"
 #include "server-http.h"
@@ -19,6 +20,7 @@
 #include <exception>
 #include <memory>
 #include <filesystem>
+#include <utility>

 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -33,6 +35,31 @@ using json = nlohmann::ordered_json;

 constexpr int HTTP_POLLING_SECONDS = 1;

+static server_prompt_checkpoint server_get_checkpoint(llama_context * ctx, int id, int64_t n_tokens, llama_pos pos_min = -1, llama_pos pos_max = -1) {
+    if (pos_min == -1) {
+        pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), id);
+    }
+    if (pos_max == -1) {
+        pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), id);
+    }
+
+    const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+    auto cur = server_prompt_checkpoint {
+        /*.pos_min  = */ pos_min,
+        /*.pos_max  = */ pos_max,
+        /*.n_tokens = */ n_tokens,
+        /*.data     = */ std::vector<uint8_t>(checkpoint_size),
+    };
+
+    const size_t n = llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+    if (n != checkpoint_size) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
+    }
+
+    return cur;
+}
+
 // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
@@ -57,7 +84,12 @@ struct server_slot {
    // multimodal
    mtmd_context * mctx = nullptr;

-    common_speculative * spec = nullptr;
+    // speculative decoding
+    llama_tokens spec_draft;
+    std::vector<int32_t> spec_i_batch;
+    server_prompt_checkpoint spec_ckpt;
+    common_speculative_ptr spec;
+

    // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
    //       see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
@@ -83,11 +115,6 @@ struct server_slot {
    std::string  debug_generated_text;
    llama_tokens generated_tokens;

-    // idx of draft tokens in the main batch
-    // non-empty if we went to evaluate draft tokens
-    // ref: https://github.com/ggml-org/llama.cpp/pull/17808
-    std::vector<int32_t> i_batch_dft;
-
    std::vector<completion_token_output> generated_token_probs;

    bool has_next_token = true;
@@ -147,8 +174,7 @@ struct server_slot {

    common_sampler_ptr smpl;

-    llama_token  sampled; // in speculative mode, this is the last accepted token
-    llama_tokens drafted;
+    llama_token sampled; // in speculative mode, this is the last accepted token

    // stats
    size_t n_sent_text = 0; // number of sent text character
@@ -178,8 +204,11 @@ struct server_slot {
        stopping_word  = "";
        n_sent_text    = 0;

-        drafted.clear();
-        i_batch_dft.clear();
+        if (can_speculate()) {
+            spec_draft.clear();
+            spec_i_batch.clear();
+            spec_ckpt.clear();
+        }
        generated_tokens.clear();
        generated_token_probs.clear();
        json_schema = json();
@@ -300,6 +329,85 @@ struct server_slot {
        return n_draft_max;
    }

+    void update_batch(llama_batch & batch) {
+        const int n_draft_max = get_n_draft_max();
+        if (n_draft_max > 0) {
+            GGML_ASSERT(can_speculate());
+
+            // generate draft tokens in speculative decoding mode
+            // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
+            //       perform the speculative drafting for all sequences at the same time in a single batch
+            const llama_tokens & tokens = prompt.tokens.get_text_tokens();
+
+            const auto & params_spec = task->params.speculative;
+
+            if (!spec_draft.empty()) {
+                // we have a previous (partial) draft to reuse
+                if (task->params.speculative.use_checkpoints) {
+                    GGML_ASSERT(!spec_ckpt.empty());
+                }
+            } else {
+                GGML_ASSERT(spec_i_batch.empty());
+
+                // generate a new draft
+                spec_draft = common_speculative_draft(spec.get(), params_spec, tokens, sampled);
+
+                if (spec_draft.size() > (size_t) n_draft_max) {
+                    SLT_WRN(*this, "draft size %d exceeds max %d, truncating\n", (int) spec_draft.size(), n_draft_max);
+                    spec_draft.resize(n_draft_max);
+                }
+
+                if (spec_draft.size() < (size_t) params_spec.n_min) {
+                    SLT_DBG(*this, "ignoring small draft: %d < %d\n", (int) spec_draft.size(), params_spec.n_min);
+                    spec_draft.clear();
+                }
+
+                if (!spec_draft.empty() && params_spec.use_checkpoints) {
+                    const auto n_tokens = prompt.tokens.size();
+
+                    auto & ckpt = spec_ckpt;
+
+                    ckpt = server_get_checkpoint(ctx, this->id, n_tokens);
+
+                    SLT_DBG(*this, "created speculative checkpoint (pos_min = %d, pos_max = %d, n_tokens = %zu, size = %.3f MiB)\n",
+                            ckpt.pos_min, ckpt.pos_max, n_tokens, (float) ckpt.data.size() / 1024 / 1024);
+                }
+            }
+
+            GGML_ASSERT(spec_draft.size() <= (size_t) n_draft_max);
+        }
+
+        if (spec_draft.empty()) {
+            // no speculative decoding
+            i_batch = batch.n_tokens;
+
+            common_batch_add(batch, sampled, prompt.tokens.pos_next(), { this->id }, true);
+
+            SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
+                    sampled, n_ctx, prompt.n_tokens(), truncated);
+        } else {
+            SLT_DBG(*this, "generate_draft: id=%d, #tokens=%zu, #draft=%zu, pos_next=%d\n",
+                    sampled, prompt.tokens.size(), spec_draft.size(), prompt.tokens.pos_next());
+
+            GGML_ASSERT(spec_i_batch.empty());
+
+            spec_i_batch.push_back(batch.n_tokens);
+            for (size_t i = 0; i < spec_draft.size(); i++) {
+                spec_i_batch.push_back(batch.n_tokens + i + 1);
+            }
+
+            auto pos0 = prompt.tokens.pos_next();
+
+            common_batch_add(batch, sampled, pos0++, { this->id }, true);
+            for (auto token : spec_draft) {
+                common_batch_add(batch, token, pos0++, { this->id }, true);
+            }
+        }
+
+        prompt.tokens.push_back(sampled);
+        prompt.tokens.insert(spec_draft);
+    }
+
    void release() {
        if (is_processing()) {
            GGML_ASSERT(task);
@@ -400,7 +508,7 @@ struct server_slot {
            );
        }

-        common_speculative_print_stats(spec);
+        common_speculative_print_stats(spec.get());
    }

    json to_json(bool only_metrics = false) const {
@@ -591,16 +699,17 @@ private:

    void destroy() {
        llama_init.reset();
+
        ctx = nullptr;
        model = nullptr;

        mtmd_free(mctx);
        mctx = nullptr;

-        // Clear any sampling context
        for (server_slot & slot : slots) {
-            common_speculative_free(slot.spec);
-            slot.spec = nullptr;
+            if (slot.can_speculate()) {
+                slot.spec.reset();
+            }
        }

        llama_batch_free(batch);
@@ -642,9 +751,6 @@ private:

        llama_init = common_init_from_params(params_base);

-        // propagate model-metadata sampling defaults back to caller
-        params.sampling = params_base.sampling;
-
        model = llama_init->model();
        ctx   = llama_init->context();

@@ -660,6 +766,7 @@ private:
        add_bos_token = llama_vocab_get_add_bos(vocab);

        if (params_base.speculative.has_dft()) {
+            // TODO speculative: move to common/speculative.cpp?
            SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str());

            const auto & params_spec = params_base.speculative;
@@ -727,11 +834,6 @@ private:
                params_base.n_cache_reuse = 0;
                SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
            }
-
-            if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
-                params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
-                SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
-            }
        }

        if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@@ -769,14 +871,23 @@ private:

        slots.clear();

-        const bool can_spec = common_speculative_is_compat(ctx);
-        if (!can_spec) {
+        const auto spec_type = common_speculative_is_compat(ctx);
+        if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
            SRV_WRN("%s", "speculative decoding not supported by this context\n");
        }

+        if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_CKPT) {
+            SRV_WRN("%s", "speculative decoding will use checkpoints\n");
+            params_base.speculative.use_checkpoints = true;
+        }
+
        // initialize slots
        for (int i = 0; i < params_base.n_parallel; i++) {
-            server_slot slot;
+            slots.emplace_back();
+        }
+
+        for (int i = 0; i < params_base.n_parallel; i++) {
+            server_slot & slot = slots[i];

            slot.id    = i;
            slot.ctx   = ctx;
@@ -786,16 +897,11 @@ private:
            slot.prompt.tokens.has_mtmd = mctx != nullptr;

            // try speculative decoding
-            if (can_spec) {
-                slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
+            if (spec_type != COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
+                slot.spec.reset(common_speculative_init(params_base.speculative, slot.ctx));
+
                if (slot.spec) {
-                    if (mctx) {
-                        SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
-                        return false;
-                    }
                    SLT_INF(slot, "%s", "speculative decoding context initialized\n");
-                } else {
-                    SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
                }
            }

@@ -806,8 +912,6 @@ private:
            };

            slot.reset();
-
-            slots.push_back(std::move(slot));
        }

        {
@@ -854,6 +958,9 @@ private:
        model_aliases = params_base.model_alias;
        model_tags    = params_base.model_tags;

+        // propagate new defaults back to caller
+        params = params_base;
+
        if (!is_resume) {
            return init();
        }
@@ -1197,7 +1304,7 @@ private:
            backend_sampling &= task.params.sampling.backend_sampling;

            // TODO: speculative decoding requires multiple samples per batch - not supported yet
-            backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0);
+            backend_sampling &= !(slot.can_speculate() && task.params.speculative.n_max > 0);

            // TODO: getting post/pre sampling logits is not yet supported with backend sampling
            backend_sampling &= !need_logits;
@@ -1703,6 +1810,26 @@ private:
        return true;
    }

+    // n_tokens_cur: the number of tokens added to the batch for the current slot
+    void create_checkpoint(server_slot & slot, const int64_t n_tokens_cur, llama_pos pos_min, llama_pos pos_max) {
+        while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
+            // make room for the new checkpoint, if needed
+            const auto & cur = slot.prompt.checkpoints.front();
+
+            SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
+                    cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
+
+            slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
+        }
+
+        const auto & cur = slot.prompt.checkpoints.emplace_back(server_get_checkpoint(ctx, slot.id, slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max));
+
+        SLT_WRN(slot,
+                "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
+                (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
+                cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
+    }
+
    void process_single_task(server_task && task) {
        switch (task.type) {
            case SERVER_TASK_TYPE_COMPLETION:
@@ -1854,7 +1981,7 @@ private:
                    std::string filename = task.slot_action.filename;
                    std::string filepath = task.slot_action.filepath;

-                    const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
+                    const llama_tokens & tokens = slot->prompt.tokens.get_tokens();
                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);

                    const int64_t t_end = ggml_time_us();
@@ -2061,7 +2188,7 @@ private:
                {
                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);

-                    llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
+                    llama_tokens new_tokens = slot.prompt.tokens.get_tokens(); // copy
                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
                        new_tokens[i - n_discard] = new_tokens[i];
                    }
@@ -2100,61 +2227,7 @@ private:
                continue;
            }

-            // generate draft tokens in speculative decoding mode
-            // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
-            //       perform the speculative drafting for all sequences at the same time in a single batch
-            const int n_draft_max = slot.get_n_draft_max();
-            if (n_draft_max > 0) {
-                if (mctx) {
-                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
-                    GGML_ABORT("not supported by multimodal");
-                }
-
-                const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
-
-                const auto & params_spec = slot.task->params.speculative;
-
-                llama_tokens draft = common_speculative_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
-
-                if (draft.size() > (size_t) n_draft_max) {
-                    SLT_WRN(slot, "draft size %d exceeds max %d, truncating\n", (int) draft.size(), n_draft_max);
-                    draft.resize(n_draft_max);
-                }
-
-                // add the sampled token to the batch
-                slot.i_batch_dft.push_back(batch.n_tokens);
-                common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
-                slot.prompt.tokens.push_back(slot.sampled);
-
-                if (slot.task->params.speculative.n_min > (int) draft.size()) {
-                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
-                    // fallback to normal decoding
-                    slot.i_batch = slot.i_batch_dft[0];
-                    slot.drafted.clear();
-                    slot.i_batch_dft.clear();
-                } else {
-                    // keep track of total number of drafted tokens tested
-                    slot.n_draft_total += draft.size();
-
-                    // add all drafted tokens to the batch
-                    for (size_t i = 0; i < draft.size(); i++) {
-                        slot.i_batch_dft.push_back(batch.n_tokens);
-                        common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true);
-                        slot.prompt.tokens.push_back(draft[i]);
-                    }
-                    slot.drafted = std::move(draft);
-                }
-            } else {
-                // no speculative decoding
-                slot.i_batch = batch.n_tokens;
-
-                common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
-
-                slot.prompt.tokens.push_back(slot.sampled);
-
-                SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
-                        slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
-            }
+            slot.update_batch(batch);
        }

        // process in chunks of params.n_batch
@@ -2651,40 +2724,12 @@ private:

                    // no need to create checkpoints that are too close together
                    do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64);
+                    SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);

                    // note: we create the checkpoint before calling llama_decode(), so the current batch is not
                    //       yet processed and therefore it is not part of the checkpoint.
                    if (do_checkpoint) {
-                        while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
-                            // make room for the new checkpoint, if needed
-                            const auto & cur = slot.prompt.checkpoints.front();
-
-                            SLT_WRN(slot,
-                                    "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
-                                    ", size = %.3f MiB)\n",
-                                    cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
-
-                            slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
-                        }
-
-                        const size_t checkpoint_size =
-                            llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                        auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
-                            /*.pos_min  = */ pos_min,
-                            /*.pos_max  = */ pos_max,
-                            /*.n_tokens = */ slot.prompt.n_tokens() - n_tokens_cur,
-                            /*.data     = */ std::vector<uint8_t>(checkpoint_size),
-                        });
-
-                        llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id,
-                                                     LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                        SLT_WRN(slot,
-                                "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
-                                ", size = %.3f MiB)\n",
-                                (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
-                                cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
+                        create_checkpoint(slot, n_tokens_cur, pos_min, pos_max);
                    }
                }

@@ -2856,19 +2901,19 @@ private:
                    slot.state = SLOT_STATE_GENERATING;

                    if (slot.can_speculate()) {
-                        common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens());
+                        common_speculative_begin(slot.spec.get(), slot.prompt.tokens.get_text_tokens());
                    }
                } else if (slot.state != SLOT_STATE_GENERATING) {
                    continue; // continue loop of slots
                }

-                if (slot.i_batch_dft.size() > 0) {
+                if (slot.can_speculate() && !slot.spec_draft.empty()) {
                    continue; // sample using speculative decoding
                }

                const int tok_idx = slot.i_batch - i;

-                llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx);
+                llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx, tok_idx);

                slot.i_batch = -1;

@@ -2889,7 +2934,7 @@ private:

                completion_token_output result;
                result.tok          = id;
-                result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs

                if (slot.task->params.sampling.n_probs > 0) {
@@ -2909,43 +2954,86 @@ private:

            // speculative decoding - main model sample and accept
            for (auto & slot : slots) {
-                if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
+                if (slot.state != SLOT_STATE_GENERATING || !slot.can_speculate() || slot.spec_draft.empty()) {
                    continue;
                }

-                const size_t n_draft = slot.drafted.size();
+                // save the original draft size
+                const size_t n_draft = slot.spec_draft.size();

-                // the accepted tokens from the speculation
-                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
-                slot.i_batch_dft.clear();
-                slot.drafted.clear();
+                GGML_ASSERT(n_draft > 0);
+
+                // verify and try to accept the draft
+                {
+                    const auto & params_spec = slot.task->params.speculative;
+
+                    common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
+
+                    GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
+                    auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft);
+                    slot.spec_i_batch.clear();
+
+                    SLT_DBG(slot, "%s: n_draft=%zu, accepted=%zu\n", __func__, slot.spec_draft.size(), accepted.size());
+
+                    GGML_ASSERT(accepted.size() >= 1);
+
+                    // check for partial draft acceptance
+                    if (accepted.size() < slot.spec_draft.size() + 1) {
+                        if (params_spec.use_checkpoints) {
+                            // partial acceptance is not supported by the context -> truncate the draft and restore the state
+                            slot.spec_draft = std::move(accepted);
+
+                            auto & ckpt = slot.spec_ckpt;
+
+                            SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size());
+
+                            const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                            if (n != ckpt.size()) {
+                                GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
+                                        __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n);
+                            }
+
+                            llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, ckpt.pos_max + 1, -1);
+
+                            slot.prompt.tokens.keep_first(ckpt.n_tokens);
+                            slot.smpl = std::move(smpl_save);
+
+                            continue;
+                        }
+
+                        LOG_DBG("%s: partial acceptance: %zu < %zu\n", __func__, accepted.size(), slot.spec_draft.size());
+                    }
+
+                    common_speculative_accept(slot.spec.get(), accepted.size() - 1);
+
+                    slot.spec_draft = std::move(accepted);
+                }

                const int64_t t_current = ggml_time_us();

-                slot.n_decoded += ids.size();
+                const auto ids = std::move(slot.spec_draft);

+                slot.n_decoded += ids.size();
                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;

                // update how many tokens out of those tested were accepted
                slot.n_draft_accepted += ids.size() - 1;
-
-                // inform the speculative decoding about the number of accepted tokens
-                common_speculative_accept(slot.spec, ids.size() - 1);
-
-                // rollback to the state before sampling the draft tokens
-                slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
+                slot.n_draft_total += n_draft;

                // add accepted tokens to the prompt
+                slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
                slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
-                slot.sampled = ids.back(); // last accepted token

-                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
+                slot.sampled = ids.back(); // last accepted token
+                SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft);
+
+                llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, slot.prompt.n_tokens(), -1);

                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;

                    result.tok          = ids[i];
-                    result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                    result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
                    result.prob         = 1.0f; // set later

                    // TODO: set result.probs
@@ -3537,6 +3625,7 @@ void server_routes::init_routes() {
                {"vision", meta->has_inp_image},
                {"audio",  meta->has_inp_audio},
            } },
+            { "media_marker",                get_media_marker() },
            { "endpoint_slots",              params.endpoint_slots },
            { "endpoint_props",              params.endpoint_props },
            { "endpoint_metrics",            params.endpoint_metrics },
@@ -3664,7 +3753,7 @@ void server_routes::init_routes() {
            params.n_predict,
            meta->slot_n_ctx,
            params.spm_infill,
-            tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
+            tokenized_prompts[0].get_tokens() // TODO: this could maybe be multimodal.
        );

        std::vector<raw_buffer> files; // dummy
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -162,7 +162,7 @@ common_chat_msg task_result_state::update_chat_msg(
        bool filter_tool_calls) {
    generated_text += text_added;
    auto msg_prv_copy = chat_msg;
-    SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
+    //SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
    auto new_msg = common_chat_parse(
        generated_text,
        is_partial,
@@ -304,6 +304,8 @@ task_params server_task::params_from_json_cmpl(
    params.sampling.backend_sampling   = json_value(data, "backend_sampling",    defaults.sampling.backend_sampling);
    params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);

+    params.speculative = defaults.speculative;
+
    params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
    params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
    params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -576,6 +576,17 @@ struct server_prompt_checkpoint {
    size_t size() const {
        return data.size();
    }
+
+    bool empty() const {
+        return data.empty();
+    }
+
+    void clear() {
+        pos_min = 0;
+        pos_max = 0;
+        n_tokens = 0;
+        data.clear();
+    }
 };

 struct server_prompt {
Author	SHA1	Message	Date
Aldehir Rojas	d5b780a676	common/autoparser : allow space after tool call (#22073 )	2026-04-19 13:28:35 +02:00
uvos	471540ae8a	HIP: Remove unesscary NCCL_CHECK (#21914 )	2026-04-19 12:59:44 +02:00
Xuan-Son Nguyen	19124078be	mtmd: add pos_0 to mtmd_image_tokens_get_decoder_pos (breaking change) (#22082 ) * mtmd: add pos_0 to mtmd_image_tokens_get_decoder_pos * fix build	2026-04-19 11:57:21 +02:00
Gaurav Garg	bcdcc1044f	ggml : reduce CPU overhead in meta backend (#22041 ) * cache subgraph splits when cgraph is unchanged Skip per-call subgraph construction in ggml_backend_meta_graph_compute when the same ggml_cgraph is used consecutively. Assign uid to every sub-graph so that CUDA's fast uid check path hits too. * Address review comments * Keep the scope as is * Rename last_uid and last_n_subgraphs field. Remove last_max_tmp_size field. Refactor code. * Address review comments * Update ggml/src/ggml-backend-meta.cpp Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml/src/ggml-backend-meta.cpp Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2026-04-19 12:48:35 +03:00
Sigbjørn Skjæret	037bfe38d0	ci : install spirv-headers for vulkan-cross (#22109 )	2026-04-19 10:32:08 +03:00
Dowon	8685e7b075	convert : support sentence-transformer 5.4 config files (#22087 ) * convert : support sentence-transformer 5.4 config files * fix: embeddinggemma * fix: mapping Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix: pooling_mode Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-19 10:25:39 +03:00
texasich	09b4efa95f	cmake: remove CMP0194 policy to restore MSVC builds (#21934 ) #21630 added the CMP0194 NEW policy to silence a CMake warning, but on Windows runners it caused CMake to prefer the MinGW toolchain for ASM and broke MSVC builds. Reverting only that policy block restores the previous working behavior. The CMake 4.1+ warning comes back, but that is cosmetic and does not break any platform. Reported-by: oobabooga Refs: #21630 Co-authored-by: texasich <texasich@users.noreply.github.com>	2026-04-19 10:25:05 +03:00
Sascha Rogmann	455d8e4be8	server : speculative checkpointing (#19493 ) * server : speculative decoding using checkpoints * server : fix draft check with checkpoints * server : rename spec vars * server : log levels * server : refactored spec logic to speculative.cpp * server : renamed spec checkpoints option * server : fix spec checkpoints, logging * speculative : checkpoints with draft model, logging * server : n_tokens_cur and create_checkpoint in draft * server : fix server_speculative_callback (slot.id) * spec : fix ngram-map/begin idx_last_check * spec : init ckpt (begin() wasn't called) * chore: update webui build output * server : restore sampler in spec checkpoint and clear mem * cont : avoid --spec-use-checkpoints argument * cont : remove server_prompt_checkpoint_with_size * spec : rename (leave_draft_state) * cont : clean-up * cont : do not ignore partial drafts even if the are short * cont : spec callback owned by session * cont : simplify * cont : avoid empty speculative session * cont : simplify * cont : simplify * cont : enable mtmd speculative decoding * cont : keep the spec sampler alive * cont : simplify * cont : fix nullptr deref + draft checkpoints * cont : remove common_speculative_accept_response * cont : remove callback * cont : simplify * cont : minor * cont : simplify * cont : fix accepted number --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-04-19 10:24:06 +03:00
Radoslav Gerganov	91fef95362	rpc : refactor the RPC transport (#21998 ) * rpc : refactor the RPC transport Move all transport related code into a separate file and use the socket_t interface to hide all transport implementation details. * fix win32 * better socket_t construction	2026-04-19 10:21:53 +03:00
Cetarthoriphros	9e5647affa	server: Expose `media_tag` on /props endpoint. (#22028 )	2026-04-19 00:27:17 +02:00
Sigbjørn Skjæret	4f02d47339	model : refactor bias tensor variable names (#22079 ) * refactor bias tensor variable names * use create_tensor_qkv for jina-bert-v2	2026-04-18 20:12:00 +02:00