Compare commits

...

11 Commits
b8838 ... b8849

Author SHA1 Message Date
Aldehir Rojas
d5b780a676 common/autoparser : allow space after tool call (#22073) 2026-04-19 13:28:35 +02:00
uvos
471540ae8a HIP: Remove unesscary NCCL_CHECK (#21914) 2026-04-19 12:59:44 +02:00
Xuan-Son Nguyen
19124078be mtmd: add pos_0 to mtmd_image_tokens_get_decoder_pos (breaking change) (#22082)
* mtmd: add pos_0 to mtmd_image_tokens_get_decoder_pos

* fix build
2026-04-19 11:57:21 +02:00
Gaurav Garg
bcdcc1044f ggml : reduce CPU overhead in meta backend (#22041)
* cache subgraph splits when cgraph is unchanged

Skip per-call subgraph construction in ggml_backend_meta_graph_compute when the same ggml_cgraph is used consecutively.

Assign uid to every sub-graph so that CUDA's fast uid check path hits too.

* Address review comments

* Keep the scope as is

* Rename last_uid and last_n_subgraphs field. Remove last_max_tmp_size field. Refactor code.

* Address review comments

* Update ggml/src/ggml-backend-meta.cpp

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml/src/ggml-backend-meta.cpp

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2026-04-19 12:48:35 +03:00
Sigbjørn Skjæret
037bfe38d0 ci : install spirv-headers for vulkan-cross (#22109) 2026-04-19 10:32:08 +03:00
Dowon
8685e7b075 convert : support sentence-transformer 5.4 config files (#22087)
* convert : support sentence-transformer 5.4 config files

* fix: embeddinggemma

* fix: mapping

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix: pooling_mode

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-19 10:25:39 +03:00
texasich
09b4efa95f cmake: remove CMP0194 policy to restore MSVC builds (#21934)
#21630 added the CMP0194 NEW policy to silence a CMake warning, but on Windows runners it caused CMake to prefer the MinGW toolchain for ASM and broke MSVC builds.

Reverting only that policy block restores the previous working behavior. The CMake 4.1+ warning comes back, but that is cosmetic and does not break any platform.

Reported-by: oobabooga

Refs: #21630

Co-authored-by: texasich <texasich@users.noreply.github.com>
2026-04-19 10:25:05 +03:00
Sascha Rogmann
455d8e4be8 server : speculative checkpointing (#19493)
* server : speculative decoding using checkpoints

* server : fix draft check with checkpoints

* server : rename spec vars

* server : log levels

* server : refactored spec logic to speculative.cpp

* server : renamed spec checkpoints option

* server : fix spec checkpoints, logging

* speculative : checkpoints with draft model, logging

* server : n_tokens_cur and create_checkpoint in draft

* server : fix server_speculative_callback (slot.id)

* spec : fix ngram-map/begin idx_last_check

* spec : init ckpt (begin() wasn't called)

* chore: update webui build output

* server : restore sampler in spec checkpoint and clear mem

* cont : avoid --spec-use-checkpoints argument

* cont : remove server_prompt_checkpoint_with_size

* spec : rename (leave_draft_state)

* cont : clean-up

* cont : do not ignore partial drafts even if the are short

* cont : spec callback owned by session

* cont : simplify

* cont : avoid empty speculative session

* cont : simplify

* cont : simplify

* cont : enable mtmd speculative decoding

* cont : keep the spec sampler alive

* cont : simplify

* cont : fix nullptr deref + draft checkpoints

* cont : remove common_speculative_accept_response

* cont : remove callback

* cont : simplify

* cont : minor

* cont : simplify

* cont : fix accepted number

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-19 10:24:06 +03:00
Radoslav Gerganov
91fef95362 rpc : refactor the RPC transport (#21998)
* rpc : refactor the RPC transport

Move all transport related code into a separate file and use the
socket_t interface to hide all transport implementation details.

* fix win32

* better socket_t construction
2026-04-19 10:21:53 +03:00
Cetarthoriphros
9e5647affa server: Expose media_tag on /props endpoint. (#22028) 2026-04-19 00:27:17 +02:00
Sigbjørn Skjæret
4f02d47339 model : refactor bias tensor variable names (#22079)
* refactor bias tensor variable names

* use create_tensor_qkv for jina-bert-v2
2026-04-18 20:12:00 +02:00
82 changed files with 1514 additions and 1224 deletions

View File

@@ -246,6 +246,7 @@ jobs:
apt-get install -y --no-install-recommends \
build-essential \
glslc \
spirv-headers \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu \
libvulkan-dev:loong64

View File

@@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
if (!format.per_call_start.empty()) {
auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
if (inputs.parallel_tool_calls) {
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
} else {
tool_calls = p.trigger_rule("tool-call", wrapped_call);
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
}
if (!format.section_start.empty()) {
tool_calls = p.trigger_rule("tool-calls",
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
(format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
}
} else {
std::string separator = ", "; // Default

View File

@@ -2334,7 +2334,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
? input
: params.generation_prompt + input;
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
//LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {

View File

@@ -11,7 +11,6 @@
#include <sstream>
#include <string>
#include <string_view>
#include <variant>
#include <vector>
#include <map>
@@ -303,7 +302,7 @@ struct common_params_speculative {
// general-purpose speculative decoding parameters
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
@@ -312,6 +311,7 @@ struct common_params_speculative {
uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
std::shared_ptr<common_ngram_mod> ngram_mod;

View File

@@ -208,7 +208,7 @@ void common_ngram_map_begin(
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
}
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
map.idx_last_check = size_begin;
map.size_last_begin = size_begin;
}
@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
}
if (map.idx_last_check > cur_len) {
if (map.idx_last_check > cur_len) {
// Should not happen because of common_ngram_map_begin().
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
}
@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
map.last_draft_created = false;
map.last_draft_created = true;
map.last_draft_key_idx = key_offset;
map.last_draft_value_idx = 0; // value 0 is used for simple mode
return;
@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
// update the value statistics
LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
n_accepted, curr_value.n_accepted);
curr_value.n_accepted = n_accepted;
}

View File

@@ -13,6 +13,7 @@
#include <cstring>
#include <iomanip>
#include <map>
#include <cinttypes>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -144,10 +145,28 @@ struct common_speculative_state {
virtual void accept(uint16_t n_accepted) = 0;
};
struct common_speculative_checkpoint {
llama_pos pos_min = 0;
llama_pos pos_max = 0;
int64_t n_tokens = 0;
std::vector<uint8_t> data;
size_t size() const {
return data.size();
}
size_t ckpt_size = 0;
};
struct common_speculative_state_draft : public common_speculative_state {
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
llama_context * ctx_dft;
struct common_speculative_checkpoint ckpt;
bool use_checkpoint;
common_sampler * smpl;
llama_batch batch;
@@ -160,10 +179,12 @@ struct common_speculative_state_draft : public common_speculative_state {
enum common_speculative_type type,
llama_context * ctx_tgt,
llama_context * ctx_dft,
const std::vector<std::pair<std::string, std::string>> & replacements)
const std::vector<std::pair<std::string, std::string>> & replacements,
bool use_checkpoint)
: common_speculative_state(type)
, ctx_tgt(ctx_tgt)
, ctx_dft(ctx_dft)
, use_checkpoint(use_checkpoint)
{
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
smpl = nullptr;
@@ -218,7 +239,48 @@ struct common_speculative_state_draft : public common_speculative_state {
}
void begin(const llama_tokens & prompt) override {
GGML_UNUSED(prompt);
if (use_checkpoint && ckpt.size() > 0) {
// delete checkpoint
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
ckpt.pos_min = 0;
ckpt.pos_max = 0;
ckpt.n_tokens = 0;
ckpt.ckpt_size = 0;
ckpt.data.clear();
}
}
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
int slot_id = 0;
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
ckpt.data.resize(checkpoint_size);
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != checkpoint_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
}
LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
return n;
}
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
int slot_id = 0;
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt_size_part_expected) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
}
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
return n;
}
void draft(
@@ -236,8 +298,8 @@ struct common_speculative_state_draft : public common_speculative_state {
auto * mem_dft = llama_get_memory(ctx_dft);
int reuse_i = 0;
int reuse_n = 0;
int reuse_i = 0; // index of part to be reused in prompt_dft
int reuse_n = 0; // length of part to be reused in prompt_dft
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
@@ -287,18 +349,26 @@ struct common_speculative_state_draft : public common_speculative_state {
}
}
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
if (use_checkpoint && ckpt.ckpt_size == 0 && reuse_n > 0) {
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
__func__, reuse_i, reuse_n);
reuse_i = 0;
reuse_n = 0;
}
result.clear();
result.reserve(params.n_max);
if (reuse_n == 0) {
bool needs_ckpt = use_checkpoint && prompt_dft.size() > 0;
if (reuse_n == 0 || (use_checkpoint && reuse_i > 0)) {
llama_memory_clear(mem_dft, false);
prompt_dft.clear();
} else {
// this happens when a previous draft has been discarded (for example, due to being too small), but the
// target model agreed with it. in this case, we simply pass back the previous results to save compute
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
result.push_back(prompt_dft[i]);
@@ -310,19 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
return;
}
bool do_restore = false;
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
// This can happen after a partial acceptance (speculative decoding with checkpoints)
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
__func__, prompt_dft.size(), prompt_cur.size());
prompt_dft.resize(prompt_cur.size());
do_restore = true;
}
if (reuse_i > 0) {
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
}
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
}
if (reuse_n < (int) prompt_dft.size()) {
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
if (reuse_n < (int) prompt_dft.size() || do_restore) {
if (use_checkpoint) {
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
}
draft_restore_checkpoint(ckpt.ckpt_size);
reuse_n = ckpt.n_tokens;
prompt_dft.resize(reuse_n);
needs_ckpt = false;
} else {
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
__func__, reuse_n, prompt_dft.size());
}
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
}
}
}
if (needs_ckpt) {
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
}
// prepare a batch to evaluate any new tokens in the prompt
common_batch_clear(batch);
@@ -337,7 +438,11 @@ struct common_speculative_state_draft : public common_speculative_state {
if (batch.n_tokens > 0) {
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
llama_decode(ctx_dft, batch);
int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
__func__, ret, prompt_cur.size());
}
}
const llama_pos n_past = prompt_dft.size();
@@ -351,7 +456,11 @@ struct common_speculative_state_draft : public common_speculative_state {
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
llama_decode(ctx_dft, batch);
int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
__func__, ret, prompt_cur.size(), prompt_dft.size());
}
common_sampler_reset(smpl);
@@ -387,7 +496,11 @@ struct common_speculative_state_draft : public common_speculative_state {
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
// evaluate the drafted tokens on the draft model
llama_decode(ctx_dft, batch);
ret = llama_decode(ctx_dft, batch);
if (ret != 0) {
LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
__func__, i, ret, prompt_cur.size(), prompt_dft.size());
}
prompt_dft.push_back(id);
}
@@ -739,6 +852,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
struct common_speculative {
std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
};
@@ -798,13 +912,13 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
return it->second;
}
bool common_speculative_is_compat(llama_context * ctx_tgt) {
common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt) {
auto * mem = llama_get_memory(ctx_tgt);
if (mem == nullptr) {
return false;
return COMMON_SPECULATIVE_COMPAT_TYPE_NO;
}
bool res = true;
common_speculative_compat_type res = COMMON_SPECULATIVE_COMPAT_TYPE_FULL;
llama_memory_clear(mem, true);
@@ -816,14 +930,14 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) {
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
if (ret != 0) {
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
res = false;
res = COMMON_SPECULATIVE_COMPAT_TYPE_NO;
goto done;
}
// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
res = false;
res = COMMON_SPECULATIVE_COMPAT_TYPE_CKPT;
goto done;
}
@@ -909,9 +1023,10 @@ common_speculative * common_speculative_init(
break;
case COMMON_SPECULATIVE_TYPE_DRAFT: {
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
/* .ctx_tgt = */ ctx_tgt,
/* .ctx_dft = */ ctx_dft,
/* .replacements = */ params.replacements
/* .ctx_tgt = */ ctx_tgt,
/* .ctx_dft = */ ctx_dft,
/* .replacements = */ params.replacements,
/* .use_checkpoint= */ params.use_checkpoints // TODO: this should be based on the draft model!
));
break;
}
@@ -966,7 +1081,8 @@ common_speculative * common_speculative_init(
}
auto * result = new common_speculative {
/* .impls = */ std::move(impls)
/* .impls = */ std::move(impls),
/* .curr_impl = */ nullptr,
};
return result;

View File

@@ -14,9 +14,15 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// convert type to string
std::string common_speculative_type_to_str(enum common_speculative_type type);
enum common_speculative_compat_type {
COMMON_SPECULATIVE_COMPAT_TYPE_NO = 0,
COMMON_SPECULATIVE_COMPAT_TYPE_FULL = 1,
COMMON_SPECULATIVE_COMPAT_TYPE_CKPT = 2,
};
// check if the llama_context is compatible for speculative decoding
// note: clears the memory of the context
bool common_speculative_is_compat(llama_context * ctx_tgt);
common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt);
common_speculative * common_speculative_init(
common_params_speculative & params,
@@ -39,3 +45,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
// print statistics about the speculative decoding
void common_speculative_print_stats(const common_speculative * spec);
struct common_speculative_deleter {
void operator()(common_speculative * s) { common_speculative_free(s); }
};
typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;

View File

@@ -1850,20 +1850,28 @@ class TextModel(ModelBase):
with open(module_path, encoding="utf-8") as f:
modules = json.load(f)
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
if mod["type"].endswith("Pooling"):
pooling_path = mod["path"]
break
mode_mapping = {
"mean": gguf.PoolingType.MEAN,
"cls": gguf.PoolingType.CLS,
"lasttoken": gguf.PoolingType.LAST,
}
# get pooling type
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
if pooling["pooling_mode_mean_tokens"]:
if pooling.get("pooling_mode_mean_tokens"):
pooling_type = gguf.PoolingType.MEAN
elif pooling["pooling_mode_cls_token"]:
elif pooling.get("pooling_mode_cls_token"):
pooling_type = gguf.PoolingType.CLS
elif pooling["pooling_mode_lasttoken"]:
elif pooling.get("pooling_mode_lasttoken"):
pooling_type = gguf.PoolingType.LAST
elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
pooling_type = mode_mapping[pooling_mode]
else:
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type)
@@ -7180,7 +7188,7 @@ class EmbeddingGemma(Gemma3Model):
with open(modules_file, encoding="utf-8") as modules_json_file:
mods = json.load(modules_json_file)
for mod in mods:
if mod["type"] == "sentence_transformers.models.Dense":
if mod["type"].endswith("Dense"):
mod_path = mod["path"]
# check if model.safetensors file for Dense layer exists
model_tensors_file = self.dir_model / mod_path / "model.safetensors"

View File

@@ -1,11 +1,5 @@
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
# MSVC is not a valid assembler for the ASM language.
# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
if (POLICY CMP0194)
cmake_policy(SET CMP0194 NEW)
endif()
project("ggml" C CXX ASM)
### GGML Version

View File

@@ -1456,6 +1456,8 @@ struct ggml_backend_meta_context {
int max_nnodes = 0;
size_t max_tmp_size = 0;
size_t max_subgraphs = 0;
size_t n_subgraphs = 0;
uint64_t uid = 0;
void * comm_ctx = nullptr;
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
@@ -1616,6 +1618,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
const size_t n_backends = ggml_backend_meta_n_backends(backend);
ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;
// If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
bool max_nnodes_raised = false;
if (cgraph->n_nodes > backend_ctx->max_nnodes) {
for (size_t j = 0; j < n_backends; j++) {
@@ -1625,173 +1630,181 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
backend_ctx->max_nnodes = cgraph->n_nodes;
max_nnodes_raised = true;
assert(needs_rebuild);
}
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
bcj.nodes[i] = node;
continue;
if (needs_rebuild) {
size_t n_subgraphs = 0;
size_t max_tmp_size = 0;
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
bcj.nodes[i] = node;
continue;
}
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
GGML_ASSERT(bcj.nodes[i]);
}
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
GGML_ASSERT(bcj.nodes[i]);
}
}
size_t n_subgraphs = 0;
size_t max_tmp_size = 0;
{
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
auto get_i_delayed = [&](const int i) -> int {
int id = i; // i_delayed
int idr = i; // i_delayed return, last safe return value
{
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
auto get_i_delayed = [&](const int i) -> int {
int id = i; // i_delayed
int idr = i; // i_delayed return, last safe return value
ggml_tensor * node = cgraph->nodes[id];
int32_t n_used = ggml_node_get_use_count(cgraph, id);
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
ggml_tensor * node = cgraph->nodes[id];
int32_t n_used = ggml_node_get_use_count(cgraph, id);
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_MUL && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
return idr;
}
for (int32_t k = 0; k < n_used; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_MUL && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
for (int32_t k = 0; k < n_used - 2; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
idr = id;
return idr;
}
for (int32_t k = 0; k < n_used; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
for (int32_t k = 0; k < n_used - 2; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
idr = id;
return idr;
};
};
int i_start = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
}
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
if (!new_subgraph) {
continue;
}
int i_start = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
}
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
if (!new_subgraph) {
continue;
}
i = get_i_delayed(i);
i = get_i_delayed(i);
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.cgraphs[n_subgraphs].offset = i_start;
}
n_subgraphs++;
i_start = i + 1;
}
GGML_ASSERT(i_start == cgraph->n_nodes);
}
backend_ctx->uid = cgraph->uid;
backend_ctx->n_subgraphs = n_subgraphs;
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.cgraphs[n_subgraphs].offset = i_start;
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
n_subgraphs++;
i_start = i + 1;
backend_ctx->max_tmp_size = max_tmp_size;
}
GGML_ASSERT(i_start == cgraph->n_nodes);
}
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
backend_ctx->max_tmp_size = max_tmp_size;
}
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
ggml_init_params params = {
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
backend_ctx->ctx.reset(ggml_init(params));
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i = 0; i < n_subgraphs; i++) {
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
ggml_init_params params = {
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
backend_ctx->ctx.reset(ggml_init(params));
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i = 0; i < n_subgraphs; i++) {
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
}
}
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
}
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
}
}
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
}
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
}
}
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
cgraph_ij->n_nodes = i_node_stop - i_node_start;
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
ggml_tensor * node_ij = bcj.nodes[i_node];
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
cgraph_ij->n_nodes = i_node_stop - i_node_start;
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
ggml_tensor * node_ij = bcj.nodes[i_node];
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
}
cgraph_ij->uid = ggml_graph_next_uid();
}
}
}
@@ -1898,7 +1911,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
};
for (size_t i = 0; i < n_subgraphs; i++) {
for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
@@ -1907,7 +1920,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
}
if (n_backends > 1 && i < n_subgraphs - 1) {
if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
bool backend_allreduce_success = false;
if (backend_ctx->comm_ctx) {
std::vector<ggml_tensor *> nodes;

View File

@@ -33,7 +33,6 @@
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)

View File

@@ -2,6 +2,7 @@ message(STATUS "Using RPC backend")
ggml_add_backend_library(ggml-rpc
ggml-rpc.cpp
transport.cpp
)
if (WIN32)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,683 @@
#include "transport.h"
#include "ggml-impl.h"
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <winsock2.h>
#else
# include <arpa/inet.h>
# include <sys/socket.h>
# include <sys/types.h>
# include <netinet/in.h>
# include <netinet/tcp.h>
# include <netdb.h>
# include <unistd.h>
#endif
#include <cstdlib>
#include <mutex>
#include <optional>
#ifdef GGML_RPC_RDMA
# include <infiniband/verbs.h>
# include <time.h>
# ifndef _WIN32
# include <poll.h>
# endif
#endif // GGML_RPC_RDMA
#ifdef _WIN32
typedef SOCKET sockfd_t;
using ssize_t = __int64;
#else
typedef int sockfd_t;
#endif
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
#define LOG_DBG(...) \
do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
#ifdef GGML_RPC_RDMA
static constexpr size_t RDMA_CHUNK = 256 * 1024; // 256 KiB per send/recv (fits default 8 MiB memlock)
static constexpr int RDMA_RX_DEPTH = 24; // pre-posted recv ring: 24 × 256 KiB = 6 MiB
static constexpr size_t RDMA_GID_SIZE = 16; // RoCE GID / IB GID is always 16 bytes
using rdma_gid_t = std::array<uint8_t, RDMA_GID_SIZE>;
struct rdma_conn {
struct ibv_context * ctx = nullptr;
struct ibv_pd * pd = nullptr;
struct ibv_cq * scq = nullptr; // send completions
struct ibv_cq * rcq = nullptr; // recv completions
struct ibv_qp * qp = nullptr;
void * tx_buf = nullptr;
struct ibv_mr * tx_mr = nullptr;
void * rx_buf = nullptr; // RDMA_RX_DEPTH × RDMA_CHUNK contiguous
struct ibv_mr * rx_mr = nullptr;
int rx_head = 0;
uint32_t max_inline = 0;
uint8_t * rx_slot(int i) const {
return static_cast<uint8_t *>(rx_buf) + static_cast<size_t>(i) * RDMA_CHUNK;
}
bool post_rx(int i) {
struct ibv_sge sge = {};
sge.addr = (uintptr_t)rx_slot(i);
sge.length = RDMA_CHUNK;
sge.lkey = rx_mr->lkey;
struct ibv_recv_wr wr = {}, * bad = nullptr;
wr.wr_id = (uint64_t)i;
wr.sg_list = &sge;
wr.num_sge = 1;
return ibv_post_recv(qp, &wr, &bad) == 0;
}
~rdma_conn() {
if (tx_mr) ibv_dereg_mr(tx_mr);
if (rx_mr) ibv_dereg_mr(rx_mr);
free(tx_buf);
free(rx_buf);
if (qp) ibv_destroy_qp(qp);
if (scq) ibv_destroy_cq(scq);
if (rcq) ibv_destroy_cq(rcq);
if (pd) ibv_dealloc_pd(pd);
if (ctx) ibv_close_device(ctx);
}
};
// Local RDMA parameters captured during the probe phase and later consumed
// by rdma_activate() after the remote side's caps arrive via HELLO.
struct rdma_local_info {
uint32_t qpn = 0;
uint32_t psn = 0;
uint8_t gid[RDMA_GID_SIZE] = {};
uint8_t ib_port = 0;
int gid_idx = 0;
enum ibv_mtu path_mtu = IBV_MTU_1024;
};
struct rdma_caps {
uint32_t qpn;
uint32_t psn;
uint8_t gid[RDMA_GID_SIZE];
};
static_assert(sizeof(rdma_caps) == RPC_CONN_CAPS_SIZE, "rdma_caps must match conn_caps size");
#endif // GGML_RPC_RDMA
struct socket_t::impl {
impl(sockfd_t fd) : use_rdma(false), fd(fd) {}
~impl();
bool send_data(const void * data, size_t size);
bool recv_data(void * data, size_t size);
void get_caps(uint8_t * local_caps);
void update_caps(const uint8_t * remote_caps);
#ifdef GGML_RPC_RDMA
bool tcp_peer_closed();
std::optional<rdma_gid_t> rdma_build_target_gid();
bool rdma_probe();
bool rdma_activate(uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid);
bool rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc);
bool rdma_send(const void * data, size_t size);
bool rdma_recv(void * data, size_t size);
std::unique_ptr<rdma_conn> rdma;
rdma_local_info rdma_local = {};
#endif // GGML_RPC_RDMA
bool use_rdma;
sockfd_t fd;
};
socket_t::impl::~impl() {
#ifdef GGML_RPC_RDMA
rdma.reset();
#endif // GGML_RPC_RDMA
LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
#ifdef _WIN32
if (fd != INVALID_SOCKET) closesocket(this->fd);
#else
if (fd >= 0) close(this->fd);
#endif
}
#ifdef GGML_RPC_RDMA
bool socket_t::impl::tcp_peer_closed() {
if (fd < 0) return false;
#ifndef _WIN32
struct pollfd pfd = { fd, POLLIN | POLLRDHUP, 0 };
int r = poll(&pfd, 1, 0);
return r > 0 && (pfd.revents & (POLLHUP | POLLERR | POLLRDHUP));
#else
return false;
#endif
}
// Build a RoCE GID-shaped 16-byte target from a TCP socket's local address.
// Used to match the socket's local IP against the kernel's GID table so that
// a single memcmp handles IPv4, IPv4-mapped IPv6, and native IPv6 uniformly:
// AF_INET -> ::ffff:a.b.c.d (bytes 10-11 = 0xff, last 4 = IPv4)
// AF_INET6 (IPv4-mapped) -> ::ffff:a.b.c.d (already in GID shape)
// AF_INET6 (native v6) -> the 16-byte IPv6 address as-is
// Returns std::nullopt on unsupported family or getsockname failure.
std::optional<rdma_gid_t> socket_t::impl::rdma_build_target_gid() {
sockaddr_storage addr = {};
socklen_t addr_len = sizeof(addr);
if (getsockname(fd, reinterpret_cast<sockaddr *>(&addr), &addr_len) != 0) {
return std::nullopt;
}
rdma_gid_t target = {};
if (addr.ss_family == AF_INET) {
const auto * a = reinterpret_cast<const sockaddr_in *>(&addr);
target[10] = 0xff;
target[11] = 0xff;
memcpy(&target[12], &a->sin_addr, 4);
return target;
}
if (addr.ss_family == AF_INET6) {
const auto * a = reinterpret_cast<const sockaddr_in6 *>(&addr);
memcpy(target.data(), &a->sin6_addr, RDMA_GID_SIZE);
return target;
}
return std::nullopt;
}
bool socket_t::impl::rdma_probe() {
const char * dev_env = std::getenv("GGML_RDMA_DEV");
const char * gid_env = std::getenv("GGML_RDMA_GID");
auto target_gid = rdma_build_target_gid();
if (!target_gid) {
return false;
}
const uint8_t ib_port = 1;
int num_devs = 0;
ibv_device ** devs = ibv_get_device_list(&num_devs);
if (!devs || num_devs == 0) return false;
ibv_context * ibctx = nullptr;
const char * matched_dev = nullptr;
int gid_idx = gid_env ? atoi(gid_env) : -1;
int gid_version = IBV_GID_TYPE_IB; // 0 = unknown/IB
for (int d = 0; d < num_devs; d++) {
const char * dn = ibv_get_device_name(devs[d]);
if (dev_env && strcmp(dev_env, dn) != 0) continue;
ibv_context * ctx = ibv_open_device(devs[d]);
if (!ctx) continue;
ibv_port_attr pa;
if (ibv_query_port(ctx, ib_port, &pa) != 0) { ibv_close_device(ctx); continue; }
int found_gid = gid_idx;
int found_version = IBV_GID_TYPE_IB;
if (found_gid < 0) {
// Find a GID on this port whose bytes equal the local TCP address
// (IPv4 or IPv6). Prefer RoCE v2 (UDP/IP, L3-routable) over v1
// (raw Ethernet, same-L2 only) so silent hangs on L3-routed paths
// are avoided. ibv_query_gid_ex returns gid+type in one call.
int v2_idx = -1;
int v1_idx = -1;
for (int i = 0; i < pa.gid_tbl_len; i++) {
ibv_gid_entry entry = {};
if (ibv_query_gid_ex(ctx, ib_port, i, &entry, 0) != 0) continue;
if (memcmp(entry.gid.raw, target_gid->data(), RDMA_GID_SIZE) != 0) continue;
if (entry.gid_type == IBV_GID_TYPE_ROCE_V2 && v2_idx < 0) {
v2_idx = i;
} else if (entry.gid_type == IBV_GID_TYPE_ROCE_V1 && v1_idx < 0) {
v1_idx = i;
}
}
if (v2_idx >= 0) {
found_gid = v2_idx;
found_version = IBV_GID_TYPE_ROCE_V2;
} else if (v1_idx >= 0) {
found_gid = v1_idx;
found_version = IBV_GID_TYPE_ROCE_V1;
}
} else {
// Explicit GID index from GGML_RDMA_GID — fetch its type for logging.
ibv_gid_entry entry = {};
if (ibv_query_gid_ex(ctx, ib_port, found_gid, &entry, 0) == 0) {
found_version = entry.gid_type;
}
}
if (found_gid >= 0) {
ibctx = ctx;
gid_idx = found_gid;
gid_version = found_version;
matched_dev = dn;
rdma_local.path_mtu = pa.active_mtu;
break;
}
ibv_close_device(ctx);
}
ibv_free_device_list(devs);
if (!ibctx) return false;
rdma_local.ib_port = ib_port;
rdma_local.gid_idx = gid_idx;
rdma = std::make_unique<rdma_conn>();
rdma->ctx = ibctx;
rdma->pd = ibv_alloc_pd(ibctx);
if (!rdma->pd) return false;
rdma->scq = ibv_create_cq(ibctx, 16, nullptr, nullptr, 0);
rdma->rcq = ibv_create_cq(ibctx, RDMA_RX_DEPTH + 4, nullptr, nullptr, 0);
if (!rdma->scq || !rdma->rcq) return false;
ibv_qp_init_attr qia = {};
qia.send_cq = rdma->scq;
qia.recv_cq = rdma->rcq;
qia.qp_type = IBV_QPT_RC;
qia.cap.max_send_wr = 4;
qia.cap.max_recv_wr = RDMA_RX_DEPTH + 4;
qia.cap.max_send_sge = 1;
qia.cap.max_recv_sge = 1;
qia.cap.max_inline_data = 256;
rdma->qp = ibv_create_qp(rdma->pd, &qia);
if (!rdma->qp) return false;
rdma->max_inline = qia.cap.max_inline_data;
rdma->tx_buf = aligned_alloc(4096, RDMA_CHUNK);
rdma->rx_buf = aligned_alloc(4096, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK);
if (!rdma->tx_buf || !rdma->rx_buf) return false;
rdma->tx_mr = ibv_reg_mr(rdma->pd, rdma->tx_buf, RDMA_CHUNK, IBV_ACCESS_LOCAL_WRITE);
rdma->rx_mr = ibv_reg_mr(rdma->pd, rdma->rx_buf, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
if (!rdma->tx_mr || !rdma->rx_mr) return false;
ibv_gid local_gid;
if (ibv_query_gid(ibctx, ib_port, gid_idx, &local_gid) != 0) return false;
rdma_local.qpn = rdma->qp->qp_num;
rdma_local.psn = rdma->qp->qp_num & 0xffffff;
memcpy(&rdma_local.gid, &local_gid, RDMA_GID_SIZE);
const char * ver_str = "";
if (gid_version == IBV_GID_TYPE_ROCE_V2) {
ver_str = " RoCEv2";
} else if (gid_version == IBV_GID_TYPE_ROCE_V1) {
ver_str = " RoCEv1";
}
GGML_LOG_INFO("RDMA probed: dev=%s gid=%d%s qpn=%u inline=%u\n",
matched_dev, gid_idx, ver_str, rdma_local.qpn, rdma->max_inline);
return true;
}
// Phase 2: Given remote QPN/PSN/GID, transition QP: RESET->INIT->pre-post->RTR->RTS.
// On success, the connection is live and ready for rdma_send/rdma_recv.
bool socket_t::impl::rdma_activate(uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid) {
// RESET -> INIT
{
struct ibv_qp_attr a = {};
a.qp_state = IBV_QPS_INIT;
a.port_num = rdma_local.ib_port;
a.pkey_index = 0;
a.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE;
if (ibv_modify_qp(rdma->qp, &a,
IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
return false;
}
}
for (int i = 0; i < RDMA_RX_DEPTH; i++) {
if (!rdma->post_rx(i)) return false;
}
// INIT -> RTR
{
struct ibv_qp_attr a = {};
a.qp_state = IBV_QPS_RTR;
a.path_mtu = rdma_local.path_mtu;
a.dest_qp_num = remote_qpn;
a.rq_psn = remote_psn;
a.max_dest_rd_atomic = 1;
a.min_rnr_timer = 1;
a.ah_attr.is_global = 1;
memcpy(&a.ah_attr.grh.dgid, remote_gid, RDMA_GID_SIZE);
a.ah_attr.grh.hop_limit = 1;
a.ah_attr.grh.sgid_index = rdma_local.gid_idx;
a.ah_attr.dlid = 0;
a.ah_attr.port_num = rdma_local.ib_port;
if (ibv_modify_qp(rdma->qp, &a,
IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER) != 0) {
return false;
}
}
// RTR -> RTS
{
struct ibv_qp_attr a = {};
a.qp_state = IBV_QPS_RTS;
a.timeout = 14;
a.retry_cnt = 7;
a.rnr_retry = 7;
a.sq_psn = rdma_local.psn;
a.max_rd_atomic = 1;
if (ibv_modify_qp(rdma->qp, &a,
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC) != 0) {
return false;
}
}
GGML_LOG_INFO("RDMA activated: qpn=%u->%u mtu=%d rx_depth=%d\n",
rdma_local.qpn, remote_qpn, 128 << rdma_local.path_mtu, RDMA_RX_DEPTH);
return true;
}
bool socket_t::impl::rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc) {
for (uint64_t s = 0; ; s++) {
int n = ibv_poll_cq(cq, 1, wc);
if (n > 0) {
if (wc->status != IBV_WC_SUCCESS) {
GGML_LOG_ERROR("RDMA CQ wc error: status=%d (%s) vendor_err=0x%x\n",
wc->status, ibv_wc_status_str(wc->status), wc->vendor_err);
}
return wc->status == IBV_WC_SUCCESS;
}
if (n < 0) return false;
if ((s & 0xFFFFF) == 0 && s > 0) {
if (tcp_peer_closed()) {
return false;
}
}
}
}
bool socket_t::impl::rdma_send(const void * data, size_t size) {
rdma_conn * c = rdma.get();
const uint8_t * src = (const uint8_t *)data;
size_t rem = size;
while (rem > 0) {
size_t chunk = std::min(rem, RDMA_CHUNK);
struct ibv_sge sge = {};
struct ibv_send_wr wr = {}, * bad = nullptr;
wr.opcode = IBV_WR_SEND;
wr.sg_list = &sge;
wr.num_sge = 1;
if (chunk <= c->max_inline) {
sge.addr = (uintptr_t)src;
sge.length = chunk;
wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
} else {
memcpy(c->tx_buf, src, chunk);
sge.addr = (uintptr_t)c->tx_buf;
sge.length = chunk;
sge.lkey = c->tx_mr->lkey;
wr.send_flags = IBV_SEND_SIGNALED;
}
if (ibv_post_send(c->qp, &wr, &bad) != 0) return false;
struct ibv_wc wc;
if (!rdma_poll(c->scq, &wc)) return false;
src += chunk;
rem -= chunk;
}
return true;
}
bool socket_t::impl::rdma_recv(void * data, size_t size) {
rdma_conn * c = rdma.get();
uint8_t * dst = (uint8_t *)data;
size_t rem = size;
while (rem > 0) {
struct ibv_wc wc;
if (!rdma_poll(c->rcq, &wc)) return false;
int slot = (int)wc.wr_id;
size_t got = wc.byte_len;
memcpy(dst, c->rx_slot(slot), got);
if (!c->post_rx(slot)) return false;
dst += got;
rem -= got;
}
return true;
}
#endif // GGML_RPC_RDMA
bool socket_t::impl::send_data(const void * data, size_t size) {
#ifdef GGML_RPC_RDMA
if (use_rdma) {
return rdma_send(data, size);
}
#endif
size_t bytes_sent = 0;
while (bytes_sent < size) {
size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
ssize_t n = send(fd, (const char *)data + bytes_sent, size_to_send, 0);
if (n < 0) {
GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
bytes_sent, size_to_send);
return false;
}
bytes_sent += (size_t)n;
}
return true;
}
bool socket_t::impl::recv_data(void * data, size_t size) {
#ifdef GGML_RPC_RDMA
if (use_rdma) {
return rdma_recv(data, size);
}
#endif
size_t bytes_recv = 0;
while (bytes_recv < size) {
size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
ssize_t n = recv(fd, (char *)data + bytes_recv, size_to_recv, 0);
if (n < 0) {
GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
bytes_recv, size_to_recv);
return false;
}
if (n == 0) {
LOG_DBG("recv returned 0 (peer closed?)\n");
return false;
}
bytes_recv += (size_t)n;
}
return true;
}
void socket_t::impl::get_caps(uint8_t * local_caps) {
memset(local_caps, 0, RPC_CONN_CAPS_SIZE);
#ifdef GGML_RPC_RDMA
rdma_local = {};
if (rdma_probe()) {
rdma_caps rc = {};
rc.qpn = rdma_local.qpn;
rc.psn = rdma_local.psn;
memcpy(rc.gid, rdma_local.gid, RDMA_GID_SIZE);
memcpy(local_caps, &rc, sizeof(rc));
} else {
rdma.reset();
}
#endif // GGML_RPC_RDMA
}
void socket_t::impl::update_caps(const uint8_t * remote_caps) {
#ifdef GGML_RPC_RDMA
if (!rdma) {
return;
}
rdma_caps rc = {};
memcpy(&rc, remote_caps, sizeof(rc));
if (rc.qpn == 0) {
rdma.reset();
return;
}
if (rdma_activate(rc.qpn, rc.psn, rc.gid)) {
use_rdma = true;
} else {
GGML_LOG_ERROR("RDMA activate failed, staying on TCP\n");
rdma.reset();
}
#else
(void)remote_caps;
#endif // GGML_RPC_RDMA
}
/////////////////////////////////////////////////////////////////////////////
socket_t::socket_t(std::unique_ptr<impl> p) : pimpl(std::move(p)) {}
socket_t::~socket_t() = default;
bool socket_t::send_data(const void * data, size_t size) {
return pimpl->send_data(data, size);
}
bool socket_t::recv_data(void * data, size_t size) {
return pimpl->recv_data(data, size);
}
void socket_t::get_caps(uint8_t * local_caps) {
return pimpl->get_caps(local_caps);
}
void socket_t::update_caps(const uint8_t * remote_caps) {
return pimpl->update_caps(remote_caps);
}
static bool is_valid_fd(sockfd_t sockfd) {
#ifdef _WIN32
return sockfd != INVALID_SOCKET;
#else
return sockfd >= 0;
#endif
}
static bool set_no_delay(sockfd_t sockfd) {
int flag = 1;
// set TCP_NODELAY to disable Nagle's algorithm
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
return ret == 0;
}
static bool set_reuse_addr(sockfd_t sockfd) {
int flag = 1;
int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
return ret == 0;
}
socket_ptr socket_t::accept() {
auto client_socket_fd = ::accept(pimpl->fd, NULL, NULL);
if (!is_valid_fd(client_socket_fd)) {
return nullptr;
}
if (!set_no_delay(client_socket_fd)) {
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
return nullptr;
}
return socket_ptr(new socket_t(std::make_unique<impl>(client_socket_fd)));
}
socket_ptr socket_t::create_server(const char * host, int port) {
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (!is_valid_fd(sockfd)) {
return nullptr;
}
if (!set_reuse_addr(sockfd)) {
GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
return nullptr;
}
if (inet_addr(host) == INADDR_NONE) {
GGML_LOG_ERROR("Invalid host address: %s\n", host);
return nullptr;
}
struct sockaddr_in serv_addr;
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = inet_addr(host);
serv_addr.sin_port = htons(port);
if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
return nullptr;
}
if (listen(sockfd, 1) < 0) {
return nullptr;
}
return socket_ptr(new socket_t(std::make_unique<impl>(sockfd)));
}
socket_ptr socket_t::connect(const char * host, int port) {
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (!is_valid_fd(sockfd)) {
return nullptr;
}
if (!set_no_delay(sockfd)) {
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
return nullptr;
}
struct sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_port = htons(port);
struct hostent * server = gethostbyname(host);
if (server == NULL) {
GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
return nullptr;
}
memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
if (::connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
return nullptr;
}
return socket_ptr(new socket_t(std::make_unique<impl>(sockfd)));
}
#ifdef _WIN32
static std::mutex g_rpc_transport_mu;
static bool g_rpc_transport_wsa_started = false;
#endif
bool rpc_transport_init() {
#ifdef _WIN32
std::lock_guard<std::mutex> lock(g_rpc_transport_mu);
if (g_rpc_transport_wsa_started) {
return true;
}
WSADATA wsaData;
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
if (res != 0) {
return false;
}
g_rpc_transport_wsa_started = true;
return true;
#else
return true;
#endif
}
void rpc_transport_shutdown() {
#ifdef _WIN32
std::lock_guard<std::mutex> lock(g_rpc_transport_mu);
if (!g_rpc_transport_wsa_started) {
return;
}
WSACleanup();
g_rpc_transport_wsa_started = false;
#endif
}

View File

@@ -0,0 +1,34 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <memory>
struct socket_t;
typedef std::shared_ptr<socket_t> socket_ptr;
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
static constexpr size_t RPC_CONN_CAPS_SIZE = 24;
struct socket_t {
~socket_t();
bool send_data(const void * data, size_t size);
bool recv_data(void * data, size_t size);
socket_ptr accept();
void get_caps(uint8_t * local_caps);
void update_caps(const uint8_t * remote_caps);
static socket_ptr create_server(const char * host, int port);
static socket_ptr connect(const char * host, int port);
private:
struct impl;
explicit socket_t(std::unique_ptr<impl> p);
std::unique_ptr<impl> pimpl;
};
bool rpc_transport_init();
void rpc_transport_shutdown();

View File

@@ -1077,9 +1077,9 @@ llm_graph_qkv llm_graph_context::build_qkv(
// fused QKV path
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
cb(qkv, "wqkv", il);
if (layer.bqkv) {
qkv = ggml_add(ctx0, qkv, layer.bqkv);
cb(qkv, "bqkv", il);
if (layer.wqkv_b) {
qkv = ggml_add(ctx0, qkv, layer.wqkv_b);
cb(qkv, "wqkv_b", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -1097,8 +1097,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
// separate Q/K/V path
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
cb(Qcur, "Qcur", il);
if (layer.bq) {
Qcur = ggml_add(ctx0, Qcur, layer.bq);
if (layer.wq_b) {
Qcur = ggml_add(ctx0, Qcur, layer.wq_b);
cb(Qcur, "Qcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
@@ -1107,8 +1107,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
}
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
cb(Kcur, "Kcur", il);
if (layer.bk) {
Kcur = ggml_add(ctx0, Kcur, layer.bk);
if (layer.wk_b) {
Kcur = ggml_add(ctx0, Kcur, layer.wk_b);
cb(Kcur, "Kcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
@@ -1117,8 +1117,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
}
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
cb(Vcur, "Vcur", il);
if (layer.bv) {
Vcur = ggml_add(ctx0, Vcur, layer.bv);
if (layer.wv_b) {
Vcur = ggml_add(ctx0, Vcur, layer.wv_b);
cb(Vcur, "Vcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {

View File

@@ -3098,14 +3098,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_;
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
if (layer.wqkv) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
} else {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
}
};
@@ -3138,7 +3138,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -3201,7 +3201,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
layer.wo =
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
@@ -3336,9 +3336,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
}
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
if (n_ff > 0) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -3558,10 +3557,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -3602,8 +3601,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
@@ -3719,23 +3718,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; // JinaBertLayer
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
@@ -3783,10 +3775,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -3819,10 +3811,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -3889,7 +3881,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -4068,7 +4060,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -4127,7 +4119,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
@@ -4291,10 +4283,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -4329,7 +4321,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -4646,7 +4638,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -4890,7 +4882,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
}
// feed forward (w/ optional biases)
@@ -5152,10 +5144,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -5570,10 +5562,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -5612,10 +5604,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// attention biases - all have shape n_embd (output dimension of projections)
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -5918,7 +5910,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@@ -5987,7 +5979,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else {
if (n_expert != 0) {
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
@@ -6808,7 +6800,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -6890,7 +6882,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// attention layers (with optional bias)
create_tensor_qkv(layer, i, hidden_size, n_embd_head_k * attn_num_attention_head, attn_num_key_value_head * n_embd_head_k, attn_num_key_value_head * n_embd_head_v, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
@@ -7026,7 +7018,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
@@ -7191,7 +7183,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
@@ -7422,7 +7414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

View File

@@ -246,6 +246,8 @@ struct llama_layer {
struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wk_b = nullptr;
struct ggml_tensor * wv_b = nullptr;
struct ggml_tensor * wqkv_b = nullptr;
struct ggml_tensor * wo_b = nullptr;
struct ggml_tensor * wq_cross = nullptr;
struct ggml_tensor * wk_cross = nullptr;
struct ggml_tensor * wv_cross = nullptr;
@@ -256,13 +258,6 @@ struct llama_layer {
struct ggml_tensor * wo_enc = nullptr;
struct ggml_tensor * wqkv_gate = nullptr;
// attention bias
struct ggml_tensor * bq = nullptr;
struct ggml_tensor * bk = nullptr;
struct ggml_tensor * bv = nullptr;
struct ggml_tensor * bo = nullptr;
struct ggml_tensor * bqkv = nullptr;
// relative position bias
struct ggml_tensor * attn_rel_b = nullptr;
struct ggml_tensor * attn_rel_b_enc = nullptr;

View File

@@ -50,7 +50,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur_pos", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -55,7 +55,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -48,7 +48,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}

View File

@@ -48,7 +48,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View File

@@ -72,7 +72,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}

View File

@@ -57,8 +57,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
cb(cur, "attn_sub_norm", il);
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
if (model.layers[il].wo_b) {
cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
}
cb(cur, "attn_out", il);
}

View File

@@ -33,7 +33,7 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -47,7 +47,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -58,7 +58,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -54,7 +54,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -59,7 +59,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -49,7 +49,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -49,7 +49,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -43,7 +43,7 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -46,7 +46,7 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -37,7 +37,7 @@ llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -46,7 +46,7 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -92,7 +92,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View File

@@ -101,7 +101,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View File

@@ -50,7 +50,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -50,7 +50,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View File

@@ -64,7 +64,7 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -65,7 +65,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -50,7 +50,7 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -27,7 +27,7 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -51,7 +51,7 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
cb(Kcur, "Kcur_rope", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -70,7 +70,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cb(Kcur, "Kcur_normed", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View File

@@ -84,7 +84,7 @@ llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_gr
cb(Kcur, "Kcur_normed", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -56,7 +56,7 @@ llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -67,7 +67,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -56,7 +56,7 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params &
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View File

@@ -70,7 +70,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View File

@@ -51,7 +51,7 @@ llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -48,7 +48,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
cb(cur, "attn_out", il);

View File

@@ -55,7 +55,7 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {

View File

@@ -49,7 +49,7 @@ llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, co
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -51,7 +51,7 @@ llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -60,7 +60,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
cb(Qcur, "Qcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -50,7 +50,7 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -50,7 +50,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -53,7 +53,7 @@ llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -56,7 +56,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View File

@@ -56,7 +56,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View File

@@ -62,7 +62,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -62,7 +62,7 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@@ -58,7 +58,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -52,7 +52,7 @@ llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -59,7 +59,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
cb(Kcur, "Kcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -55,7 +55,7 @@ llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@@ -36,7 +36,7 @@ llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_gr
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -50,7 +50,7 @@ llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@@ -41,7 +41,7 @@ llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_par
ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
cur = build_attn(inp_attn_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
}

View File

@@ -1796,7 +1796,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
"<function=special_function>\n"
"<parameter=arg1>\n1\n</parameter>\n"
"</function>\n"
"</tool_call>")
"</tool_call>\n")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
@@ -1809,7 +1809,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
"<function=special_function>\n"
"<parameter=arg1>\n1\n</parameter>\n"
"</function>\n"
"</tool_call>")
"</tool_call>\n")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
@@ -1826,7 +1826,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
"<parameter=arg1>\n1\n</parameter>\n"
"<parameter=arg2>\n2\n</parameter>\n"
"</function>\n"
"</tool_call>")
"</tool_call>\n")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.parallel_tool_calls(true)
@@ -1849,7 +1849,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
"hello()\n"
"</parameter>\n"
"</function>\n"
"</tool_call>")
"</tool_call>\n")
.enable_thinking(false)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({
@@ -1892,7 +1892,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
"hello()\n"
"</parameter>\n"
"</function>\n"
"</tool_call>"
"</tool_call>\n"
)
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
@@ -1908,7 +1908,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
"hello()\n"
"</parameter>\n"
"</function>\n"
"</tool_call>")
"</tool_call>\n")
.expect_tool_calls({
{ "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
})

View File

@@ -42,7 +42,7 @@ int main(void) {
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
// get position of the last token, which should be (nx - 1, ny - 1)
struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, n_tokens - 1);
struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, 0, n_tokens - 1);
size_t nx = pos.x + 1;
size_t ny = pos.y + 1;
const char * id = mtmd_image_tokens_get_id(image_tokens);

View File

@@ -114,10 +114,10 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
return n_pos;
}
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, llama_pos pos_0, mtmd_decoder_pos * out_pos) {
size_t n_tokens = mtmd_image_tokens_get_n_tokens(chunks);
for (size_t i = 0; i < n_tokens; i++) {
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, i);
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, pos_0, i);
}
}
@@ -163,15 +163,15 @@ struct decode_embd_batch {
}
// M-RoPE for image
void set_position_mrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
void set_position_mrope_2d(const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
seq_id_0[0] = seq_id;
for (int32_t i = 0; i < batch.n_tokens; i++) {
pos[i ] = pos_0 + rel_pos[i].t;
pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].y;
pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i ] = rel_pos[i].t;
pos[i + batch.n_tokens ] = rel_pos[i].y;
pos[i + batch.n_tokens * 2] = rel_pos[i].x;
pos[i + batch.n_tokens * 3] = rel_pos[i].z;
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
@@ -188,7 +188,7 @@ struct decode_embd_batch {
pos[i ] = pos_0 + i;
pos[i + batch.n_tokens ] = pos_0 + i;
pos[i + batch.n_tokens * 2] = pos_0 + i;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i + batch.n_tokens * 3] = pos_0 + i;
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
@@ -268,8 +268,8 @@ int32_t mtmd_helper_decode_image_chunk(
}
const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
batch_embd.set_position_mrope_2d(n_past, rel_pos, seq_id);
mtmd_helper_image_get_decoder_pos(image_tokens, n_past, rel_pos.data());
batch_embd.set_position_mrope_2d(rel_pos, seq_id);
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {

View File

@@ -49,7 +49,7 @@ MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
// out_pos must have length == mtmd_helper_get_n_tokens(image)
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos);
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
// helper function that automatically:
// 1. run llama_decode() on text chunks

View File

@@ -1246,11 +1246,14 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
return image_tokens->ny;
}
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i) {
mtmd_decoder_pos pos;
pos.t = 0;
pos.x = i % image_tokens->nx;
pos.y = i / image_tokens->nx;
// M-RoPE logic
// TODO: support other types of position encoding if needed
pos.t = pos_0;
pos.x = pos_0 + (i % image_tokens->nx);
pos.y = pos_0 + (i / image_tokens->nx);
pos.z = 0; // unused for now
return pos;
}

View File

@@ -196,11 +196,13 @@ struct mtmd_decoder_pos {
uint32_t t;
uint32_t x;
uint32_t y;
uint32_t z; // unused for now, reserved for future use
};
// get position for decoder attention, to be used by M-RoPE models
// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
// pos_0 is the absolute position of the first token
// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
// tokenize an input text prompt and a list of bitmaps (images/audio)
// the prompt must have the input image marker (default: "<__media__>") in it

View File

@@ -806,6 +806,7 @@ By default, it is read-only. To make POST request to change global properties, y
"modalities": {
"vision": false
},
"media_marker": "<__media_YoNhud46VdDqbuFmKYEO9PY7A4ARzRfg__>",
"build_info": "b(build number)-(build commit hash)",
"is_sleeping": false
}

View File

@@ -391,15 +391,25 @@ void server_tokens::push_back(server_tokens & tokens) {
}
void server_tokens::insert(const llama_tokens & inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
}
const llama_tokens & server_tokens::get_text_tokens() const {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
const llama_tokens & server_tokens::get_tokens() const {
GGML_ASSERT(!has_mtmd);
return tokens;
}
llama_tokens server_tokens::get_text_tokens() const {
llama_tokens res;
res.reserve(tokens.size());
for (llama_token t : tokens) {
if (t != LLAMA_TOKEN_NULL) {
res.push_back(t);
}
}
return res;
}
void server_tokens::set_token(llama_pos pos, llama_token id) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens[pos] = id;

View File

@@ -190,7 +190,9 @@ public:
void insert(const llama_tokens & inp_tokens);
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens() const;
const llama_tokens & get_tokens() const;
llama_tokens get_text_tokens() const;
// for compatibility with speculative decoding
void set_token(llama_pos pos, llama_token id);

View File

@@ -1,3 +1,4 @@
#include "server-context.h"
#include "server-common.h"
#include "server-http.h"
@@ -19,6 +20,7 @@
#include <exception>
#include <memory>
#include <filesystem>
#include <utility>
// fix problem with std::min and std::max
#if defined(_WIN32)
@@ -33,6 +35,31 @@ using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
static server_prompt_checkpoint server_get_checkpoint(llama_context * ctx, int id, int64_t n_tokens, llama_pos pos_min = -1, llama_pos pos_max = -1) {
if (pos_min == -1) {
pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), id);
}
if (pos_max == -1) {
pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), id);
}
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto cur = server_prompt_checkpoint {
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.n_tokens = */ n_tokens,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
};
const size_t n = llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != checkpoint_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
}
return cur;
}
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
enum slot_state {
SLOT_STATE_IDLE,
@@ -57,7 +84,12 @@ struct server_slot {
// multimodal
mtmd_context * mctx = nullptr;
common_speculative * spec = nullptr;
// speculative decoding
llama_tokens spec_draft;
std::vector<int32_t> spec_i_batch;
server_prompt_checkpoint spec_ckpt;
common_speculative_ptr spec;
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
@@ -83,11 +115,6 @@ struct server_slot {
std::string debug_generated_text;
llama_tokens generated_tokens;
// idx of draft tokens in the main batch
// non-empty if we went to evaluate draft tokens
// ref: https://github.com/ggml-org/llama.cpp/pull/17808
std::vector<int32_t> i_batch_dft;
std::vector<completion_token_output> generated_token_probs;
bool has_next_token = true;
@@ -147,8 +174,7 @@ struct server_slot {
common_sampler_ptr smpl;
llama_token sampled; // in speculative mode, this is the last accepted token
llama_tokens drafted;
llama_token sampled; // in speculative mode, this is the last accepted token
// stats
size_t n_sent_text = 0; // number of sent text character
@@ -178,8 +204,11 @@ struct server_slot {
stopping_word = "";
n_sent_text = 0;
drafted.clear();
i_batch_dft.clear();
if (can_speculate()) {
spec_draft.clear();
spec_i_batch.clear();
spec_ckpt.clear();
}
generated_tokens.clear();
generated_token_probs.clear();
json_schema = json();
@@ -300,6 +329,85 @@ struct server_slot {
return n_draft_max;
}
void update_batch(llama_batch & batch) {
const int n_draft_max = get_n_draft_max();
if (n_draft_max > 0) {
GGML_ASSERT(can_speculate());
// generate draft tokens in speculative decoding mode
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
// perform the speculative drafting for all sequences at the same time in a single batch
const llama_tokens & tokens = prompt.tokens.get_text_tokens();
const auto & params_spec = task->params.speculative;
if (!spec_draft.empty()) {
// we have a previous (partial) draft to reuse
if (task->params.speculative.use_checkpoints) {
GGML_ASSERT(!spec_ckpt.empty());
}
} else {
GGML_ASSERT(spec_i_batch.empty());
// generate a new draft
spec_draft = common_speculative_draft(spec.get(), params_spec, tokens, sampled);
if (spec_draft.size() > (size_t) n_draft_max) {
SLT_WRN(*this, "draft size %d exceeds max %d, truncating\n", (int) spec_draft.size(), n_draft_max);
spec_draft.resize(n_draft_max);
}
if (spec_draft.size() < (size_t) params_spec.n_min) {
SLT_DBG(*this, "ignoring small draft: %d < %d\n", (int) spec_draft.size(), params_spec.n_min);
spec_draft.clear();
}
if (!spec_draft.empty() && params_spec.use_checkpoints) {
const auto n_tokens = prompt.tokens.size();
auto & ckpt = spec_ckpt;
ckpt = server_get_checkpoint(ctx, this->id, n_tokens);
SLT_DBG(*this, "created speculative checkpoint (pos_min = %d, pos_max = %d, n_tokens = %zu, size = %.3f MiB)\n",
ckpt.pos_min, ckpt.pos_max, n_tokens, (float) ckpt.data.size() / 1024 / 1024);
}
}
GGML_ASSERT(spec_draft.size() <= (size_t) n_draft_max);
}
if (spec_draft.empty()) {
// no speculative decoding
i_batch = batch.n_tokens;
common_batch_add(batch, sampled, prompt.tokens.pos_next(), { this->id }, true);
SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
sampled, n_ctx, prompt.n_tokens(), truncated);
} else {
SLT_DBG(*this, "generate_draft: id=%d, #tokens=%zu, #draft=%zu, pos_next=%d\n",
sampled, prompt.tokens.size(), spec_draft.size(), prompt.tokens.pos_next());
GGML_ASSERT(spec_i_batch.empty());
spec_i_batch.push_back(batch.n_tokens);
for (size_t i = 0; i < spec_draft.size(); i++) {
spec_i_batch.push_back(batch.n_tokens + i + 1);
}
auto pos0 = prompt.tokens.pos_next();
common_batch_add(batch, sampled, pos0++, { this->id }, true);
for (auto token : spec_draft) {
common_batch_add(batch, token, pos0++, { this->id }, true);
}
}
prompt.tokens.push_back(sampled);
prompt.tokens.insert(spec_draft);
}
void release() {
if (is_processing()) {
GGML_ASSERT(task);
@@ -400,7 +508,7 @@ struct server_slot {
);
}
common_speculative_print_stats(spec);
common_speculative_print_stats(spec.get());
}
json to_json(bool only_metrics = false) const {
@@ -591,16 +699,17 @@ private:
void destroy() {
llama_init.reset();
ctx = nullptr;
model = nullptr;
mtmd_free(mctx);
mctx = nullptr;
// Clear any sampling context
for (server_slot & slot : slots) {
common_speculative_free(slot.spec);
slot.spec = nullptr;
if (slot.can_speculate()) {
slot.spec.reset();
}
}
llama_batch_free(batch);
@@ -642,9 +751,6 @@ private:
llama_init = common_init_from_params(params_base);
// propagate model-metadata sampling defaults back to caller
params.sampling = params_base.sampling;
model = llama_init->model();
ctx = llama_init->context();
@@ -660,6 +766,7 @@ private:
add_bos_token = llama_vocab_get_add_bos(vocab);
if (params_base.speculative.has_dft()) {
// TODO speculative: move to common/speculative.cpp?
SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str());
const auto & params_spec = params_base.speculative;
@@ -727,11 +834,6 @@ private:
params_base.n_cache_reuse = 0;
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
}
if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
}
}
if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@@ -769,14 +871,23 @@ private:
slots.clear();
const bool can_spec = common_speculative_is_compat(ctx);
if (!can_spec) {
const auto spec_type = common_speculative_is_compat(ctx);
if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
SRV_WRN("%s", "speculative decoding not supported by this context\n");
}
if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_CKPT) {
SRV_WRN("%s", "speculative decoding will use checkpoints\n");
params_base.speculative.use_checkpoints = true;
}
// initialize slots
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;
slots.emplace_back();
}
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot & slot = slots[i];
slot.id = i;
slot.ctx = ctx;
@@ -786,16 +897,11 @@ private:
slot.prompt.tokens.has_mtmd = mctx != nullptr;
// try speculative decoding
if (can_spec) {
slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
if (spec_type != COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
slot.spec.reset(common_speculative_init(params_base.speculative, slot.ctx));
if (slot.spec) {
if (mctx) {
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
return false;
}
SLT_INF(slot, "%s", "speculative decoding context initialized\n");
} else {
SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
}
}
@@ -806,8 +912,6 @@ private:
};
slot.reset();
slots.push_back(std::move(slot));
}
{
@@ -854,6 +958,9 @@ private:
model_aliases = params_base.model_alias;
model_tags = params_base.model_tags;
// propagate new defaults back to caller
params = params_base;
if (!is_resume) {
return init();
}
@@ -1197,7 +1304,7 @@ private:
backend_sampling &= task.params.sampling.backend_sampling;
// TODO: speculative decoding requires multiple samples per batch - not supported yet
backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0);
backend_sampling &= !(slot.can_speculate() && task.params.speculative.n_max > 0);
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_logits;
@@ -1703,6 +1810,26 @@ private:
return true;
}
// n_tokens_cur: the number of tokens added to the batch for the current slot
void create_checkpoint(server_slot & slot, const int64_t n_tokens_cur, llama_pos pos_min, llama_pos pos_max) {
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
// make room for the new checkpoint, if needed
const auto & cur = slot.prompt.checkpoints.front();
SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
}
const auto & cur = slot.prompt.checkpoints.emplace_back(server_get_checkpoint(ctx, slot.id, slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max));
SLT_WRN(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@@ -1854,7 +1981,7 @@ private:
std::string filename = task.slot_action.filename;
std::string filepath = task.slot_action.filepath;
const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
const llama_tokens & tokens = slot->prompt.tokens.get_tokens();
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
const int64_t t_end = ggml_time_us();
@@ -2061,7 +2188,7 @@ private:
{
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
llama_tokens new_tokens = slot.prompt.tokens.get_tokens(); // copy
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
new_tokens[i - n_discard] = new_tokens[i];
}
@@ -2100,61 +2227,7 @@ private:
continue;
}
// generate draft tokens in speculative decoding mode
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
// perform the speculative drafting for all sequences at the same time in a single batch
const int n_draft_max = slot.get_n_draft_max();
if (n_draft_max > 0) {
if (mctx) {
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
GGML_ABORT("not supported by multimodal");
}
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
const auto & params_spec = slot.task->params.speculative;
llama_tokens draft = common_speculative_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
if (draft.size() > (size_t) n_draft_max) {
SLT_WRN(slot, "draft size %d exceeds max %d, truncating\n", (int) draft.size(), n_draft_max);
draft.resize(n_draft_max);
}
// add the sampled token to the batch
slot.i_batch_dft.push_back(batch.n_tokens);
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
slot.prompt.tokens.push_back(slot.sampled);
if (slot.task->params.speculative.n_min > (int) draft.size()) {
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
// fallback to normal decoding
slot.i_batch = slot.i_batch_dft[0];
slot.drafted.clear();
slot.i_batch_dft.clear();
} else {
// keep track of total number of drafted tokens tested
slot.n_draft_total += draft.size();
// add all drafted tokens to the batch
for (size_t i = 0; i < draft.size(); i++) {
slot.i_batch_dft.push_back(batch.n_tokens);
common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true);
slot.prompt.tokens.push_back(draft[i]);
}
slot.drafted = std::move(draft);
}
} else {
// no speculative decoding
slot.i_batch = batch.n_tokens;
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
slot.prompt.tokens.push_back(slot.sampled);
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
}
slot.update_batch(batch);
}
// process in chunks of params.n_batch
@@ -2651,40 +2724,12 @@ private:
// no need to create checkpoints that are too close together
do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64);
SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);
// note: we create the checkpoint before calling llama_decode(), so the current batch is not
// yet processed and therefore it is not part of the checkpoint.
if (do_checkpoint) {
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
// make room for the new checkpoint, if needed
const auto & cur = slot.prompt.checkpoints.front();
SLT_WRN(slot,
"erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
", size = %.3f MiB)\n",
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
}
const size_t checkpoint_size =
llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.n_tokens = */ slot.prompt.n_tokens() - n_tokens_cur,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
});
llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id,
LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
SLT_WRN(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
", size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
create_checkpoint(slot, n_tokens_cur, pos_min, pos_max);
}
}
@@ -2856,19 +2901,19 @@ private:
slot.state = SLOT_STATE_GENERATING;
if (slot.can_speculate()) {
common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens());
common_speculative_begin(slot.spec.get(), slot.prompt.tokens.get_text_tokens());
}
} else if (slot.state != SLOT_STATE_GENERATING) {
continue; // continue loop of slots
}
if (slot.i_batch_dft.size() > 0) {
if (slot.can_speculate() && !slot.spec_draft.empty()) {
continue; // sample using speculative decoding
}
const int tok_idx = slot.i_batch - i;
llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx);
llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx, tok_idx);
slot.i_batch = -1;
@@ -2889,7 +2934,7 @@ private:
completion_token_output result;
result.tok = id;
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
if (slot.task->params.sampling.n_probs > 0) {
@@ -2909,43 +2954,86 @@ private:
// speculative decoding - main model sample and accept
for (auto & slot : slots) {
if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
if (slot.state != SLOT_STATE_GENERATING || !slot.can_speculate() || slot.spec_draft.empty()) {
continue;
}
const size_t n_draft = slot.drafted.size();
// save the original draft size
const size_t n_draft = slot.spec_draft.size();
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
slot.i_batch_dft.clear();
slot.drafted.clear();
GGML_ASSERT(n_draft > 0);
// verify and try to accept the draft
{
const auto & params_spec = slot.task->params.speculative;
common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft);
slot.spec_i_batch.clear();
SLT_DBG(slot, "%s: n_draft=%zu, accepted=%zu\n", __func__, slot.spec_draft.size(), accepted.size());
GGML_ASSERT(accepted.size() >= 1);
// check for partial draft acceptance
if (accepted.size() < slot.spec_draft.size() + 1) {
if (params_spec.use_checkpoints) {
// partial acceptance is not supported by the context -> truncate the draft and restore the state
slot.spec_draft = std::move(accepted);
auto & ckpt = slot.spec_ckpt;
SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size());
const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt.size()) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n);
}
llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, ckpt.pos_max + 1, -1);
slot.prompt.tokens.keep_first(ckpt.n_tokens);
slot.smpl = std::move(smpl_save);
continue;
}
LOG_DBG("%s: partial acceptance: %zu < %zu\n", __func__, accepted.size(), slot.spec_draft.size());
}
common_speculative_accept(slot.spec.get(), accepted.size() - 1);
slot.spec_draft = std::move(accepted);
}
const int64_t t_current = ggml_time_us();
slot.n_decoded += ids.size();
const auto ids = std::move(slot.spec_draft);
slot.n_decoded += ids.size();
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
// update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1;
// inform the speculative decoding about the number of accepted tokens
common_speculative_accept(slot.spec, ids.size() - 1);
// rollback to the state before sampling the draft tokens
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
slot.n_draft_total += n_draft;
// add accepted tokens to the prompt
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
slot.sampled = ids.back(); // last accepted token
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
slot.sampled = ids.back(); // last accepted token
SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft);
llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, slot.prompt.n_tokens(), -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
result.tok = ids[i];
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
result.prob = 1.0f; // set later
// TODO: set result.probs
@@ -3537,6 +3625,7 @@ void server_routes::init_routes() {
{"vision", meta->has_inp_image},
{"audio", meta->has_inp_audio},
} },
{ "media_marker", get_media_marker() },
{ "endpoint_slots", params.endpoint_slots },
{ "endpoint_props", params.endpoint_props },
{ "endpoint_metrics", params.endpoint_metrics },
@@ -3664,7 +3753,7 @@ void server_routes::init_routes() {
params.n_predict,
meta->slot_n_ctx,
params.spm_infill,
tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
tokenized_prompts[0].get_tokens() // TODO: this could maybe be multimodal.
);
std::vector<raw_buffer> files; // dummy

View File

@@ -162,7 +162,7 @@ common_chat_msg task_result_state::update_chat_msg(
bool filter_tool_calls) {
generated_text += text_added;
auto msg_prv_copy = chat_msg;
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
//SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
auto new_msg = common_chat_parse(
generated_text,
is_partial,
@@ -304,6 +304,8 @@ task_params server_task::params_from_json_cmpl(
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
params.speculative = defaults.speculative;
params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);

View File

@@ -576,6 +576,17 @@ struct server_prompt_checkpoint {
size_t size() const {
return data.size();
}
bool empty() const {
return data.empty();
}
void clear() {
pos_min = 0;
pos_max = 0;
n_tokens = 0;
data.clear();
}
};
struct server_prompt {