mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-14 21:14:10 +00:00
* mimo-v2.5: vision support * mimo-v2.5: use fused qkv for vision * mimi-v2.5: fix f16 vision overflow * mimo-v2.5: comment cleanups * mimo-v2.5: Flash doesn't have mmproj more cleanup remember to use filter_tensors * mimo-v2.5: fix trailing whitespace
210 lines
9.2 KiB
C++
210 lines
9.2 KiB
C++
#include "models.h"
|
|
|
|
ggml_tensor * clip_graph_mimovl::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
|
ggml_tensor * cur = ggml_mul_mat(ctx0, w, x);
|
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
|
return cur;
|
|
}
|
|
|
|
// MiMoVL vision tower for MiMo-V2.5 (non-Pro). Qwen2.5-VL-shaped ViT, except:
|
|
// 1. GQA in attention (32 Q / 8 KV heads, head_dim 64).
|
|
// 2. Per-head attention sinks on every windowed layer. The sinks adjust
|
|
// the softmax denominator (equivalently, a virtual extra K column with V=0),
|
|
// so they decay attention weight without contributing to the output.
|
|
// 3. Per-layer window-attention mode in hparams.wa_pattern_mode:
|
|
// -1 -> full, 0 -> row-window+sinks, 1 -> col-window+sinks.
|
|
// Col mode transposes the merge-unit grid on entry and restores
|
|
// it on exit. Both patch and rotary orderings are pre-computed
|
|
// host-side.
|
|
// 4. 1D banded sliding window (|q-k| > window_size -> -inf) as a
|
|
// single 2D mask broadcast across heads.
|
|
// 5. Per-block MLP biases.
|
|
ggml_cgraph * clip_graph_mimovl::build() {
|
|
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
|
GGML_ASSERT(model.patch_embeddings_1 != nullptr);
|
|
GGML_ASSERT(model.class_embedding == nullptr);
|
|
GGML_ASSERT(hparams.n_head_kv > 0);
|
|
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
|
|
GGML_ASSERT((int) hparams.wa_pattern_mode.size() == n_layer);
|
|
|
|
const int batch_size = 1;
|
|
const int n_pos = n_patches;
|
|
const int n_head_kv = hparams.n_head_kv;
|
|
const int merge = hparams.n_merge > 0 ? hparams.n_merge : 2;
|
|
const int merge_unit = merge * merge;
|
|
const int n_units = n_pos / merge_unit;
|
|
GGML_ASSERT(n_units * merge_unit == n_pos);
|
|
|
|
// MiMoVL has head_dim=64 with n_embd=1280, so n_embd is NOT n_head*head_dim
|
|
// (the base class's d_head = n_embd/n_head = 40 is wrong here). Derive
|
|
// head_dim from the fused QKV projection: rows = (n_head + 2*n_head_kv)*head_dim.
|
|
GGML_ASSERT(model.layers[0].qkv_w != nullptr);
|
|
const int qkv_rows = model.layers[0].qkv_w->ne[1];
|
|
const int head_dim = qkv_rows / (n_head + 2 * n_head_kv);
|
|
GGML_ASSERT(head_dim * (n_head + 2 * n_head_kv) == qkv_rows);
|
|
const float attn_scale = 1.0f / std::sqrt((float) head_dim);
|
|
const int rope_n_dims = head_dim / 2;
|
|
int mrope_sections[4] = {rope_n_dims/2, rope_n_dims/2, 0, 0};
|
|
|
|
// Patch embed: Conv3D(kt=2) split into two Conv2D, then interleave-merge
|
|
// along the height axis to match the merge-tile token order.
|
|
ggml_tensor * inp_raw = build_inp_raw();
|
|
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw,
|
|
patch_size, patch_size, 0, 0, 1, 1);
|
|
{
|
|
ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw,
|
|
patch_size, patch_size, 0, 0, 1, 1);
|
|
inp = ggml_add(ctx0, inp, inp_1);
|
|
|
|
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
|
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
|
|
|
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b]
|
|
inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
|
inp = ggml_reshape_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
|
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
|
inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
|
|
}
|
|
cb(inp, "patch_embed", -1);
|
|
|
|
ggml_tensor * positions_row = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
|
|
ggml_set_name(positions_row, "mimovl_positions_row");
|
|
ggml_set_input(positions_row);
|
|
|
|
ggml_tensor * positions_col = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
|
|
ggml_set_name(positions_col, "mimovl_positions_col");
|
|
ggml_set_input(positions_col);
|
|
|
|
// idx_col is the col-major merge-unit permutation. Take it as F32 so we can
|
|
// derive the inverse permutation in-graph via ggml_argsort;
|
|
// ggml_get_rows requires its index tensor to be I32, so cast back as well.
|
|
ggml_tensor * idx_col_f = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_units);
|
|
ggml_set_name(idx_col_f, "mimovl_idx_col");
|
|
ggml_set_input(idx_col_f);
|
|
ggml_tensor * idx_col = ggml_cast(ctx0, idx_col_f, GGML_TYPE_I32);
|
|
ggml_tensor * idx_col_inv = ggml_argsort(ctx0, idx_col_f, GGML_SORT_ORDER_ASC);
|
|
|
|
ggml_tensor * window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
|
ggml_set_name(window_mask, "mimovl_window_mask");
|
|
ggml_set_input(window_mask);
|
|
|
|
ggml_tensor * window_mask_attn = (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED)
|
|
? ggml_cast(ctx0, window_mask, GGML_TYPE_F16)
|
|
: window_mask;
|
|
|
|
// Reorder helper: permute patches at merge-unit granularity. The patch
|
|
// sequence is laid out as n_units groups of merge_unit (=4) consecutive
|
|
// patches; the row<->col transpose only permutes whole groups. We keep
|
|
// the per-group (h,w) ordering intact by reshaping to
|
|
// [n_embd*merge_unit, n_units] before ggml_get_rows.
|
|
auto reorder = [&](ggml_tensor * x, ggml_tensor * idx) {
|
|
ggml_tensor * y = ggml_reshape_2d(ctx0, x, n_embd * merge_unit, n_units);
|
|
y = ggml_get_rows(ctx0, y, idx);
|
|
return ggml_reshape_3d(ctx0, y, n_embd, n_pos, batch_size);
|
|
};
|
|
|
|
ggml_tensor * inpL = inp;
|
|
int prev_mode = -1;
|
|
|
|
for (int il = 0; il < n_layer; il++) {
|
|
const auto & layer = model.layers[il];
|
|
const int mode = hparams.wa_pattern_mode[il];
|
|
const bool is_full = (mode == -1);
|
|
const bool is_col = (mode == 1);
|
|
|
|
// Reorder transitions on entry/exit of a col-mode run.
|
|
if (is_col && prev_mode != 1) {
|
|
inpL = reorder(inpL, idx_col);
|
|
cb(inpL, "reorder_to_col", il);
|
|
} else if (!is_col && prev_mode == 1) {
|
|
inpL = reorder(inpL, idx_col_inv);
|
|
cb(inpL, "reorder_to_row", il);
|
|
}
|
|
|
|
ggml_tensor * cur = inpL;
|
|
|
|
// Pre-attention RMSNorm.
|
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_RMS, eps, il);
|
|
cb(cur, "ln1", il);
|
|
|
|
// Fused QKV with GQA.
|
|
ggml_tensor * qkv = build_mm(layer.qkv_w, cur);
|
|
qkv = ggml_add(ctx0, qkv, layer.qkv_b);
|
|
|
|
const size_t row = ggml_row_size(qkv->type, head_dim);
|
|
const size_t off_k = ggml_row_size(qkv->type, n_head * head_dim);
|
|
const size_t off_v = ggml_row_size(qkv->type, (n_head + n_head_kv) * head_dim);
|
|
|
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim, n_head, n_pos, row, qkv->nb[1], 0);
|
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_k);
|
|
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_v);
|
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
// 2D RoPE
|
|
ggml_tensor * pos = is_col ? positions_col : positions_row;
|
|
Qcur = ggml_rope_multi(ctx0, Qcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
|
|
Kcur = ggml_rope_multi(ctx0, Kcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
|
|
cb(Qcur, "Qcur_rope", il);
|
|
cb(Kcur, "Kcur_rope", il);
|
|
|
|
// Full layers: plain attention. Windowed layers: banded mask and per-head sinks.
|
|
ggml_tensor * mask = is_full ? nullptr : window_mask_attn;
|
|
ggml_tensor * sinks = is_full ? nullptr : layer.attn_sinks;
|
|
if (!is_full) {
|
|
GGML_ASSERT(layer.attn_sinks != nullptr);
|
|
}
|
|
ggml_tensor * attn_out = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, mask, attn_scale, il, sinks);
|
|
cb(attn_out, "attn_out", il);
|
|
|
|
// Residual 1.
|
|
cur = ggml_add(ctx0, attn_out, inpL);
|
|
inpL = cur;
|
|
cb(cur, "ffn_inp", il);
|
|
|
|
// Pre-FFN RMSNorm.
|
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_RMS, eps, il);
|
|
cb(cur, "ffn_inp_normed", il);
|
|
|
|
// SwiGLU MLP with biases
|
|
cur = build_ffn(cur,
|
|
layer.ff_up_w, layer.ff_up_b,
|
|
layer.ff_gate_w, layer.ff_gate_b,
|
|
layer.ff_down_w, layer.ff_down_b,
|
|
hparams.ffn_op, il);
|
|
cb(cur, "ffn_out", il);
|
|
|
|
// Residual 2.
|
|
cur = ggml_add(ctx0, inpL, cur);
|
|
cb(cur, "layer_out", il);
|
|
|
|
inpL = cur;
|
|
prev_mode = mode;
|
|
}
|
|
|
|
// If the last block was col-mode, undo the transpose so the merger sees patches in row order.
|
|
if (prev_mode == 1) {
|
|
inpL = reorder(inpL, idx_col_inv);
|
|
cb(inpL, "reorder_to_row_final", -1);
|
|
}
|
|
|
|
// Merger: post-LayerNorm
|
|
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, 1e-6f, n_layer);
|
|
cb(inpL, "post_ln", -1);
|
|
|
|
// Spatial merge: pack each merge_unit (=4) of patches into a single
|
|
// (n_embd*merge_unit)-wide row, then run the 2-layer MLP.
|
|
ggml_tensor * embeddings = ggml_reshape_3d(ctx0, inpL, n_embd * merge_unit, n_units, batch_size);
|
|
embeddings = build_ffn(embeddings,
|
|
model.mm_0_w, nullptr,
|
|
nullptr, nullptr,
|
|
model.mm_1_w, nullptr,
|
|
FFN_GELU, -1);
|
|
cb(embeddings, "vit_out", -1);
|
|
|
|
ggml_build_forward_expand(gf, embeddings);
|
|
return gf;
|
|
}
|