Files
llama.cpp/tools/mtmd/models/mimovl.cpp
AesSedai 4178259130 mtmd: add MiMo v2.5 vision (#22883)
* mimo-v2.5: vision support

* mimo-v2.5: use fused qkv for vision

* mimi-v2.5: fix f16 vision overflow

* mimo-v2.5: comment cleanups

* mimo-v2.5: Flash doesn't have mmproj
more cleanup
remember to use filter_tensors

* mimo-v2.5: fix trailing whitespace
2026-05-12 11:11:14 +02:00

210 lines
9.2 KiB
C++

#include "models.h"
ggml_tensor * clip_graph_mimovl::build_mm(ggml_tensor * w, ggml_tensor * x) const {
ggml_tensor * cur = ggml_mul_mat(ctx0, w, x);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
return cur;
}
// MiMoVL vision tower for MiMo-V2.5 (non-Pro). Qwen2.5-VL-shaped ViT, except:
// 1. GQA in attention (32 Q / 8 KV heads, head_dim 64).
// 2. Per-head attention sinks on every windowed layer. The sinks adjust
// the softmax denominator (equivalently, a virtual extra K column with V=0),
// so they decay attention weight without contributing to the output.
// 3. Per-layer window-attention mode in hparams.wa_pattern_mode:
// -1 -> full, 0 -> row-window+sinks, 1 -> col-window+sinks.
// Col mode transposes the merge-unit grid on entry and restores
// it on exit. Both patch and rotary orderings are pre-computed
// host-side.
// 4. 1D banded sliding window (|q-k| > window_size -> -inf) as a
// single 2D mask broadcast across heads.
// 5. Per-block MLP biases.
ggml_cgraph * clip_graph_mimovl::build() {
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
GGML_ASSERT(model.patch_embeddings_1 != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
GGML_ASSERT(hparams.n_head_kv > 0);
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
GGML_ASSERT((int) hparams.wa_pattern_mode.size() == n_layer);
const int batch_size = 1;
const int n_pos = n_patches;
const int n_head_kv = hparams.n_head_kv;
const int merge = hparams.n_merge > 0 ? hparams.n_merge : 2;
const int merge_unit = merge * merge;
const int n_units = n_pos / merge_unit;
GGML_ASSERT(n_units * merge_unit == n_pos);
// MiMoVL has head_dim=64 with n_embd=1280, so n_embd is NOT n_head*head_dim
// (the base class's d_head = n_embd/n_head = 40 is wrong here). Derive
// head_dim from the fused QKV projection: rows = (n_head + 2*n_head_kv)*head_dim.
GGML_ASSERT(model.layers[0].qkv_w != nullptr);
const int qkv_rows = model.layers[0].qkv_w->ne[1];
const int head_dim = qkv_rows / (n_head + 2 * n_head_kv);
GGML_ASSERT(head_dim * (n_head + 2 * n_head_kv) == qkv_rows);
const float attn_scale = 1.0f / std::sqrt((float) head_dim);
const int rope_n_dims = head_dim / 2;
int mrope_sections[4] = {rope_n_dims/2, rope_n_dims/2, 0, 0};
// Patch embed: Conv3D(kt=2) split into two Conv2D, then interleave-merge
// along the height axis to match the merge-tile token order.
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw,
patch_size, patch_size, 0, 0, 1, 1);
{
ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw,
patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b]
inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
}
cb(inp, "patch_embed", -1);
ggml_tensor * positions_row = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
ggml_set_name(positions_row, "mimovl_positions_row");
ggml_set_input(positions_row);
ggml_tensor * positions_col = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
ggml_set_name(positions_col, "mimovl_positions_col");
ggml_set_input(positions_col);
// idx_col is the col-major merge-unit permutation. Take it as F32 so we can
// derive the inverse permutation in-graph via ggml_argsort;
// ggml_get_rows requires its index tensor to be I32, so cast back as well.
ggml_tensor * idx_col_f = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_units);
ggml_set_name(idx_col_f, "mimovl_idx_col");
ggml_set_input(idx_col_f);
ggml_tensor * idx_col = ggml_cast(ctx0, idx_col_f, GGML_TYPE_I32);
ggml_tensor * idx_col_inv = ggml_argsort(ctx0, idx_col_f, GGML_SORT_ORDER_ASC);
ggml_tensor * window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
ggml_set_name(window_mask, "mimovl_window_mask");
ggml_set_input(window_mask);
ggml_tensor * window_mask_attn = (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED)
? ggml_cast(ctx0, window_mask, GGML_TYPE_F16)
: window_mask;
// Reorder helper: permute patches at merge-unit granularity. The patch
// sequence is laid out as n_units groups of merge_unit (=4) consecutive
// patches; the row<->col transpose only permutes whole groups. We keep
// the per-group (h,w) ordering intact by reshaping to
// [n_embd*merge_unit, n_units] before ggml_get_rows.
auto reorder = [&](ggml_tensor * x, ggml_tensor * idx) {
ggml_tensor * y = ggml_reshape_2d(ctx0, x, n_embd * merge_unit, n_units);
y = ggml_get_rows(ctx0, y, idx);
return ggml_reshape_3d(ctx0, y, n_embd, n_pos, batch_size);
};
ggml_tensor * inpL = inp;
int prev_mode = -1;
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
const int mode = hparams.wa_pattern_mode[il];
const bool is_full = (mode == -1);
const bool is_col = (mode == 1);
// Reorder transitions on entry/exit of a col-mode run.
if (is_col && prev_mode != 1) {
inpL = reorder(inpL, idx_col);
cb(inpL, "reorder_to_col", il);
} else if (!is_col && prev_mode == 1) {
inpL = reorder(inpL, idx_col_inv);
cb(inpL, "reorder_to_row", il);
}
ggml_tensor * cur = inpL;
// Pre-attention RMSNorm.
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_RMS, eps, il);
cb(cur, "ln1", il);
// Fused QKV with GQA.
ggml_tensor * qkv = build_mm(layer.qkv_w, cur);
qkv = ggml_add(ctx0, qkv, layer.qkv_b);
const size_t row = ggml_row_size(qkv->type, head_dim);
const size_t off_k = ggml_row_size(qkv->type, n_head * head_dim);
const size_t off_v = ggml_row_size(qkv->type, (n_head + n_head_kv) * head_dim);
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim, n_head, n_pos, row, qkv->nb[1], 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_k);
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_v);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// 2D RoPE
ggml_tensor * pos = is_col ? positions_col : positions_row;
Qcur = ggml_rope_multi(ctx0, Qcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
Kcur = ggml_rope_multi(ctx0, Kcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
// Full layers: plain attention. Windowed layers: banded mask and per-head sinks.
ggml_tensor * mask = is_full ? nullptr : window_mask_attn;
ggml_tensor * sinks = is_full ? nullptr : layer.attn_sinks;
if (!is_full) {
GGML_ASSERT(layer.attn_sinks != nullptr);
}
ggml_tensor * attn_out = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, mask, attn_scale, il, sinks);
cb(attn_out, "attn_out", il);
// Residual 1.
cur = ggml_add(ctx0, attn_out, inpL);
inpL = cur;
cb(cur, "ffn_inp", il);
// Pre-FFN RMSNorm.
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_RMS, eps, il);
cb(cur, "ffn_inp_normed", il);
// SwiGLU MLP with biases
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
// Residual 2.
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
inpL = cur;
prev_mode = mode;
}
// If the last block was col-mode, undo the transpose so the merger sees patches in row order.
if (prev_mode == 1) {
inpL = reorder(inpL, idx_col_inv);
cb(inpL, "reorder_to_row_final", -1);
}
// Merger: post-LayerNorm
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, 1e-6f, n_layer);
cb(inpL, "post_ln", -1);
// Spatial merge: pack each merge_unit (=4) of patches into a single
// (n_embd*merge_unit)-wide row, then run the 2-layer MLP.
ggml_tensor * embeddings = ggml_reshape_3d(ctx0, inpL, n_embd * merge_unit, n_units, batch_size);
embeddings = build_ffn(embeddings,
model.mm_0_w, nullptr,
nullptr, nullptr,
model.mm_1_w, nullptr,
FFN_GELU, -1);
cb(embeddings, "vit_out", -1);
ggml_build_forward_expand(gf, embeddings);
return gf;
}