mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-10 02:54:06 +00:00
* Support MiniCPM-V 4.6 in new branch Signed-off-by: tc-mb <tianchi_cai@icloud.com> * fix code bug Signed-off-by: tc-mb <tianchi_cai@icloud.com> * fix pre-commit Signed-off-by: tc-mb <tianchi_cai@icloud.com> * fix convert Signed-off-by: tc-mb <tianchi_cai@icloud.com> * rename clip_graph_minicpmv4_6 Signed-off-by: tc-mb <tianchi_cai@icloud.com> * use new TYPE_MINICPMV4_6 Signed-off-by: tc-mb <tianchi_cai@icloud.com> * use build_attn to allow flash attention support Signed-off-by: tc-mb <tianchi_cai@icloud.com> * no use legacy code, restored here. Signed-off-by: tc-mb <tianchi_cai@icloud.com> * use the existing tensors name Signed-off-by: tc-mb <tianchi_cai@icloud.com> * unused ctx->model.hparams.minicpmv_version Signed-off-by: tc-mb <tianchi_cai@icloud.com> * use n_merge for slice alignment Signed-off-by: tc-mb <tianchi_cai@icloud.com> * borrow wa_layer_indexes for vit_merger insertion point Signed-off-by: tc-mb <tianchi_cai@icloud.com> * fix code style Signed-off-by: tc-mb <tianchi_cai@icloud.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * use filter_tensors and add model.vision_tower Signed-off-by: tc-mb <tianchi_cai@icloud.com> * fix chkhsh Signed-off-by: tc-mb <tianchi_cai@icloud.com> * fix type check Signed-off-by: tc-mb <tianchi_cai@icloud.com> --------- Signed-off-by: tc-mb <tianchi_cai@icloud.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
406 lines
16 KiB
C++
406 lines
16 KiB
C++
#include "models.h"
|
|
|
|
ggml_cgraph * clip_graph_minicpmv::build() {
|
|
GGML_ASSERT(model.class_embedding == nullptr);
|
|
const int n_pos = n_patches;
|
|
const int n_embd_proj = n_mmproj_embd;
|
|
|
|
// position embeddings for the projector (not for ViT)
|
|
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
|
|
// base frequency omega
|
|
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
|
|
ggml_set_name(omega, "omega");
|
|
ggml_set_input(omega);
|
|
|
|
// 2D input positions (using float for sinusoidal embeddings)
|
|
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
|
ggml_set_name(pos_h, "pos_h");
|
|
ggml_set_input(pos_h);
|
|
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
|
ggml_set_name(pos_w, "pos_w");
|
|
ggml_set_input(pos_w);
|
|
|
|
// for selecting learned pos embd, used by ViT
|
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
|
ggml_set_name(positions, "positions");
|
|
ggml_set_input(positions);
|
|
|
|
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
|
|
|
ggml_tensor * inp = build_inp();
|
|
ggml_tensor * embeddings = build_vit(
|
|
inp, n_pos,
|
|
NORM_TYPE_NORMAL,
|
|
hparams.ffn_op,
|
|
learned_pos_embd,
|
|
nullptr);
|
|
|
|
// resampler projector (it is just another transformer)
|
|
|
|
ggml_tensor * q = model.mm_model_query;
|
|
ggml_tensor * v = build_mm(model.mm_model_kv_proj, embeddings);
|
|
|
|
// norm
|
|
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
|
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
|
|
|
// calculate sinusoidal pos embd
|
|
ggml_tensor * pos_embed = nullptr;
|
|
{
|
|
// outer product
|
|
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
|
|
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
|
|
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
|
|
// sin and cos
|
|
ggml_tensor * pos_embd_x = ggml_concat(
|
|
ctx0,
|
|
ggml_sin(ctx0, theta_x),
|
|
ggml_cos(ctx0, theta_x),
|
|
0 // concat on first dim
|
|
);
|
|
ggml_tensor * pos_embd_y = ggml_concat(
|
|
ctx0,
|
|
ggml_sin(ctx0, theta_y),
|
|
ggml_cos(ctx0, theta_y),
|
|
0 // concat on first dim
|
|
);
|
|
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
|
|
}
|
|
|
|
// k = v + pos_embed
|
|
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
|
|
|
// attention
|
|
{
|
|
const int d_head = 128;
|
|
int n_head = n_embd_proj/d_head;
|
|
// Use actual config value if available, otherwise fall back to hardcoded values
|
|
int num_query = hparams.minicpmv_query_num;
|
|
ggml_tensor * Q = ggml_add(ctx0,
|
|
build_mm(model.mm_model_attn_q_w, q),
|
|
model.mm_model_attn_q_b);
|
|
ggml_tensor * K = ggml_add(ctx0,
|
|
build_mm(model.mm_model_attn_k_w, k),
|
|
model.mm_model_attn_k_b);
|
|
ggml_tensor * V = ggml_add(ctx0,
|
|
build_mm(model.mm_model_attn_v_w, v),
|
|
model.mm_model_attn_v_b);
|
|
|
|
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
|
K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
|
|
V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
|
|
|
|
cb(Q, "resampler_Q", -1);
|
|
cb(K, "resampler_K", -1);
|
|
cb(V, "resampler_V", -1);
|
|
|
|
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
|
|
embeddings = build_attn(
|
|
model.mm_model_attn_o_w,
|
|
model.mm_model_attn_o_b,
|
|
Q, K, V, nullptr, resampler_kq_scale, -1);
|
|
cb(embeddings, "resampler_attn_out", -1);
|
|
}
|
|
// layernorm
|
|
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
|
|
|
// projection
|
|
embeddings = build_mm(model.mm_model_proj, embeddings);
|
|
|
|
// build the graph
|
|
ggml_build_forward_expand(gf, embeddings);
|
|
|
|
return gf;
|
|
}
|
|
|
|
ggml_cgraph * clip_graph_minicpmv4_6::build() {
|
|
const int insert_lid = hparams.insert_layer_id;
|
|
const int n_pos = n_patches;
|
|
const int half_h = n_patches_y / 2;
|
|
const int half_w = n_patches_x / 2;
|
|
const int n_ds = half_h * half_w; // after ViT merger 2x2 downsample
|
|
const int qh = half_h / 2;
|
|
const int qw = half_w / 2;
|
|
const int n_ds2 = qh * qw; // after final merger 2x2 downsample
|
|
|
|
auto add_i32_input = [&](const char * name, int n) {
|
|
ggml_tensor * t = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n);
|
|
ggml_set_name(t, name);
|
|
ggml_set_input(t);
|
|
return t;
|
|
};
|
|
|
|
// position indices for ViT learned positional embeddings
|
|
ggml_tensor * positions = add_i32_input("positions", n_pos);
|
|
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
|
|
|
// ViT merger window reorder indices + block-diagonal mask
|
|
// (mask layout follows qwen2vl: -inf except for 4x4 blocks on the diagonal,
|
|
// so each window-major group of 4 tokens only attends to itself)
|
|
ggml_tensor * vit_merger_window_idx = add_i32_input("vit_merger_window_idx", n_pos);
|
|
ggml_tensor * vit_merger_inv_window_idx = add_i32_input("vit_merger_inv_window_idx", n_pos);
|
|
ggml_tensor * vit_merger_window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
|
ggml_set_name(vit_merger_window_mask, "vit_merger_window_mask");
|
|
ggml_set_input(vit_merger_window_mask);
|
|
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
|
vit_merger_window_mask = ggml_cast(ctx0, vit_merger_window_mask, GGML_TYPE_F16);
|
|
}
|
|
|
|
// ViT merger 2x2 downsample gather indices
|
|
ggml_tensor * vit_merger_ds_idx_0 = add_i32_input("vit_merger_ds_idx_0", n_ds);
|
|
ggml_tensor * vit_merger_ds_idx_1 = add_i32_input("vit_merger_ds_idx_1", n_ds);
|
|
ggml_tensor * vit_merger_ds_idx_2 = add_i32_input("vit_merger_ds_idx_2", n_ds);
|
|
ggml_tensor * vit_merger_ds_idx_3 = add_i32_input("vit_merger_ds_idx_3", n_ds);
|
|
|
|
// final merger 2x2 downsample gather indices
|
|
ggml_tensor * merger_ds_idx_0 = add_i32_input("merger_ds_idx_0", n_ds2);
|
|
ggml_tensor * merger_ds_idx_1 = add_i32_input("merger_ds_idx_1", n_ds2);
|
|
ggml_tensor * merger_ds_idx_2 = add_i32_input("merger_ds_idx_2", n_ds2);
|
|
ggml_tensor * merger_ds_idx_3 = add_i32_input("merger_ds_idx_3", n_ds2);
|
|
|
|
// patch embedding + positional embedding
|
|
ggml_tensor * inp = build_inp();
|
|
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
|
cb(inp, "pos_embed", -1);
|
|
|
|
ggml_tensor * inpL = inp;
|
|
if (model.pre_ln_w) {
|
|
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
|
cb(inpL, "pre_ln", -1);
|
|
}
|
|
|
|
// ViT layers 0..insert_layer_id (inclusive)
|
|
// Mirrors the separate-qkv path of clip_graph::build_vit so the two manually
|
|
// unrolled segments around the ViT merger read like build_vit() expansions.
|
|
for (int il = 0; il <= insert_lid; il++) {
|
|
auto & layer = model.layers[il];
|
|
ggml_tensor * cur = inpL;
|
|
|
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
|
cb(cur, "layer_inp_normed", il);
|
|
|
|
{
|
|
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
|
if (layer.q_b) {
|
|
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
|
}
|
|
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
|
if (layer.k_b) {
|
|
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
|
}
|
|
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
|
if (layer.v_b) {
|
|
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
|
}
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
cb(cur, "attn_out", il);
|
|
}
|
|
|
|
if (layer.ls_1_w) {
|
|
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
|
cb(cur, "attn_out_scaled", il);
|
|
}
|
|
cur = ggml_add(ctx0, cur, inpL);
|
|
inpL = cur;
|
|
cb(cur, "ffn_inp", il);
|
|
|
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
|
cb(cur, "ffn_inp_normed", il);
|
|
|
|
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b,
|
|
layer.ff_down_w, layer.ff_down_b, hparams.ffn_op, il);
|
|
cb(cur, "ffn_out", il);
|
|
|
|
if (layer.ls_2_w) {
|
|
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
|
cb(cur, "ffn_out_scaled", il);
|
|
}
|
|
cur = ggml_add(ctx0, inpL, cur);
|
|
cb(cur, "layer_out", il);
|
|
|
|
inpL = cur;
|
|
}
|
|
|
|
// ViT merger: window self-attention
|
|
// Tokens are reordered to window-major (4 tokens per window are contiguous),
|
|
// and a block-diagonal mask restricts attention to within each window. This
|
|
// mirrors the qwen2vl windowed-attention pattern so build_attn() can pick the
|
|
// flash-attention path when available.
|
|
{
|
|
ggml_tensor * residual = inpL;
|
|
ggml_tensor * cur = build_norm(inpL,
|
|
model.vit_merger_ln1_w, model.vit_merger_ln1_b,
|
|
NORM_TYPE_NORMAL, eps, -1);
|
|
cb(cur, "vit_merger_attn_inp_normed", -1);
|
|
|
|
cur = ggml_get_rows(ctx0, cur, vit_merger_window_idx);
|
|
cb(cur, "vit_merger_window_reorder", -1);
|
|
|
|
ggml_tensor * Qcur = build_mm(model.vit_merger_attn_q_w, cur);
|
|
if (model.vit_merger_attn_q_b) {
|
|
Qcur = ggml_add(ctx0, Qcur, model.vit_merger_attn_q_b);
|
|
}
|
|
ggml_tensor * Kcur = build_mm(model.vit_merger_attn_k_w, cur);
|
|
if (model.vit_merger_attn_k_b) {
|
|
Kcur = ggml_add(ctx0, Kcur, model.vit_merger_attn_k_b);
|
|
}
|
|
ggml_tensor * Vcur = build_mm(model.vit_merger_attn_v_w, cur);
|
|
if (model.vit_merger_attn_v_b) {
|
|
Vcur = ggml_add(ctx0, Vcur, model.vit_merger_attn_v_b);
|
|
}
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
|
cb(Qcur, "vit_merger_Qcur", -1);
|
|
cb(Kcur, "vit_merger_Kcur", -1);
|
|
cb(Vcur, "vit_merger_Vcur", -1);
|
|
|
|
cur = build_attn(model.vit_merger_attn_o_w, model.vit_merger_attn_o_b,
|
|
Qcur, Kcur, Vcur, vit_merger_window_mask, kq_scale, -1);
|
|
cb(cur, "vit_merger_attn_out", -1);
|
|
|
|
cur = ggml_get_rows(ctx0, cur, vit_merger_inv_window_idx);
|
|
inpL = ggml_add(ctx0, cur, residual);
|
|
cb(inpL, "vit_merger_attn_residual", -1);
|
|
}
|
|
|
|
// ViT merger: 2x2 spatial downsample + MLP (4 tokens -> 1)
|
|
{
|
|
ggml_tensor * p0 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_0);
|
|
ggml_tensor * p1 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_1);
|
|
ggml_tensor * p2 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_2);
|
|
ggml_tensor * p3 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_3);
|
|
|
|
ggml_tensor * mean_res = ggml_add(ctx0, p0, p1);
|
|
mean_res = ggml_add(ctx0, mean_res, p2);
|
|
mean_res = ggml_add(ctx0, mean_res, p3);
|
|
mean_res = ggml_scale(ctx0, mean_res, 0.25f);
|
|
cb(mean_res, "vit_merger_ds_mean_res", -1);
|
|
|
|
ggml_tensor * cat = ggml_concat(ctx0, p0, p1, 0);
|
|
cat = ggml_concat(ctx0, cat, p2, 0);
|
|
cat = ggml_concat(ctx0, cat, p3, 0);
|
|
|
|
ggml_tensor * cur = build_norm(cat,
|
|
model.vit_merger_ds_ln_w, model.vit_merger_ds_ln_b,
|
|
NORM_TYPE_NORMAL, eps, -1);
|
|
cb(cur, "vit_merger_ds_normed", -1);
|
|
|
|
// ViTWindowAttentionMerger downsample MLP uses gelu_pytorch_tanh (FFN_GELU)
|
|
cur = build_ffn(cur,
|
|
model.vit_merger_ds_up_w, model.vit_merger_ds_up_b,
|
|
nullptr, nullptr,
|
|
model.vit_merger_ds_down_w, model.vit_merger_ds_down_b,
|
|
FFN_GELU, -1);
|
|
cb(cur, "vit_merger_ds_mlp_out", -1);
|
|
|
|
inpL = ggml_add(ctx0, cur, mean_res);
|
|
cb(inpL, "vit_merger_ds_out", -1);
|
|
}
|
|
|
|
// ViT layers (insert_layer_id+1)..n_layer-1, operating on the downsampled tokens
|
|
{
|
|
const int64_t n_pos_ds = n_ds;
|
|
for (int il = insert_lid + 1; il < n_layer; il++) {
|
|
auto & layer = model.layers[il];
|
|
ggml_tensor * cur = inpL;
|
|
|
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
|
cb(cur, "layer_inp_normed", il);
|
|
|
|
{
|
|
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
|
if (layer.q_b) {
|
|
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
|
}
|
|
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
|
if (layer.k_b) {
|
|
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
|
}
|
|
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
|
if (layer.v_b) {
|
|
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
|
}
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos_ds);
|
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos_ds);
|
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos_ds);
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
cb(cur, "attn_out", il);
|
|
}
|
|
|
|
if (layer.ls_1_w) {
|
|
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
|
cb(cur, "attn_out_scaled", il);
|
|
}
|
|
cur = ggml_add(ctx0, cur, inpL);
|
|
inpL = cur;
|
|
cb(cur, "ffn_inp", il);
|
|
|
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
|
cb(cur, "ffn_inp_normed", il);
|
|
|
|
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b,
|
|
layer.ff_down_w, layer.ff_down_b, hparams.ffn_op, il);
|
|
cb(cur, "ffn_out", il);
|
|
|
|
if (layer.ls_2_w) {
|
|
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
|
cb(cur, "ffn_out_scaled", il);
|
|
}
|
|
cur = ggml_add(ctx0, inpL, cur);
|
|
cb(cur, "layer_out", il);
|
|
|
|
inpL = cur;
|
|
}
|
|
}
|
|
|
|
if (model.post_ln_w) {
|
|
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
|
cb(inpL, "post_ln", -1);
|
|
}
|
|
|
|
// Final Merger (DownsampleMLP): another 2x2 spatial merge -> projector embedding
|
|
{
|
|
ggml_tensor * p0 = ggml_get_rows(ctx0, inpL, merger_ds_idx_0);
|
|
ggml_tensor * p1 = ggml_get_rows(ctx0, inpL, merger_ds_idx_1);
|
|
ggml_tensor * p2 = ggml_get_rows(ctx0, inpL, merger_ds_idx_2);
|
|
ggml_tensor * p3 = ggml_get_rows(ctx0, inpL, merger_ds_idx_3);
|
|
|
|
ggml_tensor * cat = ggml_concat(ctx0, p0, p1, 0);
|
|
cat = ggml_concat(ctx0, cat, p2, 0);
|
|
cat = ggml_concat(ctx0, cat, p3, 0);
|
|
|
|
ggml_tensor * cur = build_norm(cat,
|
|
model.mm_input_norm_w, model.mm_input_norm_b,
|
|
NORM_TYPE_NORMAL, eps, -1);
|
|
cb(cur, "merger_normed", -1);
|
|
|
|
// MiniCPMV4_6DownsampleMLP uses nn.GELU() (erf-based, FFN_GELU_ERF)
|
|
cur = build_ffn(cur,
|
|
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
|
nullptr, nullptr,
|
|
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
|
FFN_GELU_ERF, -1);
|
|
cb(cur, "merger_out", -1);
|
|
|
|
inpL = cur;
|
|
}
|
|
|
|
ggml_build_forward_expand(gf, inpL);
|
|
return gf;
|
|
}
|