mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-11 19:44:06 +00:00
* mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) Conformer encoder with Shaw relative position encoding, QFormer projector, log-mel spectrogram with frame stacking. Encoder uses GLU gating, folded batch norm, and SSM depthwise conv. QFormer compresses encoder output via windowed cross-attention (window=15, queries=3) into the LLM embedding space. Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank, dynamic range compression, 2x frame stacking (80->160 mel). GGUF converter handles batch norm folding at export time, fused K/V split, and Conv1d weight reshaping. Tested against HF transformers reference: token-for-token match on 30s/60s audio clips with greedy decoding. * mtmd: rename gs_ prefixed tensors to generic/architecture names * mtmd: use tensor_mapping.py for all granite_speech tensors * convert: fold GraniteSpeechTextModel into GraniteModel * mtmd: replace n_layer hack with explicit has_standard_layers flag * mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech * mtmd: align KEY_A_ define spacing * convert: register GraniteModel for GraniteSpeechForConditionalGeneration * convert: fix ty type-check for GraniteSpeechMmprojModel registration * mtmd: align TN_ define spacing * mtmd: use generic layer loop for granite speech tensor loading * mtmd: merge qformer_proj_layer into clip_layer * mtmd: granite_speech remove redundant ggml_build_forward_expand on inputs * mtmd: granite_speech add comment explaining why build_attn is not used * mtmd: granite_speech hard-code eps in cpp, remove from GGUF metadata * gguf: add spacing between granite_speech tensor mapping blocks * mtmd: make generic audio layer_norm_eps read optional * mtmd: granite_speech keep encoder eps in GGUF, only hard-code projector eps * mtmd: align defines and struct fields in clip-impl.h and clip-model.h * mtmd: fix alignment and ordering issues across granite speech files * convert: granite_speech use filter_tensors instead of modify_tensors for skipping
276 lines
11 KiB
C++
276 lines
11 KiB
C++
#include "models.h"
|
|
|
|
ggml_cgraph * clip_graph_granite_speech::build() {
|
|
const int n_frames = img.nx;
|
|
const int context_size = hparams.audio_chunk_size;
|
|
const int ctc_layer = n_layer / 2;
|
|
const int conv_kernel = hparams.audio_conv_kernel_size;
|
|
const int conv_pad = conv_kernel / 2;
|
|
|
|
const int num_blocks = (n_frames + context_size - 1) / context_size;
|
|
const int padded_len = num_blocks * context_size;
|
|
const int remainder = n_frames % context_size;
|
|
|
|
ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
|
|
ggml_set_name(attn_dists, "attn_dists");
|
|
ggml_set_input(attn_dists);
|
|
|
|
ggml_tensor * attn_mask = nullptr;
|
|
if (remainder > 0) {
|
|
attn_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32,
|
|
context_size, context_size, 1, num_blocks);
|
|
ggml_set_name(attn_mask, "attn_mask");
|
|
ggml_set_input(attn_mask);
|
|
}
|
|
|
|
ggml_tensor * inp = build_inp_raw(1);
|
|
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
|
cb(cur, "inp_transposed", -1);
|
|
|
|
cur = build_mm(model.inp_proj_w, cur);
|
|
cur = ggml_add(ctx0, cur, model.inp_proj_b);
|
|
cb(cur, "inp_linear", -1);
|
|
|
|
for (int il = 0; il < n_layer; il++) {
|
|
const auto & layer = model.layers[il];
|
|
auto * residual = cur;
|
|
|
|
// ffn1 (half-step)
|
|
{
|
|
auto * ffn1 = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b,
|
|
NORM_TYPE_NORMAL, eps, il);
|
|
cb(ffn1, "ffn1_norm", il);
|
|
|
|
ffn1 = build_ffn(ffn1,
|
|
layer.ff_up_w, layer.ff_up_b,
|
|
nullptr, nullptr,
|
|
layer.ff_down_w, layer.ff_down_b,
|
|
FFN_SILU, il);
|
|
cb(ffn1, "ffn1_out", il);
|
|
|
|
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn1, 0.5f));
|
|
cb(residual, "ffn1_residual", il);
|
|
}
|
|
|
|
// build_attn not used here: Shaw RPE needs pos_attn = mul_mat(pos_emb, Q)
|
|
// injected between KQ product and softmax, which build_attn doesn't support
|
|
{
|
|
auto * normed = build_norm(residual, layer.ln_1_w, layer.ln_1_b,
|
|
NORM_TYPE_NORMAL, eps, il);
|
|
cb(normed, "attn_norm", il);
|
|
|
|
if (n_frames < padded_len) {
|
|
normed = ggml_pad(ctx0, normed, 0, padded_len - n_frames, 0, 0);
|
|
}
|
|
|
|
ggml_tensor * Q = build_mm(layer.q_w, normed);
|
|
ggml_tensor * K = build_mm(layer.k_w, normed);
|
|
ggml_tensor * V = build_mm(layer.v_w, normed);
|
|
|
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, context_size, num_blocks);
|
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, context_size, num_blocks);
|
|
V = ggml_reshape_4d(ctx0, V, d_head, n_head, context_size, num_blocks);
|
|
|
|
ggml_tensor * Q_perm = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
|
ggml_tensor * K_perm = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
|
|
ggml_tensor * kq = ggml_mul_mat(ctx0, K_perm, Q_perm);
|
|
|
|
// Shaw RPE: pos_emb ne[2]=1 broadcasts against Q ne[2]=num_blocks in mul_mat
|
|
ggml_tensor * pos_emb = ggml_get_rows(ctx0, layer.attn_rel_pos_emb, attn_dists);
|
|
pos_emb = ggml_reshape_3d(ctx0, pos_emb, d_head, context_size, context_size);
|
|
pos_emb = ggml_reshape_4d(ctx0, pos_emb, d_head, context_size, 1, context_size);
|
|
|
|
ggml_tensor * Q_shaw = ggml_permute(ctx0, Q, 0, 1, 3, 2);
|
|
ggml_tensor * pos_attn = ggml_mul_mat(ctx0, pos_emb, Q_shaw);
|
|
pos_attn = ggml_cont(ctx0, ggml_permute(ctx0, pos_attn, 0, 2, 3, 1));
|
|
|
|
ggml_tensor * scores = ggml_add(ctx0, kq, pos_attn);
|
|
ggml_tensor * attn_weights = ggml_soft_max_ext(ctx0, scores, attn_mask,
|
|
kq_scale, 0.0f);
|
|
|
|
ggml_tensor * V_perm = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
ggml_tensor * attn_out = ggml_mul_mat(ctx0, V_perm, attn_weights);
|
|
|
|
attn_out = ggml_permute(ctx0, attn_out, 0, 2, 1, 3);
|
|
attn_out = ggml_cont_2d(ctx0, attn_out, n_embd, padded_len);
|
|
|
|
if (n_frames < padded_len) {
|
|
attn_out = ggml_view_2d(ctx0, attn_out,
|
|
n_embd, n_frames, attn_out->nb[1], 0);
|
|
}
|
|
|
|
cur = build_mm(layer.o_w, attn_out);
|
|
cur = ggml_add(ctx0, cur, layer.o_b);
|
|
cb(cur, "attn_out", il);
|
|
}
|
|
|
|
residual = ggml_add(ctx0, residual, cur);
|
|
|
|
// conv module
|
|
{
|
|
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b,
|
|
NORM_TYPE_NORMAL, eps, il);
|
|
cb(cur, "conv_norm", il);
|
|
|
|
auto * x = build_mm(layer.conv_pw1_w, cur);
|
|
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
|
cb(x, "conv_pw1", il);
|
|
|
|
// GLU: ggml has no fused op, manual split + sigmoid gate
|
|
{
|
|
int64_t d = x->ne[0] / 2;
|
|
ggml_tensor * gate = ggml_sigmoid(ctx0,
|
|
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
|
|
x = ggml_mul(ctx0,
|
|
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
|
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
|
}
|
|
cb(x, "conv_glu", il);
|
|
|
|
x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0);
|
|
x = ggml_roll(ctx0, x, conv_pad, 0, 0, 0);
|
|
x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0);
|
|
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
|
|
cb(x, "conv_dw", il);
|
|
|
|
// folded batch norm
|
|
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
|
|
x = ggml_silu(ctx0, x);
|
|
cb(x, "conv_bn_silu", il);
|
|
|
|
x = build_mm(layer.conv_pw2_w, x);
|
|
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
|
cb(x, "conv_pw2", il);
|
|
|
|
cur = x;
|
|
}
|
|
|
|
residual = ggml_add(ctx0, residual, cur);
|
|
|
|
// ffn2 (half-step)
|
|
{
|
|
auto * ffn2 = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b,
|
|
NORM_TYPE_NORMAL, eps, il);
|
|
cb(ffn2, "ffn2_norm", il);
|
|
|
|
ffn2 = build_ffn(ffn2,
|
|
layer.ff_up_1_w, layer.ff_up_1_b,
|
|
nullptr, nullptr,
|
|
layer.ff_down_1_w, layer.ff_down_1_b,
|
|
FFN_SILU, il);
|
|
cb(ffn2, "ffn2_out", il);
|
|
|
|
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn2, 0.5f));
|
|
}
|
|
|
|
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b,
|
|
NORM_TYPE_NORMAL, eps, il);
|
|
cb(cur, "layer_out", il);
|
|
|
|
// CTC branch
|
|
if (il + 1 == ctc_layer) {
|
|
auto * mid = build_mm(model.ctc_out_w, cur);
|
|
mid = ggml_add(ctx0, mid, model.ctc_out_b);
|
|
mid = ggml_soft_max(ctx0, mid);
|
|
mid = build_mm(model.ctc_out_mid_w, mid);
|
|
mid = ggml_add(ctx0, mid, model.ctc_out_mid_b);
|
|
cur = ggml_add(ctx0, cur, mid);
|
|
cb(cur, "ctc_branch", il);
|
|
}
|
|
}
|
|
|
|
cb(cur, "encoder_out", -1);
|
|
|
|
// QFormer projector
|
|
{
|
|
const int window_size = hparams.audio_proj_window_size;
|
|
const int num_queries = window_size / hparams.audio_proj_downsample_rate;
|
|
const int proj_n_head = hparams.audio_proj_head_count;
|
|
const int proj_d_head = n_embd / proj_n_head;
|
|
const float proj_kq_scale = 1.0f / sqrtf((float)proj_d_head);
|
|
const float proj_eps = 1e-12f;
|
|
const int nblocks_proj = (n_frames + window_size - 1) / window_size;
|
|
const int padded_proj = nblocks_proj * window_size;
|
|
|
|
if (n_frames < padded_proj) {
|
|
cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
|
|
}
|
|
|
|
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
|
|
|
|
ggml_tensor * queries = build_norm(model.qf_proj_query,
|
|
model.qf_proj_norm_w, model.qf_proj_norm_b,
|
|
NORM_TYPE_NORMAL, proj_eps, -1);
|
|
{
|
|
ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1);
|
|
ggml_tensor * q_shape = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32,
|
|
n_embd, num_queries, nblocks_proj);
|
|
queries = ggml_repeat(ctx0, q_3d, q_shape);
|
|
}
|
|
|
|
for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) {
|
|
const auto & pl = model.qf_proj_layers[il];
|
|
|
|
// self-attention
|
|
{
|
|
ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.q_w, queries), pl.q_b);
|
|
ggml_tensor * K = ggml_add(ctx0, build_mm(pl.k_w, queries), pl.k_b);
|
|
ggml_tensor * V = ggml_add(ctx0, build_mm(pl.v_w, queries), pl.v_b);
|
|
|
|
Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
|
K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
|
V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
|
|
|
ggml_tensor * sa_out = build_attn(pl.o_w, pl.o_b,
|
|
Q, K, V, nullptr, proj_kq_scale, il);
|
|
sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, num_queries, nblocks_proj);
|
|
|
|
queries = build_norm(ggml_add(ctx0, sa_out, queries),
|
|
pl.ln_1_w, pl.ln_1_b,
|
|
NORM_TYPE_NORMAL, proj_eps, il);
|
|
}
|
|
|
|
// cross-attention
|
|
{
|
|
ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.cross_attn_q_w, queries), pl.cross_attn_q_b);
|
|
ggml_tensor * K = ggml_add(ctx0, build_mm(pl.cross_attn_k_w, enc_windows), pl.cross_attn_k_b);
|
|
ggml_tensor * V = ggml_add(ctx0, build_mm(pl.cross_attn_v_w, enc_windows), pl.cross_attn_v_b);
|
|
|
|
Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
|
K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, window_size, nblocks_proj);
|
|
V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, window_size, nblocks_proj);
|
|
|
|
ggml_tensor * ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
|
|
Q, K, V, nullptr, proj_kq_scale, il);
|
|
ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, num_queries, nblocks_proj);
|
|
|
|
queries = build_norm(ggml_add(ctx0, ca_out, queries),
|
|
pl.cross_attn_norm_w, pl.cross_attn_norm_b,
|
|
NORM_TYPE_NORMAL, proj_eps, il);
|
|
}
|
|
|
|
// ffn
|
|
{
|
|
ggml_tensor * ffn_out = build_ffn(queries,
|
|
pl.ff_up_w, pl.ff_up_b,
|
|
nullptr, nullptr,
|
|
pl.ff_down_w, pl.ff_down_b,
|
|
FFN_GELU, il);
|
|
|
|
queries = build_norm(ggml_add(ctx0, ffn_out, queries),
|
|
pl.ln_2_w, pl.ln_2_b,
|
|
NORM_TYPE_NORMAL, proj_eps, il);
|
|
}
|
|
}
|
|
|
|
cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj);
|
|
cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b);
|
|
cb(cur, "projector_out", -1);
|
|
}
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
|
|
return gf;
|
|
}
|