mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-08 10:04:10 +00:00
mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) (#22101)
* mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) Conformer encoder with Shaw relative position encoding, QFormer projector, log-mel spectrogram with frame stacking. Encoder uses GLU gating, folded batch norm, and SSM depthwise conv. QFormer compresses encoder output via windowed cross-attention (window=15, queries=3) into the LLM embedding space. Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank, dynamic range compression, 2x frame stacking (80->160 mel). GGUF converter handles batch norm folding at export time, fused K/V split, and Conv1d weight reshaping. Tested against HF transformers reference: token-for-token match on 30s/60s audio clips with greedy decoding. * mtmd: rename gs_ prefixed tensors to generic/architecture names * mtmd: use tensor_mapping.py for all granite_speech tensors * convert: fold GraniteSpeechTextModel into GraniteModel * mtmd: replace n_layer hack with explicit has_standard_layers flag * mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech * mtmd: align KEY_A_ define spacing * convert: register GraniteModel for GraniteSpeechForConditionalGeneration * convert: fix ty type-check for GraniteSpeechMmprojModel registration * mtmd: align TN_ define spacing * mtmd: use generic layer loop for granite speech tensor loading * mtmd: merge qformer_proj_layer into clip_layer * mtmd: granite_speech remove redundant ggml_build_forward_expand on inputs * mtmd: granite_speech add comment explaining why build_attn is not used * mtmd: granite_speech hard-code eps in cpp, remove from GGUF metadata * gguf: add spacing between granite_speech tensor mapping blocks * mtmd: make generic audio layer_norm_eps read optional * mtmd: granite_speech keep encoder eps in GGUF, only hard-code projector eps * mtmd: align defines and struct fields in clip-impl.h and clip-model.h * mtmd: fix alignment and ordering issues across granite speech files * convert: granite_speech use filter_tensors instead of modify_tensors for skipping
This commit is contained in:
@@ -10695,7 +10695,7 @@ class ExaoneMoEModel(Exaone4Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("GraniteForCausalLM")
|
||||
@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
|
||||
class GraniteModel(LlamaModel):
|
||||
"""Conversion for IBM's GraniteForCausalLM"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||
@@ -10728,6 +10728,13 @@ class GraniteModel(LlamaModel):
|
||||
self.gguf_writer.add_logit_scale(logits_scale)
|
||||
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if name.startswith("encoder."):
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
|
||||
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
|
||||
class GraniteMoeModel(GraniteModel):
|
||||
@@ -12581,6 +12588,89 @@ class LFM2AudioModel(ConformerAudioModel):
|
||||
return super().filter_tensors(item)
|
||||
|
||||
|
||||
@ModelBase.register("GraniteSpeechForConditionalGeneration")
|
||||
class GraniteSpeechMmprojModel(MmprojModel):
|
||||
has_vision_encoder = False
|
||||
has_audio_encoder = True
|
||||
|
||||
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("encoder_config")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
assert self.hparams_audio is not None
|
||||
a = self.hparams_audio
|
||||
a["hidden_size"] = a["hidden_dim"]
|
||||
a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
|
||||
a["num_attention_heads"] = a["num_heads"]
|
||||
a["num_hidden_layers"] = a["num_layers"]
|
||||
|
||||
super().set_gguf_parameters()
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
|
||||
self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
self.gguf_writer.add_audio_chunk_size(a["context_size"])
|
||||
self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"])
|
||||
self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"])
|
||||
|
||||
p = self.global_config
|
||||
self.gguf_writer.add_audio_projector_window_size(p["window_size"])
|
||||
self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"])
|
||||
self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"])
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if "encoder" in name or "projector" in name:
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if "attention_dists" in name or "num_batches_tracked" in name:
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||
if "batch_norm" in name and "encoder.layers." in name:
|
||||
if self._batch_norm_tensors is None:
|
||||
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||
assert bid is not None
|
||||
self._batch_norm_tensors[bid][name] = data_torch
|
||||
if len(self._batch_norm_tensors[bid]) < 4:
|
||||
return
|
||||
prefix = f"encoder.layers.{bid}.conv.batch_norm"
|
||||
weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
|
||||
bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
|
||||
running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
|
||||
running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
|
||||
eps = 1e-5
|
||||
a = weight / torch.sqrt(running_var + eps)
|
||||
b = bias - running_mean * a
|
||||
yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
|
||||
yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
|
||||
return
|
||||
|
||||
if ".attn.to_kv.weight" in name:
|
||||
k_weight, v_weight = data_torch.chunk(2, dim=0)
|
||||
yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
|
||||
yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
|
||||
return
|
||||
|
||||
if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
|
||||
if data_torch.ndim == 3 and data_torch.shape[2] == 1:
|
||||
data_torch = data_torch.squeeze(2)
|
||||
|
||||
if "depth_conv" in name and name.endswith(".weight"):
|
||||
if data_torch.ndim == 3 and data_torch.shape[1] == 1:
|
||||
data_torch = data_torch.squeeze(1)
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Lfm25AudioTokenizer")
|
||||
class LFM25AudioTokenizer(LFM2Model):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2
|
||||
|
||||
@@ -339,6 +339,9 @@ class Keys:
|
||||
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
|
||||
PROJECTION_DIM = "clip.audio.projection_dim"
|
||||
BLOCK_COUNT = "clip.audio.block_count"
|
||||
CHUNK_SIZE = "clip.audio.chunk_size"
|
||||
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
|
||||
MAX_POS_EMB = "clip.audio.max_pos_emb"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.audio.attention.head_count"
|
||||
@@ -346,6 +349,9 @@ class Keys:
|
||||
|
||||
class Projector:
|
||||
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
||||
WINDOW_SIZE = "clip.audio.projector.window_size"
|
||||
DOWNSAMPLE_RATE = "clip.audio.projector.downsample_rate"
|
||||
HEAD_COUNT = "clip.audio.projector.head_count"
|
||||
|
||||
class Diffusion:
|
||||
SHIFT_LOGITS = "diffusion.shift_logits"
|
||||
@@ -854,6 +860,26 @@ class MODEL_TENSOR(IntEnum):
|
||||
A_ENC_CONV_NORM = auto() # SSM conv
|
||||
A_ENC_CONV_PW1 = auto()
|
||||
A_ENC_CONV_PW2 = auto()
|
||||
A_CTC_OUT = auto()
|
||||
A_CTC_OUT_MID = auto()
|
||||
A_ENC_ATTN_REL_POS_EMB = auto()
|
||||
# qformer projector
|
||||
A_QF_PROJ_QUERY = auto()
|
||||
A_QF_PROJ_NORM = auto()
|
||||
A_QF_PROJ_LINEAR = auto()
|
||||
A_QF_SELF_ATTN_Q = auto()
|
||||
A_QF_SELF_ATTN_K = auto()
|
||||
A_QF_SELF_ATTN_V = auto()
|
||||
A_QF_SELF_ATTN_O = auto()
|
||||
A_QF_SELF_ATTN_NORM = auto()
|
||||
A_QF_CROSS_ATTN_Q = auto()
|
||||
A_QF_CROSS_ATTN_K = auto()
|
||||
A_QF_CROSS_ATTN_V = auto()
|
||||
A_QF_CROSS_ATTN_O = auto()
|
||||
A_QF_CROSS_ATTN_NORM = auto()
|
||||
A_QF_FFN_UP = auto()
|
||||
A_QF_FFN_DOWN = auto()
|
||||
A_QF_FFN_NORM = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
@@ -1333,6 +1359,26 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
|
||||
MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out",
|
||||
MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid",
|
||||
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb",
|
||||
# qformer projector
|
||||
MODEL_TENSOR.A_QF_PROJ_QUERY: "a.proj_query",
|
||||
MODEL_TENSOR.A_QF_PROJ_NORM: "a.proj_norm",
|
||||
MODEL_TENSOR.A_QF_PROJ_LINEAR: "a.proj_linear",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_Q: "a.proj_blk.{bid}.self_attn_q",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_K: "a.proj_blk.{bid}.self_attn_k",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_V: "a.proj_blk.{bid}.self_attn_v",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_O: "a.proj_blk.{bid}.self_attn_out",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: "a.proj_blk.{bid}.self_attn_norm",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: "a.proj_blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_K: "a.proj_blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_V: "a.proj_blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_O: "a.proj_blk.{bid}.cross_attn_out",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: "a.proj_blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.A_QF_FFN_UP: "a.proj_blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm",
|
||||
# NextN/MTP
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
|
||||
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
|
||||
@@ -1480,6 +1526,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE,
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE,
|
||||
MODEL_TENSOR.A_CTC_OUT,
|
||||
MODEL_TENSOR.A_CTC_OUT_MID,
|
||||
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB,
|
||||
# qformer projector
|
||||
MODEL_TENSOR.A_QF_PROJ_QUERY,
|
||||
MODEL_TENSOR.A_QF_PROJ_NORM,
|
||||
MODEL_TENSOR.A_QF_PROJ_LINEAR,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_Q,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_K,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_V,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_O,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_NORM,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_Q,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_K,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_V,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_O,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM,
|
||||
MODEL_TENSOR.A_QF_FFN_UP,
|
||||
MODEL_TENSOR.A_QF_FFN_DOWN,
|
||||
MODEL_TENSOR.A_QF_FFN_NORM,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
@@ -4158,6 +4224,7 @@ class VisionProjectorType:
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
HUNYUANVL = "hunyuanvl"
|
||||
GRANITE_SPEECH = "granite_speech" # audio
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -1260,6 +1260,24 @@ class GGUFWriter:
|
||||
def add_audio_stack_factor(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
||||
|
||||
def add_audio_chunk_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value)
|
||||
|
||||
def add_audio_conv_kernel_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.CONV_KERNEL_SIZE, value)
|
||||
|
||||
def add_audio_max_pos_emb(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
|
||||
|
||||
def add_audio_projector_window_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
|
||||
|
||||
def add_audio_projector_downsample_rate(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.DOWNSAMPLE_RATE, value)
|
||||
|
||||
def add_audio_projector_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.HEAD_COUNT, value)
|
||||
|
||||
def add_xielu_alpha_p(self, values: Sequence[float]):
|
||||
self.add_array(Keys.xIELU.ALPHA_P, values)
|
||||
|
||||
|
||||
@@ -155,6 +155,21 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_CTC_OUT: (
|
||||
"encoder.out",
|
||||
),
|
||||
MODEL_TENSOR.A_CTC_OUT_MID: (
|
||||
"encoder.out_mid",
|
||||
),
|
||||
MODEL_TENSOR.A_QF_PROJ_QUERY: (
|
||||
"projector.query",
|
||||
),
|
||||
MODEL_TENSOR.A_QF_PROJ_NORM: (
|
||||
"projector.qformer.layernorm",
|
||||
),
|
||||
MODEL_TENSOR.A_QF_PROJ_LINEAR: (
|
||||
"projector.linear",
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
@@ -1881,6 +1896,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: (
|
||||
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
|
||||
"encoder.input_linear",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV2D: (
|
||||
@@ -1903,6 +1919,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_q", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
@@ -1910,6 +1927,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
@@ -1917,6 +1935,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
|
||||
@@ -1944,6 +1963,7 @@ class TensorNameMap:
|
||||
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_self_att", # lfm2
|
||||
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
|
||||
"encoder.layers.{bid}.attn.pre_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: (
|
||||
@@ -1951,18 +1971,21 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.post", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_out", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post_norm", # gemma3n
|
||||
"encoder.layers.{bid}.post_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
|
||||
"encoder.layers.{bid}.ff1.pre_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
@@ -1979,6 +2002,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
|
||||
"encoder.layers.{bid}.ff1.up_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
@@ -1988,24 +2012,28 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
|
||||
"encoder.layers.{bid}.ff1.down_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
|
||||
"encoder.layers.{bid}.ff2.up_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
|
||||
"encoder.layers.{bid}.ff2.down_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
|
||||
"encoder.layers.{bid}.ff2.pre_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
@@ -2062,26 +2090,31 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.A_ENC_CONV_DW: (
|
||||
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
|
||||
"encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: (
|
||||
"conformer.layers.{bid}.conv.batch_norm", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
|
||||
"encoder.layers.{bid}.conv.batch_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
|
||||
"encoder.layers.{bid}.conv.up_conv", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
|
||||
"encoder.layers.{bid}.conv.down_conv", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: (
|
||||
"conformer.layers.{bid}.norm_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
"encoder.layers.{bid}.conv.norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
|
||||
@@ -2105,6 +2138,62 @@ class TensorNameMap:
|
||||
"model.embed_audio.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: (
|
||||
"encoder.layers.{bid}.attn.rel_pos_emb.weight",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_Q: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.attention.query",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_K: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.attention.key",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_V: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.attention.value",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_O: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.output.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.output.LayerNorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.attention.query",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_K: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.attention.key",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_V: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.attention.value",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_O: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.output.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_FFN_UP: (
|
||||
"projector.qformer.encoder.layer.{bid}.intermediate_query.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_FFN_DOWN: (
|
||||
"projector.qformer.encoder.layer.{bid}.output_query.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_FFN_NORM: (
|
||||
"projector.qformer.encoder.layer.{bid}.output_query.LayerNorm",
|
||||
),
|
||||
|
||||
# NextN/MTP tensors
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ: (
|
||||
"model.layers.{bid}.eh_proj",
|
||||
|
||||
@@ -21,6 +21,7 @@ add_library(mtmd
|
||||
models/gemma4a.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/granite-speech.cpp
|
||||
models/hunyuanocr.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
|
||||
@@ -60,9 +60,15 @@
|
||||
#define KEY_SAM_N_BLOCK "clip.vision.sam.block_count"
|
||||
#define KEY_SAM_N_EMBD "clip.vision.sam.embedding_length"
|
||||
// audio-specific
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
||||
#define KEY_A_CHUNK_SIZE "clip.audio.chunk_size"
|
||||
#define KEY_A_CONV_KERNEL_SIZE "clip.audio.conv_kernel_size"
|
||||
#define KEY_A_MAX_POS_EMB "clip.audio.max_pos_emb"
|
||||
#define KEY_A_PROJ_WINDOW_SIZE "clip.audio.projector.window_size"
|
||||
#define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate"
|
||||
#define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count"
|
||||
|
||||
|
||||
//
|
||||
@@ -182,6 +188,27 @@
|
||||
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
|
||||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
#define TN_INP_PROJ "a.input_projection.%s"
|
||||
#define TN_CTC_OUT "a.enc_ctc_out.%s"
|
||||
#define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
|
||||
#define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb"
|
||||
// qformer projector
|
||||
#define TN_QF_PROJ_QUERY "a.proj_query"
|
||||
#define TN_QF_PROJ_NORM "a.proj_norm.%s"
|
||||
#define TN_QF_PROJ_LINEAR "a.proj_linear.%s"
|
||||
#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s"
|
||||
#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s"
|
||||
#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s"
|
||||
#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s"
|
||||
#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s"
|
||||
#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
|
||||
#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
|
||||
#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
|
||||
#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
|
||||
#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
|
||||
#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s"
|
||||
#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s"
|
||||
#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s"
|
||||
|
||||
// gemma4 audio conformer
|
||||
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"
|
||||
@@ -304,6 +331,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_NEMOTRON_V2_VL,
|
||||
PROJECTOR_TYPE_HUNYUANOCR,
|
||||
PROJECTOR_TYPE_HUNYUANVL,
|
||||
PROJECTOR_TYPE_GRANITE_SPEECH,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -351,6 +379,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
|
||||
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
|
||||
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
|
||||
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
@@ -92,6 +92,12 @@ struct clip_hparams {
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
int32_t audio_chunk_size = 0;
|
||||
int32_t audio_conv_kernel_size = 0;
|
||||
int32_t audio_max_pos_emb = 0;
|
||||
int32_t audio_proj_window_size = 0;
|
||||
int32_t audio_proj_downsample_rate = 0;
|
||||
int32_t audio_proj_head_count = 0;
|
||||
|
||||
// audio-to-mel preprocessor params
|
||||
int32_t audio_chunk_len = -1; // in seconds
|
||||
@@ -224,6 +230,21 @@ struct clip_layer {
|
||||
ggml_tensor * per_dim_k_scale_w = nullptr;
|
||||
ggml_tensor * ff_post_norm_1_w = nullptr;
|
||||
|
||||
// granite_speech conformer per-layer
|
||||
ggml_tensor * attn_rel_pos_emb = nullptr;
|
||||
|
||||
// granite_speech qformer cross-attention
|
||||
ggml_tensor * cross_attn_q_w = nullptr;
|
||||
ggml_tensor * cross_attn_q_b = nullptr;
|
||||
ggml_tensor * cross_attn_k_w = nullptr;
|
||||
ggml_tensor * cross_attn_k_b = nullptr;
|
||||
ggml_tensor * cross_attn_v_w = nullptr;
|
||||
ggml_tensor * cross_attn_v_b = nullptr;
|
||||
ggml_tensor * cross_attn_o_w = nullptr;
|
||||
ggml_tensor * cross_attn_o_b = nullptr;
|
||||
ggml_tensor * cross_attn_norm_w = nullptr;
|
||||
ggml_tensor * cross_attn_norm_b = nullptr;
|
||||
|
||||
bool has_deepstack() const {
|
||||
return deepstack_fc1_w != nullptr;
|
||||
}
|
||||
@@ -515,6 +536,21 @@ struct clip_model {
|
||||
ggml_tensor * audio_out_proj_w = nullptr;
|
||||
ggml_tensor * audio_out_proj_b = nullptr;
|
||||
|
||||
// granite_speech encoder
|
||||
ggml_tensor * inp_proj_w = nullptr;
|
||||
ggml_tensor * inp_proj_b = nullptr;
|
||||
ggml_tensor * ctc_out_w = nullptr;
|
||||
ggml_tensor * ctc_out_b = nullptr;
|
||||
ggml_tensor * ctc_out_mid_w = nullptr;
|
||||
ggml_tensor * ctc_out_mid_b = nullptr;
|
||||
// qformer projector
|
||||
ggml_tensor * qf_proj_query = nullptr;
|
||||
ggml_tensor * qf_proj_norm_w = nullptr;
|
||||
ggml_tensor * qf_proj_norm_b = nullptr;
|
||||
ggml_tensor * qf_proj_linear_w = nullptr;
|
||||
ggml_tensor * qf_proj_linear_b = nullptr;
|
||||
std::vector<clip_layer> qf_proj_layers;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|
||||
@@ -936,6 +936,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_gemma4a>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_granite_speech>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||
@@ -1503,6 +1507,20 @@ struct clip_model_loader {
|
||||
hparams.audio_window_len = 320; // 20ms frame (NOT 25ms/400)
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
hparams.audio_chunk_len = 0;
|
||||
hparams.audio_sample_rate = 16000;
|
||||
hparams.audio_n_fft = 512;
|
||||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
get_u32(KEY_A_CHUNK_SIZE, hparams.audio_chunk_size);
|
||||
get_u32(KEY_A_CONV_KERNEL_SIZE, hparams.audio_conv_kernel_size);
|
||||
get_u32(KEY_A_MAX_POS_EMB, hparams.audio_max_pos_emb);
|
||||
get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size);
|
||||
get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
|
||||
get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
hparams.image_pad_color = {127, 127, 127};
|
||||
@@ -1654,13 +1672,13 @@ struct clip_model_loader {
|
||||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
}
|
||||
const bool has_standard_layers = (
|
||||
model.proj_type != PROJECTOR_TYPE_GEMMA3NV);
|
||||
|
||||
// layers
|
||||
model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
const int n_layers_to_load = has_standard_layers ? hparams.n_layer : 0;
|
||||
model.layers.resize(n_layers_to_load);
|
||||
for (int il = 0; il < n_layers_to_load; ++il) {
|
||||
auto & layer = model.layers[il];
|
||||
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
|
||||
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
|
||||
@@ -2415,6 +2433,83 @@ struct clip_model_loader {
|
||||
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
model.inp_proj_w = get_tensor(string_format(TN_INP_PROJ, "weight"));
|
||||
model.inp_proj_b = get_tensor(string_format(TN_INP_PROJ, "bias"));
|
||||
model.ctc_out_w = get_tensor(string_format(TN_CTC_OUT, "weight"));
|
||||
model.ctc_out_b = get_tensor(string_format(TN_CTC_OUT, "bias"));
|
||||
model.ctc_out_mid_w = get_tensor(string_format(TN_CTC_OUT_MID, "weight"));
|
||||
model.ctc_out_mid_b = get_tensor(string_format(TN_CTC_OUT_MID, "bias"));
|
||||
|
||||
// per-layer tensors not loaded by the generic loop above
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
auto & layer = model.layers[il];
|
||||
|
||||
layer.attn_rel_pos_emb = get_tensor(string_format(TN_ATTN_REL_POS_EMB, prefix, il));
|
||||
|
||||
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
|
||||
layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
|
||||
|
||||
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
|
||||
layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
|
||||
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
|
||||
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
|
||||
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
|
||||
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
|
||||
|
||||
layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
|
||||
layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
|
||||
layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
|
||||
layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
|
||||
layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
|
||||
layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
|
||||
layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"));
|
||||
layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
|
||||
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
|
||||
}
|
||||
|
||||
model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY);
|
||||
model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight"));
|
||||
model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias"));
|
||||
model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight"));
|
||||
model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias"));
|
||||
|
||||
const int n_proj_layers = 2;
|
||||
model.qf_proj_layers.resize(n_proj_layers);
|
||||
for (int il = 0; il < n_proj_layers; ++il) {
|
||||
auto & pl = model.qf_proj_layers[il];
|
||||
|
||||
pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight"));
|
||||
pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias"));
|
||||
pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight"));
|
||||
pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias"));
|
||||
pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight"));
|
||||
pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias"));
|
||||
pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight"));
|
||||
pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias"));
|
||||
pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight"));
|
||||
pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias"));
|
||||
|
||||
pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight"));
|
||||
pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias"));
|
||||
pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight"));
|
||||
pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias"));
|
||||
pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight"));
|
||||
pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias"));
|
||||
pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight"));
|
||||
pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias"));
|
||||
pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight"));
|
||||
pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias"));
|
||||
|
||||
pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight"));
|
||||
pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias"));
|
||||
pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight"));
|
||||
pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias"));
|
||||
pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight"));
|
||||
pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias"));
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown projector type");
|
||||
}
|
||||
@@ -3105,6 +3200,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
}
|
||||
n_patches = n;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
const int ws = ctx->model.hparams.audio_proj_window_size;
|
||||
const int ds = ctx->model.hparams.audio_proj_downsample_rate;
|
||||
n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
}
|
||||
@@ -3701,6 +3802,39 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
set_input_f32("pos_emb", pos_emb);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
const int context_size = ctx->model.hparams.audio_chunk_size;
|
||||
const int max_pos_emb = ctx->model.hparams.audio_max_pos_emb;
|
||||
|
||||
std::vector<int32_t> dists(context_size * context_size);
|
||||
for (int i = 0; i < context_size; i++) {
|
||||
for (int j = 0; j < context_size; j++) {
|
||||
int d = i - j;
|
||||
if (d < -context_size) d = -context_size;
|
||||
if (d > context_size) d = context_size;
|
||||
dists[i * context_size + j] = d + max_pos_emb;
|
||||
}
|
||||
}
|
||||
set_input_i32("attn_dists", dists);
|
||||
|
||||
const int n_frames = image_size_width;
|
||||
const int remainder = n_frames % context_size;
|
||||
if (remainder > 0) {
|
||||
const int num_blocks = (n_frames + context_size - 1) / context_size;
|
||||
std::vector<float> mask(context_size * context_size * num_blocks, 0.0f);
|
||||
const float neg_inf = -INFINITY;
|
||||
const int last_block_offset = (num_blocks - 1) * context_size * context_size;
|
||||
for (int q = 0; q < context_size; q++) {
|
||||
for (int k = 0; k < context_size; k++) {
|
||||
if (q >= remainder || k >= remainder) {
|
||||
mask[last_block_offset + q * context_size + k] = neg_inf;
|
||||
}
|
||||
}
|
||||
}
|
||||
set_input_f32("attn_mask", mask);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
@@ -3849,6 +3983,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.position_embeddings->ne[0];
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
return ctx->model.hparams.projection_dim;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
return ctx->model.qf_proj_linear_w->ne[1];
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return ctx->model.mm_ffn_down_w->ne[1];
|
||||
default:
|
||||
|
||||
275
tools/mtmd/models/granite-speech.cpp
Normal file
275
tools/mtmd/models/granite-speech.cpp
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int context_size = hparams.audio_chunk_size;
|
||||
const int ctc_layer = n_layer / 2;
|
||||
const int conv_kernel = hparams.audio_conv_kernel_size;
|
||||
const int conv_pad = conv_kernel / 2;
|
||||
|
||||
const int num_blocks = (n_frames + context_size - 1) / context_size;
|
||||
const int padded_len = num_blocks * context_size;
|
||||
const int remainder = n_frames % context_size;
|
||||
|
||||
ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
|
||||
ggml_set_name(attn_dists, "attn_dists");
|
||||
ggml_set_input(attn_dists);
|
||||
|
||||
ggml_tensor * attn_mask = nullptr;
|
||||
if (remainder > 0) {
|
||||
attn_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32,
|
||||
context_size, context_size, 1, num_blocks);
|
||||
ggml_set_name(attn_mask, "attn_mask");
|
||||
ggml_set_input(attn_mask);
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
cb(cur, "inp_transposed", -1);
|
||||
|
||||
cur = build_mm(model.inp_proj_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.inp_proj_b);
|
||||
cb(cur, "inp_linear", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
auto * residual = cur;
|
||||
|
||||
// ffn1 (half-step)
|
||||
{
|
||||
auto * ffn1 = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(ffn1, "ffn1_norm", il);
|
||||
|
||||
ffn1 = build_ffn(ffn1,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
nullptr, nullptr,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
FFN_SILU, il);
|
||||
cb(ffn1, "ffn1_out", il);
|
||||
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn1, 0.5f));
|
||||
cb(residual, "ffn1_residual", il);
|
||||
}
|
||||
|
||||
// build_attn not used here: Shaw RPE needs pos_attn = mul_mat(pos_emb, Q)
|
||||
// injected between KQ product and softmax, which build_attn doesn't support
|
||||
{
|
||||
auto * normed = build_norm(residual, layer.ln_1_w, layer.ln_1_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(normed, "attn_norm", il);
|
||||
|
||||
if (n_frames < padded_len) {
|
||||
normed = ggml_pad(ctx0, normed, 0, padded_len - n_frames, 0, 0);
|
||||
}
|
||||
|
||||
ggml_tensor * Q = build_mm(layer.q_w, normed);
|
||||
ggml_tensor * K = build_mm(layer.k_w, normed);
|
||||
ggml_tensor * V = build_mm(layer.v_w, normed);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, context_size, num_blocks);
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, context_size, num_blocks);
|
||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, context_size, num_blocks);
|
||||
|
||||
ggml_tensor * Q_perm = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
||||
ggml_tensor * K_perm = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, K_perm, Q_perm);
|
||||
|
||||
// Shaw RPE: pos_emb ne[2]=1 broadcasts against Q ne[2]=num_blocks in mul_mat
|
||||
ggml_tensor * pos_emb = ggml_get_rows(ctx0, layer.attn_rel_pos_emb, attn_dists);
|
||||
pos_emb = ggml_reshape_3d(ctx0, pos_emb, d_head, context_size, context_size);
|
||||
pos_emb = ggml_reshape_4d(ctx0, pos_emb, d_head, context_size, 1, context_size);
|
||||
|
||||
ggml_tensor * Q_shaw = ggml_permute(ctx0, Q, 0, 1, 3, 2);
|
||||
ggml_tensor * pos_attn = ggml_mul_mat(ctx0, pos_emb, Q_shaw);
|
||||
pos_attn = ggml_cont(ctx0, ggml_permute(ctx0, pos_attn, 0, 2, 3, 1));
|
||||
|
||||
ggml_tensor * scores = ggml_add(ctx0, kq, pos_attn);
|
||||
ggml_tensor * attn_weights = ggml_soft_max_ext(ctx0, scores, attn_mask,
|
||||
kq_scale, 0.0f);
|
||||
|
||||
ggml_tensor * V_perm = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
ggml_tensor * attn_out = ggml_mul_mat(ctx0, V_perm, attn_weights);
|
||||
|
||||
attn_out = ggml_permute(ctx0, attn_out, 0, 2, 1, 3);
|
||||
attn_out = ggml_cont_2d(ctx0, attn_out, n_embd, padded_len);
|
||||
|
||||
if (n_frames < padded_len) {
|
||||
attn_out = ggml_view_2d(ctx0, attn_out,
|
||||
n_embd, n_frames, attn_out->nb[1], 0);
|
||||
}
|
||||
|
||||
cur = build_mm(layer.o_w, attn_out);
|
||||
cur = ggml_add(ctx0, cur, layer.o_b);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
residual = ggml_add(ctx0, residual, cur);
|
||||
|
||||
// conv module
|
||||
{
|
||||
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "conv_norm", il);
|
||||
|
||||
auto * x = build_mm(layer.conv_pw1_w, cur);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
||||
cb(x, "conv_pw1", il);
|
||||
|
||||
// GLU: ggml has no fused op, manual split + sigmoid gate
|
||||
{
|
||||
int64_t d = x->ne[0] / 2;
|
||||
ggml_tensor * gate = ggml_sigmoid(ctx0,
|
||||
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
|
||||
x = ggml_mul(ctx0,
|
||||
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
||||
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
||||
}
|
||||
cb(x, "conv_glu", il);
|
||||
|
||||
x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0);
|
||||
x = ggml_roll(ctx0, x, conv_pad, 0, 0, 0);
|
||||
x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0);
|
||||
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
|
||||
cb(x, "conv_dw", il);
|
||||
|
||||
// folded batch norm
|
||||
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
|
||||
x = ggml_silu(ctx0, x);
|
||||
cb(x, "conv_bn_silu", il);
|
||||
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
||||
cb(x, "conv_pw2", il);
|
||||
|
||||
cur = x;
|
||||
}
|
||||
|
||||
residual = ggml_add(ctx0, residual, cur);
|
||||
|
||||
// ffn2 (half-step)
|
||||
{
|
||||
auto * ffn2 = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(ffn2, "ffn2_norm", il);
|
||||
|
||||
ffn2 = build_ffn(ffn2,
|
||||
layer.ff_up_1_w, layer.ff_up_1_b,
|
||||
nullptr, nullptr,
|
||||
layer.ff_down_1_w, layer.ff_down_1_b,
|
||||
FFN_SILU, il);
|
||||
cb(ffn2, "ffn2_out", il);
|
||||
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn2, 0.5f));
|
||||
}
|
||||
|
||||
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
// CTC branch
|
||||
if (il + 1 == ctc_layer) {
|
||||
auto * mid = build_mm(model.ctc_out_w, cur);
|
||||
mid = ggml_add(ctx0, mid, model.ctc_out_b);
|
||||
mid = ggml_soft_max(ctx0, mid);
|
||||
mid = build_mm(model.ctc_out_mid_w, mid);
|
||||
mid = ggml_add(ctx0, mid, model.ctc_out_mid_b);
|
||||
cur = ggml_add(ctx0, cur, mid);
|
||||
cb(cur, "ctc_branch", il);
|
||||
}
|
||||
}
|
||||
|
||||
cb(cur, "encoder_out", -1);
|
||||
|
||||
// QFormer projector
|
||||
{
|
||||
const int window_size = hparams.audio_proj_window_size;
|
||||
const int num_queries = window_size / hparams.audio_proj_downsample_rate;
|
||||
const int proj_n_head = hparams.audio_proj_head_count;
|
||||
const int proj_d_head = n_embd / proj_n_head;
|
||||
const float proj_kq_scale = 1.0f / sqrtf((float)proj_d_head);
|
||||
const float proj_eps = 1e-12f;
|
||||
const int nblocks_proj = (n_frames + window_size - 1) / window_size;
|
||||
const int padded_proj = nblocks_proj * window_size;
|
||||
|
||||
if (n_frames < padded_proj) {
|
||||
cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
|
||||
}
|
||||
|
||||
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
|
||||
|
||||
ggml_tensor * queries = build_norm(model.qf_proj_query,
|
||||
model.qf_proj_norm_w, model.qf_proj_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, -1);
|
||||
{
|
||||
ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1);
|
||||
ggml_tensor * q_shape = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32,
|
||||
n_embd, num_queries, nblocks_proj);
|
||||
queries = ggml_repeat(ctx0, q_3d, q_shape);
|
||||
}
|
||||
|
||||
for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) {
|
||||
const auto & pl = model.qf_proj_layers[il];
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.q_w, queries), pl.q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0, build_mm(pl.k_w, queries), pl.k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0, build_mm(pl.v_w, queries), pl.v_b);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
|
||||
ggml_tensor * sa_out = build_attn(pl.o_w, pl.o_b,
|
||||
Q, K, V, nullptr, proj_kq_scale, il);
|
||||
sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, num_queries, nblocks_proj);
|
||||
|
||||
queries = build_norm(ggml_add(ctx0, sa_out, queries),
|
||||
pl.ln_1_w, pl.ln_1_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, il);
|
||||
}
|
||||
|
||||
// cross-attention
|
||||
{
|
||||
ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.cross_attn_q_w, queries), pl.cross_attn_q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0, build_mm(pl.cross_attn_k_w, enc_windows), pl.cross_attn_k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0, build_mm(pl.cross_attn_v_w, enc_windows), pl.cross_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, window_size, nblocks_proj);
|
||||
V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, window_size, nblocks_proj);
|
||||
|
||||
ggml_tensor * ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
|
||||
Q, K, V, nullptr, proj_kq_scale, il);
|
||||
ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, num_queries, nblocks_proj);
|
||||
|
||||
queries = build_norm(ggml_add(ctx0, ca_out, queries),
|
||||
pl.cross_attn_norm_w, pl.cross_attn_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, il);
|
||||
}
|
||||
|
||||
// ffn
|
||||
{
|
||||
ggml_tensor * ffn_out = build_ffn(queries,
|
||||
pl.ff_up_w, pl.ff_up_b,
|
||||
nullptr, nullptr,
|
||||
pl.ff_down_w, pl.ff_down_b,
|
||||
FFN_GELU, il);
|
||||
|
||||
queries = build_norm(ggml_add(ctx0, ffn_out, queries),
|
||||
pl.ln_2_w, pl.ln_2_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj);
|
||||
cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b);
|
||||
cb(cur, "projector_out", -1);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
@@ -111,6 +111,11 @@ struct clip_graph_conformer : clip_graph {
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_granite_speech : clip_graph {
|
||||
clip_graph_granite_speech(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4a : clip_graph {
|
||||
clip_graph_gemma4a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
@@ -650,6 +650,108 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_preprocessor_granite_speech
|
||||
//
|
||||
|
||||
void mtmd_audio_preprocessor_granite_speech::initialize() {
|
||||
cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
cache.fill_mel_filterbank_matrix(
|
||||
hparams.n_mel_bins / 2, hparams.audio_n_fft, hparams.audio_sample_rate,
|
||||
0.0f, -1.0f, false, 1.0f, true);
|
||||
}
|
||||
|
||||
bool mtmd_audio_preprocessor_granite_speech::preprocess(const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
if (n_samples == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_ASSERT(!cache.sin_vals.empty());
|
||||
GGML_ASSERT(!cache.cos_vals.empty());
|
||||
GGML_ASSERT(!cache.filters.data.empty());
|
||||
|
||||
const int n_fft = hparams.audio_n_fft;
|
||||
const int pad = n_fft / 2;
|
||||
|
||||
// reflect padding
|
||||
const int n_padded = (int)n_samples + 2 * pad;
|
||||
std::vector<float> padded(n_padded, 0.0f);
|
||||
std::copy(samples, samples + n_samples, padded.data() + pad);
|
||||
for (int i = 0; i < pad; i++) {
|
||||
int src = i + 1;
|
||||
if (src >= (int)n_samples) {
|
||||
src = (int)n_samples - 1;
|
||||
}
|
||||
padded[pad - 1 - i] = samples[src];
|
||||
}
|
||||
for (int i = 0; i < pad; i++) {
|
||||
int src = (int)n_samples - 2 - i;
|
||||
if (src < 0) {
|
||||
src = 0;
|
||||
}
|
||||
padded[pad + (int)n_samples + i] = samples[src];
|
||||
}
|
||||
|
||||
filter_params params;
|
||||
params.n_mel = hparams.n_mel_bins / 2;
|
||||
params.n_fft_bins = 1 + (n_fft / 2);
|
||||
params.hann_window_size = hparams.audio_window_len;
|
||||
params.hop_length = hparams.audio_hop_len;
|
||||
params.sample_rate = hparams.audio_sample_rate;
|
||||
params.no_padding = true;
|
||||
params.center_padding = false;
|
||||
params.preemph = 0.0f;
|
||||
params.use_natural_log = false;
|
||||
params.norm_per_feature = false;
|
||||
params.mel_floor = 1e-10f;
|
||||
|
||||
mtmd_audio_mel mel;
|
||||
if (!log_mel_spectrogram(padded.data(), n_padded, 4, params, cache, mel)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
double mmax = -1e20;
|
||||
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
|
||||
if (mel.data[i] > mmax) {
|
||||
mmax = mel.data[i];
|
||||
}
|
||||
}
|
||||
mmax -= 8.0;
|
||||
|
||||
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
|
||||
if (mel.data[i] < mmax) {
|
||||
mel.data[i] = mmax;
|
||||
}
|
||||
mel.data[i] = (mel.data[i] + 4.0) / 4.0;
|
||||
}
|
||||
|
||||
int n_frames = mel.n_len;
|
||||
if (n_frames % 2 == 1) {
|
||||
n_frames--;
|
||||
}
|
||||
const int n_mel = mel.n_mel;
|
||||
const int n_stacked = n_frames / 2;
|
||||
|
||||
mtmd_audio_mel stacked;
|
||||
stacked.n_mel = 2 * n_mel;
|
||||
stacked.n_len = n_stacked;
|
||||
stacked.n_len_org = (int)n_samples;
|
||||
stacked.data.resize(2 * n_mel * n_stacked);
|
||||
|
||||
for (int t = 0; t < n_stacked; t++) {
|
||||
for (int m = 0; m < n_mel; m++) {
|
||||
stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t];
|
||||
stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1];
|
||||
}
|
||||
}
|
||||
|
||||
output.push_back(std::move(stacked));
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_preprocessor_gemma4a
|
||||
//
|
||||
|
||||
@@ -78,6 +78,15 @@ struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor_granite_speech : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_granite_speech(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
|
||||
private:
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_gemma4a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
|
||||
@@ -532,6 +532,10 @@ struct mtmd_context {
|
||||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_granite_speech>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
aud_beg = "<|audio>";
|
||||
|
||||
Reference in New Issue
Block a user