model : add ASR support for LFM2-Audio-1.5B (conformer) (#18106)

* ASR with LFM2-Audio-1.5B

* Set rope_theta

* Fix comment

* Remove rope_theta setting

* Address PR feedback

* rename functions to conformer

* remove some redundant ggml_cont

* fix missing tensor

* add prefix "a." for conv tensors

* remove redundant reshape

* clean up

* add test model

---------

Co-authored-by: Tarek Dakhran <tarek@liquid.ai>
This commit is contained in:
Xuan-Son Nguyen
2025-12-19 00:18:01 +01:00
committed by GitHub
parent f9ec8858ed
commit 8ea958d4d9
17 changed files with 669 additions and 29 deletions

View File

@@ -690,6 +690,8 @@ class MODEL_TENSOR(IntEnum):
V_TOK_EOI = auto() # cogvlm
# audio (mtmd)
A_ENC_EMBD_POS = auto()
A_ENC_EMBD_NORM = auto()
A_ENC_EMBD_TO_LOGITS = auto()
A_ENC_CONV1D = auto()
A_PRE_NORM = auto()
A_POST_NORM = auto()
@@ -700,8 +702,13 @@ class MODEL_TENSOR(IntEnum):
A_ENC_OUTPUT = auto()
A_ENC_OUTPUT_NORM = auto()
A_ENC_FFN_UP = auto()
A_ENC_FFN_NORM = auto()
A_ENC_FFN_GATE = auto()
A_ENC_FFN_DOWN = auto()
A_ENC_FFN_UP_1 = auto()
A_ENC_FFN_NORM_1 = auto()
A_ENC_FFN_GATE_1 = auto()
A_ENC_FFN_DOWN_1 = auto()
A_MMPROJ = auto()
A_MMPROJ_FC = auto()
A_MM_NORM_PRE = auto()
@@ -713,6 +720,16 @@ class MODEL_TENSOR(IntEnum):
NEXTN_HNORM = auto()
NEXTN_SHARED_HEAD_HEAD = auto()
NEXTN_SHARED_HEAD_NORM = auto()
# lfm2 audio
A_ENC_NORM_CONV = auto()
A_ENC_LINEAR_POS = auto()
A_ENC_POS_BIAS_U = auto()
A_ENC_POS_BIAS_V = auto()
A_ENC_OUT = auto()
A_ENC_CONV_DW = auto() # SSM conv
A_ENC_CONV_NORM = auto() # SSM conv
A_ENC_CONV_PW1 = auto()
A_ENC_CONV_PW2 = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -1064,7 +1081,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_TOK_BOI: "v.boi",
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
# audio (mtmd)
# note: all audio tensor names must use prefix "a." or "mm.a."
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
@@ -1074,13 +1094,28 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm",
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1",
MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1",
MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1",
MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1",
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
# lfm2 audio
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u",
MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v",
MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out",
MODEL_TENSOR.A_ENC_CONV_DW: "a.blk.{bid}.conv_dw",
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
# NextN/MTP
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
@@ -1145,6 +1180,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_TOK_EOI,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_EMBD_NORM,
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
MODEL_TENSOR.A_ENC_CONV1D,
MODEL_TENSOR.A_PRE_NORM,
MODEL_TENSOR.A_POST_NORM,
@@ -1154,13 +1191,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.A_ENC_INPUT_NORM,
MODEL_TENSOR.A_ENC_OUTPUT,
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
MODEL_TENSOR.A_ENC_FFN_NORM,
MODEL_TENSOR.A_ENC_FFN_UP,
MODEL_TENSOR.A_ENC_FFN_GATE,
MODEL_TENSOR.A_ENC_FFN_DOWN,
MODEL_TENSOR.A_ENC_FFN_NORM_1,
MODEL_TENSOR.A_ENC_FFN_UP_1,
MODEL_TENSOR.A_ENC_FFN_GATE_1,
MODEL_TENSOR.A_ENC_FFN_DOWN_1,
MODEL_TENSOR.A_MMPROJ,
MODEL_TENSOR.A_MMPROJ_FC,
MODEL_TENSOR.A_MM_NORM_PRE,
MODEL_TENSOR.A_MM_NORM_MID,
MODEL_TENSOR.A_ENC_NORM_CONV,
MODEL_TENSOR.A_ENC_LINEAR_POS,
MODEL_TENSOR.A_ENC_POS_BIAS_U,
MODEL_TENSOR.A_ENC_POS_BIAS_V,
MODEL_TENSOR.A_ENC_OUT,
MODEL_TENSOR.A_ENC_CONV_DW,
MODEL_TENSOR.A_ENC_CONV_NORM,
MODEL_TENSOR.A_ENC_CONV_PW1,
MODEL_TENSOR.A_ENC_CONV_PW2,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
@@ -3363,6 +3414,7 @@ class VisionProjectorType:
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
LFM2A = "lfm2a" # audio
GLM4V = "glm4v"

View File

@@ -1535,10 +1535,20 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_EMBD_POS: (
"audio_tower.embed_positions", # ultravox
"audio_embedding.embedding", # lfm2
),
MODEL_TENSOR.A_ENC_EMBD_NORM: (
"audio_embedding.embedding_norm", # lfm2
),
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: (
"audio_embedding.to_logits", # lfm2
),
MODEL_TENSOR.A_ENC_CONV1D: (
"audio_tower.conv{bid}", # ultravox
"conformer.pre_encode.conv.{bid}", # lfm2
),
MODEL_TENSOR.A_PRE_NORM: (),
@@ -1550,36 +1560,76 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_ATTN_Q: (
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
),
MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
),
MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
),
MODEL_TENSOR.A_ENC_INPUT_NORM: (
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
),
MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
),
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
),
MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
),
MODEL_TENSOR.A_ENC_FFN_UP: (
"audio_tower.layers.{bid}.fc1", # ultravox
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
),
MODEL_TENSOR.A_ENC_FFN_GATE: (),
MODEL_TENSOR.A_ENC_FFN_DOWN: (
"audio_tower.layers.{bid}.fc2", # ultravox
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
),
MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
),
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
),
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
),
MODEL_TENSOR.A_ENC_LINEAR_POS: (
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
),
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
"conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
),
MODEL_TENSOR.A_ENC_POS_BIAS_V: (
"conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
),
MODEL_TENSOR.A_ENC_OUT: (
"conformer.pre_encode.out", # lfm2
),
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
@@ -1587,6 +1637,7 @@ class TensorNameMap:
MODEL_TENSOR.A_MMPROJ: (
"audio.multi_modal_projector.linear_{bid}", # ultravox
"audio_adapter.model.{bid}" # lfm2
),
MODEL_TENSOR.A_MMPROJ_FC: (
@@ -1602,6 +1653,26 @@ class TensorNameMap:
"audio.multi_modal_projector.ln_mid", # ultravox
),
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
),
MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
),
MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
),
MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
),
MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
),
# NextN/MTP tensors for GLM4_MOE
MODEL_TENSOR.NEXTN_EH_PROJ: (
"model.layers.{bid}.eh_proj",