mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-14 13:04:08 +00:00
* mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) Conformer encoder with Shaw relative position encoding, QFormer projector, log-mel spectrogram with frame stacking. Encoder uses GLU gating, folded batch norm, and SSM depthwise conv. QFormer compresses encoder output via windowed cross-attention (window=15, queries=3) into the LLM embedding space. Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank, dynamic range compression, 2x frame stacking (80->160 mel). GGUF converter handles batch norm folding at export time, fused K/V split, and Conv1d weight reshaping. Tested against HF transformers reference: token-for-token match on 30s/60s audio clips with greedy decoding. * mtmd: rename gs_ prefixed tensors to generic/architecture names * mtmd: use tensor_mapping.py for all granite_speech tensors * convert: fold GraniteSpeechTextModel into GraniteModel * mtmd: replace n_layer hack with explicit has_standard_layers flag * mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech * mtmd: align KEY_A_ define spacing * convert: register GraniteModel for GraniteSpeechForConditionalGeneration * convert: fix ty type-check for GraniteSpeechMmprojModel registration * mtmd: align TN_ define spacing * mtmd: use generic layer loop for granite speech tensor loading * mtmd: merge qformer_proj_layer into clip_layer * mtmd: granite_speech remove redundant ggml_build_forward_expand on inputs * mtmd: granite_speech add comment explaining why build_attn is not used * mtmd: granite_speech hard-code eps in cpp, remove from GGUF metadata * gguf: add spacing between granite_speech tensor mapping blocks * mtmd: make generic audio layer_norm_eps read optional * mtmd: granite_speech keep encoder eps in GGUF, only hard-code projector eps * mtmd: align defines and struct fields in clip-impl.h and clip-model.h * mtmd: fix alignment and ordering issues across granite speech files * convert: granite_speech use filter_tensors instead of modify_tensors for skipping
124 lines
3.8 KiB
CMake
124 lines
3.8 KiB
CMake
# mtmd
|
|
|
|
find_package(Threads REQUIRED)
|
|
|
|
add_library(mtmd
|
|
mtmd.cpp
|
|
mtmd-audio.cpp
|
|
mtmd-image.cpp
|
|
mtmd.h
|
|
mtmd-helper.cpp
|
|
mtmd-helper.h
|
|
clip.cpp
|
|
clip.h
|
|
clip-impl.h
|
|
clip-model.h
|
|
clip-graph.h
|
|
models/models.h
|
|
models/cogvlm.cpp
|
|
models/conformer.cpp
|
|
models/dotsocr.cpp
|
|
models/gemma4a.cpp
|
|
models/gemma4v.cpp
|
|
models/glm4v.cpp
|
|
models/granite-speech.cpp
|
|
models/hunyuanocr.cpp
|
|
models/internvl.cpp
|
|
models/kimivl.cpp
|
|
models/kimik25.cpp
|
|
models/nemotron-v2-vl.cpp
|
|
models/llama4.cpp
|
|
models/llava.cpp
|
|
models/minicpmv.cpp
|
|
models/paddleocr.cpp
|
|
models/pixtral.cpp
|
|
models/qwen2vl.cpp
|
|
models/qwen3vl.cpp
|
|
models/qwen3a.cpp
|
|
models/step3vl.cpp
|
|
models/siglip.cpp
|
|
models/whisper-enc.cpp
|
|
models/deepseekocr.cpp
|
|
models/mobilenetv5.cpp
|
|
models/youtuvl.cpp
|
|
models/yasa2.cpp
|
|
)
|
|
|
|
set_target_properties(mtmd PROPERTIES
|
|
VERSION ${LLAMA_INSTALL_VERSION}
|
|
SOVERSION 0
|
|
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
|
)
|
|
|
|
target_link_libraries (mtmd PUBLIC ggml llama)
|
|
target_link_libraries (mtmd PRIVATE Threads::Threads)
|
|
target_include_directories(mtmd PUBLIC .)
|
|
target_include_directories(mtmd PRIVATE ../..)
|
|
target_include_directories(mtmd PRIVATE ../../vendor)
|
|
target_compile_features (mtmd PRIVATE cxx_std_17)
|
|
|
|
if (BUILD_SHARED_LIBS)
|
|
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
|
|
target_compile_definitions(mtmd PUBLIC LLAMA_SHARED)
|
|
endif()
|
|
|
|
set(MTMD_PUBLIC_HEADERS
|
|
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
|
|
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
|
|
)
|
|
|
|
set_target_properties(mtmd
|
|
PROPERTIES
|
|
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
|
|
|
|
set_target_properties(mtmd
|
|
PROPERTIES
|
|
PRIVATE_HEADER debug/mtmd-debug.h)
|
|
|
|
install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
|
|
|
|
if (NOT MSVC)
|
|
# for stb_image.h and miniaudio.h
|
|
target_compile_options(mtmd PRIVATE -Wno-cast-qual)
|
|
endif()
|
|
|
|
if (ANDROID)
|
|
# miniaudio.h defines ma_android_sdk_version() without a prior prototype
|
|
target_compile_options(mtmd PRIVATE -Wno-missing-prototypes)
|
|
endif()
|
|
|
|
if (TARGET BUILD_INFO)
|
|
add_dependencies(mtmd BUILD_INFO)
|
|
add_dependencies(mtmd-helper BUILD_INFO)
|
|
endif()
|
|
|
|
# if mtmd is linked against llama-common, we throw an error
|
|
if (TARGET mtmd)
|
|
get_target_property(libs mtmd LINK_LIBRARIES)
|
|
if (libs AND "llama-common" IN_LIST libs)
|
|
message(FATAL_ERROR "mtmd is designed to be a public library.\n"
|
|
"It must not link against llama-common")
|
|
endif()
|
|
endif()
|
|
|
|
add_executable(llama-llava-cli deprecation-warning.cpp)
|
|
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
|
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
|
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
|
|
|
set(TARGET llama-mtmd-cli)
|
|
add_executable (${TARGET} mtmd-cli.cpp)
|
|
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
|
if(LLAMA_TOOLS_INSTALL)
|
|
install(TARGETS ${TARGET} RUNTIME)
|
|
endif()
|
|
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
|
|
# mtmd-debug tool
|
|
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
|
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
|
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
|
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|