mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-15 13:34:06 +00:00
* mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) Conformer encoder with Shaw relative position encoding, QFormer projector, log-mel spectrogram with frame stacking. Encoder uses GLU gating, folded batch norm, and SSM depthwise conv. QFormer compresses encoder output via windowed cross-attention (window=15, queries=3) into the LLM embedding space. Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank, dynamic range compression, 2x frame stacking (80->160 mel). GGUF converter handles batch norm folding at export time, fused K/V split, and Conv1d weight reshaping. Tested against HF transformers reference: token-for-token match on 30s/60s audio clips with greedy decoding. * mtmd: rename gs_ prefixed tensors to generic/architecture names * mtmd: use tensor_mapping.py for all granite_speech tensors * convert: fold GraniteSpeechTextModel into GraniteModel * mtmd: replace n_layer hack with explicit has_standard_layers flag * mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech * mtmd: align KEY_A_ define spacing * convert: register GraniteModel for GraniteSpeechForConditionalGeneration * convert: fix ty type-check for GraniteSpeechMmprojModel registration * mtmd: align TN_ define spacing * mtmd: use generic layer loop for granite speech tensor loading * mtmd: merge qformer_proj_layer into clip_layer * mtmd: granite_speech remove redundant ggml_build_forward_expand on inputs * mtmd: granite_speech add comment explaining why build_attn is not used * mtmd: granite_speech hard-code eps in cpp, remove from GGUF metadata * gguf: add spacing between granite_speech tensor mapping blocks * mtmd: make generic audio layer_norm_eps read optional * mtmd: granite_speech keep encoder eps in GGUF, only hard-code projector eps * mtmd: align defines and struct fields in clip-impl.h and clip-model.h * mtmd: fix alignment and ordering issues across granite speech files * convert: granite_speech use filter_tensors instead of modify_tensors for skipping
133 lines
4.0 KiB
C++
133 lines
4.0 KiB
C++
#pragma once
|
||
|
||
#include "ggml.h"
|
||
#include "clip-model.h"
|
||
|
||
#include <cstdint>
|
||
#include <vector>
|
||
#include <string>
|
||
|
||
#define MTMD_INTERNAL_HEADER
|
||
|
||
struct mtmd_audio_mel {
|
||
int n_len;
|
||
int n_len_org;
|
||
int n_mel;
|
||
|
||
std::vector<float> data;
|
||
};
|
||
|
||
struct mtmd_audio_mel_filters {
|
||
int32_t n_mel;
|
||
int32_t n_fft;
|
||
|
||
std::vector<float> data;
|
||
};
|
||
|
||
// cache for audio processing, each processor instance owns its own cache
|
||
struct mtmd_audio_cache {
|
||
std::vector<float> sin_vals;
|
||
std::vector<float> cos_vals;
|
||
|
||
std::vector<float> hann_window;
|
||
|
||
mtmd_audio_mel_filters filters;
|
||
|
||
void fill_sin_cos_table(uint32_t n);
|
||
|
||
void fill_hann_window(uint32_t length, bool periodic);
|
||
|
||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||
void fill_mel_filterbank_matrix(int n_mel,
|
||
int n_fft,
|
||
int sample_rate, // e.g. 16000
|
||
float fmin = 0.0f, // e.g. 0.0
|
||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||
bool slaney_area_norm = true,
|
||
float scale = 1.0f,
|
||
bool use_htk = false
|
||
);
|
||
};
|
||
|
||
struct mtmd_audio_preprocessor {
|
||
const clip_hparams & hparams;
|
||
|
||
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||
|
||
virtual ~mtmd_audio_preprocessor() = default;
|
||
virtual void initialize() = 0; // NOT thread-safe
|
||
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
||
};
|
||
|
||
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||
void initialize() override;
|
||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||
|
||
private:
|
||
mtmd_audio_cache cache;
|
||
};
|
||
|
||
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
||
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||
void initialize() override;
|
||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||
|
||
private:
|
||
mtmd_audio_cache cache;
|
||
};
|
||
|
||
struct mtmd_audio_preprocessor_granite_speech : mtmd_audio_preprocessor {
|
||
mtmd_audio_preprocessor_granite_speech(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||
void initialize() override;
|
||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||
|
||
private:
|
||
mtmd_audio_cache cache;
|
||
};
|
||
|
||
struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
|
||
mtmd_audio_preprocessor_gemma4a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||
void initialize() override;
|
||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||
|
||
private:
|
||
mtmd_audio_cache cache;
|
||
};
|
||
|
||
//
|
||
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
|
||
//
|
||
struct mtmd_audio_streaming_istft {
|
||
mtmd_audio_streaming_istft(int n_fft, int hop_length);
|
||
|
||
// reset streaming state
|
||
void reset();
|
||
|
||
// process a single STFT frame (streaming)
|
||
// frame_spectrum: [n_fft_bins x 2] interleaved real/imag
|
||
// returns: up to hop_length samples
|
||
std::vector<float> process_frame(const float * frame_spectrum);
|
||
|
||
// flush remaining samples at end of stream
|
||
std::vector<float> flush();
|
||
|
||
private:
|
||
int n_fft;
|
||
int hop_length;
|
||
int n_fft_bins;
|
||
|
||
// Own cache for output processing
|
||
mtmd_audio_cache cache;
|
||
|
||
// Streaming state
|
||
std::vector<float> overlap_buffer;
|
||
std::vector<float> window_sum_buffer;
|
||
int padding_to_remove;
|
||
|
||
// Working buffers for IFFT
|
||
std::vector<float> ifft_in;
|
||
std::vector<float> ifft_out;
|
||
};
|