mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-13 04:24:17 +00:00
eagle3: add support for RedHtAI eagle3 speculator series models
This commit is contained in:
@@ -2396,6 +2396,8 @@ class StableLMModel(TextModel):
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaForCausalLMEagle3",
|
||||
"Eagle3Speculator",
|
||||
"Eagle3DraftModel",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
@@ -2445,6 +2447,11 @@ class LlamaModel(TextModel):
|
||||
logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)")
|
||||
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
|
||||
|
||||
# Eagle3Speculator norm_before_residual specific handling
|
||||
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
|
||||
logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)")
|
||||
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
|
||||
|
||||
def set_vocab(self):
|
||||
# For EAGLE-3 models, use tokenizer from target model if provided
|
||||
if hasattr(self, 'is_eagle3') and self.is_eagle3:
|
||||
@@ -2528,15 +2535,23 @@ class LlamaModel(TextModel):
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
tensors = super().index_tensors(remote_hf_model_id)
|
||||
|
||||
# Handle Eagle3Speculator nested config
|
||||
if "transformer_layer_config" in self.hparams:
|
||||
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
|
||||
|
||||
# EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set)
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
logger.info("EAGLE-3: Renaming midlayer.* to model.layers.0.*")
|
||||
logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*")
|
||||
new_tensors = {}
|
||||
# EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model
|
||||
for name, gen in tensors.items():
|
||||
if name.startswith("midlayer."):
|
||||
new_name = "model.layers.0." + name[len("midlayer."):]
|
||||
new_tensors[new_name] = gen
|
||||
elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format)
|
||||
new_name = "model." + name
|
||||
new_tensors[new_name] = gen
|
||||
else:
|
||||
new_tensors[name] = gen
|
||||
return new_tensors
|
||||
|
||||
@@ -149,6 +149,7 @@ class Keys:
|
||||
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
|
||||
EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers"
|
||||
EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
|
||||
EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
|
||||
@@ -248,8 +248,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
|
||||
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
||||
|
||||
{ LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" },
|
||||
{ LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
|
||||
{ LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" },
|
||||
{ LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
|
||||
{ LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },
|
||||
|
||||
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
|
||||
// sentence-transformers dense modules feature dims
|
||||
|
||||
@@ -292,6 +292,7 @@ enum llm_kv {
|
||||
|
||||
LLM_KV_EAGLE3_EXTRACT_LAYERS,
|
||||
LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
|
||||
LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,
|
||||
|
||||
LLM_KV_SHORTCONV_L_CACHE,
|
||||
|
||||
|
||||
@@ -196,6 +196,9 @@ struct llama_hparams {
|
||||
// EAGLE3 draft model - target model hidden size
|
||||
uint32_t eagle3_target_hidden_size = 0;
|
||||
|
||||
// EAGLE3 draft model - apply hidden_norm before storing residual
|
||||
bool eagle3_norm_before_residual = false;
|
||||
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
@@ -2260,7 +2260,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
|
||||
hparams.eagle3_target_hidden_size, hparams.n_embd);
|
||||
|
||||
|
||||
// EAGLE3 norm_before_residual (optional, default false)
|
||||
// compatible with Readhat eagle3 speculator model
|
||||
ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false);
|
||||
if (hparams.eagle3_norm_before_residual) {
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
|
||||
}
|
||||
|
||||
type = LLM_TYPE_UNKNOWN;
|
||||
} break;
|
||||
case LLM_ARCH_COGVLM:
|
||||
|
||||
@@ -77,9 +77,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
|
||||
// Single decoder layer (il = 0)
|
||||
const int il = 0;
|
||||
{
|
||||
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Apply input_layernorm to the token embeddings
|
||||
ggml_tensor * embd_norm = build_norm(inp_embd,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
@@ -92,6 +89,12 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(g_norm, "g_norm", il);
|
||||
|
||||
// norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
|
||||
// - false (default): use raw inp_g for residual
|
||||
// - true: use normalized g_norm for residual
|
||||
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
|
||||
ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL;
|
||||
|
||||
// Concatenate normalized inp_embd and normalized inp_g
|
||||
cur = ggml_concat(ctx0, embd_norm, g_norm, il);
|
||||
cb(cur, "concat_embd", il);
|
||||
|
||||
Reference in New Issue
Block a user