eagle3: add support for RedHtAI eagle3 speculator series models

This commit is contained in:
ruixiangw
2026-01-16 00:54:14 +00:00
parent 75883cde73
commit 7b78bfa984
7 changed files with 38 additions and 7 deletions

View File

@@ -2396,6 +2396,8 @@ class StableLMModel(TextModel):
"LlavaForConditionalGeneration",
"VoxtralForConditionalGeneration",
"LlamaForCausalLMEagle3",
"Eagle3Speculator",
"Eagle3DraftModel",
"LlamaModel")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2445,6 +2447,11 @@ class LlamaModel(TextModel):
logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)")
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
# Eagle3Speculator norm_before_residual specific handling
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)")
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
def set_vocab(self):
# For EAGLE-3 models, use tokenizer from target model if provided
if hasattr(self, 'is_eagle3') and self.is_eagle3:
@@ -2528,15 +2535,23 @@ class LlamaModel(TextModel):
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
tensors = super().index_tensors(remote_hf_model_id)
# Handle Eagle3Speculator nested config
if "transformer_layer_config" in self.hparams:
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
# EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set)
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
logger.info("EAGLE-3: Renaming midlayer.* to model.layers.0.*")
logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*")
new_tensors = {}
# EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model
for name, gen in tensors.items():
if name.startswith("midlayer."):
new_name = "model.layers.0." + name[len("midlayer."):]
new_tensors[new_name] = gen
elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format)
new_name = "model." + name
new_tensors[new_name] = gen
else:
new_tensors[name] = gen
return new_tensors

View File

@@ -149,6 +149,7 @@ class Keys:
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers"
EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"

View File

@@ -248,8 +248,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
{ LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" },
{ LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
{ LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" },
{ LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
{ LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
// sentence-transformers dense modules feature dims

View File

@@ -292,6 +292,7 @@ enum llm_kv {
LLM_KV_EAGLE3_EXTRACT_LAYERS,
LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,
LLM_KV_SHORTCONV_L_CACHE,

View File

@@ -196,6 +196,9 @@ struct llama_hparams {
// EAGLE3 draft model - target model hidden size
uint32_t eagle3_target_hidden_size = 0;
// EAGLE3 draft model - apply hidden_norm before storing residual
bool eagle3_norm_before_residual = false;
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;

View File

@@ -2260,7 +2260,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size);
LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
hparams.eagle3_target_hidden_size, hparams.n_embd);
// EAGLE3 norm_before_residual (optional, default false)
// compatible with Readhat eagle3 speculator model
ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false);
if (hparams.eagle3_norm_before_residual) {
LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
}
type = LLM_TYPE_UNKNOWN;
} break;
case LLM_ARCH_COGVLM:

View File

@@ -77,9 +77,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
// Single decoder layer (il = 0)
const int il = 0;
{
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
ggml_tensor * inpSA = inpL;
// Apply input_layernorm to the token embeddings
ggml_tensor * embd_norm = build_norm(inp_embd,
model.layers[il].attn_norm, NULL,
@@ -92,6 +89,12 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
LLM_NORM_RMS, -1);
cb(g_norm, "g_norm", il);
// norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
// - false (default): use raw inp_g for residual
// - true: use normalized g_norm for residual
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL;
// Concatenate normalized inp_embd and normalized inp_g
cur = ggml_concat(ctx0, embd_norm, g_norm, il);
cb(cur, "concat_embd", il);