eagle3: add support for RedHtAI eagle3 speculator series models

2026-05-13 04:24:17 +00:00 · 2026-01-16 00:54:14 +00:00
parent 75883cde73
commit 7b78bfa984
7 changed files with 38 additions and 7 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2396,6 +2396,8 @@ class StableLMModel(TextModel):
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
    "LlamaForCausalLMEagle3",
+    "Eagle3Speculator",
+    "Eagle3DraftModel",
    "LlamaModel")
 class LlamaModel(TextModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2445,6 +2447,11 @@ class LlamaModel(TextModel):
                logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)")
            self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)

+            # Eagle3Speculator norm_before_residual specific handling
+            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
+            logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)")
+            self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
+
    def set_vocab(self):
        # For EAGLE-3 models, use tokenizer from target model if provided
        if hasattr(self, 'is_eagle3') and self.is_eagle3:
@@ -2528,15 +2535,23 @@ class LlamaModel(TextModel):

    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
        tensors = super().index_tensors(remote_hf_model_id)
+
+        # Handle Eagle3Speculator nested config
+        if "transformer_layer_config" in self.hparams:
+            self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
+            
        # EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set)
        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
-            logger.info("EAGLE-3: Renaming midlayer.* to model.layers.0.*")
+            logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*")
            new_tensors = {}
            # EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model
            for name, gen in tensors.items():
                if name.startswith("midlayer."):
                    new_name = "model.layers.0." + name[len("midlayer."):]
                    new_tensors[new_name] = gen
+                elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format)
+                    new_name = "model." + name
+                    new_tensors[new_name] = gen
                else:
                    new_tensors[name] = gen
            return new_tensors
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -149,6 +149,7 @@ class Keys:
        DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
        EAGLE3_EXTRACT_LAYERS             = "{arch}.extract_layers"
        EAGLE3_TARGET_HIDDEN_SIZE         = "{arch}.target_hidden_size"
+        EAGLE3_NORM_BEFORE_RESIDUAL       = "{arch}.norm_before_residual"

    class Attention:
        HEAD_COUNT                   = "{arch}.attention.head_count"
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -248,8 +248,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

-    { LLM_KV_EAGLE3_EXTRACT_LAYERS,     "%s.extract_layers"     },
-    { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
+    { LLM_KV_EAGLE3_EXTRACT_LAYERS,        "%s.extract_layers"        },
+    { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"    },
+    { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual"  },

    { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
    // sentence-transformers dense modules feature dims
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -292,6 +292,7 @@ enum llm_kv {

    LLM_KV_EAGLE3_EXTRACT_LAYERS,
    LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
+    LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,

    LLM_KV_SHORTCONV_L_CACHE,

--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -196,6 +196,9 @@ struct llama_hparams {
    // EAGLE3 draft model - target model hidden size
    uint32_t eagle3_target_hidden_size = 0;

+    // EAGLE3 draft model - apply hidden_norm before storing residual
+    bool eagle3_norm_before_residual = false;
+
    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2260,7 +2260,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size);
                LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
                               hparams.eagle3_target_hidden_size, hparams.n_embd);
-                               
+
+                // EAGLE3 norm_before_residual (optional, default false)
+                // compatible with Readhat eagle3 speculator model
+                ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false);
+                if (hparams.eagle3_norm_before_residual) {
+                    LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
+                }
+
                type = LLM_TYPE_UNKNOWN;
            } break;
        case LLM_ARCH_COGVLM:
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -77,9 +77,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
    // Single decoder layer (il = 0)
    const int il = 0;
    {
-        // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
-        ggml_tensor * inpSA = inpL;
-
        // Apply input_layernorm to the token embeddings
        ggml_tensor * embd_norm = build_norm(inp_embd,
                model.layers[il].attn_norm, NULL,
@@ -92,6 +89,12 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
                LLM_NORM_RMS, -1);
        cb(g_norm, "g_norm", il);

+        // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
+        // - false (default): use raw inp_g for residual
+        // - true: use normalized g_norm for residual
+        // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
+        ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL;
+
        // Concatenate normalized inp_embd and normalized inp_g
        cur = ggml_concat(ctx0, embd_norm, g_norm, il);
        cb(cur, "concat_embd", il);