diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 52140107fb..2babd7f9f0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2396,6 +2396,8 @@ class StableLMModel(TextModel): "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", "LlamaForCausalLMEagle3", + "Eagle3Speculator", + "Eagle3DraftModel", "LlamaModel") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA @@ -2445,6 +2447,11 @@ class LlamaModel(TextModel): logger.info(f"EAGLE3: target_hidden_size = {target_hidden_size} (from target model config)") self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size) + # Eagle3Speculator norm_before_residual specific handling + norm_before_residual = eagle3_raw_config.get("norm_before_residual", False) + logger.info(f"EAGLE3: norm_before_residual = {norm_before_residual} (from EAGLE3 config)") + self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual) + def set_vocab(self): # For EAGLE-3 models, use tokenizer from target model if provided if hasattr(self, 'is_eagle3') and self.is_eagle3: @@ -2528,15 +2535,23 @@ class LlamaModel(TextModel): def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: tensors = super().index_tensors(remote_hf_model_id) + + # Handle Eagle3Speculator nested config + if "transformer_layer_config" in self.hparams: + self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]} + # EAGLE-3 detection: check hparams directly (before self.is_eagle3 is set) if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: - logger.info("EAGLE-3: Renaming midlayer.* to model.layers.0.*") + logger.info("EAGLE-3: Renaming midlayer.* or layers.0.* to model.layers.0.*") new_tensors = {} # EAGLE-3: rename midlayer.* to model.layers.0.* for compatibility with llama model for name, gen in tensors.items(): if name.startswith("midlayer."): new_name = "model.layers.0." + name[len("midlayer."):] new_tensors[new_name] = gen + elif name.startswith("layers.0."): # layers.0.* -> model.layers.0.* (Eagle3Speculator format) + new_name = "model." + name + new_tensors[new_name] = gen else: new_tensors[name] = gen return new_tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b1160ca26d..2ae5094619 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -149,6 +149,7 @@ class Keys: DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers" EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" + EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 4caa5f77ae..8304c63615 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -248,8 +248,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, - { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, - { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, + { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims diff --git a/src/llama-arch.h b/src/llama-arch.h index 3e731b5005..36cad138a8 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -292,6 +292,7 @@ enum llm_kv { LLM_KV_EAGLE3_EXTRACT_LAYERS, LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, + LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, LLM_KV_SHORTCONV_L_CACHE, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 9272c728e3..f8ed7f364c 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -196,6 +196,9 @@ struct llama_hparams { // EAGLE3 draft model - target model hidden size uint32_t eagle3_target_hidden_size = 0; + // EAGLE3 draft model - apply hidden_norm before storing residual + bool eagle3_norm_before_residual = false; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 287bfe7f14..4879376aef 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2260,7 +2260,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.eagle3_target_hidden_size); LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, hparams.eagle3_target_hidden_size, hparams.n_embd); - + + // EAGLE3 norm_before_residual (optional, default false) + // compatible with Readhat eagle3 speculator model + ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false); + if (hparams.eagle3_norm_before_residual) { + LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__); + } + type = LLM_TYPE_UNKNOWN; } break; case LLM_ARCH_COGVLM: diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 629d89d327..4f9410b360 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -77,9 +77,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons // Single decoder layer (il = 0) const int il = 0; { - // inpL is the concatenated input (normalized inp_embd + normalized inp_g) - ggml_tensor * inpSA = inpL; - // Apply input_layernorm to the token embeddings ggml_tensor * embd_norm = build_norm(inp_embd, model.layers[il].attn_norm, NULL, @@ -92,6 +89,12 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons LLM_NORM_RMS, -1); cb(g_norm, "g_norm", il); + // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) + // - false (default): use raw inp_g for residual + // - true: use normalized g_norm for residual + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) + ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL; + // Concatenate normalized inp_embd and normalized inp_g cur = ggml_concat(ctx0, embd_norm, g_norm, il); cb(cur, "concat_embd", il);