From bf3f12df4c87ffb6f8dc73d511ef4333321d0bdf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jan 2026 15:46:45 +0200
Subject: [PATCH] graph : constant topology for tokens/embeddings inputs

---
 src/llama-graph.cpp         | 28 +++++++++++++++-------------
 src/llama-graph.h           |  4 +++-
 src/models/gemma3n-iswa.cpp |  2 +-
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 1d0d7197e1..e292300f0a 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -21,7 +21,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 
     if (ubatch->embd) {
-        const int64_t n_embd   = embd->ne[0];
+        GGML_ASSERT(n_embd == embd->ne[0]);
+
         const int64_t n_tokens = ubatch->n_tokens;
 
         ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -1206,17 +1207,21 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
     const int64_t n_embd = hparams.n_embd_inp();
 
-    auto inp = std::make_unique<llm_graph_input_embd>();
+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+    cb(inp->tokens, "inp_tokens", -1);
+    ggml_set_input(inp->tokens);
+    res->t_tokens = inp->tokens;
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+    ggml_set_input(inp->embd);
 
     ggml_tensor * cur = nullptr;
 
-    if (ubatch.token) {
-        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp->tokens, "inp_tokens", -1);
-        ggml_set_input(inp->tokens);
-        res->t_tokens = inp->tokens;
-
+    {
         cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
+        cur = ggml_scale(ctx0, cur, ubatch.token ? 1.0f : 0.0f);
 
         // apply lora for embedding tokens if needed
         for (const auto & lora : *loras) {
@@ -1235,13 +1240,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 
             cur = ggml_add(ctx0, cur, inpL_delta);
         }
-    } else {
-        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        ggml_set_input(inp->embd);
-
-        cur = inp->embd;
     }
 
+    cur = ggml_add(ctx0, cur, ggml_scale(ctx0, inp->embd, ubatch.embd ? 1.0f : 0.0f));
+
     // For Granite architecture
     if (hparams.f_embedding_scale != 0.0f) {
         cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 81ac329cc3..07c81e79f9 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -104,7 +104,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
-    llm_graph_input_embd()          = default;
+    llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
     virtual ~llm_graph_input_embd() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
@@ -113,6 +113,8 @@ public:
 
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+
+    const int64_t n_embd = 0;
 };
 
 class llm_graph_input_pos : public llm_graph_input_i {
diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp
index 9c7b3ba0bb..15054cf1ce 100644
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -245,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto inp = std::make_unique<llm_graph_input_embd>();
+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
     ggml_tensor * inp_per_layer;
     if (ubatch.token) {
         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);