From adf436f5ec0f888c5072c32e6e5e24aa9eafb4e7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 7 May 2026 10:50:42 +0300
Subject: [PATCH] server : sketch the ctx_dft decode loop

[no ci]
---
 tools/server/server-context.cpp | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index dfe42ca64c..e3c2749809 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2859,6 +2859,36 @@ private:
                 continue; // continue loop of n_batch
             }
 
+            if (ctx_dft) {
+                SRV_WRN("%s", "processing the batch using the draft context\n");
+
+                // note: for now, to keep things simple, synchronize the target context
+                // TODO: revisit later on
+                llama_synchronize(ctx);
+
+                // the logic here varies depending on the speculative decoding method
+                //  - some draft contexts require emebeddings from the target context, others don't
+                //  - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
+                // TODO: extract this in a function ?
+                {
+                    // TODO: hook the embeddings from the last target batch here
+                    if (llama_model_has_encoder(model_dft.get())) {
+                        //llama_encode(ctx_dft, ...);
+
+                        GGML_ABORT("not implemented yet\n");
+                    }
+
+                    const int ret = llama_decode(ctx_dft.get(), batch_view);
+
+                    if (ret != 0) {
+                        SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
+
+                        // TODO: handle error
+                        break;
+                    }
+                }
+            }
+
             // move the head of the batch forward with the number of tokens we just processed
             i_next = i + n_tokens;