cuda : add f32 to bf16 copy op (#12806 )

This allows BF16 KV-cache on CUDA.
llava: improve clip_ctx destructor to not memleak load_image_size (#12834 )
2026-05-21 08:24:06 +00:00 · 2025-04-08 23:21:31 +02:00 · 2025-04-08 22:01:58 +02:00 · 2025-04-08 19:54:51 +03:00 · 2025-04-08 18:37:06 +02:00 · 2025-04-08 15:49:13 +02:00
36 changed files with 814 additions and 85 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -163,6 +163,8 @@ struct common_hf_file_res {
 #   if !defined(PATH_MAX)
 #   define PATH_MAX MAX_PATH
 #   endif
+#elif defined(_AIX)
+#include <sys/limits.h>
 #else
 #include <sys/syslimits.h>
 #endif
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -714,6 +714,9 @@ class Model:
        if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
            # ref: https://huggingface.co/inclusionAI/Ling-lite
            res = "bailingmoe"
+        if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
+            # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
+            res = "llama4"

        if res is None:
            logger.warning("\n")
@@ -1608,6 +1611,7 @@ class StableLMModel(Model):
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA
+    undo_permute = True

    def set_vocab(self):
        try:
@@ -1672,10 +1676,11 @@ class LlamaModel(Model):
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")

-        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        if self.undo_permute:
+            if name.endswith(("q_proj.weight", "q_proj.bias")):
+                data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            if name.endswith(("k_proj.weight", "k_proj.bias")):
+                data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)

        # process the experts separately
        if name.find("block_sparse_moe.experts") != -1:
@@ -1752,6 +1757,61 @@ class LlamaModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


+@Model.register("Llama4ForConditionalGeneration")
+class Llama4Model(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA4
+    has_vision: bool = False
+    undo_permute = False
+
+    # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
+    # same with llama, but we need to merge the text_config into the root level of hparams
+    def __init__(self, *args, **kwargs):
+        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
+        if "text_config" in hparams:
+            hparams = {**hparams, **hparams["text_config"]}
+            kwargs["hparams"] = hparams
+        super().__init__(*args, **kwargs)
+        if "vision_config" in hparams:
+            logger.info("Has vision encoder, but it will be ignored")
+            self.has_vision = True
+        # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
+        self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
+        self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+        self.gguf_writer.add_add_bos_token(True)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
+        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        name = name.replace("language_model.", "")
+        name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
+        name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
+
+        # split the gate_up into gate and up
+        if "gate_up_proj" in name:
+            name_up = name.replace("gate_up_proj", "up_proj.weight")
+            name_gate = name.replace("gate_up_proj", "gate_proj.weight")
+            dim_half = data_torch.shape[-1] // 2
+            gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
+            return [
+                (self.map_tensor_name(name_gate), gate_proj_weight),
+                (self.map_tensor_name(name_up), up_proj_weight)
+            ]
+
+        if name.endswith("down_proj"):
+            name += ".weight"
+            data_torch = data_torch.transpose(-1, -2)
+
+        if "multi_modal_projector" in name or "vision_model" in name:
+            return []
+        return super().modify_tensors(data_torch, name, bid)
+
+
@Model.register("Mistral3ForConditionalGeneration")
 class Mistral3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -113,6 +113,7 @@ models = [
    {"name": "superbpe",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
+    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
 ]


--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -380,6 +380,7 @@ struct clip_ctx {
        if (backend_cpu != backend) {
            ggml_backend_free(backend_cpu);
        }
+        clip_image_size_free(load_image_size);
    }
 };

@@ -1618,6 +1619,12 @@ struct clip_image_f32 * clip_image_f32_init() {
    return new clip_image_f32();
 }

+void clip_image_size_free(struct clip_image_size * load_image_size) {
+    if (load_image_size == nullptr) {
+        return;
+    }
+    delete load_image_size;
+}
 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
 void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
@@ -2270,6 +2277,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
 }

 void clip_free(clip_ctx * ctx) {
+    if (ctx == nullptr) {
+        return;
+    }
    delete ctx;
 }

@@ -2840,10 +2850,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 bool clip_is_glm(const struct clip_ctx * ctx) {
    return ctx->has_glm_projector;
 }
+
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
    return ctx->has_qwen2vl_merger;
 }

+bool clip_is_llava(const struct clip_ctx * ctx) {
+    return ctx->has_llava_projector;
+}
+
+bool clip_is_gemma3(const struct clip_ctx * ctx) {
+    return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
+}
+
 // Determine the number of encoder layers to iterate over
 int get_deepest_feature_layer(const struct clip_ctx * ctx) {
    // Get the index of the second to last layer; this is the
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -77,6 +77,7 @@ CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();

+CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
@@ -106,6 +107,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);

 CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -851,7 +851,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {

    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);

-    LOG("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\t95%% confidence interval\n");

    double acc = 0.0f;

@@ -985,8 +985,22 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
                acc += 1.0;
            }

-            // Print the accumulated accuracy mean x 100
-            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
+            double freq = acc / double(i + 1);
+
+            const double za = 1.95996398454;
+
+            // // Wald normal approx
+            // double conf =za*sqrt(freq*(1-freq)/double(i + 1));
+            // LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
+
+            // Wilson score interval, more accurate
+            double z   = za * za / double(i + 1);
+            double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
+            double a   = (freq + z * 0.5 - cnf) / (1.0 + z);
+            double b   = (freq + z * 0.5 + cnf) / (1.0 + z);
+
+            // Print the accumulated accuracy mean x 100 and confidence interval
+            LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
        }

        i0 = i1 - 1;
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1705,6 +1705,8 @@ private:
 };

 struct server_response {
+    bool running = true;
+
    // for keeping track of all tasks waiting for the result
    std::unordered_set<int> waiting_task_ids;

@@ -1759,6 +1761,10 @@ struct server_response {
        while (true) {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
+                if (!running) {
+                    SRV_DBG("%s : queue result stop\n", __func__);
+                    std::terminate(); // we cannot return here since the caller is HTTP code
+                }
                return !queue_results.empty();
            });

@@ -1789,6 +1795,10 @@ struct server_response {
            }

            std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+            if (!running) {
+                SRV_DBG("%s : queue result stop\n", __func__);
+                std::terminate(); // we cannot return here since the caller is HTTP code
+            }
            if (cr_res == std::cv_status::timeout) {
                return nullptr;
            }
@@ -1818,6 +1828,12 @@ struct server_response {
            }
        }
    }
+
+    // terminate the waiting loop
+    void terminate() {
+        running = false;
+        condition_results.notify_all();
+    }
 };

 struct server_context {
@@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
    svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };

    // clean up function, to be called before exit
-    auto clean_up = [&svr]() {
+    auto clean_up = [&svr, &ctx_server]() {
        SRV_INF("%s: cleaning up before exit...\n", __func__);
        svr->stop();
+        ctx_server.queue_results.terminate();
        llama_backend_free();
    };

@@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {

    if (!ctx_server.load_model(params)) {
        clean_up();
-        // t.join(); // FIXME: see below
+        t.join();
        LOG_ERR("%s: exiting due to model loading error\n", __func__);
        return 1;
    }
@@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
    ctx_server.queue_tasks.start_loop();

    clean_up();
-    // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
+    t.join();

    return 0;
 }
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@@ -49,6 +49,26 @@ def test_embedding_multiple():
        assert len(d['embedding']) > 1


+def test_embedding_multiple_with_fa():
+    server = ServerPreset.bert_bge_small_with_fa()
+    server.pooling = 'last'
+    server.start()
+    # one of these should trigger the FA branch (i.e. context size % 256 == 0)
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "a "*253,
+            "b "*254,
+            "c "*255,
+            "d "*256,
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
@pytest.mark.parametrize(
    "input,is_multi_prompt",
    [
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -323,6 +323,21 @@ class ServerPreset:
        server.server_embeddings = True
        return server

+    @staticmethod
+    def bert_bge_small_with_fa() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 1024
+        server.n_batch = 300
+        server.n_ubatch = 300
+        server.n_slots = 2
+        server.fa = True
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
    @staticmethod
    def tinyllama_infill() -> ServerProcess:
        server = ServerProcess()
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useMemo, useRef, useState } from 'react';
+import { useEffect, useMemo, useState } from 'react';
 import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
 import ChatMessage from './ChatMessage';
 import { CanvasType, Message, PendingMessage } from '../utils/types';
@@ -6,6 +6,7 @@ import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
 import CanvasPyInterpreter from './CanvasPyInterpreter';
 import StorageUtils from '../utils/storage';
 import { useVSCodeContext } from '../utils/llama-vscode';
+import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';

 /**
 * A message display is a message node with additional information for rendering.
@@ -99,7 +100,8 @@ export default function ChatScreen() {
    canvasData,
    replaceMessageAndGenerate,
  } = useAppContext();
-  const textarea = useOptimizedTextarea(prefilledMsg.content());
+
+  const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());

  const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
  // TODO: improve this when we have "upload file" feature
@@ -248,14 +250,16 @@ export default function ChatScreen() {
        </div>

        {/* chat input */}
-        <div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
+        <div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
          <textarea
-            className="textarea textarea-bordered w-full"
+            // Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
+            // Large screens (lg:): Disable manual resize, apply max-height for autosize limit
+            className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
            placeholder="Type a message (Shift+Enter to add a new line)"
            ref={textarea.ref}
+            onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
            onKeyDown={(e) => {
              if (e.nativeEvent.isComposing || e.keyCode === 229) return;
-              if (e.key === 'Enter' && e.shiftKey) return;
              if (e.key === 'Enter' && !e.shiftKey) {
                e.preventDefault();
                sendNewMessage();
@@ -263,7 +267,11 @@ export default function ChatScreen() {
            }}
            id="msg-input"
            dir="auto"
+            // Set a base height of 2 rows for mobile views
+            // On lg+ screens, the hook will calculate and set the initial height anyway
+            rows={2}
          ></textarea>
+
          {isGenerating(currConvId ?? '') ? (
            <button
              className="btn btn-neutral ml-2"
@@ -286,43 +294,3 @@ export default function ChatScreen() {
    </div>
  );
 }
-
-export interface OptimizedTextareaValue {
-  value: () => string;
-  setValue: (value: string) => void;
-  focus: () => void;
-  ref: React.RefObject<HTMLTextAreaElement>;
-}
-
-// This is a workaround to prevent the textarea from re-rendering when the inner content changes
-// See https://github.com/ggml-org/llama.cpp/pull/12299
-function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
-  const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
-  const textareaRef = useRef<HTMLTextAreaElement>(null);
-
-  useEffect(() => {
-    if (textareaRef.current && savedInitValue) {
-      textareaRef.current.value = savedInitValue;
-      setSavedInitValue('');
-    }
-  }, [textareaRef, savedInitValue, setSavedInitValue]);
-
-  return {
-    value: () => {
-      return textareaRef.current?.value ?? savedInitValue;
-    },
-    setValue: (value: string) => {
-      if (textareaRef.current) {
-        textareaRef.current.value = value;
-      }
-    },
-    focus: () => {
-      if (textareaRef.current) {
-        // focus and move the cursor to the end
-        textareaRef.current.focus();
-        textareaRef.current.selectionStart = textareaRef.current.value.length;
-      }
-    },
-    ref: textareaRef,
-  };
-}
--- a/examples/server/webui/src/components/useChatTextarea.ts
+++ b/examples/server/webui/src/components/useChatTextarea.ts
@@ -0,0 +1,96 @@
+import { useEffect, useRef, useState, useCallback } from 'react';
+
+// Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint)
+const LARGE_SCREEN_MQ = '(min-width: 1024px)';
+
+// Calculates and sets the textarea height based on its scrollHeight
+const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => {
+  if (!textarea) return;
+
+  // Only perform auto-sizing on large screens
+  if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
+    // On small screens, reset inline height and max-height styles.
+    // This allows CSS (e.g., `rows` attribute or classes) to control the height,
+    // and enables manual resizing if `resize-vertical` is set.
+    textarea.style.height = ''; // Use 'auto' or '' to reset
+    textarea.style.maxHeight = '';
+    return; // Do not adjust height programmatically on small screens
+  }
+
+  const computedStyle = window.getComputedStyle(textarea);
+  // Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
+  const currentMaxHeight = computedStyle.maxHeight;
+
+  // Temporarily remove max-height to allow scrollHeight to be calculated correctly
+  textarea.style.maxHeight = 'none';
+  // Reset height to 'auto' to measure the actual scrollHeight needed
+  textarea.style.height = 'auto';
+  // Set the height to the calculated scrollHeight
+  textarea.style.height = `${textarea.scrollHeight}px`;
+  // Re-apply the original max-height from CSS to enforce the limit
+  textarea.style.maxHeight = currentMaxHeight;
+};
+
+// Interface describing the API returned by the hook
+export interface ChatTextareaApi {
+  value: () => string;
+  setValue: (value: string) => void;
+  focus: () => void;
+  ref: React.RefObject<HTMLTextAreaElement>;
+  onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
+}
+
+// This is a workaround to prevent the textarea from re-rendering when the inner content changes
+// See https://github.com/ggml-org/llama.cpp/pull/12299
+// combined now with auto-sizing logic.
+export function useChatTextarea(initValue: string): ChatTextareaApi {
+  const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
+  const textareaRef = useRef<HTMLTextAreaElement>(null);
+
+  // Effect to set initial value and height on mount or when initValue changes
+  useEffect(() => {
+    const textarea = textareaRef.current;
+    if (textarea) {
+      if (typeof savedInitValue === 'string' && savedInitValue.length > 0) {
+        textarea.value = savedInitValue;
+        // Call adjustTextareaHeight - it will check screen size internally
+        setTimeout(() => adjustTextareaHeight(textarea), 0);
+        setSavedInitValue(''); // Reset after applying
+      } else {
+        // Adjust height even if there's no initial value (for initial render)
+        setTimeout(() => adjustTextareaHeight(textarea), 0);
+      }
+    }
+  }, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue
+
+  const handleInput = useCallback(
+    (event: React.FormEvent<HTMLTextAreaElement>) => {
+      // Call adjustTextareaHeight on every input - it will decide whether to act
+      adjustTextareaHeight(event.currentTarget);
+    },
+    []
+  );
+
+  return {
+    // Method to get the current value directly from the textarea
+    value: () => {
+      return textareaRef.current?.value ?? '';
+    },
+    // Method to programmatically set the value and trigger height adjustment
+    setValue: (value: string) => {
+      const textarea = textareaRef.current;
+      if (textarea) {
+        textarea.value = value;
+        // Call adjustTextareaHeight - it will check screen size internally
+        setTimeout(() => adjustTextareaHeight(textarea), 0);
+      }
+    },
+    focus: () => {
+      if (textareaRef.current) {
+        textareaRef.current.focus();
+      }
+    },
+    ref: textareaRef,
+    onInput: handleInput,
+  };
+}
--- a/examples/server/webui/src/utils/llama-vscode.ts
+++ b/examples/server/webui/src/utils/llama-vscode.ts
@@ -1,6 +1,6 @@
 import { useEffect, useState } from 'react';
 import { MessageExtraContext } from './types';
-import { OptimizedTextareaValue } from '../components/ChatScreen';
+import { ChatTextareaApi } from '../components/useChatTextarea.ts';

 // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
 // Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,7 +15,7 @@ interface SetTextEvData {
 * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
 */

-export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
+export const useVSCodeContext = (textarea: ChatTextareaApi) => {
  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
    null
  );
--- a/examples/server_embd.py
+++ b/examples/server_embd.py
@@ -15,7 +15,7 @@ async def main():
    model_url = "http://127.0.0.1:6900"
    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
        url= f"{model_url}/embedding",
-        json= {"content": str(0)*1024}
+        json= {"content": "a "*1022}
    ) for i in range(n)])

    for response in responses:
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6721,8 +6721,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
    ggml_vec_dot_t    const kq_vec_dot     = ggml_get_type_traits_cpu(k->type)->vec_dot;
    ggml_to_float_t   const v_to_float     = ggml_get_type_traits(v->type)->to_float;

-    GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
-    GGML_ASSERT(v_to_float   && "fattn: unsupported V-type");
+    GGML_ASSERT((                            q_to_vec_dot) && "fattn: unsupported K-type");
+    GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float  ) && "fattn: unsupported V-type");

    // loop over n_batch and n_head
    for (int ir = ir0; ir < ir1; ++ir) {
@@ -6818,10 +6818,14 @@ static void ggml_compute_forward_flash_attn_ext_f16(
                    vs = expf(s - M);
                }

-                v_to_float(v_data, V32, DV);
-
                // V += v*expf(s - M)
-                ggml_vec_mad_f32(DV, VKQ32, V32, vs);
+                if (v_to_float) {
+                    v_to_float(v_data, V32, DV);
+                    ggml_vec_mad_f32(DV, VKQ32, V32, vs);
+                } else {
+                    // V is F32
+                    ggml_vec_mad_f32(DV, VKQ32, (const float *) v_data, vs);
+                }
            }

            S = S*ms + vs; // scale and increment sum with partial sum
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -10,6 +10,13 @@ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
    *dsti = *xi;
 }

+static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
+
+    *dsti = *xi;
+}
+
 static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    half * dsti = (half *) cdsti;
@@ -386,6 +393,16 @@ static void ggml_cpy_f32_f32_cuda(
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

+static void ggml_cpy_f32_bf16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+}
+
 static void ggml_cpy_f32_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -581,6 +598,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
+        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
@@ -634,6 +653,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
        return nullptr;
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        return (void*) cpy_f32_f16<cpy_1_f32_f32>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
+        return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3079,6 +3079,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
+                    return true;
+                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1345,6 +1345,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_ARANGE:
            return true;
        case GGML_OP_FLASH_ATTN_EXT:
+            if (op->src[0]->ne[0] == 32) {
+                // head size == 32 (e.g. bert-bge-small)
+                // TODO: not sure if it is worth adding kernels for this size
+                return false;
+            }
            if (op->src[1]->type != op->src[2]->type) {
                return false;
            }
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -415,6 +415,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
        unsigned number;
        cl_device_type type;
        char name[128];
+        char version[128];
    };

    enum { NPLAT = 16, NDEV = 16 };
@@ -455,6 +456,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
            d->platform = p;
            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_VERSION, sizeof(d->version), &d->version, NULL));

            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
                p->default_device = d;
@@ -547,7 +549,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    }

    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
+    GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
    if (default_device->type != CL_DEVICE_TYPE_GPU) {
        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
    }
@@ -556,9 +558,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    dev_ctx->device = default_device->id;
    backend_ctx->device = default_device->id;

-    if (strstr(default_device->name, "Adreno")) {
+    if (strstr(default_device->name, "Adreno") ||
+        strstr(default_device->name, "Qualcomm") ||
+        strstr(default_device->version, "Adreno")) {
        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+        // Usually device version contains the detailed device name
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
+        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
+            backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+        }

        // Use wave size of 64 for all Adreno GPUs.
        backend_ctx->adreno_wave_size = 64;
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -372,9 +372,14 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
    SYCL_CHECK(
        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+    // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
+    // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
+    char* host_buf = (char*)malloc(size);
+    memcpy(host_buf, data, size);
    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, data, size)
+        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
                             .wait()));
+    free(host_buf);
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -116,6 +116,7 @@ class Keys:
        RESIDUAL_SCALE                    = "{arch}.residual_scale"
        EMBEDDING_SCALE                   = "{arch}.embedding_scale"
        TOKEN_SHIFT_COUNT                 = "{arch}.token_shift_count"
+        INTERLEAVE_MOE_LAYER_STEP         = "{arch}.interleave_moe_layer_step"

    class Attention:
        HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -227,6 +228,7 @@ class GGUFType:

 class MODEL_ARCH(IntEnum):
    LLAMA            = auto()
+    LLAMA4           = auto()
    DECI             = auto()
    FALCON           = auto()
    BAICHUAN         = auto()
@@ -431,6 +433,7 @@ class MODEL_TENSOR(IntEnum):

 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.LLAMA:            "llama",
+    MODEL_ARCH.LLAMA4:           "llama4",
    MODEL_ARCH.DECI:             "deci",
    MODEL_ARCH.FALCON:           "falcon",
    MODEL_ARCH.BAICHUAN:         "baichuan",
@@ -654,6 +657,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
+    MODEL_ARCH.LLAMA4: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
    MODEL_ARCH.DECI: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -746,6 +746,9 @@ class GGUFWriter:
    def add_token_shift_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)

+    def add_interleave_moe_layer_step(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value)
+
    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -139,6 +139,16 @@ class LazyBase(ABC, metaclass=LazyMeta):

            if isinstance(res, cls._tensor_type):
                return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
+            elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res):
+                # share the evaluation between lazy tuple elements
+                shared_args: list = [args, None]
+
+                def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase:
+                    assert len(a) == 2
+                    if a[1] is None:
+                        a[1] = fn(*a[0], **kw)
+                    return a[1][i]
+                return tuple(cls(meta=cls.eager_to_meta(res[i]), args=(shared_args, i), kwargs=kwargs, func=eager_tuple_element) for i in range(len(res)))
            else:
                del res  # not needed
                # non-tensor return likely relies on the contents of the args
--- a/include/llama.h
+++ b/include/llama.h
@@ -110,6 +110,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
    };

    enum llama_rope_type {
--- a/models/ggml-vocab-llama4.gguf.inp
+++ b/models/ggml-vocab-llama4.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-llama4.gguf.out
+++ b/models/ggml-vocab-llama4.gguf.out
@@ -0,0 +1,46 @@
+ 1190 220 32 220 18215 7112
+ 50 16800 258
+
+ 220
+ 256
+ 277
+ 197
+ 198
+ 368
+ 2946
+ 3271
+ 19873 3817
+ 39715 3817
+ 19873 7353
+ 39715 7353
+ 39715 7353 13
+ 19873 24 3817 13
+ 39715 24 3817 13
+ 544 373 9522 112 247 26 36315
+ 99 39923 220 35 9607 21498 21470 3679 9433
+ 1595 7653 633 79829 34051 1636
+ 8755 102595 115960 21125 148305 96819 102816 39048 14105 22528 160234
+ 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 330 7384 88230 511 947 1492 3742 7233 21
+ 19873
+ 39715
+ 220 39715
+ 256 39715
+ 277 39715
+ 277 39715 198 277 39715
+ 330
+ 198 319
+ 19 7359
+ 19873 24 386 87799 13 2403 583 650 51358 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645
+ 17931 4959
+ 31
+ 1922
+ 12325
+ 12325 31
+ 12325 1922
+ 12325 12325
+ 12325 12325 31
+ 12325 12325 1922
+ 12325 12325 12325
+ 47 19811 12077
+ 3260 3579
+ 198 7283 51499 191231 20192 3271 3322 9287 2143 17860 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 9522 112 247 172394 247 220 31 220 1922 220 12325 220 12325 31 220 12325 1922 220 12325 12325 220 12325 12325 31 220 12325 12325 1922 220 31 26 31 220 31 396 31 220 31 1043 31 117131 102595 115960 21125 148305 96819 102816 80883 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645 79745 150278 117079 633 79829 34051 1636 25611 41990 109428 1488 91054 24072 17931 4959 29795 9296 16517 1806 481 96 1386 36633 1609 24 481 1109 650 5074 43 481 57 702 5074 27088 2170 536 24 481 48 650 1933 1696 30262 43 1665 19 32818 262 27236 56
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@

 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,            "llama"            },
+    { LLM_ARCH_LLAMA4,           "llama4"           },
    { LLM_ARCH_DECI,             "deci"             },
    { LLM_ARCH_FALCON,           "falcon"           },
    { LLM_ARCH_GROK,             "grok"             },
@@ -114,6 +115,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
    { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
    { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
+    { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },

    { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
@@ -233,6 +235,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
+    {
+        LLM_ARCH_LLAMA4,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
+        },
+    },
    {
        LLM_ARCH_DECI,
        {
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -10,6 +10,7 @@

 enum llm_arch {
    LLM_ARCH_LLAMA,
+    LLM_ARCH_LLAMA4,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
@@ -118,6 +119,7 @@ enum llm_kv {
    LLM_KV_RESIDUAL_SCALE,
    LLM_KV_EMBEDDING_SCALE,
    LLM_KV_TOKEN_SHIFT_COUNT,
+    LLM_KV_INTERLEAVE_MOE_LAYER_STEP,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -61,6 +61,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
    { "yandex",            LLM_CHAT_TEMPLATE_YANDEX            },
    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
+    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -174,6 +175,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_YANDEX;
    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
        return LLM_CHAT_TEMPLATE_BAILING;
+    } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
+        return LLM_CHAT_TEMPLATE_LLAMA4;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -608,7 +611,16 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<role>ASSISTANT</role>";
        }
-    } else {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
+        // Llama 4
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
+        }
+        if (add_ass) {
+            ss << "<|header_start|>assistant<|header_end|>\n\n";
+        }
+    }  else {
        // template not supported
        return -1;
    }
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -40,6 +40,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MEGREZ,
    LLM_CHAT_TEMPLATE_YANDEX,
    LLM_CHAT_TEMPLATE_BAILING,
+    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -59,6 +59,22 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
    }
 }

+void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->pos && attn_scale) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        std::vector<float> attn_scale_data(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const float pos = ubatch->pos[i];
+            attn_scale_data[i] = std::log(
+                std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
+            ) * f_attn_temp_scale + 1.0;
+        }
+
+        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
+    }
+}
+
 void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
    if (pos_bucket) {
        const int64_t n_tokens = ubatch->n_tokens;
@@ -458,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
                        }

                        // may need to cut off old tokens for sliding window
+                        // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
                        if (data_swa) {
-                            if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                f = -INFINITY;
+                            if (hparams.n_attn_chunk) {
+                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
+                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
+                                    f = -INFINITY;
+                                }
+                            } else {
+                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
+                                    f = -INFINITY;
+                                }
                            }
                            data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
                        }
@@ -812,8 +836,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
               float   w_scale,
         llama_expert_gating_func_type gating_op,
                 int   il) const {
-    int64_t n_embd = cur->ne[0];
-    int64_t n_tokens = cur->ne[1];
+    const int64_t n_embd   = cur->ne[0];
+    const int64_t n_tokens = cur->ne[1];
+    const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN

    ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
    cb(logits, "ffn_moe_logits", il);
@@ -841,6 +866,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        cb(selection_probs, "ffn_moe_probs_biased", il);
    }

+    // llama4 doesn't have exp_probs_b, and sigmoid is only used after top_k
+    // see: https://github.com/meta-llama/llama-models/blob/699a02993512fb36936b1b0741e13c06790bcf98/models/llama4/moe.py#L183-L198
+    if (arch == LLM_ARCH_LLAMA4) {
+        selection_probs = logits;
+    }
+
    // select experts
    ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
    cb(selected_experts->src[0], "ffn_moe_argsort", il);
@@ -867,6 +898,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    }

    cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+
+    if (weight_before_ffn) {
+        // TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
+        ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
+        repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
+        cur = ggml_mul(ctx0, repeated, weights);
+        cb(cur, "ffn_moe_weighted", il);
+    }
+
    ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
    cb(up, "ffn_moe_up", il);

@@ -894,7 +934,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
    cb(experts, "ffn_moe_down", il);

-    experts = ggml_mul(ctx0, experts, weights);
+    if (!weight_before_ffn) {
+        experts = ggml_mul(ctx0, experts, weights);
+        cb(cur, "ffn_moe_weighted", il);
+    }

    // aggregate experts
    ggml_tensor * moe_out = nullptr;
@@ -914,6 +957,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        moe_out = ggml_cont(ctx0, moe_out);
    }

+    cb(moe_out, "ffn_moe_out", il);
+
    return moe_out;
 }

@@ -981,6 +1026,19 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
    return cur;
 }

+ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+
+    auto & cur = inp->attn_scale;
+
+    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
+    ggml_set_input(cur);
+
+    res->add_input(std::move(inp));
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
    auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);

@@ -1157,6 +1215,15 @@ ggml_tensor * llm_graph_context::build_attn_mha(
            v = ggml_transpose(ctx0, v);
        }

+        // this can happen when KV cache is not used (e.g. an embedding model with non-causal attn)
+        if (k->type == GGML_TYPE_F32) {
+            k = ggml_cast(ctx0, k, GGML_TYPE_F16);
+        }
+
+        if (v->type == GGML_TYPE_F32) {
+            v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+        }
+
        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);

--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -100,6 +100,23 @@ public:
    const int64_t n_pos_per_token = 1;
 };

+// temperature tuning, used by llama4
+class llm_graph_input_attn_temp : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    virtual ~llm_graph_input_attn_temp() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+
+    const uint32_t n_attn_temp_floor_scale;
+    const float    f_attn_temp_scale;
+};
+
 class llm_graph_input_pos_bucket : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@@ -470,6 +487,7 @@ struct llm_graph_context {

    ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
    ggml_tensor * build_inp_pos() const;
+    ggml_tensor * build_inp_attn_scale() const;
    ggml_tensor * build_inp_out_ids() const;
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -112,6 +112,14 @@ struct llama_hparams {
    bool use_alibi     = false;
    bool attn_soft_cap = false;

+    uint32_t n_moe_layer_step        = 0;
+    bool     use_kq_norm             = true;
+    uint32_t n_attn_chunk            = 0;
+    // values below seems to be fixed on llama4
+    uint32_t n_no_rope_layer_step    = 4;
+    uint32_t n_attn_temp_floor_scale = 8192;
+    float    f_attn_temp_scale       = 0.1;
+
    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -90,6 +90,8 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_57B_A14B:      return "57B.A14B";
        case LLM_TYPE_27B:           return "27B";
        case LLM_TYPE_290B:          return "290B";
+        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
+        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
        default:                     return "?B";
    }
 }
@@ -550,6 +552,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    }
                }
            } break;
+        case LLM_ARCH_LLAMA4:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
+                hparams.n_swa_pattern = 4;    // pattern: 3 chunked - 1 full
+                hparams.n_attn_chunk  = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
+                hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
+
+                switch (hparams.n_expert) {
+                    case 16:  type = LLM_TYPE_17B_16E; break;
+                    case 128: type = LLM_TYPE_17B_128E; break;
+                    default:  type = LLM_TYPE_UNKNOWN;
+                }
+
+                if (type == LLM_TYPE_17B_128E) {
+                    hparams.use_kq_norm = false;
+                }
+            } break;
        case LLM_ARCH_DECI:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1690,6 +1711,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        }
                    }
                } break;
+            case LLM_ARCH_LLAMA4:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
+                    for (int i = 0; i < n_layer; ++i) {
+                        bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
+
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+                        if (is_moe_layer) {
+                            int n_ff_exp = hparams.n_ff_exp;
+
+                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+
+                            // Shared expert
+                            const int64_t n_ff_shexp = n_ff_exp;
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
+                        } else {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        }
+                    }
+                } break;
            case LLM_ARCH_DECI:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4203,12 +4274,22 @@ struct llm_build_llama : public llm_graph_context {
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();

+        // temperature tuning
+        ggml_tensor * inp_attn_scale = nullptr;
+        if (arch == LLM_ARCH_LLAMA4) {
+            inp_attn_scale = build_inp_attn_scale();
+        }
+
        auto * inp_attn = build_attn_inp_kv_unified();

        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;

+            bool use_rope = arch == LLM_ARCH_LLAMA4
+                ? (il + 1) % hparams.n_no_rope_layer_step != 0
+                : true;
+
            // norm
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
@@ -4246,25 +4327,38 @@ struct llm_build_llama : public llm_graph_context {
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

-                Qcur = ggml_rope_ext(
-                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
+                if (use_rope) {
+                    Qcur = ggml_rope_ext(
+                            ctx0, Qcur, inp_pos, rope_factors,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );

-                Kcur = ggml_rope_ext(
-                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
+                    Kcur = ggml_rope_ext(
+                            ctx0, Kcur, inp_pos, rope_factors,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+                } else if (inp_attn_scale) {
+                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                }

                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);

+                if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
+                    // Llama4TextL2Norm
+                    Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
+                    Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
+                    cb(Qcur, "Qcur_normed", il);
+                    cb(Kcur, "Kcur_normed", il);
+                }
+
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
            }

            if (il == n_layer - 1) {
@@ -4282,7 +4376,7 @@ struct llm_build_llama : public llm_graph_context {
            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
            cb(ffn_inp, "ffn_inp", il);

-            // feed-forward network
+            // feed-forward network (non-MoE)
            if (model.layers[il].ffn_gate_inp == nullptr) {

                cur = build_norm(ffn_inp,
@@ -4297,6 +4391,38 @@ struct llm_build_llama : public llm_graph_context {
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
+
+            } else if (arch == LLM_ARCH_LLAMA4) {
+                // llama4 MoE
+                ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, false,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+                        il);
+
+                // Shared experts
+                ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
+                    model.layers[il].ffn_up_shexp,   NULL, NULL,
+                    model.layers[il].ffn_gate_shexp, NULL, NULL,
+                    model.layers[il].ffn_down_shexp, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(shexp_out, "ffn_moe_shexp", il);
+
+                cur = ggml_add(ctx0, moe_out, shexp_out);
+                cb(cur, "ffn_moe_out_merged", il);
+
            } else {
                // MoE branch
                cur = build_norm(ffn_inp,
@@ -12091,6 +12217,7 @@ llm_graph_result_ptr llama_model::build_graph(

    switch (arch) {
        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_LLAMA4:
        case LLM_ARCH_MINICPM:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
@@ -12440,6 +12567,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {

        // use what we call a normal RoPE, operating on pairs of consecutive head values
        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_LLAMA4:
        case LLM_ARCH_DECI:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -86,6 +86,8 @@ enum llm_type {
    LLM_TYPE_57B_A14B,
    LLM_TYPE_27B,
    LLM_TYPE_290B,
+    LLM_TYPE_17B_16E, // llama4 Scout
+    LLM_TYPE_17B_128E, // llama4 Maverick
 };

 struct llama_layer_posnet {
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1616,7 +1616,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "megrez") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else if (
-                tokenizer_pre == "gpt-4o") {
+                    tokenizer_pre == "gpt-4o" ||
+                    tokenizer_pre == "llama4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
                clean_spaces = false;
            } else if (
Author	SHA1	Message	Date
Sigbjørn Skjæret	7538246e7c	cuda : add f32 to bf16 copy op (#12806 ) This allows BF16 KV-cache on CUDA.	2025-04-08 23:21:31 +02:00
Matt Clayton	b32efad2bc	llava: improve clip_ctx destructor to not memleak load_image_size (#12834 )	2025-04-08 22:01:58 +02:00
Georgi Gerganov	a19b5cef16	llama : fix FA when KV cache is not used (i.e. embeddings) (#12825 ) * ggml : FA supports F32 V * graph : cast KV to F16 when the KV cache is not used ggml-ci * server : add test that exercises embeddings with FA enabled ggml-ci	2025-04-08 19:54:51 +03:00
Xuan-Son Nguyen	78a1ba0a4f	server : fix thread.join() on exit (#12831 )	2025-04-08 18:37:06 +02:00
dm4	2dabf759e7	llava: add more helper functions to check projector types in clip context (#12824 ) Signed-off-by: dm4 <sunrisedm4@gmail.com>	2025-04-08 15:49:13 +02:00
Prajwal B Mehendarkar	1d343b4069	arg : Including limits file on AIX (#12822 )	2025-04-08 14:30:59 +02:00
characharm	8ca6e1c3a4	server : webui : Improve Chat Input with Auto-Sizing Textarea (#12785 ) * Update ChatScreen.tsx * useAutosizeTextarea.ts useAutosizeTextarea to encapsulate the logic. * Implement responsive auto-sizing chat textarea Replaces the manual textarea resizing with an automatic height adjustment based on content. - `useChatTextarea` hook to manage textarea state and auto-sizing logic via refs, preserving the optimization - Textarea now grows vertically up to a maximum height (`lg:max-h-48`) on large screens (lg breakpoint and up). - Disables auto-sizing and enables manual vertical resizing (`resize-vertical`) on smaller screens for better mobile usability. - Aligns the "Send" button to the bottom of the textarea (`items-end`) for consistent positioning during resize. * -update compressed index.html.gz after npm run build -refactor: replace OptimizedTextareaValue with AutosizeTextareaApi in VSCode context hook * chore: normalize line endings to LF refactor: AutosizeTextareaApi -> chatTextareaApi * refactor: Rename interface to PascalCase --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2025-04-08 11:14:59 +02:00
Neo Zhang Jianyu	656babd6c2	Revert "sycl:remove redundant memcopy in function ggml_backend_sycl_buffer_set_tensor" (#12812 ) * Revert "sycl: remove redundant memcopy in function ggml_backend_sycl_buffer_s…" This reverts commit `518a01480e`. * Update ggml/src/ggml-sycl/ggml-sycl.cpp * Update ggml/src/ggml-sycl/ggml-sycl.cpp * rm tail space	2025-04-08 15:03:21 +08:00
compilade	a226bc7a9a	gguf-py : support lazy tensor splitting (#12809 ) * gguf-py : support lazy tensor splitting Splitting usually involves returning tuples of tensors, which need to be handled properly to avoid early eager evaluation. * gguf-py : fix flake8 lint	2025-04-08 09:03:07 +02:00
Xuan-Son Nguyen	1466621e73	llama : Support llama 4 text-only (#12791 ) * llama4 conversion * initial support, no chat template * clean up a bit * fix tokenizer conversion * correct hparams * try this * fix shexp * ffn_inp_normed * chat template * clean up model conversion * add_bos * add scale_before_ffn * fix order * weight_before_ffn * llm_graph_input_attn_temp * add chunk attn mask * build_inp_attn_scale() * add comment about ggml_repeat * clarify comments * fix build	2025-04-07 23:06:44 +02:00
lhez	82974011f3	opencl: better identify Adreno GPU (#12760 )	2025-04-07 13:22:54 -07:00
stduhpf	4ccea213bc	hellaswag: display estimated score confidence interval (#12797 )	2025-04-07 18:47:08 +03:00