models : fix assert in mamba2 graph (#20270 )

server : add kill switch when server is stuck (#20277 )
ggml-cuda: disable gdn for musa (#20278 )
2026-05-02 23:24:06 +00:00 · 2026-03-09 13:15:15 +02:00 · 2026-03-09 10:33:12 +02:00 · 2026-03-09 16:15:36 +08:00 · 2026-03-09 09:28:41 +02:00 · 2026-03-09 15:05:34 +08:00
17 changed files with 473 additions and 133 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -39,6 +39,7 @@ Before submitting your PR:
    - For intricate features, consider opening a feature request first to discuss and align expectations
    - When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
+- If you are a new contributor, limit your open PRs to 1.

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -476,6 +476,74 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
    return force_tool_calls ? section : optional(section);
 }

+// Python-style tool calls: name(arg1="value1", arg2=123)
+// Used only by LFM2 for now, so we don't merge it into autoparser
+common_peg_parser common_chat_peg_builder::python_style_tool_calls(
+    const nlohmann::json & tools,
+    bool                   parallel_tool_calls) {
+    if (!tools.is_array() || tools.empty()) {
+        return eps();
+    }
+
+    auto tool_choices = choice();
+
+    for (const auto & tool_def : tools) {
+        if (!tool_def.contains("function")) {
+            continue;
+        }
+        const auto &   function = tool_def.at("function");
+        std::string    name     = function.at("name");
+        nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
+
+        auto args = eps();
+        if (params.contains("properties") && !params["properties"].empty()) {
+            auto arg_choice = choice();
+            for (const auto & el : params["properties"].items()) {
+                const std::string & prop_name = el.key();
+                const auto & prop_def = el.value();
+                bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");
+
+                auto arg_name_parser = literal(prop_name);
+
+                common_peg_parser arg_value_parser = eps();
+                auto string_value_parser = choice({
+                    literal("\"") + tool_arg_string_value(json_string_content()) + literal("\""),
+                    literal("'") + tool_arg_string_value(json_string_content()) + literal("'")
+                });
+
+                if (is_string_type) {
+                    arg_value_parser = string_value_parser;
+                } else {
+                    arg_value_parser = tool_arg_value(python_value());
+                }
+
+                // Full argument: name="value" or name=value
+                auto arg_rule = tool_arg(
+                    tool_arg_open(eps()) +
+                    tool_arg_name(arg_name_parser) +
+                    literal("=") +
+                    arg_value_parser +
+                    tool_arg_close(eps())
+                );
+                arg_choice |= arg_rule;
+            }
+
+            args = arg_choice + zero_or_more("," + space() + arg_choice);
+        }
+
+        auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
+            space() + tool_args(args) + space() + tool_close(literal(")"))
+        );
+
+        tool_choices |= rule("tool-" + name, tool_parser);
+    }
+
+    if (parallel_tool_calls) {
+        return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
+    }
+    return "[" + space() + tool_choices + space() + "]";
+}
+
 // Helper: Parse dot notation key into prefix and field name
 static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
    auto dot_pos = key.find('.');
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -112,6 +112,11 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                                 bool                                       parallel_tool_calls,
                                                 bool                                       force_tool_calls);

+    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
+    // Used by LFM2 and similar templates
+    common_peg_parser python_style_tool_calls(const nlohmann::json & tools,
+                                              bool                   parallel_tool_calls);
+
  private:
    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
    common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1274,6 +1274,82 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    return data;
 }

+// LFM2 format:
+// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
+// - Content: text after reasoning (optional)
+// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
+// Tool calls can appear multiple times (parallel tool calls)
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template &    tmpl,
+                                                       const autoparser::templates_params & inputs) {
+    common_chat_params data;
+
+    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = true;
+    data.preserved_tokens  = {
+        "<|tool_list_start|>",
+        "<|tool_list_end|>",
+        "<|tool_call_start|>",
+        "<|tool_call_end|>",
+        "<think>",
+        "</think>",
+    };
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
+
+    const std::string TOOL_CALL_START = "<|tool_call_start|>";
+    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
+    const std::string THINK_START     = "<think>";
+    const std::string THINK_END       = "</think>";
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+
+        auto end = p.end();
+
+        auto reasoning = p.eps();
+        if (extract_reasoning && inputs.enable_thinking) {
+            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
+        }
+
+        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            return reasoning + p.content(p.rest()) + end;
+        }
+
+        auto tool_calls = p.rule("tool-calls",
+            p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
+                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
+                p.literal(TOOL_CALL_END)
+            )
+        );
+
+        auto content = p.content(p.until(TOOL_CALL_START));
+
+        return reasoning + content + tool_calls + end;
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
+        };
+    }
+
+    return data;
+}
+
 namespace workaround {

 // if first message is system and template does not support it, merge it with next message
@@ -1422,6 +1498,14 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

+    // LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
+    // Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
+    if (src.find("<|tool_list_start|>") != std::string::npos &&
+        src.find("<|tool_list_end|>") != std::string::npos) {
+        LOG_DBG("Using specialized template: LFM2\n");
+        return common_chat_params_init_lfm2(tmpl, params);
+    }
+
    try {
        LOG_DBG("Using differential autoparser\n");
        struct autoparser::autoparser autoparser;
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -47,6 +47,7 @@ Legend:
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                  GATED_DELTA_NET | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -92,7 +93,7 @@ Legend:
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
@@ -1,8 +1,8 @@
 "backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
 "Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
-"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
+"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
@@ -85,8 +85,8 @@
 "Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
-"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
+"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
@@ -13591,3 +13591,16 @@
 "Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan"
 "Vulkan0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
 "Vulkan0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","0","no","Vulkan"
+"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","0","no","Vulkan"
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -205,7 +205,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
    GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);

    int64_t total_vram = 0;
-    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
+    for (int id = 0; id < info.device_count; ++id) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+        total_vram += prop.totalGlobalMem;
+    }
+    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices (Total VRAM: %zu MiB):\n",
+                  __func__, info.device_count, (size_t)(total_vram / (1024 * 1024)));
+    total_vram = 0;

    std::vector<std::pair<int, std::string>> turing_devices_without_mma;
    for (int id = 0; id < info.device_count; ++id) {
@@ -243,6 +250,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #else
        info.devices[id].supports_cooperative_launch = false;
 #endif // !(GGML_USE_MUSA)
+
+        // cudaMemGetInfo returns info for the current device
+        size_t free_mem;
+        CUDA_CHECK(cudaSetDevice(id));
+        CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
+
 #if defined(GGML_USE_HIP)
        info.devices[id].smpbo = prop.sharedMemPerBlock;

@@ -257,22 +270,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
                info.devices[id].cc += prop.minor * 0x10;
            }
        }
-        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
                      id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
-                      device_vmm ? "yes" : "no", prop.warpSize);
+                      device_vmm ? "yes" : "no", prop.warpSize,
+                      (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
 #elif defined(GGML_USE_MUSA)
        // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
        info.devices[id].warp_size = 32;
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
        info.devices[id].cc += prop.minor * 0x10;
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
+                      id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                      (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
 #else
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = 100*prop.major + 10*prop.minor;
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
+                      id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                      (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
        std::string device_name(prop.name);
        if (device_name == "NVIDIA GeForce MX450") {
            turing_devices_without_mma.push_back({ id, device_name });
@@ -4976,9 +4992,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_RWKV_WKV6:
        case GGML_OP_GATED_LINEAR_ATTN:
-        case GGML_OP_GATED_DELTA_NET:
        case GGML_OP_RWKV_WKV7:
            return true;
+        case GGML_OP_GATED_DELTA_NET:
+            //TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
+#ifdef GGML_USE_MUSA
+            return false;
+#else
+            return true;
+#endif // GGML_USE_MUSA
        case GGML_OP_FLASH_ATTN_EXT:
            return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
        case GGML_OP_CROSS_ENTROPY_LOSS:
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -763,6 +763,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_ceil[2];
    vk_pipeline pipeline_floor[2];
    vk_pipeline pipeline_trunc[2];
+    vk_pipeline pipeline_sgn[2];

    vk_pipeline pipeline_add1_f16_f16;
    vk_pipeline pipeline_add1_f16_f32;
@@ -4393,6 +4394,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    CREATE_UNARY(ceil)
    CREATE_UNARY(floor)
    CREATE_UNARY(trunc)
+    CREATE_UNARY(sgn)
 #undef CREATE_UNARY

 #define CREATE_UNARY_RTE(name)  \
@@ -9281,6 +9283,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
                return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16];
            case GGML_UNARY_OP_TRUNC:
                return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16];
+            case GGML_UNARY_OP_SGN:
+                return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16];
            default:
                break;
        }
@@ -12875,6 +12879,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        case GGML_UNARY_OP_CEIL:
        case GGML_UNARY_OP_FLOOR:
        case GGML_UNARY_OP_TRUNC:
+        case GGML_UNARY_OP_SGN:
            ggml_vk_unary(ctx, compute_ctx, src0, node);
            break;
        case GGML_UNARY_OP_XIELU:
@@ -13253,6 +13258,10 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g
    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
    vk_buffer buf = buf_ctx->dev_buffer;

+    if (size == 0) {
+        return;
+    }
+
    uint32_t val32 = (uint32_t)value * 0x01010101;
    ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size);
 }
@@ -13262,6 +13271,10 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml
    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
    vk_buffer buf = buf_ctx->dev_buffer;

+    if (size == 0) {
+        return;
+    }
+
    ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
 }

@@ -13269,12 +13282,20 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons
    VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;

+    if (size == 0) {
+        return;
+    }
+
    vk_buffer buf = buf_ctx->dev_buffer;

    ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
 }

 static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+    if (ggml_nbytes(src) == 0) {
+        return true;
+    }
+
    if (ggml_backend_buffer_is_vk(src->buffer)) {
        ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
        ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
@@ -13464,6 +13485,10 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");

+    if (size == 0) {
+        return;
+    }
+
    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;

    vk_context cpy_ctx;
@@ -13507,6 +13532,10 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");

+    if (size == 0) {
+        return;
+    }
+
    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;

    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
@@ -13533,9 +13562,14 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
 }

 static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
-    VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
+    VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;

+    // Skip zero-size tensors
+    if (ggml_nbytes(src) == 0) {
+        return true;
+    }
+
    if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) {
        return false;
    }
@@ -14975,6 +15009,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case GGML_UNARY_OP_CEIL:
                case GGML_UNARY_OP_FLOOR:
                case GGML_UNARY_OP_TRUNC:
+                case GGML_UNARY_OP_SGN:
                    return ggml_is_contiguous(op->src[0]) &&
                           (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
@@ -16141,6 +16176,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            case GGML_UNARY_OP_TRUNC:
                tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
                break;
+            case GGML_UNARY_OP_SGN:
+                tensor_clone = ggml_sgn(ggml_ctx, src_clone[0]);
+                break;
            default:
                std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
                GGML_ABORT("fatal error");
--- a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp
@@ -0,0 +1,21 @@
+#version 450
+
+#include "generic_head.glsl"
+#include "types.glsl"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    data_d[i] = D_TYPE(sign(float(data_a[i])));
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -871,6 +871,8 @@ void process_shaders() {
    string_to_spv("elu_f32",        "elu.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
    string_to_spv("xielu_f16",      "xielu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("xielu_f32",      "xielu.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("sgn_f16",        "sgn.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
+    string_to_spv("sgn_f32",        "sgn.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});

    string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
--- a/models/templates/LFM2-8B-A1B.jinja
+++ b/models/templates/LFM2-8B-A1B.jinja
@@ -6,7 +6,7 @@
 	{%- set messages = messages[1:] -%}
 {%- endif -%}
 {%- if tools -%}
-	{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "You can use the following tools: <|tool_list_start|>[" -%}
+	{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
 	{%- for tool in tools -%}
 		{%- if tool is not string -%}
 			{%- set tool = tool | tojson -%}
@@ -17,7 +17,6 @@
 		{%- endif -%}
 	{%- endfor -%}
 	{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
-	{{- '**IMPORTANT**: The syntax for calling the tools is: <|tool_call_start|>JSON tool call goes here<|tool_call_end|>. Please only call tools in the specified manner.' -}}
 {%- endif -%}
 {%- if ns.system_prompt -%}
 	{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
@@ -30,18 +29,9 @@
 	{%- endif -%}
 	{%- if message["role"] == "tool" -%}
 		{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
-	{%- elif message["role"] == "assistant" -%}
-		{%- if message.tool_calls %}
-			{%- for tool_call in message.tool_calls %}
-				{%- if tool_call.function %}
-					{%- set tool_call = tool_call.function %}
-				{%- endif %}
-				{{- '\n<|tool_call_start|>\n{"name": "' + tool_call.name + '", "arguments": ' + (tool_call.arguments if tool_call.arguments is string else tool_call.arguments | tojson) + '}\n<|tool_call_end|>\n' }}
-			{%- endfor %}
-		{%- endif %}
 	{%- endif -%}
 	{{- content + "<|im_end|>\n" -}}
 {%- endfor -%}
 {%- if add_generation_prompt -%}
 	{{- "<|im_start|>assistant\n" -}}
-{%- endif -%}
+{%- endif -%}
--- a/models/templates/llama-cpp-lfm2.jinja
+++ b/models/templates/llama-cpp-lfm2.jinja
@@ -1,37 +0,0 @@
-{{- bos_token -}}
-{%- set system_prompt = "" -%}
-{%- set ns = namespace(system_prompt="") -%}
-{%- if messages[0]["role"] == "system" -%}
-	{%- set ns.system_prompt = messages[0]["content"] -%}
-	{%- set messages = messages[1:] -%}
-{%- endif -%}
-{%- if tools -%}
-	{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
-	{%- for tool in tools -%}
-		{%- if tool is not string -%}
-			{%- set tool = tool | tojson -%}
-		{%- endif -%}
-		{%- set ns.system_prompt = ns.system_prompt + tool -%}
-		{%- if not loop.last -%}
-			{%- set ns.system_prompt = ns.system_prompt + ", " -%}
-		{%- endif -%}
-	{%- endfor -%}
-	{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
-{%- endif -%}
-{%- if ns.system_prompt -%}
-	{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
-{%- endif -%}
-{%- for message in messages -%}
-	{{- "<|im_start|>" + message["role"] + "\n" -}}
-	{%- set content = message["content"] -%}
-	{%- if content is not string -%}
-		{%- set content = content | tojson -%}
-	{%- endif -%}
-	{%- if message["role"] == "tool" -%}
-		{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
-	{%- endif -%}
-	{{- content + "<|im_end|>\n" -}}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-	{{- "<|im_start|>assistant\n" -}}
-{%- endif -%}
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -778,7 +778,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            ml.load_data_for(tensor);
        }

-        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+        LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
               ++idx, ml.n_tensors,
               ggml_get_name(tensor),
               llama_format_tensor_shape(tensor).c_str(),
--- a/src/models/mamba-base.cpp
+++ b/src/models/mamba-base.cpp
@@ -155,7 +155,6 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,

    const auto kv_head = mctx_cur->get_head();

-    const int64_t n_embd   = hparams.n_embd;
    const int64_t d_conv   = hparams.ssm_d_conv;
    const int64_t d_inner  = hparams.ssm_d_inner;
    const int64_t d_state  = hparams.ssm_d_state;
@@ -170,7 +169,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
    GGML_ASSERT(ubatch.equal_seqs());
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
    GGML_ASSERT(d_inner % n_head == 0);
-    GGML_ASSERT(d_inner % (n_group*n_embd) == 0);
+    GGML_ASSERT(d_inner % (n_group*d_state) == 0);

    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -2387,6 +2387,78 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();
    }

+    // LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|>
+    {
+        auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug);
+
+        // Basic content only
+        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+
+        // Single tool call without reasoning
+        tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
+            .tools({ special_function_tool })
+            .expect(message_assist_call)
+            .run();
+
+        // Tool call with string argument
+        tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
+            .tools({ get_time_tool })
+            .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
+            .run();
+
+        // Tool call with reasoning (enable_thinking=true)
+        tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
+            .enable_thinking(true)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({ special_function_tool })
+            .expect(message_assist_call_thoughts)
+            .run();
+
+        // Multiple tool calls (parallel)
+        tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
+            .parallel_tool_calls(true)
+            .tools({
+                special_function_tool, special_function_tool_with_optional_param
+            })
+            .expect_tool_calls({
+                { "special_function", R"({"arg1": 1})", {} },
+                { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
+            })
+            .run();
+
+        // Tool call with reasoning and content
+        tst.test("<think>I need to call a function</think>"
+                 "Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
+            .enable_thinking(true)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({ get_time_tool })
+            .expect(message_with_reasoning_content_and_multiple_tool_calls(
+                "I need to call a function", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
+            ))
+            .run();
+
+        // Python tool with multiline code in string
+        tst.test("<|tool_call_start|>[python(code=\"def hello():\\n    print('hey')\")]<|tool_call_end|>")
+            .tools({ python_tool })
+            .expect_tool_calls({
+                { "python", R"#({"code": "def hello():\\n    print('hey')"})#", "" }
+            })
+            .run();
+
+        // Partial tool call (streaming)
+        tst.test("<|tool_call_start|>[special_function(arg1=")
+            .tools({ special_function_tool })
+            .is_partial(true)
+            .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
+            .run();
+
+        // Tool call with empty arguments
+        tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
+            .tools({ empty_args_tool })
+            .expect(simple_assist_msg("", "", "empty_args", "{}"))
+            .run();
+    }
+
    // Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
    // Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
    {
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -20,6 +20,7 @@
 #include <unordered_set>

 #include "common.h"
+#include "download.h"
 #include "ggml.h"
 #include "llama.h"

@@ -312,6 +313,9 @@ static std::vector<int> parse_int_range(const std::string & s) {

 struct cmd_params {
    std::vector<std::string>         model;
+    std::vector<std::string>         hf_repo;
+    std::vector<std::string>         hf_file;
+    std::string                      hf_token;
    std::vector<int>                 n_prompt;
    std::vector<int>                 n_gen;
    std::vector<std::pair<int, int>> n_pg;
@@ -351,6 +355,9 @@ struct cmd_params {

 static const cmd_params cmd_params_defaults = {
    /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
+    /* hf_repo              */ {},
+    /* hf_file              */ {},
+    /* hf_token             */ "",
    /* n_prompt             */ { 512 },
    /* n_gen                */ { 128 },
    /* n_pg                 */ {},
@@ -372,7 +379,7 @@ static const cmd_params cmd_params_defaults = {
    /* devices              */ { {} },
    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
    /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
-    /* use_mmap             */ { false },
+    /* use_mmap             */ { true },
    /* use_direct_io        */ { false },
    /* embeddings           */ { false },
    /* no_op_offload        */ { false },
@@ -393,74 +400,57 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help\n");
-    printf("  --numa <distribute|isolate|numactl>       numa mode (default: disabled)\n");
-    printf("  -r, --repetitions <n>                     number of times to repeat each test (default: %d)\n",
-           cmd_params_defaults.reps);
-    printf("  --prio <-1|0|1|2|3>                          process/thread priority (default: %d)\n",
-           cmd_params_defaults.prio);
-    printf("  --delay <0...N> (seconds)                 delay between each test (default: %d)\n",
-           cmd_params_defaults.delay);
-    printf("  -o, --output <csv|json|jsonl|md|sql>      output format printed to stdout (default: %s)\n",
-           output_format_str(cmd_params_defaults.output_format));
-    printf("  -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
-           output_format_str(cmd_params_defaults.output_format_stderr));
-    printf("  --list-devices                            list available devices and exit\n");
-    printf("  -v, --verbose                             verbose output\n");
-    printf("  --progress                                print test progress indicators\n");
-    printf("  --no-warmup                               skip warmup runs before benchmarking\n");
+    printf("  --numa <distribute|isolate|numactl>         numa mode (default: disabled)\n");
+    printf("  -r, --repetitions <n>                       number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <-1|0|1|2|3>                         process/thread priority (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)                   delay between each test (default: %d)\n", cmd_params_defaults.delay);
+    printf("  -o, --output <csv|json|jsonl|md|sql>        output format printed to stdout (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|jsonl|md|sql>   output format printed to stderr (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
+    printf("  --list-devices                              list available devices and exit\n");
+    printf("  -v, --verbose                               verbose output\n");
+    printf("  --progress                                  print test progress indicators\n");
+    printf("  --no-warmup                                 skip warmup runs before benchmarking\n");
    if (llama_supports_rpc()) {
-        printf("  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)\n");
+        printf("  -rpc, --rpc <rpc_servers>                   register RPC devices (comma separated)\n");
    }
    printf("\n");
    printf("test parameters:\n");
-    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                        (default: %s)\n",
-           join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -pg <pp,tg>                               (default: %s)\n",
-           join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
-    printf("  -d, --n-depth <n>                         (default: %s)\n",
-           join(cmd_params_defaults.n_depth, ",").c_str());
-    printf("  -b, --batch-size <n>                      (default: %s)\n",
-           join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
-           join(cmd_params_defaults.n_ubatch, ",").c_str());
-    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n",
-           join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
-           join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                         (default: %s)\n",
-           join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
-           join(cmd_params_defaults.cpu_mask, ",").c_str());
-    printf("  --cpu-strict <0|1>                        (default: %s)\n",
-           join(cmd_params_defaults.cpu_strict, ",").c_str());
-    printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
-           join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -ncmoe, --n-cpu-moe <n>                   (default: %s)\n",
-           join(cmd_params_defaults.n_cpu_moe, ",").c_str());
-    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n",
-           join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -mg, --main-gpu <i>                       (default: %s)\n",
-           join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n",
-           join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n",
-           join(cmd_params_defaults.flash_attn, ",").c_str());
-    printf("  -dev, --device <dev0/dev1/...>            (default: auto)\n");
-    printf("  -mmp, --mmap <0|1>                        (default: %s)\n",
-           join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  -dio, --direct-io <0|1>                   (default: %s)\n",
-           join(cmd_params_defaults.use_direct_io, ",").c_str());
-    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
-           join(cmd_params_defaults.embeddings, ",").c_str());
-    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
+    printf("  -m, --model <filename>                      (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
+    printf("                                              default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
+    printf("                                              example: unsloth/phi-4-GGUF:Q4_K_M\n");
+    printf("                                              (default: unused)\n");
+    printf("  -hff, --hf-file <file>                      Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
+    printf("                                              (default: unused)\n");
+    printf("  -hft, --hf-token <token>                    Hugging Face access token\n");
+    printf("                                              (default: value from HF_TOKEN environment variable)\n");
+    printf("  -p, --n-prompt <n>                          (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -n, --n-gen <n>                             (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -pg <pp,tg>                                 (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -d, --n-depth <n>                           (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
+    printf("  -b, --batch-size <n>                        (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub, --ubatch-size <n>                      (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk, --cache-type-k <t>                    (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv, --cache-type-v <t>                    (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -t, --threads <n>                           (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>                    (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                          (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                            (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>                    (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -ncmoe, --n-cpu-moe <n>                     (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str());
+    printf("  -sm, --split-mode <none|layer|row>          (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+    printf("  -mg, --main-gpu <i>                         (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -nkvo, --no-kv-offload <0|1>                (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>                     (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -dev, --device <dev0/dev1/...>              (default: auto)\n");
+    printf("  -mmp, --mmap <0|1>                          (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -dio, --direct-io <0|1>                     (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
+    printf("  -embd, --embeddings <0|1>                   (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
+    printf("  -ts, --tensor-split <ts0/ts1/..>            (default: 0)\n");
    printf("  -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
-    printf("                                            (default: disabled)\n");
-    printf("  -nopo, --no-op-offload <0|1>              (default: 0)\n");
-    printf("  --no-host <0|1>                           (default: %s)\n",
-           join(cmd_params_defaults.no_host, ",").c_str());
+    printf("                                              (default: disabled)\n");
+    printf("  -nopo, --no-op-offload <0|1>                (default: 0)\n");
+    printf("  --no-host <0|1>                             (default: %s)\n", join(cmd_params_defaults.no_host, ",").c_str());
    printf("\n");
    printf(
        "Multiple values can be given for each parameter by separating them with ','\n"
@@ -514,6 +504,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.progress             = cmd_params_defaults.progress;
    params.no_warmup            = cmd_params_defaults.no_warmup;

+    if (const char * env = getenv("HF_TOKEN")) {
+        params.hf_token = env;
+    }
+
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
@@ -531,6 +525,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                auto p = string_split<std::string>(argv[i], split_delim);
                params.model.insert(params.model.end(), p.begin(), p.end());
+            } else if (arg == "-hf" || arg == "-hfr" || arg == "--hf-repo") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+                params.hf_repo.insert(params.hf_repo.end(), p.begin(), p.end());
+            } else if (arg == "-hff" || arg == "--hf-file") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+                params.hf_file.insert(params.hf_file.end(), p.begin(), p.end());
+            } else if (arg == "-hft" || arg == "--hf-token") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                params.hf_token = argv[i];
            } else if (arg == "-p" || arg == "--n-prompt") {
                if (++i >= argc) {
                    invalid_param = true;
@@ -961,6 +975,44 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
        exit(1);
    }

+    if (!params.hf_repo.empty()) {
+        for (size_t i = 0; i < params.hf_repo.size(); i++) {
+            common_params_model model;
+
+            // step 1: no `-hff` provided, we auto-detect based on the `-hf` flag
+            if (params.hf_file.empty() || params.hf_file[i].empty()) {
+                auto auto_detected = common_get_hf_file(params.hf_repo[i], params.hf_token, false);
+                if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
+                    exit(1);
+                }
+
+                model.name    = params.hf_repo[i];
+                model.hf_repo = auto_detected.repo;
+                model.hf_file = auto_detected.ggufFile;
+            } else {
+                model.hf_file = params.hf_file[i];
+            }
+
+            // step 2: construct the model cache path
+            std::string clean_fname = model.hf_repo + "_" + model.hf_file;
+            string_replace_all(clean_fname, "\\", "_");
+            string_replace_all(clean_fname, "/", "_");
+            model.path = fs_get_cache_file(clean_fname);
+
+            // step 3: download the model if not exists
+            std::string model_endpoint = get_model_endpoint();
+            model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
+
+            bool ok = common_download_model(model, params.hf_token, false);
+            if (!ok) {
+                fprintf(stderr, "error: failed to download model from %s\n", model.url.c_str());
+                exit(1);
+            }
+
+            params.model.push_back(model.path);
+        }
+    }
+
    // set defaults
    if (params.model.empty()) {
        params.model = cmd_params_defaults.model;
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -562,7 +562,7 @@ private:

    llama_model_ptr model_dft;

-    bool add_bos_token  = true;
+    bool add_bos_token = true;

    int32_t n_ctx; // total context for all clients / slots

@@ -570,6 +570,7 @@ private:
    std::vector<server_slot> slots;

    int slots_debug = 0;
+    int n_empty_consequtive = 0;

    std::unique_ptr<server_prompt_cache> prompt_cache;

@@ -2438,6 +2439,8 @@ private:
                        slot.n_prompt_tokens_cache = 0;
                    }

+                    bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
+
                    // check if we should process the image
                    if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
                        // process the image
@@ -2457,6 +2460,8 @@ private:
                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
                            slot.prompt.tokens.push_back(chunk.get()); // copy
                        }
+
+                        do_checkpoint = false; // do not checkpoint right after an image chunk
                    }

                    // If using an alora, there may be uncached tokens that come
@@ -2473,8 +2478,6 @@ private:
                        alora_disabled_id = enabled_loras[0];
                    }

-                    bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
-
                    // make checkpoints only for completion tasks
                    do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION;

@@ -2626,6 +2629,12 @@ private:

        if (batch.n_tokens == 0) {
            SRV_WRN("%s", "no tokens to decode\n");
+
+            if (++n_empty_consequtive > 3) {
+                GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277");
+            }
+        } else {
+            n_empty_consequtive = 0;
        }

        int32_t i_next = 0;
Author	SHA1	Message	Date
Georgi Gerganov	43e1cbd6c1	models : fix assert in mamba2 graph (#20270 )	2026-03-09 13:15:15 +02:00
Georgi Gerganov	107d599952	server : add kill switch when server is stuck (#20277 )	2026-03-09 10:33:12 +02:00
Aman Gupta	e8bbc736cb	ggml-cuda: disable gdn for musa (#20278 )	2026-03-09 16:15:36 +08:00
ddh0	b518195101	llama-quant : left-align tensor names in output (#20117 )	2026-03-09 09:28:41 +02:00
Aman Gupta	e2763a6723	contributing: limit open PRs for new contributors to 1 (#20036 )	2026-03-09 15:05:34 +08:00
Bertay Eren	0beb8db3a0	ggml-vulkan: add SGN operator, auto-generate Vulkan.csv and ops.md (#20219 )	2026-03-09 07:24:16 +01:00
Ruben Ortlam	b2f460bd3c	vulkan: skip zero size tensors in backend copies (#20233 )	2026-03-09 07:23:45 +01:00
Michael Huang	5f4cdac385	cuda : display total and free VRAM capacity during device initialization (#20185 )	2026-03-09 12:45:43 +08:00
Aaron Teo	ae87863dc1	llama-bench: introduce `-hf` and `-hff` flags & use `--mmap 1` by default (#20211 )	2026-03-09 09:05:44 +08:00
Piotr Wilkin (ilintar)	97c64fbdbd	PEG parser for LFM2 (#20251 ) * PEG parser for LFM2 * Simplify using python_value()	2026-03-09 01:11:22 +01:00
Georgi Gerganov	d417bc43dd	server : do not create checkpoints right after mtmd chunks (#20232 )	2026-03-08 22:16:46 +02:00