rpc : check src buffer when copying tensor (#16421 )

Only dst buffer is guaranteed to be an RPC buffer. Add check for the src one.
rpc : add support for multiple devices (#16276 )
2026-05-04 16:14:06 +00:00 · 2025-10-04 16:22:45 +03:00 · 2025-10-04 12:49:16 +03:00 · 2025-10-04 11:42:56 +02:00 · 2025-10-03 21:51:48 +03:00 · 2025-10-03 21:34:51 +03:00
191 changed files with 2972 additions and 893 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -97,7 +97,7 @@ jobs:
          ctest -L 'main|curl' --verbose --timeout 900

  macOS-latest-cmake-x64:
-    runs-on: macos-13
+    runs-on: macos-15-intel

    steps:
      - name: Clone
@@ -362,11 +362,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-rpc
-          evict-old-files: 1d
+      # - name: ccache
+      #   uses: ggml-org/ccache-action@v1.2.16
+      #   with:
+      #     key: ubuntu-latest-cmake-rpc
+      #     evict-old-files: 1d

      - name: Dependencies
        id: depends
@@ -387,8 +387,8 @@ jobs:
          cd build
          ctest -L main --verbose

-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
+  ubuntu-24-cmake-vulkan:
+    runs-on: ubuntu-24.04

    steps:
      - name: Clone
@@ -398,20 +398,40 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-22-cmake-vulkan
+          key: ubuntu-24-cmake-vulkan
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+
+      - name: Get latest Vulkan SDK version
+        id: vulkan_sdk_version
+        run: |
+          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
+
+      - name: Cache Vulkan SDK
+        id: cache_vulkan_sdk
+        uses: actions/cache@v4
+        with:
+          path: ./vulkan_sdk
+          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+
+      - name: Install Vulkan SDK
+        if: steps.cache_vulkan_sdk.outputs.cache-hit != 'true'
+        id: vulkan_sdk_install
+        run: |
+          mkdir -p vulkan_sdk
+          cd vulkan_sdk
+          curl --no-progress-meter https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz | tar -Jx --strip-components=1

      - name: Build
        id: cmake_build
        run: |
+          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)
@@ -421,6 +441,7 @@ jobs:
        run: |
          cd build
          export GGML_VK_VISIBLE_DEVICES=0
+          export GGML_VK_DISABLE_F16=1
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 4200

--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -75,7 +75,7 @@ jobs:
          name: llama-bin-macos-arm64.zip

  macOS-x64:
-    runs-on: macos-13
+    runs-on: macos-15-intel

    steps:
      - name: Clone
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -34,9 +34,9 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")

-rm -f "$OUT/*.log"
-rm -f "$OUT/*.exit"
-rm -f "$OUT/*.md"
+rm -f $OUT/*.log
+rm -f $OUT/*.exit
+rm -f $OUT/*.md

 sd=`dirname $0`
 cd $sd/../
@@ -607,6 +607,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
+
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

@@ -624,4 +625,6 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run ctest_with_model_release
 fi

+cat $OUT/README.md
+
 exit $ret
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1615,18 +1615,14 @@ static void add_rpc_devices(const std::string & servers) {
    if (!rpc_reg) {
        throw std::invalid_argument("failed to find RPC backend");
    }
-    typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
-    ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
-    if (!ggml_backend_rpc_add_device_fn) {
-        throw std::invalid_argument("failed to find RPC device add function");
+    typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
+    ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
+    if (!ggml_backend_rpc_add_server_fn) {
+        throw std::invalid_argument("failed to find RPC add server function");
    }
    for (const auto & server : rpc_servers) {
-        ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
-        if (dev) {
-            ggml_backend_device_register(dev);
-        } else {
-            throw std::invalid_argument("failed to register RPC device");
-        }
+        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
+        ggml_backend_register(reg);
    }
 }

@@ -1932,13 +1928,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
-        {"--swa-checkpoints"}, "N",
-        string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
+        {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
+        string_format("max number of context checkpoints to create per slot (default: %d)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
        [](common_params & params, int value) {
-            params.n_swa_checkpoints = value;
+            params.n_ctx_checkpoints = value;
        }
-    ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--kv-unified", "-kvu"},
        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -75,6 +75,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
    }
    return true;
 }
+
+bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
+    if (!tool_call.is_object() || tool_call.size() != 1) {
+        return false;
+    }
+
+    // Get the tool name (the single key in the object)
+    auto it = tool_call.begin();
+    std::string name = it.key();
+
+    if (name.empty()) {
+        return false;
+    }
+
+    // Get the arguments (the nested object)
+    const json & args_json = it.value();
+    std::string arguments = "";
+
+    if (args_json.is_object()) {
+        arguments = args_json.dump();
+    } else if (args_json.is_string()) {
+        arguments = args_json;
+    } else if (!args_json.is_null()) {
+        // For other types, convert to string representation
+        arguments = args_json.dump();
+    }
+
+    return add_tool_call(name, "", arguments);
+}
 void common_chat_msg_parser::finish() {
    if (!is_partial_ && pos_ != input_.size()) {
        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -64,6 +64,9 @@ class common_chat_msg_parser {
    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
    bool add_tool_calls(const nlohmann::ordered_json & arr);

+    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+
    void finish();

    bool consume_spaces();
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -625,6 +625,7 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
        case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
        case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
+        case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
        case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
        case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -638,6 +639,7 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
        case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
        case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
+        case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -801,6 +803,7 @@ static std::string apply(
    }
    tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
    tmpl_inputs.extra_context = inputs.extra_context;
+    tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
    if (additional_context) {
        tmpl_inputs.extra_context.merge_patch(*additional_context);
    }
@@ -982,6 +985,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
    data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
    return data;
 }
+
+static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
+    data.preserved_tokens = {
+        "[THINK]",
+        "[/THINK]",
+    };
+
+    if (inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    {"type", "object"},
+                    {"properties", {
+                        {"name", {
+                            {"type", "string"},
+                            {"const", function.at("name")},
+                        }},
+                        {"arguments", function.at("parameters")},
+                        {"id", {
+                            {"type", "string"},
+                            {"pattern", "^[a-zA-Z0-9]{9}$"},
+                        }},
+                    }},
+                    {"required", json::array({"name", "arguments", "id"})},
+                });
+            });
+            auto schema = json {
+                {"type", "array"},
+                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+                {"minItems", 1},
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
+        });
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+        data.preserved_tokens.push_back("[TOOL_CALLS]");
+    } else {
+        data.grammar_lazy = false;
+        if (!inputs.json_schema.is_null()) {
+            if (!inputs.grammar.empty()) {
+                throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+            }
+            data.grammar = json_schema_to_grammar(inputs.json_schema);
+        } else {
+            data.grammar = inputs.grammar;
+        }
+    }
+
+    return data;
+}
+
 static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
@@ -992,6 +1054,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
    parse_prefixed_json_tool_call_array(builder, prefix);
 }

+static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("[THINK]", "[/THINK]");
+
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+    parse_prefixed_json_tool_call_array(builder, prefix);
+}
+
 static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;

@@ -1264,6 +1338,75 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
    }
    return data;
 }
+
+static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+
+    // Generate the prompt using the apply() function with the template
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_APERTUS;
+
+    // Handle thinking tags appropriately based on inputs.enable_thinking
+    if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "<|inner_suffix|>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    // When tools are present, build grammar for the <|tools_prefix|> format
+    if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = true;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    { "type",       "object"                                                   },
+                    { "properties",
+                        {
+                            { function.at("name"), function.at("parameters") }
+                        }                                                                        },
+                    { "required",   json::array({ function.at("name") }) },
+                });
+            });
+            auto schema = json{
+                        { "type",     "array"                                                         },
+                        { "items",    schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
+                        { "minItems", 1                                                               },
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root",
+                                std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
+                                    "\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
+                            });
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+            // If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
+            // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+            std::string(data.thinking_forced_open ?
+                            "[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
+                            "(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
+                "(<\\|tools_prefix\\|>)[\\s\\S]*" });
+        data.preserved_tokens = {
+            "<|system_start|>",
+            "<|system_end|>",
+            "<|developer_start|>",
+            "<|developer_end|>",
+            "<|user_start|>",
+            "<|user_end|>",
+            "<|assistant_start|>",
+            "<|assistant_end|>",
+            "<|inner_prefix|>",
+            "<|inner_suffix|>",
+            "<|tools_prefix|>",
+            "<|tools_suffix|>",
+        };
+    }
+    return data;
+}
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
@@ -2323,6 +2466,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }

+static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            builder.consume_spaces();
+            if (!builder.try_consume_literal("<|tools_suffix|>")) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            for (const auto & value : tool_calls_data.json) {
+                if (value.is_object()) {
+                    builder.add_tool_call_short_form(value);
+                }
+            }
+        } else {
+            throw common_chat_msg_partial_exception("Incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
    // Parse thinking tags first - this handles the main reasoning content
    builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2567,6 +2741,11 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_nemotron_v2(tmpl, params);
    }

+    // Apertus format detection
+    if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
+        return common_chat_params_init_apertus(tmpl, params);
+    }
+
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2595,6 +2774,10 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
    }

+    if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
+        return common_chat_params_init_magistral(tmpl, params);
+    }
+
    // Plain handler (no tools)
    if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
        return common_chat_params_init_without_tools(tmpl, params);
@@ -2695,6 +2878,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
            common_chat_parse_mistral_nemo(builder);
            break;
+        case COMMON_CHAT_FORMAT_MAGISTRAL:
+            common_chat_parse_magistral(builder);
+            break;
        case COMMON_CHAT_FORMAT_LLAMA_3_X:
            common_chat_parse_llama_3_1(builder);
            break;
@@ -2734,6 +2920,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_NEMOTRON_V2:
            common_chat_parse_nemotron_v2(builder);
            break;
+        case COMMON_CHAT_FORMAT_APERTUS:
+            common_chat_parse_apertus(builder);
+            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.h
+++ b/common/chat.h
@@ -101,6 +101,7 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_MAGISTRAL,
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
@@ -114,6 +115,7 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
+    COMMON_CHAT_FORMAT_APERTUS,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
--- a/common/common.h
+++ b/common/common.h
@@ -424,7 +424,7 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot
+    int32_t n_ctx_checkpoints = 3;            // max number of context checkpoints per slot

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8945,6 +8945,43 @@ class SmallThinkerModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("ApertusForCausalLM")
+class ApertusModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.APERTUS
+    undo_permute = False
+
+    _alpha_n = {}
+    _alpha_p = {}
+    _beta = {}
+    _eps = {}
+
+    def modify_tensors(self, data_torch, name, bid):
+        # Handle xIELU activation parameters
+        n_layers = self.hparams["num_hidden_layers"]
+        if name.endswith(".act_fn.alpha_n"):
+            self._alpha_n[bid] = data_torch.to("cpu").float().item()
+            if (len(self._alpha_n) == n_layers):
+                self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
+            return []
+        if name.endswith(".act_fn.alpha_p"):
+            self._alpha_p[bid] = data_torch.to("cpu").float().item()
+            if (len(self._alpha_p) == n_layers):
+                self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
+            return []
+        if name.endswith(".act_fn.beta"):
+            self._beta[bid] = data_torch.to("cpu").float().item()
+            if (len(self._beta) == n_layers):
+                self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
+            return []
+        if name.endswith(".act_fn.eps"):
+            self._eps[bid] = data_torch.to("cpu").float().item()
+            if (len(self._eps) == n_layers):
+                self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
+            return []
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 class MistralModel(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
    model_name = "Mistral"
@@ -9112,7 +9149,7 @@ class LazyTorchTensor(gguf.LazyBase):
    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
        dtype = cls._dtype_str_map[st_slice.get_dtype()]
        shape: tuple[int, ...] = tuple(st_slice.get_shape())
-        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
+        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
        return cast(torch.Tensor, lazy)

    @classmethod
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -215,6 +215,8 @@ extern "C" {
    // Backend registry
    //

+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+
    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);

    // Backend (reg) enumeration
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -7,26 +7,25 @@
 extern "C" {
 #endif

-#define RPC_PROTO_MAJOR_VERSION    2
+#define RPC_PROTO_MAJOR_VERSION    3
 #define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
 GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);

-GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
-                                                    const char * cache_dir,
-                                                    size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
+                                                    size_t n_threads, size_t n_devices,
+                                                    ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
-
-GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);

 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -576,6 +576,7 @@ extern "C" {
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
        GGML_UNARY_OP_GELU_ERF,
+        GGML_UNARY_OP_XIELU,

        GGML_UNARY_OP_COUNT,
    };
@@ -1150,6 +1151,18 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // xIELU activation function
+    // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
+    // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
+    // that constrain the positive and negative source alpha values respectively
+    GGML_API struct ggml_tensor * ggml_xielu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float alpha_n,
+            float alpha_p,
+            float beta,
+            float eps);
+
    // gated linear unit ops
    // A: n columns, r rows,
    // result is n / 2 columns, r rows,
@@ -1617,6 +1630,13 @@ extern "C" {
            float                 scale,
            float                 max_bias);

+    GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * mask,
+            float                 scale,
+            float                 max_bias);
+
    GGML_API void ggml_soft_max_add_sinks(
            struct ggml_tensor * a,
            struct ggml_tensor * sinks);
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -392,12 +392,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
    free(alloc);
 }

-static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
-    size_t max_size = 0;
-    for (int i = 0; i < alloc->n_chunks; i++) {
-        max_size += alloc->chunks[i]->max_size;
-    }
-    return max_size;
+static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
+    return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
 }


@@ -417,10 +413,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) {
    free(buf);
 }

-static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
-    int n = 0;
-    while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
-    return n;
+static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
+    return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
 }

 static size_t ggml_vbuffer_size(struct vbuffer * buf) {
@@ -885,12 +879,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            }
        }

-        size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
-        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
-
        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
-        if (new_size > cur_size || galloc->buffers[i] == NULL) {
+        bool realloc = galloc->buffers[i] == NULL;
+        size_t new_size = 0;
+        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
+            size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
+            size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
+            new_size += new_chunk_size;
+            if (new_chunk_size > cur_chunk_size) {
+                realloc = true;
+            }
+        }
+        if (realloc) {
 #ifndef NDEBUG
+            size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -209,9 +209,6 @@ extern "C" {
        void * context;
    };

-    // Internal backend registry API
-    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
-
    // Add backend dynamic loading support to the backend

    // Initialize the backend
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2187,6 +2187,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_XIELU:
                    {
                        n_tasks = n_threads;
                    } break;
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8637,7 +8637,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                // n_head
                for (int h = ih0; h < ih1; ++h) {
                    // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const float dt_soft_plus = ggml_softplus(dt[h]);
                    const float dA = expf(dt_soft_plus * A[h]);
                    const int g = h / (nh / ng); // repeat_interleave

@@ -8734,7 +8734,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                // n_head
                for (int h = ih0; h < ih1; ++h) {
                    // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const float dt_soft_plus = ggml_softplus(dt[h]);
                    const int g = h / (nh / ng); // repeat_interleave

                    // dim
@@ -8997,6 +8997,10 @@ void ggml_compute_forward_unary(
            {
                ggml_compute_forward_exp(params, dst);
            } break;
+        case GGML_UNARY_OP_XIELU:
+            {
+                ggml_compute_forward_xielu(params, dst);
+            } break;
        default:
            {
                GGML_ABORT("fatal error");
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -52,6 +52,15 @@ static inline float op_sqrt(float x) {
    return sqrtf(x);
 }

+static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
+    if (x > 0.0f) {
+        return alpha_p * x * x + beta * x;
+    } else {
+        const float min_x_eps = fminf(x, eps);
+        return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
+    }
+}
+
 static inline float op_sin(float x) {
    return sinf(x);
 }
@@ -121,6 +130,86 @@ static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
    }
 }

+template <float (*op)(float, ggml_tensor *)>
+static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+// Extend vec_unary_op to support functors
+template <typename Op, typename src0_t, typename dst_t>
+static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+// Extend apply_unary_op to support functors
+template <typename Op, typename src0_t, typename dst_t>
+static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
+    }
+}
+
+// Generic dispatcher for functors
+template <typename Op>
+static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op_functor<Op, float, float>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
 void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
    unary_op<op_abs>(params, dst);
 }
@@ -184,3 +273,17 @@ void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor *
 void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
    unary_op<op_log>(params, dst);
 }
+
+void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
+    const float alpha_n = ggml_get_op_params_f32(dst, 1);
+    const float alpha_p = ggml_get_op_params_f32(dst, 2);
+    const float beta = ggml_get_op_params_f32(dst, 3);
+    const float eps = ggml_get_op_params_f32(dst, 4);
+
+    const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
+        return op_xielu(f, alpha_n, alpha_p, beta, eps);
+    };
+
+    unary_op_functor(params, dst, xielu_op_params);
+}
+
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@@ -22,6 +22,7 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
 void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);

 #ifdef __cplusplus
 }
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -535,8 +535,6 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2334,6 +2334,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_ELU:
                    ggml_cuda_op_elu(ctx, dst);
                    break;
+                case GGML_UNARY_OP_XIELU:
+                    ggml_cuda_op_xielu(ctx, dst);
+                    break;
                default:
                    return false;
            }
--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -13,7 +13,7 @@

    It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
 */
-template <size_t n_experts, bool with_norm>
+template <int n_experts, bool with_norm>
 __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
                                                                  float *       weights,
                                                                  int32_t *     ids,
@@ -204,8 +204,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,

    GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);

-    cudaStream_t stream = ctx.stream();
-
    const int n_expert_used = weights->ne[1];

    if (with_norm) {
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -1,4 +1,5 @@
 #include "unary.cuh"
+#include "convert.cuh"

 static __device__ __forceinline__ float op_abs(float x) {
    return fabsf(x);
@@ -375,6 +376,59 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
    swiglu_oai_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
 }

+/* CUDA kernel + launcher for xIELU */
+
+template <typename T>
+static __global__ void xielu_kernel(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const float xi = ggml_cuda_cast<float>(x[i]);
+
+    const float gate_pos = (xi > 0.0f);
+    const float y_pos = alpha_p * xi * xi + beta * xi;
+    const float min_v_eps = fminf(xi, eps);
+    const float y_neg = (expm1f(min_v_eps) - xi) * alpha_n + beta * xi;
+    const float out = gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
+
+    dst[i] = ggml_cuda_cast<T>(out);
+}
+
+template <typename T>
+static void xielu_cuda(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_XIELU_BLOCK_SIZE) / CUDA_XIELU_BLOCK_SIZE;
+    xielu_kernel<<<num_blocks, CUDA_XIELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, alpha_n, alpha_p, beta, eps);
+}
+
+void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const void * src0_d = src0->data;
+    void * dst_d = dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const float alpha_n = ggml_get_op_params_f32(dst, 1);
+    const float alpha_p = ggml_get_op_params_f32(dst, 2);
+    const float beta    = ggml_get_op_params_f32(dst, 3);
+    const float eps     = ggml_get_op_params_f32(dst, 4);
+
+    if (src0->type == GGML_TYPE_F16) {
+        xielu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
+    } else {
+        xielu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
+    }
+}
+
+
+
 /* silu_back */

 static __device__ __forceinline__ float op_silu_back(float grad, float x) {
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -16,6 +16,7 @@
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256
 #define CUDA_GLU_BLOCK_SIZE 256
+#define CUDA_XIELU_BLOCK_SIZE 256

 void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

@@ -72,3 +73,5 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
 void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -102,6 +102,9 @@ static bool ggml_op_is_empty(enum ggml_op op) {
    }
 }

+static inline float ggml_softplus(float input) {
+    return (input > 20.0f) ? input : logf(1 + expf(input));
+}
 //
 // logging
 //
--- a/ggml/src/ggml-metal/ggml-metal-common.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-common.cpp
@@ -112,7 +112,7 @@ static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * t
 }

 bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
        if (tensor->src[i]) {
            ggml_mem_ranges_add_src(mrs, tensor->src[i]);
        }
@@ -173,7 +173,7 @@ static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor *
 }

 bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
        if (tensor->src[i]) {
            if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
                return false;
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -56,7 +56,7 @@ if (MUSAToolkit_FOUND)

    set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
    foreach(SOURCE ${GGML_SOURCES_MUSA})
-        set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
+        set(COMPILE_FLAGS "-Od3 -fno-strict-aliasing -ffast-math -fsigned-char -x musa -mtgpu -fmusa-flush-denormals-to-zero")
        foreach(ARCH ${MUSA_ARCHITECTURES})
            set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
        endforeach()
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -105,9 +105,12 @@ enum rpc_cmd {
    RPC_CMD_INIT_TENSOR,
    RPC_CMD_GET_ALLOC_SIZE,
    RPC_CMD_HELLO,
+    RPC_CMD_DEVICE_COUNT,
    RPC_CMD_COUNT,
 };

+static_assert(RPC_CMD_HELLO == 14, "RPC_CMD_HELLO must be always 14");
+
 // Try RPC_CMD_SET_TENSOR_HASH first when data size is larger than this threshold
 const size_t HASH_THRESHOLD = 10 * 1024 * 1024;

@@ -117,7 +120,12 @@ struct rpc_msg_hello_rsp {
    uint8_t patch;
 };

+struct rpc_msg_device_count_rsp {
+    uint32_t device_count;
+};
+
 struct rpc_msg_get_alloc_size_req {
+    uint32_t   device;
    rpc_tensor tensor;
 };

@@ -130,6 +138,7 @@ struct rpc_msg_init_tensor_req {
 };

 struct rpc_msg_alloc_buffer_req {
+    uint32_t device;
    uint64_t size;
 };

@@ -138,10 +147,18 @@ struct rpc_msg_alloc_buffer_rsp {
    uint64_t remote_size;
 };

+struct rpc_msg_get_alignment_req {
+    uint32_t device;
+};
+
 struct rpc_msg_get_alignment_rsp {
    uint64_t alignment;
 };

+struct rpc_msg_get_max_size_req {
+    uint32_t device;
+};
+
 struct rpc_msg_get_max_size_rsp {
    uint64_t max_size;
 };
@@ -192,6 +209,10 @@ struct rpc_msg_graph_compute_rsp {
    uint8_t result;
 };

+struct rpc_msg_get_device_memory_req {
+    uint32_t device;
+};
+
 struct rpc_msg_get_device_memory_rsp {
    uint64_t free_mem;
    uint64_t total_mem;
@@ -207,13 +228,15 @@ static ggml_guid_t ggml_backend_rpc_guid() {

 struct ggml_backend_rpc_buffer_type_context {
    std::string endpoint;
+    uint32_t    device;
    std::string name;
-    size_t alignment;
-    size_t max_size;
+    size_t      alignment;
+    size_t      max_size;
 };

 struct ggml_backend_rpc_context {
    std::string endpoint;
+    uint32_t    device;
    std::string name;
 };

@@ -608,23 +631,30 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
    RPC_STATUS_ASSERT(status);
 }

+static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
+}
+
 static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-    // check if src and dst are on the same server
-    ggml_backend_buffer_t src_buffer = src->buffer;
-    ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
-    ggml_backend_buffer_t dst_buffer = dst->buffer;
-    ggml_backend_rpc_buffer_context * dst_ctx = (ggml_backend_rpc_buffer_context *)dst_buffer->context;
-    if (src_ctx->sock != dst_ctx->sock) {
-        return false;
+    if (ggml_backend_buffer_is_rpc(src->buffer)) {
+        // check if src and dst are on the same server
+        ggml_backend_buffer_t src_buffer = src->buffer;
+        ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
+        ggml_backend_buffer_t dst_buffer = dst->buffer;
+        ggml_backend_rpc_buffer_context * dst_ctx = (ggml_backend_rpc_buffer_context *)dst_buffer->context;
+        if (src_ctx->sock != dst_ctx->sock) {
+            return false;
+        }
+        ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+        rpc_msg_copy_tensor_req request;
+        request.src = serialize_tensor(src);
+        request.dst = serialize_tensor(dst);
+        rpc_msg_copy_tensor_rsp response;
+        bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
+        RPC_STATUS_ASSERT(status);
+        return response.result;
    }
-    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    rpc_msg_copy_tensor_req request;
-    request.src = serialize_tensor(src);
-    request.dst = serialize_tensor(dst);
-    rpc_msg_copy_tensor_rsp response;
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
-    RPC_STATUS_ASSERT(status);
-    return response.result;
+    return false;
 }

 static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -653,7 +683,7 @@ static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t

 static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
-    rpc_msg_alloc_buffer_req request = {size};
+    rpc_msg_alloc_buffer_req request = {buft_ctx->device, size};
    rpc_msg_alloc_buffer_rsp response;
    auto sock = get_socket(buft_ctx->endpoint);
    bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
@@ -669,9 +699,10 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
    }
 }

-static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
+static size_t get_alignment(const std::shared_ptr<socket_t> & sock, uint32_t device) {
+    rpc_msg_get_alignment_req request = {device};
    rpc_msg_get_alignment_rsp response;
-    bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, &request, sizeof(request), &response, sizeof(response));
    RPC_STATUS_ASSERT(status);
    return response.alignment;
 }
@@ -681,9 +712,10 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
    return buft_ctx->alignment;
 }

-static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
+static size_t get_max_size(const std::shared_ptr<socket_t> & sock, uint32_t device) {
+    rpc_msg_get_max_size_req request = {device};
    rpc_msg_get_max_size_rsp response;
-    bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, &request, sizeof(request), &response, sizeof(response));
    RPC_STATUS_ASSERT(status);
    return response.max_size;
 }
@@ -700,7 +732,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
        auto sock = get_socket(buft_ctx->endpoint);

        rpc_msg_get_alloc_size_req request;
-
+        request.device = buft_ctx->device;
        request.tensor = serialize_tensor(tensor);

        rpc_msg_get_alloc_size_rsp response;
@@ -754,7 +786,7 @@ static void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors,
    tensors.push_back(serialize_tensor(tensor));
 }

-static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
    uint32_t n_nodes = cgraph->n_nodes;
    std::vector<rpc_tensor> tensors;
    std::unordered_set<ggml_tensor*> visited;
@@ -762,24 +794,29 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
        add_tensor(cgraph->nodes[i], tensors, visited);
    }
    // serialization format:
-    // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+    // | device (4 bytes) | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
    uint32_t n_tensors = tensors.size();
-    int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+    int output_size = 2*sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
    output.resize(output_size, 0);
-    memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+    uint8_t * dest = output.data();
+    memcpy(dest, &device, sizeof(device));
+    dest += sizeof(device);
+    memcpy(dest, &n_nodes, sizeof(n_nodes));
+    dest += sizeof(n_nodes);
    for (uint32_t i = 0; i < n_nodes; i++) {
-        memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+        memcpy(dest + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
    }
-    uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
-    *out_ntensors = n_tensors;
-    rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+    dest += n_nodes * sizeof(uint64_t);
+    memcpy(dest, &n_tensors, sizeof(n_tensors));
+    dest += sizeof(n_tensors);
+    rpc_tensor * out_tensors = (rpc_tensor *)dest;
    memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
 }

 static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
    std::vector<uint8_t> input;
-    serialize_graph(cgraph, input);
+    serialize_graph(rpc_ctx->device, cgraph, input);
    rpc_msg_graph_compute_rsp response;
    auto sock = get_socket(rpc_ctx->endpoint);
    bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
@@ -804,12 +841,13 @@ static ggml_backend_i ggml_backend_rpc_interface = {
    /* .graph_optimize          = */ NULL,
 };

-ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
+ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device) {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);
+    std::string buft_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
    // NOTE: buffer types are allocated and never freed; this is by design
    static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
-    auto it = buft_map.find(endpoint);
+    auto it = buft_map.find(buft_name);
    if (it != buft_map.end()) {
        return it->second;
    }
@@ -818,34 +856,37 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
        GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
        return nullptr;
    }
-    size_t alignment = get_alignment(sock);
-    size_t max_size = get_max_size(sock);
+    size_t alignment = get_alignment(sock, device);
+    size_t max_size = get_max_size(sock, device);
    ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
        /* .endpoint  = */ endpoint,
-        /* .name      = */ "RPC[" + std::string(endpoint) + "]",
+        /* .device    = */ device,
+        /* .name      = */ buft_name,
        /* .alignment = */ alignment,
        /* .max_size  = */ max_size
    };
-
+    auto reg = ggml_backend_rpc_add_server(endpoint);
    ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
        /* .iface   = */ ggml_backend_rpc_buffer_type_interface,
-        /* .device  = */ ggml_backend_rpc_add_device(endpoint),
+        /* .device  = */ ggml_backend_reg_dev_get(reg, device),
        /* .context = */ buft_ctx
    };
-    buft_map[endpoint] = buft;
+    buft_map[buft_name] = buft;
    return buft;
 }

-ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
+ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
+    std::string dev_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
    ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
-        /* .endpoint  = */ endpoint,
-        /* .name      = */ "RPC[" + std::string(endpoint) + "]",
+        /* .endpoint = */ endpoint,
+        /* .device   = */ device,
+        /* .name     = */ dev_name
    };
-
+    auto reg = ggml_backend_rpc_add_server(endpoint);
    ggml_backend_t backend = new ggml_backend {
        /* .guid    = */ ggml_backend_rpc_guid(),
        /* .iface   = */ ggml_backend_rpc_interface,
-        /* .device  = */ ggml_backend_rpc_add_device(endpoint),
+        /* .device  = */ ggml_backend_reg_dev_get(reg, device),
        /* .context = */ ctx
    };
    return backend;
@@ -855,37 +896,39 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
 }

-static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
+static void get_device_memory(const std::shared_ptr<socket_t> & sock, uint32_t device, size_t * free, size_t * total) {
+    rpc_msg_get_device_memory_req request;
+    request.device = device;
    rpc_msg_get_device_memory_rsp response;
-    bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, &request, sizeof(request), &response, sizeof(response));
    RPC_STATUS_ASSERT(status);
    *free = response.free_mem;
    *total = response.total_mem;
 }

-void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
+void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total) {
    auto sock = get_socket(endpoint);
    if (sock == nullptr) {
        *free = 0;
        *total = 0;
        return;
    }
-    get_device_memory(sock, free, total);
+    get_device_memory(sock, device, free, total);
 }

 // RPC server-side implementation

 class rpc_server {
 public:
-    rpc_server(ggml_backend_t backend, const char * cache_dir)
-        : backend(backend), cache_dir(cache_dir) {
+    rpc_server(std::vector<ggml_backend_t> backends, const char * cache_dir)
+        : backends(std::move(backends)), cache_dir(cache_dir) {
    }
    ~rpc_server();

    void hello(rpc_msg_hello_rsp & response);
-    void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
-    void get_alignment(rpc_msg_get_alignment_rsp & response);
-    void get_max_size(rpc_msg_get_max_size_rsp & response);
+    bool alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
+    bool get_alignment(const rpc_msg_get_alignment_req & request, rpc_msg_get_alignment_rsp & response);
+    bool get_max_size(const rpc_msg_get_max_size_req & request, rpc_msg_get_max_size_rsp & response);
    bool buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response);
    bool free_buffer(const rpc_msg_free_buffer_req & request);
    bool buffer_clear(const rpc_msg_buffer_clear_req & request);
@@ -906,7 +949,7 @@ private:
                              std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);


-    ggml_backend_t backend;
+    std::vector<ggml_backend_t> backends;
    const char * cache_dir;
    std::unordered_set<ggml_backend_buffer_t> buffers;
 };
@@ -919,6 +962,10 @@ void rpc_server::hello(rpc_msg_hello_rsp & response) {
 }

 bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
+    uint32_t dev_id = request.device;
+    if (dev_id >= backends.size()) {
+        return false;
+    }
    ggml_backend_buffer_type_t buft;
    struct ggml_init_params params {
        /*.mem_size   =*/ ggml_tensor_overhead(),
@@ -935,10 +982,10 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
        GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
        return false;
    }
-    LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
+    LOG_DBG("[%s] device: %d, buffer: %p, data: %p\n", __func__, dev_id, (void*)tensor->buffer, tensor->data);
    if (tensor->buffer == nullptr) {
        //No buffer allocated.
-        buft = ggml_backend_get_default_buffer_type(backend);
+        buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
    } else {
        buft = tensor->buffer->buft;
    }
@@ -948,33 +995,49 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
    return true;
 }

-void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
-    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+bool rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
+    uint32_t dev_id = request.device;
+    if (dev_id >= backends.size()) {
+        return false;
+    }
+    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
    response.remote_ptr = 0;
    response.remote_size = 0;
    if (buffer != nullptr) {
        response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
        response.remote_size = buffer->size;
-        LOG_DBG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
+        LOG_DBG("[%s] device: %d, size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n",
+            __func__, dev_id, request.size, response.remote_ptr, response.remote_size);
        buffers.insert(buffer);
    } else {
-        LOG_DBG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
+        LOG_DBG("[%s] device: %d, size: %" PRIu64 " -> failed\n", __func__, dev_id, request.size);
    }
+    return true;
 }

-void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
-    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+bool rpc_server::get_alignment(const rpc_msg_get_alignment_req & request, rpc_msg_get_alignment_rsp & response) {
+    uint32_t dev_id = request.device;
+    if (dev_id >= backends.size()) {
+        return false;
+    }
+    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
    size_t alignment = ggml_backend_buft_get_alignment(buft);
-    LOG_DBG("[%s] alignment: %lu\n", __func__, alignment);
+    LOG_DBG("[%s] device: %d, alignment: %lu\n", __func__, dev_id, alignment);
    response.alignment = alignment;
+    return true;
 }

-void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
-    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+bool rpc_server::get_max_size(const rpc_msg_get_max_size_req & request, rpc_msg_get_max_size_rsp & response) {
+    uint32_t dev_id = request.device;
+    if (dev_id >= backends.size()) {
+        return false;
+    }
+    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
    size_t max_size = ggml_backend_buft_get_max_size(buft);
-    LOG_DBG("[%s] max_size: %lu\n", __func__, max_size);
+    LOG_DBG("[%s] device: %d, max_size: %lu\n", __func__, dev_id, max_size);
    response.max_size = max_size;
+    return true;
 }

 bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
@@ -1332,23 +1395,33 @@ ggml_tensor * rpc_server::create_node(uint64_t id,

 bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response) {
    // serialization format:
-    // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
-    if (input.size() < sizeof(uint32_t)) {
+    // | device (4 bytes) | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+    if (input.size() < 2*sizeof(uint32_t)) {
+        return false;
+    }
+    const uint8_t * src = input.data();
+    uint32_t device;
+    memcpy(&device, src, sizeof(device));
+    src += sizeof(device);
+    if (device >= backends.size()) {
        return false;
    }
    uint32_t n_nodes;
-    memcpy(&n_nodes, input.data(), sizeof(n_nodes));
-    if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
+    memcpy(&n_nodes, src, sizeof(n_nodes));
+    src += sizeof(n_nodes);
+    if (input.size() < 2*sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
        return false;
    }
-    const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes));
+    const uint64_t * nodes = (const uint64_t *)src;
+    src += n_nodes*sizeof(uint64_t);
    uint32_t n_tensors;
-    memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors));
-    if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
+    memcpy(&n_tensors, src, sizeof(n_tensors));
+    src += sizeof(n_tensors);
+    if (input.size() < 2*sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
        return false;
    }
-    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
-    LOG_DBG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
+    const rpc_tensor * tensors = (const rpc_tensor *)src;
+    LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);

    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);

@@ -1380,7 +1453,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
            return false;
        }
    }
-    ggml_status status = ggml_backend_graph_compute(backend, graph);
+    ggml_status status = ggml_backend_graph_compute(backends[device], graph);
    response.result = status;
    return true;
 }
@@ -1391,9 +1464,9 @@ rpc_server::~rpc_server() {
    }
 }

-static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
-                             sockfd_t sockfd, size_t free_mem, size_t total_mem) {
-    rpc_server server(backend, cache_dir);
+static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const char * cache_dir,
+                             sockfd_t sockfd, const std::vector<size_t> & free_mem, const std::vector<size_t> & total_mem) {
+    rpc_server server(backends, cache_dir);
    uint8_t cmd;
    if (!recv_data(sockfd, &cmd, 1)) {
        return;
@@ -1425,13 +1498,26 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                // HELLO command is handled above
                return;
            }
+            case RPC_CMD_DEVICE_COUNT: {
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_device_count_rsp response;
+                response.device_count = backends.size();
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
            case RPC_CMD_ALLOC_BUFFER: {
                rpc_msg_alloc_buffer_req request;
                if (!recv_msg(sockfd, &request, sizeof(request))) {
                    return;
                }
                rpc_msg_alloc_buffer_rsp response;
-                server.alloc_buffer(request, response);
+                if (!server.alloc_buffer(request, response)) {
+                    return;
+                }
                if (!send_msg(sockfd, &response, sizeof(response))) {
                    return;
                }
@@ -1452,22 +1538,28 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                break;
            }
            case RPC_CMD_GET_ALIGNMENT: {
-                if (!recv_msg(sockfd, nullptr, 0)) {
+                rpc_msg_get_alignment_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
                    return;
                }
                rpc_msg_get_alignment_rsp response;
-                server.get_alignment(response);
+                if (!server.get_alignment(request, response)) {
+                    return;
+                }
                if (!send_msg(sockfd, &response, sizeof(response))) {
                    return;
                }
                break;
            }
            case RPC_CMD_GET_MAX_SIZE: {
-                if (!recv_msg(sockfd, nullptr, 0)) {
+                rpc_msg_get_max_size_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
                    return;
                }
                rpc_msg_get_max_size_rsp response;
-                server.get_max_size(response);
+                if (!server.get_max_size(request, response)) {
+                    return;
+                }
                if (!send_msg(sockfd, &response, sizeof(response))) {
                    return;
                }
@@ -1593,12 +1685,19 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                break;
            }
            case RPC_CMD_GET_DEVICE_MEMORY: {
-                if (!recv_msg(sockfd, nullptr, 0)) {
+                rpc_msg_get_device_memory_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                auto dev_id = request.device;
+                if (dev_id >= backends.size()) {
                    return;
                }
                rpc_msg_get_device_memory_rsp response;
-                response.free_mem = free_mem;
-                response.total_mem = total_mem;
+                response.free_mem = free_mem[dev_id];
+                response.total_mem = total_mem[dev_id];
+                LOG_DBG("[get_device_mem] device: %u, free_mem: %" PRIu64 ", total_mem: %" PRIu64 "\n", dev_id,
+                    response.free_mem, response.total_mem);
                if (!send_msg(sockfd, &response, sizeof(response))) {
                    return;
                }
@@ -1612,16 +1711,41 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
    }
 }

-void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
-                                   const char * cache_dir,
-                                   size_t free_mem, size_t total_mem) {
+void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
+                                   size_t n_threads, size_t n_devices,
+                                   ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem) {
+    if (n_devices == 0 || devices == nullptr || free_mem == nullptr || total_mem == nullptr) {
+        fprintf(stderr, "Invalid arguments to ggml_backend_rpc_start_server\n");
+        return;
+    }
+    std::vector<ggml_backend_t> backends;
+    std::vector<size_t> free_mem_vec(free_mem, free_mem + n_devices);
+    std::vector<size_t> total_mem_vec(total_mem, total_mem + n_devices);
    printf("Starting RPC server v%d.%d.%d\n",
        RPC_PROTO_MAJOR_VERSION,
        RPC_PROTO_MINOR_VERSION,
        RPC_PROTO_PATCH_VERSION);
    printf("  endpoint       : %s\n", endpoint);
    printf("  local cache    : %s\n", cache_dir ? cache_dir : "n/a");
-    printf("  backend memory : %zu MB\n", free_mem / (1024 * 1024));
+    printf("Devices:\n");
+    for (size_t i = 0; i < n_devices; i++) {
+        auto dev = devices[i];
+        printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+               total_mem[i] / 1024 / 1024, free_mem[i] / 1024 / 1024);
+        auto backend = ggml_backend_dev_init(dev, nullptr);
+        if (!backend) {
+            fprintf(stderr, "Failed to create backend for device %s\n", dev->iface.get_name(dev));
+            return;
+        }
+        backends.push_back(backend);
+        ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+        if (reg) {
+            auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+            if (ggml_backend_set_n_threads_fn) {
+                ggml_backend_set_n_threads_fn(backend, n_threads);
+            }
+        }
+    }

    std::string host;
    int port;
@@ -1649,22 +1773,27 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
            fprintf(stderr, "Failed to accept client connection\n");
            return;
        }
-        printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
+        printf("Accepted client connection\n");
        fflush(stdout);
-        rpc_serve_client(backend, cache_dir, client_socket->fd, free_mem, total_mem);
+        rpc_serve_client(backends, cache_dir, client_socket->fd, free_mem_vec, total_mem_vec);
        printf("Client connection closed\n");
        fflush(stdout);
    }
 #ifdef _WIN32
    WSACleanup();
 #endif
+    for (auto backend : backends) {
+        ggml_backend_free(backend);
+    }
 }

 // device interface

 struct ggml_backend_rpc_device_context {
    std::string endpoint;
+    uint32_t    device;
    std::string name;
+    std::string description;
 };

 static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
@@ -1676,15 +1805,13 @@ static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
 static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;

-    return ctx->name.c_str();
+    return ctx->description.c_str();
 }

 static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;

-    ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
-
-    GGML_UNUSED(dev);
+    ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), ctx->device, free, total);
 }

 static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
@@ -1710,7 +1837,7 @@ static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggm
 static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;

-    return ggml_backend_rpc_init(ctx->endpoint.c_str());
+    return ggml_backend_rpc_init(ctx->endpoint.c_str(), ctx->device);

    GGML_UNUSED(params);
 }
@@ -1718,7 +1845,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
 static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;

-    return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
+    return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str(), ctx->device);

    GGML_UNUSED(dev);
 }
@@ -1736,7 +1863,7 @@ static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_b
    }
    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
    ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
-    return buft_ctx->endpoint == dev_ctx->endpoint;
+    return buft_ctx->endpoint == dev_ctx->endpoint && buft_ctx->device == dev_ctx->device;
 }

 static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
@@ -1759,28 +1886,34 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {

 // backend reg interface

-static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
-    return "RPC";
+struct ggml_backend_rpc_reg_context {
+    std::string                     name;
+    std::vector<ggml_backend_dev_t> devices;
+};

-    GGML_UNUSED(reg);
+static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
+    ggml_backend_rpc_reg_context * ctx = (ggml_backend_rpc_reg_context *)reg->context;
+    return ctx ? ctx->name.c_str() : "RPC";
 }

 static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 0;
-
-    GGML_UNUSED(reg);
+    ggml_backend_rpc_reg_context * ctx = (ggml_backend_rpc_reg_context *)reg->context;
+    return ctx ? ctx->devices.size() : 0;
 }

 static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
+    ggml_backend_rpc_reg_context * ctx = (ggml_backend_rpc_reg_context *)reg->context;
+    if (ctx == nullptr) {
+        GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_rpc_add_server instead");
+    } else {
+        GGML_ASSERT(index < ctx->devices.size());
+        return ctx->devices[index];
+    }
 }

 static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
-        return (void *)ggml_backend_rpc_add_device;
+    if (std::strcmp(name, "ggml_backend_rpc_add_server") == 0) {
+        return (void *)ggml_backend_rpc_add_server;
    }
    if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) {
        return (void *)ggml_backend_rpc_start_server;
@@ -1807,30 +1940,61 @@ ggml_backend_reg_t ggml_backend_rpc_reg(void) {
    return &ggml_backend_rpc_reg;
 }

-ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
-    static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
-
-    static std::mutex mutex;
-    std::lock_guard<std::mutex> lock(mutex);
-
-    if (dev_map.find(endpoint) != dev_map.end()) {
-        return dev_map[endpoint];
-    }
-
-    ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
-        /* .endpoint = */ endpoint,
-        /* .name     = */ "RPC[" + std::string(endpoint) + "]",
-    };
-
-    ggml_backend_dev_t dev = new ggml_backend_device {
-        /* .iface   = */ ggml_backend_rpc_device_i,
-        /* .reg     = */ ggml_backend_rpc_reg(),
-        /* .context = */ ctx,
-    };
-
-    dev_map[endpoint] = dev;
-
-    return dev;
+static uint32_t ggml_backend_rpc_get_device_count(const char * endpoint) {
+    auto sock = get_socket(endpoint);
+    rpc_msg_device_count_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_DEVICE_COUNT, nullptr, 0, &response, sizeof(response));
+    RPC_STATUS_ASSERT(status);
+    return response.device_count;
 }

+static const ggml_backend_reg_i ggml_backend_rpc_reg_interface = {
+    /* .get_name          = */ ggml_backend_rpc_reg_get_name,
+    /* .get_device_count  = */ ggml_backend_rpc_reg_get_device_count,
+    /* .get_device        = */ ggml_backend_rpc_reg_get_device,
+    /* .get_proc_address  = */ ggml_backend_rpc_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) {
+    static std::unordered_map<std::string, ggml_backend_reg_t> reg_map;
+    static std::mutex mutex;
+    static uint32_t dev_id = 0;
+    std::lock_guard<std::mutex> lock(mutex);
+    if (reg_map.find(endpoint) != reg_map.end()) {
+        return reg_map[endpoint];
+    }
+    uint32_t dev_count = ggml_backend_rpc_get_device_count(endpoint);
+    if (dev_count == 0) {
+        return nullptr;
+    }
+    ggml_backend_rpc_reg_context * ctx = new ggml_backend_rpc_reg_context;
+    ctx->name = "RPC[" + std::string(endpoint) + "]";
+    for (uint32_t ind = 0; ind < dev_count; ind++) {
+        std::string dev_name = "RPC" + std::to_string(dev_id);
+        std::string dev_desc = std::string(endpoint);
+        ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context {
+            /* .endpoint    = */ endpoint,
+            /* .device      = */ ind,
+            /* .name        = */ dev_name,
+            /* .description = */ dev_desc
+        };
+
+        ggml_backend_dev_t dev = new ggml_backend_device {
+            /* .iface   = */ ggml_backend_rpc_device_i,
+            /* .reg     = */ ggml_backend_rpc_reg(),
+            /* .context = */ dev_ctx,
+        };
+        ctx->devices.push_back(dev);
+        dev_id++;
+    }
+    ggml_backend_reg_t reg = new ggml_backend_reg {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_rpc_reg_interface,
+        /* .context     = */ ctx
+    };
+    reg_map[endpoint] = reg;
+    return reg;
+}
+
+
 GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.19)
 cmake_policy(SET CMP0114 NEW)
+cmake_policy(SET CMP0116 NEW)

 find_package(Vulkan COMPONENTS glslc REQUIRED)

@@ -54,25 +55,25 @@ if (Vulkan_FOUND)
    # Test all shader extensions
    test_shader_extension_support(
        "GL_KHR_cooperative_matrix"
-        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/coopmat.comp"
        "GGML_VULKAN_COOPMAT_GLSLC_SUPPORT"
    )

    test_shader_extension_support(
        "GL_NV_cooperative_matrix2"
-        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/coopmat2.comp"
        "GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT"
    )

    test_shader_extension_support(
        "GL_EXT_integer_dot_product"
-        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_integer_dot_support.comp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/integer_dot.comp"
        "GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT"
    )

    test_shader_extension_support(
        "GL_EXT_bfloat16"
-        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_bfloat16_support.comp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/bfloat16.comp"
        "GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT"
    )

@@ -160,7 +161,6 @@ if (Vulkan_FOUND)
    set (_ggml_vk_genshaders_dir "${CMAKE_BINARY_DIR}/$<CONFIG>")
    set (_ggml_vk_genshaders_cmd "${_ggml_vk_genshaders_dir}/vulkan-shaders-gen${_ggml_vk_host_suffix}")
    set (_ggml_vk_header     "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp")
-    set (_ggml_vk_source     "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp")
    set (_ggml_vk_input_dir  "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders")
    set (_ggml_vk_output_dir "${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv")

@@ -176,24 +176,35 @@ if (Vulkan_FOUND)

    add_custom_command(
        OUTPUT ${_ggml_vk_header}
-               ${_ggml_vk_source}
-
        COMMAND ${_ggml_vk_genshaders_cmd}
-            --glslc      ${Vulkan_GLSLC_EXECUTABLE}
-            --input-dir  ${_ggml_vk_input_dir}
            --output-dir ${_ggml_vk_output_dir}
            --target-hpp ${_ggml_vk_header}
-            --target-cpp ${_ggml_vk_source}
-            --no-clean
-
-        DEPENDS ${_ggml_vk_shader_files}
-                ${_ggml_vk_shaders_gen_sources}
+        DEPENDS ${_ggml_vk_shaders_gen_sources}
                vulkan-shaders-gen
-
-        COMMENT "Generate vulkan shaders"
+        COMMENT "Generate vulkan shaders header"
    )
+    target_sources(ggml-vulkan PRIVATE ${_ggml_vk_header})

-    target_sources(ggml-vulkan PRIVATE ${_ggml_vk_source} ${_ggml_vk_header})
+    foreach (file_full ${_ggml_vk_shader_files})
+        get_filename_component(file ${file_full} NAME)
+        set (_ggml_vk_target_cpp "${CMAKE_CURRENT_BINARY_DIR}/${file}.cpp")
+
+        add_custom_command(
+            OUTPUT  ${_ggml_vk_target_cpp}
+            DEPFILE ${_ggml_vk_target_cpp}.d
+            COMMAND ${_ggml_vk_genshaders_cmd}
+                --glslc      ${Vulkan_GLSLC_EXECUTABLE}
+                --source     ${file_full}
+                --output-dir ${_ggml_vk_output_dir}
+                --target-hpp ${_ggml_vk_header}
+                --target-cpp ${_ggml_vk_target_cpp}
+            DEPENDS ${file_full}
+                    ${_ggml_vk_shaders_gen_sources}
+                    vulkan-shaders-gen
+            COMMENT "Generate vulkan shaders for ${file}"
+        )
+        target_sources(ggml-vulkan PRIVATE ${_ggml_vk_target_cpp})
+    endforeach()

 else()
    message(WARNING "Vulkan not found")
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -393,6 +393,7 @@ struct vk_device_struct {
    vk::PhysicalDeviceProperties properties;
    std::string name;
    uint64_t max_memory_allocation_size;
+    uint64_t max_buffer_size;
    uint64_t suballocation_block_size;
    bool fp16;
    bool bf16;
@@ -1563,6 +1564,12 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx

 static void ggml_backend_vk_free(ggml_backend_t backend);

+static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) {
+    const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset},
+                                        VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
+    return range;
+}
+
 // Wait for ctx->fence to be signaled.
 static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
    // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
@@ -2012,8 +2019,8 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr

 static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) {
    VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
-    if (size > device->max_memory_allocation_size) {
-        throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
+    if (size > device->max_buffer_size) {
+        throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
    }

    vk_buffer buf = std::make_shared<vk_buffer_struct>();
@@ -2159,8 +2166,8 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf) {
    buf.reset();
 }

-static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
-    return { buf, 0, VK_WHOLE_SIZE };
+static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) {
+    return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) };
 }

 static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) {
@@ -2614,8 +2621,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
        const uint32_t D_lsb = D ^ (D & (D-1));
        uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);

-        // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
-        GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
        return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split};
    };

@@ -3855,17 +3860,27 @@ static vk_device ggml_vk_get_device(size_t idx) {
        const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");

        if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
-            device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
+            device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
        } else if (maintenance4_support) {
            device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
        } else {
            device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
        }

+        const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE");
+
+        if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) {
+            device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE);
+        } else if (maintenance4_support) {
+            device->max_buffer_size = props4.maxBufferSize;
+        } else {
+            device->max_buffer_size = device->max_memory_allocation_size;
+        }
+
        const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");

        if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
-            device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
+            device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
        } else {
            // Limit batching of allocations to 1GB by default to avoid fragmentation issues
            device->suballocation_block_size = 1024*1024*1024;
@@ -6150,9 +6165,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
        }
        const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
        if (
-                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
-                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
-                (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) {
+                (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+                (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) {
            GGML_ABORT("Requested preallocation size is too large");
        }
        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
@@ -6227,7 +6242,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
    }

    if (x_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
    } else if (qx_needs_dequant) {
        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
@@ -6239,7 +6254,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
            if (ctx->prealloc_y_need_sync) {
                ggml_vk_sync_buffers(ctx, subctx);
            }
-            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
            ctx->prealloc_y_last_tensor_used = src1;
        }
@@ -6250,7 +6265,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
            if (ctx->prealloc_y_need_sync) {
                ggml_vk_sync_buffers(ctx, subctx);
            }
-            ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
+            ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true);
            ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
            ctx->prealloc_y_last_tensor_used = src1;
        }
@@ -6272,14 +6287,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
        y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
    }

-    // No bounds checking is needed for dst. This is basically VK_WHOLE_SIZE but clamped to maxStorageBufferRange.
-    VkDeviceSize d_range = std::min(VkDeviceSize{d_D->size - d_buf_offset}, VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
-
    // compute
    ggml_vk_matmul(
        ctx, subctx, pipeline,
        { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total },
-        { d_D, d_buf_offset, d_range }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
+        ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
        ne01, ne11, ne10,
        ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d,
        split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
@@ -6446,8 +6458,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
            y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
        }
        if (
-                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
-                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+                (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
            GGML_ABORT("Requested preallocation size is too large");
        }
        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
@@ -6512,7 +6524,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
        }

        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
    }
    if (y_non_contig) {
        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
@@ -6521,7 +6533,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
            if (ctx->prealloc_y_need_sync) {
                ggml_vk_sync_buffers(ctx, subctx);
            }
-            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
            ctx->prealloc_y_last_tensor_used = src1;
        }
@@ -6532,7 +6544,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
            if (ctx->prealloc_y_need_sync) {
                ggml_vk_sync_buffers(ctx, subctx);
            }
-            ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
+            ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true);
            ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
            ctx->prealloc_y_last_tensor_used = src1;
        }
@@ -6931,8 +6943,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
        const uint64_t x_sz_upd = x_sz * ne02 * ne03;
        const uint64_t y_sz_upd = y_sz * ne12 * ne13;
        if (
-                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
-                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+                (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
            GGML_ABORT("Requested preallocation size is too large");
        }
        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
@@ -6999,7 +7011,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
    }

    if (x_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
    } else if (qx_needs_dequant) {
        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
@@ -7012,7 +7024,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
            if (ctx->prealloc_y_need_sync) {
                ggml_vk_sync_buffers(ctx, subctx);
            }
-            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
            ctx->prealloc_y_last_tensor_used = src1;
        }
@@ -7145,8 +7157,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
        const uint64_t x_sz_upd = x_sz * ne02 * ne03;
        const uint64_t y_sz_upd = y_sz * ne12 * ne13;
        if (
-                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
-                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+                (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
            GGML_ABORT("Requested preallocation size is too large");
        }
        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
@@ -7212,7 +7224,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte

    if (x_non_contig) {
        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
    }
    if (y_non_contig) {
        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
@@ -7221,7 +7233,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
            if (ctx->prealloc_y_need_sync) {
                ggml_vk_sync_buffers(ctx, subctx);
            }
-            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+            ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
            ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
            ctx->prealloc_y_last_tensor_used = src1;
        }
@@ -7457,8 +7469,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    if (((HSK | HSV) % 16) != 0 && path == FA_COOPMAT2) {
        aligned = false;
    }
-    // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
-    GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0);

    bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;

@@ -7498,7 +7508,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
    // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
    const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
-    if (split_k_size > ctx->device->max_memory_allocation_size) {
+    if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) {
        GGML_ABORT("Requested preallocation size is too large");
    }
    if (ctx->prealloc_size_split_k < split_k_size) {
@@ -7620,12 +7630,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx

        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                    {
-                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
+                                        ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
+                                        ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
                                    },
                                    // We only use split_k when group query attention is enabled, which means
                                    // there's no more than one tile of rows (i.e. workgroups_x would have been
@@ -7637,21 +7647,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
        const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
        ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
                                    {
-                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                        ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
+                                        ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
                                    },
                                    pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
        ctx->prealloc_split_k_need_sync = true;
    } else {
        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                    {
-                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
-                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                        ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
+                                        ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
                                    },
                                    pc, { workgroups_x, workgroups_y, workgroups_z });
    }
@@ -8360,18 +8370,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        }
    }

-    uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0;
-    uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0;
-    uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
-    uint64_t d_sz = ggml_type_size(dst->type) * ned;
-
    vk_buffer d_D = dst_buf_ctx->dev_buffer;

-    // Workaround for tiny tensor inputs on ROPE
-    if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
-        y_sz = VK_WHOLE_SIZE;
-    }
-
    GGML_ASSERT(d_D != nullptr);
    uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
    if(!src0_uma) {
@@ -8396,26 +8396,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
    d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);

-    if (op_supports_incontiguous) {
-        x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
-        y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
-        z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
-        d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
-
-        if (x_buf_offset + x_sz >= d_X->size) {
-            x_sz = VK_WHOLE_SIZE;
-        }
-        if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
-            y_sz = VK_WHOLE_SIZE;
-        }
-        if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
-            z_sz = VK_WHOLE_SIZE;
-        }
-        if (d_buf_offset + d_sz >= d_D->size) {
-            d_sz = VK_WHOLE_SIZE;
-        }
-    }
-
    std::array<uint32_t, 3> elements;

    // Single call if dimension 2 is contiguous
@@ -8606,19 +8586,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        break;
    }

-    if (!op_supports_incontiguous) {
-        if (x_sz != VK_WHOLE_SIZE) {
-            x_sz *= ne02 * ne03;
+    uint64_t x_sz, y_sz, z_sz, d_sz;
+
+    if (op_supports_incontiguous) {
+        x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
+        y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
+        z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
+        d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
+
+        if (x_buf_offset + x_sz >= d_X->size) {
+            x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset);
        }
-        if (use_src1 && y_sz != VK_WHOLE_SIZE) {
-            y_sz *= ne12 * ne13;
+        if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
+            y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset);
        }
-        if (use_src2 && z_sz != VK_WHOLE_SIZE) {
-            z_sz *= ne22 * ne23;
+        if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
+            z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset);
        }
-        if (d_sz != VK_WHOLE_SIZE) {
-            d_sz *= ned2 * ned3;
+        if (d_buf_offset + d_sz >= d_D->size) {
+            d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset);
        }
+    } else {
+        x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03;
+        y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0;
+        z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0;
+        d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3;
    }

    if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) {
@@ -8628,7 +8620,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
            { vk_subbuffer{ d_X, x_buf_offset, x_sz },
              vk_subbuffer{ d_Y, y_buf_offset, y_sz },
              vk_subbuffer{ d_D, d_buf_offset, d_sz },
-              vk_subbuffer{ d_A, a_buf_offset, VK_WHOLE_SIZE },
+              ggml_vk_subbuffer(ctx, d_A, a_buf_offset),
            }, pc, elements);
    } else if (op == GGML_OP_GLU) {
        // Empty src1 is possible in glu, but the shader needs a buffer
@@ -8821,18 +8813,18 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx,
    static_assert(MAX_PARAMETER_COUNT == 12);
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
        {
-            vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[8], offset[8], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[9], offset[9], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[10], offset[10], VK_WHOLE_SIZE },
-            vk_subbuffer{ buf[11], offset[11], VK_WHOLE_SIZE },
+            ggml_vk_subbuffer(ctx, buf[0], offset[0]),
+            ggml_vk_subbuffer(ctx, buf[1], offset[1]),
+            ggml_vk_subbuffer(ctx, buf[2], offset[2]),
+            ggml_vk_subbuffer(ctx, buf[3], offset[3]),
+            ggml_vk_subbuffer(ctx, buf[4], offset[4]),
+            ggml_vk_subbuffer(ctx, buf[5], offset[5]),
+            ggml_vk_subbuffer(ctx, buf[6], offset[6]),
+            ggml_vk_subbuffer(ctx, buf[7], offset[7]),
+            ggml_vk_subbuffer(ctx, buf[8], offset[8]),
+            ggml_vk_subbuffer(ctx, buf[9], offset[9]),
+            ggml_vk_subbuffer(ctx, buf[10], offset[10]),
+            ggml_vk_subbuffer(ctx, buf[11], offset[11]),
        }, pc, elements);
 }

@@ -10006,7 +9998,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
    ggml_vk_ctx_begin(ctx->device, subctx);
    for (size_t i = 0; i < num_it; i++) {
        ggml_vk_matmul(
-            ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
+            ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k),
            m, n, k,
            k, k, m, k*m, k*n, m*n,
            split_k, batch, batch, batch, 1, 1, n
@@ -10317,7 +10309,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 //
 //     vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
 //     ggml_vk_ctx_begin(ctx->device, subctx);
-//     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
+//     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne);
 //     ggml_vk_ctx_end(subctx);
 //
 //     auto begin = std::chrono::high_resolution_clock::now();
--- a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_binary_head.comp"
+#include "types.glsl"
+#include "generic_binary_head.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
@@ -6,8 +6,8 @@
 #extension GL_KHR_shader_subgroup_basic : enable
 #endif

-#include "types.comp"
-#include "generic_binary_head.comp"
+#include "types.glsl"
+#include "generic_binary_head.glsl"

 const uint num_threads = 256;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp
@@ -2,7 +2,7 @@

 #extension GL_EXT_control_flow_attributes : require

-#include "types.comp"
+#include "types.glsl"

 layout (push_constant) uniform parameter
 {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
@@ -1,7 +1,7 @@
 #version 450
 #extension GL_EXT_control_flow_attributes : enable

-#include "types.comp"
+#include "types.glsl"

 layout(constant_id = 0) const int BLOCK_SIZE = 1024;
 layout(constant_id = 1) const int BLOCK_SIZE_LOG2 = 10;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_unary_head.comp"
+#include "types.glsl"
+#include "generic_unary_head.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_binary_head.comp"
+#include "types.glsl"
+#include "generic_binary_head.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_unary_head.comp"
+#include "types.glsl"
+#include "generic_unary_head.glsl"

 #extension GL_EXT_control_flow_attributes : require

--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "types.comp"
+#include "types.glsl"

 layout (push_constant) uniform parameter
 {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -11,7 +11,7 @@
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif

-#include "types.comp"
+#include "types.glsl"

 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout(binding = 0) readonly buffer A {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "types.comp"
+#include "types.glsl"

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};   // src0 - kernel:    [K, Cout, Cin]
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};   // src1 - input:     [L, Cin]
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_unary_head.comp"
+#include "types.glsl"
+#include "generic_unary_head.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
@@ -1,8 +1,8 @@
 #version 450

-#include "types.comp"
-#include "generic_unary_head.comp"
-#include "dequant_funcs.comp"
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+#include "dequant_funcs.glsl"

 #if defined(DATA_A_IQ4_NL) || defined(DATA_A_MXFP4)
 // 16 invocations needed for init_iq_shmem
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "rte.comp"
-#include "types.comp"
+#include "rte.glsl"
+#include "types.glsl"

 #if defined(SET_ROWS) && QUANT_K == 1
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@@ -14,7 +14,7 @@ const uint BLOCK_SIZE = 32;
 layout (binding = 0) readonly buffer S {float data_s[];};

 #if defined(SET_ROWS)
-#include "generic_binary_head.comp"
+#include "generic_binary_head.glsl"
 layout (binding = 1) readonly buffer C {B_TYPE data_i[];};
 layout (binding = 2) writeonly buffer Q {A_TYPE data_q[];};

@@ -25,7 +25,7 @@ layout (binding = 2) writeonly buffer Q {A_TYPE data_q[];};
 #endif

 #else
-#include "generic_unary_head.comp"
+#include "generic_unary_head.glsl"
 layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];};
 #endif

--- a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_unary_head.comp"
+#include "types.glsl"
+#include "generic_unary_head.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp
@@ -2,8 +2,8 @@

 #extension GL_EXT_control_flow_attributes : enable

-#include "types.comp"
-#include "generic_head.comp"
+#include "types.glsl"
+#include "generic_head.glsl"

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -2,7 +2,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
 #endif

-#include "types.comp"
+#include "types.glsl"

 #if defined(A_TYPE_PACKED16)
 layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
@@ -1,5 +1,5 @@

-#include "types.comp"
+#include "types.glsl"

 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
   block_q4_0_packed16 block;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl
@@ -10,4 +10,4 @@ layout (push_constant) uniform parameter
    uint nel;
 } p;

-#include "types.comp"
+#include "types.glsl"
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp
@@ -2,7 +2,7 @@

 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "dequant_head.comp"
+#include "dequant_head.glsl"

 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp
@@ -10,7 +10,7 @@ layout (push_constant) uniform parameter
    uint n_past;
 } p;

-#include "types.comp"
+#include "types.glsl"

 layout(local_size_x = 1, local_size_y = 512, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_binary_head.comp"
+#include "types.glsl"
+#include "generic_binary_head.glsl"

 const uint num_threads = 256;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp
@@ -1,8 +1,8 @@
 #version 450

-#include "rte.comp"
-#include "generic_head.comp"
-#include "types.comp"
+#include "rte.glsl"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@@ -8,8 +8,8 @@

 #extension GL_KHR_shader_subgroup_shuffle : enable

-#include "types.comp"
-#include "flash_attn_base.comp"
+#include "types.glsl"
+#include "flash_attn_base.glsl"

 const uint32_t HSK_per_thread = HSK / D_split;
 const uint32_t HSV_per_thread = HSV / D_split;
@@ -153,12 +153,13 @@ void main() {
        }

        if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
+            bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;

            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
                uint32_t c = (idx + tid) % Bc;
                uint32_t r = (idx + tid) / Bc;
                if (idx + tid < Bc * Br) {
-                    if (!KV_bounds_check || j * Bc + c < KV) {
+                    if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
                        masksh[c][r] = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
                    } else {
                        masksh[c][r] = float(0);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -10,8 +10,8 @@
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable

-#include "types.comp"
-#include "flash_attn_base.comp"
+#include "types.glsl"
+#include "flash_attn_base.glsl"

 const uint32_t HSK_per_thread = HSK / D_split;
 const uint32_t HSV_per_thread = HSV / D_split;
@@ -201,11 +201,13 @@ void main() {
        }

        if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
+            bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
+
            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
                uint32_t c = (idx + tid) % Bc;
                uint32_t r = (idx + tid) / Bc;
                if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
-                    if (!KV_bounds_check || j * Bc + c < KV) {
+                    if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
                        sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]));
                    }
                }
@@ -356,8 +358,8 @@ void main() {
    }

    if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
-        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
-            float sink = perElemOpGetSink(r, 0u, ACC_TYPE(0), iq2);
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            float sink = perElemOpGetSink(tile_row(r), 0u, ACC_TYPE(0), iq2);

            float ms = 1.0f;
            float vs = 1.0f;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -16,9 +16,9 @@
 #extension GL_KHR_shader_subgroup_vote : enable
 #extension GL_EXT_null_initializer : enable

-#include "types.comp"
-#include "dequant_funcs_cm2.comp"
-#include "flash_attn_base.comp"
+#include "types.glsl"
+#include "dequant_funcs_cm2.glsl"
+#include "flash_attn_base.glsl"

 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
@@ -154,15 +154,31 @@ void main() {
        }

        if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
-            tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
-            tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
-            tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
+            bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;

-            coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
+            if (nem1_bounds_check) {
+                tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+                tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
+                tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);

-            coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+                coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;

-            S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+                coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+
+                S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+            } else {
+                tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
+                // Don't clamp against nem1 when GQA is enabled
+                uint32_t m_height = p.gqa_ratio > 1 ? ~0 : p.nem1;
+                tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, m_height, KV);
+                tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
+
+                coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
+
+                coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+
+                S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+            }
        }

        // Clear padding elements to -inf, so they don't contribute to rowmax
--- a/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "glu_head.comp"
+#include "glu_head.glsl"

 const float GELU_COEF_A    = 0.044715f;
 const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -10,4 +10,4 @@ float op(float a, float b) {
    return 0.5f*a*(2.0f - 2.0f / (exp(2 * val) + 1)) * b;
 }

-#include "glu_main.comp"
+#include "glu_main.glsl"
--- a/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "glu_head.comp"
+#include "glu_head.glsl"

 // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
 // ref: https://www.johndcook.com/blog/python_erf/
@@ -24,4 +24,4 @@ float op(float a, float b) {
    return 0.5f * a * (1.0f + erf_approx) * b;
 }

-#include "glu_main.comp"
+#include "glu_main.glsl"
--- a/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp
@@ -1,6 +1,6 @@
 #version 450

-#include "glu_head.comp"
+#include "glu_head.glsl"

 const float GELU_QUICK_COEF = -1.702f;

@@ -8,4 +8,4 @@ float op(float a, float b) {
    return a * (1.0f / (1.0f + exp(GELU_QUICK_COEF * a))) * b;
 }

-#include "glu_main.comp"
+#include "glu_main.glsl"
--- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl
@@ -1,8 +1,8 @@
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_EXT_control_flow_attributes : require

-#include "rte.comp"
-#include "utils.comp"
+#include "rte.glsl"
+#include "utils.glsl"

 layout (push_constant) uniform parameter
 {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "types.comp"
-#include "generic_binary_head.comp"
+#include "types.glsl"
+#include "generic_binary_head.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@@ -2,9 +2,9 @@

 #extension GL_EXT_control_flow_attributes : enable

-#include "types.comp"
-#include "generic_binary_head.comp"
-#include "dequant_funcs.comp"
+#include "types.glsl"
+#include "generic_binary_head.glsl"
+#include "dequant_funcs.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl
@@ -1,6 +1,6 @@
 #extension GL_EXT_shader_16bit_storage : require

-#include "rte.comp"
+#include "rte.glsl"

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl
--- a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable
 #define BLOCK_SIZE 512
--- a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable

--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -3,9 +3,8 @@
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_EXT_control_flow_attributes : require

-#include "rte.comp"
-
-#include "types.comp"
+#include "rte.glsl"
+#include "types.glsl"

 layout (push_constant) uniform parameter
 {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp
@@ -4,9 +4,8 @@
 #extension GL_EXT_control_flow_attributes : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require

-#include "rte.comp"
-
-#include "types.comp"
+#include "rte.glsl"
+#include "types.glsl"

 layout (push_constant) uniform parameter
 {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp
@@ -1,7 +1,7 @@
 #version 450

-#include "generic_head.comp"
-#include "types.comp"
+#include "generic_head.glsl"
+#include "types.glsl"

 #extension GL_EXT_control_flow_attributes : enable
 #define BLOCK_SIZE 512
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Radoslav Gerganov	f39283960b	rpc : check src buffer when copying tensor (#16421 ) Only dst buffer is guaranteed to be an RPC buffer. Add check for the src one.	2025-10-04 16:22:45 +03:00
Radoslav Gerganov	898acba681	rpc : add support for multiple devices (#16276 ) * rpc : add support for multiple devices Allow rpc-server to expose multiple devices from a single endpoint. Change RPC protocol to include device identifier where needed. closes: #15210 * fixes * use ggml_backend_reg_t * address review comments * fix llama-bench backend report * address review comments, change device naming * fix cmd order	2025-10-04 12:49:16 +03:00
Acly	e29acf74fe	vulkan : incremental shader builds (#16341 ) * vulkan (DRAFT): split shader generation by GLSL source file, to improve incremental build times * support dep-files so shaders are recompiled if their included files change * rename shader files which are used as "headers" to use .glsl extension * move glslc extension detection shaders to separate folders * the above is to prevent them from getting glob'd with the actual compute shaders that need to be compiled * vulkan : only write embedded shader .hpp/.cpp when they change * avoid recompiling ggml-vulkan.cpp when editing shaders * pass single --source argument instead of --input-dir & --filter to shader gen * check for source file match earlier * fix hang in vulkan-shaders-gen when there are compilation errors * early out did not decrement compile_count * clean up * fix glslc integer dot product test * unconditionally write the embedded shader cpp output * replace output filepath in generated dep-files to match output in CMakeLists --------- Co-authored-by: Jeff Bolz <jbolz@nvidia.com>	2025-10-04 11:42:56 +02:00
Pascal	128d522c04	chat : support Magistral thinking (#16413 ) * feat: added a dedicated Magistral chat format that preserves [THINK] spans, parses reasoning before tool calls * feat: new flow in the chat template test suite for Magistral	2025-10-03 21:51:48 +03:00
ddh0	f6dcda3900	server : context checkpointing for hybrid and recurrent models (#16382 ) * initial commit for branch 3 * generalize `swa_checkpoint` to `ctx_checkpoint` this extends `llama-server`'s SWA checkpointing logic to include hybrid/recurrent models such as Jamba, Granite * oops * disable debug prints * keep backwards compat with `--swa-checkpoints` Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update prompt re-processing message * fix off-by-one error per GG * keep `seq_rm` log per GG Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server : fix checkpoint logic to support recurrent caches * server : cleanup and fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-10-03 21:34:51 +03:00
Georgi Gerganov	606a73f531	metal : fix loop bound in ggml_mem_ranges (#16412 )	2025-10-03 19:18:56 +03:00
Sigbjørn Skjæret	946f71ed9a	llama : fix shapes for bert/mpt q/k norm (#16409 )	2025-10-03 14:40:25 +02:00
Acly	638d330246	ggml : fix graph reallocation with multiple chunks (#16396 ) reallocation is needed if a single chunk grows in size, even if total allocation size stays the same or is lower	2025-10-03 13:49:08 +02:00
Aleksander Grygier	84c8e305e8	Fix missing messages on sibling navigation (#16408 ) * fix: resolve message disappearing issue when navigating between regenerated siblings by using current leaf nodes instead of cached sibling IDs * chore: update webui build output * chore: update webui build output	2025-10-03 12:51:40 +02:00
Jeff Bolz	2aaf0a2a20	vulkan: Replace uses of maxMemoryAllocationSize and VK_WHOLE_SIZE (#16354 ) * vulkan: Replace uses of maxMemoryAllocationSize and VK_WHOLE_SIZE Replace maxMemoryAllocationSize check with maxBufferSize when creating buffers. The maxMemoryAllocationSize limit is a "soft" limit and allocations can succeed beyond that limit. This allows > 4GB buffers to be allocated on some implementations (e.g. NVIDIA) and tensors this large can be used for im2col and mul_mat. For temporary buffers (prealloc_x/y/etc) check against maxStorageBufferRange. I'm not sure this check is ideal, but we always use these buffers as a single full size binding and the limit may be smaller than maxMemoryAllocationSize or maxBufferSize, so I think this is reasonable. Replace descriptor range uses of VK_WHOLE_SIZE with a manually computed range. The maxStorageBufferRange may be smaller than the maxBufferSize or maxMemoryAllocationSize (and the Vulkan spec warns about this in a note) and it's invalid usage if VK_WHOLE_SIZE computes a range larger than maxStorageBufferRange. With this change, it should be possible to generate videos using wan networks in stable-diffusion.cpp. * vulkan: Add env var GGML_VK_FORCE_MAX_BUFFER_SIZE and use stoull	2025-10-03 12:50:46 +02:00
Jeff Bolz	0e1f838556	vulkan: Fix FA coopmat1 invalid array indexing (#16365 ) When computing sinks, the cm1 shader was looping r from 0 to Br rather than to rows_per_thread. I must have copied this from the scalar path (where it is correct), and somehow it wasn't causing failures on current drivers.	2025-10-03 11:52:46 +02:00
Daniel Bevenius	ad126479c2	ci : change macos-13 to macos-15-intel (#16401 ) This commit updates the macos-13 runners to macos-15-intel. The motivation for this changes is the macos-13 runners are scheduled to be retired on 2025-12-04. Refs: https://github.blog/changelog/2025-09-19-github-actions-macos-13-runner-image-is-closing-down/	2025-10-03 11:45:16 +02:00
Aleksander Grygier	77233277c9	Capture model name only after first token (streaming) or completed request (#16405 ) * feat: Capture model name only after first token (streaming) or completed request (non-streaming) * chore: update webui build output * chore: update webui build output	2025-10-03 11:30:39 +02:00
Jeff Bolz	e308efda8e	vulkan: in flash attention, bounds check against nem1 (don't rely on GGML_KQ_MASK_PAD) (#16316 )	2025-10-03 10:33:08 +02:00
Aleksander Grygier	136bda78c5	webui : Fix messages payload sent to chat completions (#16402 ) * fix: Include just the currently active message branches instead of all in chat completions request * chore: Build webui static output * chore: Formatting * chore: update webui build output	2025-10-03 10:11:34 +03:00
Pascal	5113efd34c	fix: track viewportHeight via window.innerHeight to avoid unwanted scrolling (#16356 ) Use <svelte:window bind:innerHeight> instead of manual resize listener Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>	2025-10-03 08:01:31 +02:00
Sigbjørn Skjæret	d64c8104f0	test-barrier : do not use more threads than physically available (#16389 ) * do not use more threads than physically available * ensure n_threads > 0 Co-authored-by: Jeff Bolz <jbolz@nvidia.com> --------- Co-authored-by: Jeff Bolz <jbolz@nvidia.com>	2025-10-02 20:10:12 +02:00
Reese Levine	ef07a40906	ggml webgpu: add support for soft_max, optimize rms_norm (#16357 ) * Add inplace softmax * Move rms_norm to split row approach * Update debug for supports_op * clean up debug statements * Update tests/test-backend-ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-10-02 11:00:31 -07:00
Piotr Wilkin (ilintar)	34fcc5a4ac	model : Apertus model implementation (#15852 ) * First attempt * No permute during convert (fixes qk tensors), proper norm application. * RoPE = NeoX * Coherence! * Migrate xielu params from tensors to hyperparameters * Simple CUDA kernel * Revert stupid LLM refactorings * Chat template support * configchecker / flake8 errors * Reorder unary.cu * I do conclude that LLMs are, in fact, stupid. * Fix after merge * Final newline * Make xIELU an UNARY_OP * Final newline * Correctly account for parameter shift * Argh. * Update ggml/src/ggml-cpu/unary-ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Refactor: remove unused methods, inline and factorize softplus, add const modifiers * Revert CUDA changes, implement xIELU as a separate OP * Pesky newline * Add float2half / half2float for F16 inputs/outputs * CUDA variants, attempt 2 * Actually, attempt 3 * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Missing convert header * Proper formula and reference for xIELU in the comments. * Modify unary-ops.cpp to add the functor-based logic besides the template system to retain optimizations * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Add tensor mappings for Apertus to global list instead * Fix lazy on scalars * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Add comment about the constraints on positive/negative alpha * Change `softplus` to `ggml_softplus` --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-10-02 20:43:22 +03:00
R0CKSTAR	91a2a56556	musa: update compile flags (#16265 ) Signed-off-by: Xiaodong Ye <yeahdongcn@gmail.com>	2025-10-02 16:29:56 +03:00
Sigbjørn Skjæret	72ee736c44	ci : fix ubuntu-latest-cmake-rpc (disable ccache) (#16388 )	2025-10-02 13:51:36 +02:00
Eve	f09aefaa84	ci: update vulkan ci (#16294 )	2025-10-02 10:10:07 +02:00
Georgi Gerganov	bbd32bc038	ci : fix clean-up of old logs (#16381 )	2025-10-02 10:35:43 +03:00