ggml-zendnn : resolve ZenDNN backend cross-module symbol dependency (#19159 )

CUDA: refactor topk-moe to enable more models (GLM 4.7, Nemotron etc.) (#19126 )
sycl: fix norm kernels: l2_norm, group_norm, rms_norm by remove assert to support more cases (#19154 )
2026-05-03 07:34:07 +00:00 · 2026-01-29 12:28:57 +08:00 · 2026-01-29 10:31:28 +08:00 · 2026-01-29 09:20:22 +08:00 · 2026-01-28 22:05:39 +01:00 · 2026-01-28 18:52:45 +01:00
105 changed files with 10474 additions and 2346 deletions
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -8,7 +8,7 @@ on:
 jobs:
  update:
    name: Update Winget Package
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    if: github.repository_owner == 'ggml-org'

    steps:
@@ -28,16 +28,17 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
            });
-            console.log("Latest release:", releases[0].tag_name);
-            return releases[0].tag_name;
+            const { tag_name: version, assets: assets } = releases.find(({assets}) => assets.find(asset => asset.name.includes('win-vulkan')));
+            const { browser_download_url: asset_url } = assets.find(asset => asset.name.includes('win-vulkan'));
+            console.log("Latest release:", version);
+            core.setOutput('VERSION', version);
+            core.setOutput('ASSETURL', asset_url);

      - name: Update manifest
-        env:
-          VERSION: ${{ steps.find_latest_release.outputs.result }}
        run: |
          echo "Updating manifest..."
-          komac update --version ${{ env.VERSION }} \
-            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
+          komac update --version ${{ steps.find_latest_release.outputs.VERSION }} \
+            --urls "${{ steps.find_latest_release.outputs.ASSETURL }}" \
            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
            --submit \
            ggml.llamacpp
--- a/2
+++ b/2
@@ -18,6 +18,7 @@
 /common/jinja/                          @ngxson @CISC @aldehir
 /common/llguidance.*                    @ggerganov
 /common/log.*                           @ggerganov
+/common/ngram-map.*                     @srogmann
 /common/peg-parser.*                    @aldehir
 /common/sampling.*                      @ggerganov
 /common/speculative.*                   @ggerganov
@@ -67,6 +68,7 @@
 /ggml/src/ggml-rpc/                     @rgerganov
 /ggml/src/ggml-threading.*              @ggerganov
 /ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-webgpu/                  @reeselevine
 /ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
 /ggml/src/ggml.c                        @ggerganov
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -73,6 +73,8 @@ add_library(${TARGET} STATIC
    log.h
    ngram-cache.cpp
    ngram-cache.h
+    ngram-map.cpp
+    ngram-map.h
    peg-parser.cpp
    peg-parser.h
    preset.cpp
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -6,6 +6,7 @@
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
+#include "speculative.h"
 #include "preset.h"

 // fix problem with std::min and std::max
@@ -579,14 +580,14 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
            params.mmproj = res.mmproj;
        }
        // only download mmproj if the current example is using it
-        for (auto & ex : mmproj_examples) {
+        for (const auto & ex : mmproj_examples) {
            if (ctx_arg.ex == ex) {
                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
                break;
            }
        }
-        common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
-        common_params_handle_model(params.vocoder.model,     params.hf_token, params.offline);
+        common_params_handle_model(params.speculative.mparams_dft, params.hf_token, params.offline);
+        common_params_handle_model(params.vocoder.model,           params.hf_token, params.offline);
    }

    // model is required (except for server)
@@ -1216,16 +1217,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-lcs", "--lookup-cache-static"}, "FNAME",
        "path to static lookup cache to use for lookup decoding (not updated by generation)",
        [](common_params & params, const std::string & value) {
-            params.lookup_cache_static = value;
+            params.speculative.lookup_cache_static = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
        [](common_params & params, const std::string & value) {
-            params.lookup_cache_dynamic = value;
+            params.speculative.lookup_cache_dynamic = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-c", "--ctx-size"}, "N",
        string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
@@ -1295,9 +1296,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"-kvu", "--kv-unified"},
+        {"-no-kvu", "--no-kv-unified"},
        "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
-        [](common_params & params) {
-            params.kv_unified = true;
+        [](common_params & params, bool value) {
+            params.kv_unified = value;
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
    add_opt(common_arg(
@@ -2198,18 +2200,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--mmap"},
        {"--no-mmap"},
-        string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.use_mmap = value;
-            if (value) {
-                params.use_direct_io = false;  // disable direct io when mmap is explicitly enabled
-            }
        }
    ).set_env("LLAMA_ARG_MMAP"));
    add_opt(common_arg(
        {"-dio", "--direct-io"},
        {"-ndio", "--no-direct-io"},
-        string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+        string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.use_direct_io = value;
        }
@@ -2565,7 +2564,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
        "Same as --hf-repo, but for the draft model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.speculative.model.hf_repo = value;
+            params.speculative.mparams_dft.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HFD_REPO"));
    add_opt(common_arg(
@@ -3386,7 +3385,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.speculative.model.path = value;
+            params.speculative.mparams_dft.path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
    add_opt(common_arg(
@@ -3396,6 +3395,66 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.replacements.push_back({ tgt, dft });
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    add_opt(common_arg(
+        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
+        string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
+            common_speculative_type_to_str(params.speculative.type).c_str()),
+        [](common_params & params, const std::string & value) {
+            if (value == "none") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
+            } else if (value == "ngram-cache") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
+            } else if (value == "ngram-simple") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
+            } else if (value == "ngram-map-k") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
+            } else if (value == "ngram-map-k4v") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
+            } else {
+                throw std::invalid_argument("unknown speculative decoding type without draft model");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--spec-ngram-size-n"}, "N",
+        string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n),
+        [](common_params & params, int value) {
+            if (value < 1 || value > 1024) {
+                throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
+            }
+            params.speculative.ngram_size_n = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--spec-ngram-size-m"}, "N",
+        string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m),
+        [](common_params & params, int value) {
+            if (value < 1 || value > 1024) {
+                throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
+            }
+            params.speculative.ngram_size_m = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--spec-ngram-check-rate"}, "N",
+        string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
+        [](common_params & params, int value) {
+            if (value < 1) {
+                throw std::invalid_argument("ngram check rate must be at least 1");
+            }
+            params.speculative.ngram_check_rate = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--spec-ngram-min-hits"}, "N",
+        string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
+        [](common_params & params, int value) {
+            if (value < 1) {
+                throw std::invalid_argument("ngram min hits must be at least 1");
+            }
+            params.speculative.ngram_min_hits = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-ctkd", "--cache-type-k-draft"}, "TYPE",
        string_format(
@@ -3622,8 +3681,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.port = 8012;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
@@ -3638,8 +3697,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
-            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.port = 8012;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1097,7 +1097,10 @@ common_init_result::common_init_result(common_params & params) :
    if (params.fit_params) {
        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+            params.tensor_split,
+            params.tensor_buft_overrides.data(),
+            params.fit_params_target.data(),
+            params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

@@ -1208,10 +1211,6 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }

-void common_init_result::free_context() {
-    pimpl->context.reset();
-}
-
 common_init_result_ptr common_init_from_params(common_params & params) {
    common_init_result_ptr res(new common_init_result(params));

--- a/common/common.h
+++ b/common/common.h
@@ -164,6 +164,16 @@ enum common_params_sampling_config : uint64_t {
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
 };

+enum common_speculative_type {
+    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
+    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
+    COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
+    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
+};

 // sampling parameters
 struct common_params_sampling {
@@ -243,16 +253,35 @@ struct common_params_model {
 };

 struct common_params_speculative {
-    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

-    int32_t n_ctx        =     0; // draft context size
-    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
-    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
-    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    // general-purpose speculative decoding parameters
+
+    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    float   p_split = 0.1f; // speculative decoding split probability
+    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+
+    // ngram-based speculative decoding
+
+    uint16_t ngram_size_n     = 12; // ngram size for lookup
+    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
+    uint16_t ngram_check_rate =  1; // check rate for ngram lookup
+    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+
+    // draft-model speculative decoding
+
+    struct common_params_model mparams_dft;
+
+    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+
+    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+
+    int32_t n_ctx        = 0;  // draft context size
+    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -260,7 +289,14 @@ struct common_params_speculative {
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    struct common_params_model model;
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+
+    bool has_dft() const {
+        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
+    }
 };

 struct common_params_vocoder {
@@ -378,8 +414,6 @@ struct common_params {
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT

    // llama-debug specific options
@@ -438,7 +472,7 @@ struct common_params {

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // enable mmap to use filesystem cache
-    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
+    bool use_direct_io     = false; // read from disk without buffering
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
@@ -575,10 +609,6 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
-
-    bool has_speculative() const {
-        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
-    }
 };

 // call once at the start of a program if it uses libcommon
@@ -714,8 +744,6 @@ struct common_init_result {

    std::vector<llama_adapter_lora_ptr> & lora();

-    void free_context();
-
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -44,6 +44,12 @@ static std::string get_line_col(const std::string & source, size_t pos) {
    return "line " + std::to_string(line) + ", column " + std::to_string(col);
 }

+static void ensure_key_type_allowed(const value & val) {
+    if (!val->is_hashable()) {
+        throw std::runtime_error("Type: " + val->type() + " is not allowed as object key");
+    }
+}
+
 // execute with error handling
 value statement::execute(context & ctx) {
    try {
@@ -95,20 +101,10 @@ value identifier::execute_impl(context & ctx) {
 value object_literal::execute_impl(context & ctx) {
    auto obj = mk_val<value_object>();
    for (const auto & pair : val) {
-        value key_val = pair.first->execute(ctx);
-        if (!is_val<value_string>(key_val) && !is_val<value_int>(key_val)) {
-            throw std::runtime_error("Object literal: keys must be string or int values, got " + key_val->type());
-        }
-        std::string key = key_val->as_string().str();
+        value key = pair.first->execute(ctx);
        value val = pair.second->execute(ctx);
-        JJ_DEBUG("Object literal: setting key '%s' with value type %s", key.c_str(), val->type().c_str());
+        JJ_DEBUG("Object literal: setting key '%s' with value type %s", key->as_string().str().c_str(), val->type().c_str());
        obj->insert(key, val);
-
-        if (is_val<value_int>(key_val)) {
-            obj->val_obj.is_key_numeric = true;
-        } else if (obj->val_obj.is_key_numeric) {
-            throw std::runtime_error("Object literal: cannot mix numeric and non-numeric keys");
-        }
    }
    return obj;
 }
@@ -127,9 +123,9 @@ value binary_expression::execute_impl(context & ctx) {
    value right_val = right->execute(ctx);
    JJ_DEBUG("Executing binary expression %s '%s' %s", left_val->type().c_str(), op.value.c_str(), right_val->type().c_str());
    if (op.value == "==") {
-        return mk_val<value_bool>(value_compare(left_val, right_val, value_compare_op::eq));
+        return mk_val<value_bool>(*left_val == *right_val);
    } else if (op.value == "!=") {
-        return mk_val<value_bool>(!value_compare(left_val, right_val, value_compare_op::eq));
+        return mk_val<value_bool>(!(*left_val == *right_val));
    }

    auto workaround_concat_null_with_str = [&](value & res) -> bool {
@@ -230,7 +226,7 @@ value binary_expression::execute_impl(context & ctx) {
        auto & arr = right_val->as_array();
        bool member = false;
        for (const auto & item : arr) {
-            if (value_compare(left_val, item, value_compare_op::eq)) {
+            if (*left_val == *item) {
                member = true;
                break;
            }
@@ -265,10 +261,9 @@ value binary_expression::execute_impl(context & ctx) {
        }
    }

-    // String in object
-    if (is_val<value_string>(left_val) && is_val<value_object>(right_val)) {
-        auto key = left_val->as_string().str();
-        bool has_key = right_val->has_key(key);
+    // Value key in object
+    if (is_val<value_object>(right_val)) {
+        bool has_key = right_val->has_key(left_val);
        if (op.value == "in") {
            return mk_val<value_bool>(has_key);
        } else if (op.value == "not in") {
@@ -465,14 +460,8 @@ value for_statement::execute_impl(context & ctx) {
        JJ_DEBUG("%s", "For loop over object keys");
        auto & obj = iterable_val->as_ordered_object();
        for (auto & p : obj) {
-            auto tuple = mk_val<value_array>();
-            if (iterable_val->val_obj.is_key_numeric) {
-                tuple->push_back(mk_val<value_int>(std::stoll(p.first)));
-            } else {
-                tuple->push_back(mk_val<value_string>(p.first));
-            }
-            tuple->push_back(p.second);
-            items.push_back(tuple);
+            auto tuple = mk_val<value_tuple>(p);
+            items.push_back(std::move(tuple));
        }
        if (ctx.is_get_stats) {
            iterable_val->stats.used = true;
@@ -602,11 +591,13 @@ value set_statement::execute_impl(context & ctx) {
    auto rhs = val ? val->execute(ctx) : exec_statements(body, ctx);

    if (is_stmt<identifier>(assignee)) {
+        // case: {% set my_var = value %}
        auto var_name = cast_stmt<identifier>(assignee)->val;
        JJ_DEBUG("Setting global variable '%s' with value type %s", var_name.c_str(), rhs->type().c_str());
        ctx.set_val(var_name, rhs);

    } else if (is_stmt<tuple_literal>(assignee)) {
+        // case: {% set a, b = value %}
        auto tuple = cast_stmt<tuple_literal>(assignee);
        if (!is_val<value_array>(rhs)) {
            throw std::runtime_error("Cannot unpack non-iterable type in set: " + rhs->type());
@@ -625,6 +616,7 @@ value set_statement::execute_impl(context & ctx) {
        }

    } else if (is_stmt<member_expression>(assignee)) {
+        // case: {% set ns.my_var = value %}
        auto member = cast_stmt<member_expression>(assignee);
        if (member->computed) {
            throw std::runtime_error("Cannot assign to computed member");
@@ -767,22 +759,22 @@ value member_expression::execute_impl(context & ctx) {
    }

    JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str());
+    ensure_key_type_allowed(property);

    value val = mk_val<value_undefined>("object_property");

    if (is_val<value_undefined>(object)) {
        JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined");
        return val;
+
    } else if (is_val<value_object>(object)) {
-        if (!is_val<value_string>(property)) {
-            throw std::runtime_error("Cannot access object with non-string: got " + property->type());
-        }
        auto key = property->as_string().str();
-        val = object->at(key, val);
+        val = object->at(property, val);
        if (is_val<value_undefined>(val)) {
            val = try_builtin_func(ctx, key, object, true);
        }
        JJ_DEBUG("Accessed property '%s' value, got type: %s", key.c_str(), val->type().c_str());
+
    } else if (is_val<value_array>(object) || is_val<value_string>(object)) {
        if (is_val<value_int>(property)) {
            int64_t index = property->as_int();
@@ -806,6 +798,7 @@ value member_expression::execute_impl(context & ctx) {
            auto key = property->as_string().str();
            JJ_DEBUG("Accessing %s built-in '%s'", is_val<value_array>(object) ? "array" : "string", key.c_str());
            val = try_builtin_func(ctx, key, object, true);
+
        } else {
            throw std::runtime_error("Cannot access property with non-string/non-number: got " + property->type());
        }
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -79,18 +79,18 @@ struct context {
    }

    value get_val(const std::string & name) {
-        auto it = env->val_obj.unordered.find(name);
-        if (it != env->val_obj.unordered.end()) {
-            return it->second;
-        } else {
-            return mk_val<value_undefined>(name);
-        }
+        value default_val = mk_val<value_undefined>(name);
+        return env->at(name, default_val);
    }

    void set_val(const std::string & name, const value & val) {
        env->insert(name, val);
    }

+    void set_val(const value & name, const value & val) {
+        env->insert(name, val);
+    }
+
    void print_vars() const {
        printf("Context Variables:\n%s\n", value_to_json(env, 2).c_str());
    }
@@ -344,9 +344,19 @@ struct array_literal : public expression {
    }
 };

-struct tuple_literal : public array_literal {
-    explicit tuple_literal(statements && val) : array_literal(std::move(val)) {}
+struct tuple_literal : public expression {
+    statements val;
+    explicit tuple_literal(statements && val) : val(std::move(val)) {
+        for (const auto& item : this->val) chk_type<expression>(item);
+    }
    std::string type() const override { return "TupleLiteral"; }
+    value execute_impl(context & ctx) override {
+        auto arr = mk_val<value_array>();
+        for (const auto & item_stmt : val) {
+            arr->push_back(item_stmt->execute(ctx));
+        }
+        return mk_val<value_tuple>(std::move(arr->as_array()));
+    }
 };

 struct object_literal : public expression {
--- a/common/jinja/string.cpp
+++ b/common/jinja/string.cpp
@@ -61,6 +61,12 @@ size_t string::length() const {
    return len;
 }

+void string::hash_update(hasher & hash) const noexcept {
+    for (const auto & part : parts) {
+        hash.update(part.val.data(), part.val.length());
+    }
+}
+
 bool string::all_parts_are_input() const {
    for (const auto & part : parts) {
        if (!part.is_input) {
--- a/common/jinja/string.h
+++ b/common/jinja/string.h
@@ -4,6 +4,8 @@
 #include <string>
 #include <vector>

+#include "utils.h"
+
 namespace jinja {

 // allow differentiate between user input strings and template strings
@@ -37,6 +39,7 @@ struct string {

    std::string str() const;
    size_t length() const;
+    void hash_update(hasher & hash) const noexcept;
    bool all_parts_are_input() const;
    bool is_uppercase() const;
    bool is_lowercase() const;
--- a/common/jinja/utils.h
+++ b/common/jinja/utils.h
@@ -3,6 +3,8 @@
 #include <string>
 #include <sstream>
 #include <algorithm>
+#include <cstdint>
+#include <cstring>

 namespace jinja {

@@ -46,4 +48,102 @@ static std::string fmt_error_with_source(const std::string & tag, const std::str
    return oss.str();
 }

+// Note: this is a simple hasher, not cryptographically secure, just for hash table usage
+struct hasher {
+    static constexpr auto size_t_digits = sizeof(size_t) * 8;
+    static constexpr size_t prime = size_t_digits == 64 ? 0x100000001b3 : 0x01000193;
+    static constexpr size_t seed = size_t_digits == 64 ? 0xcbf29ce484222325 : 0x811c9dc5;
+    static constexpr auto block_size = sizeof(size_t); // in bytes; allowing the compiler to vectorize the computation
+
+    static_assert(size_t_digits == 64 || size_t_digits == 32);
+    static_assert(block_size == 8 || block_size == 4);
+
+    uint8_t buffer[block_size];
+    size_t idx = 0; // current index in buffer
+    size_t state = seed;
+
+    hasher() = default;
+    hasher(const std::type_info & type_inf) noexcept {
+        const auto type_hash = type_inf.hash_code();
+        update(&type_hash, sizeof(type_hash));
+    }
+
+    // Properties:
+    //   - update is not associative: update(a).update(b) != update(b).update(a)
+    //   - update(a ~ b) == update(a).update(b) with ~ as concatenation operator --> useful for streaming
+    //   - update("", 0) --> state unchanged with empty input
+    hasher& update(void const * bytes, size_t len) noexcept {
+        const uint8_t * c = static_cast<uint8_t const *>(bytes);
+        if (len == 0) {
+            return *this;
+        }
+        size_t processed = 0;
+
+        // first, fill the existing buffer if it's partial
+        if (idx > 0) {
+            size_t to_fill = block_size - idx;
+            if (to_fill > len) {
+                to_fill = len;
+            }
+            std::memcpy(buffer + idx, c, to_fill);
+            idx += to_fill;
+            processed += to_fill;
+            if (idx == block_size) {
+                update_block(buffer);
+                idx = 0;
+            }
+        }
+
+        // process full blocks from the remaining input
+        for (; processed + block_size <= len; processed += block_size) {
+            update_block(c + processed);
+        }
+
+        // buffer any remaining bytes
+        size_t remaining = len - processed;
+        if (remaining > 0) {
+            std::memcpy(buffer, c + processed, remaining);
+            idx = remaining;
+        }
+        return *this;
+    }
+
+    // convenience function for testing only
+    hasher& update(const std::string & s) noexcept {
+        return update(s.data(), s.size());
+    }
+
+    // finalize and get the hash value
+    // note: after calling digest, the hasher state is modified, do not call update() again
+    size_t digest() noexcept {
+        // if there are remaining bytes in buffer, fill the rest with zeros and process
+        if (idx > 0) {
+            for (size_t i = idx; i < block_size; ++i) {
+                buffer[i] = 0;
+            }
+            update_block(buffer);
+            idx = 0;
+        }
+
+        return state;
+    }
+
+private:
+    // IMPORTANT: block must have at least block_size bytes
+    void update_block(const uint8_t * block) noexcept {
+        size_t blk = static_cast<uint32_t>(block[0])
+                    | (static_cast<uint32_t>(block[1]) << 8)
+                    | (static_cast<uint32_t>(block[2]) << 16)
+                    | (static_cast<uint32_t>(block[3]) << 24);
+        if constexpr (block_size == 8) {
+            blk = blk | (static_cast<uint64_t>(block[4]) << 32)
+                      | (static_cast<uint64_t>(block[5]) << 40)
+                      | (static_cast<uint64_t>(block[6]) << 48)
+                      | (static_cast<uint64_t>(block[7]) << 56);
+        }
+        state ^= blk;
+        state *= prime;
+    }
+};
+
 } // namespace jinja
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -114,6 +114,18 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
    return result;
 }

+template<typename T>
+static value empty_value_fn(const func_args &) {
+    if constexpr (std::is_same_v<T, value_int>) {
+        return mk_val<T>(0);
+    } else if constexpr (std::is_same_v<T, value_float>) {
+        return mk_val<T>(0.0);
+    } else if constexpr (std::is_same_v<T, value_bool>) {
+        return mk_val<T>(false);
+    } else {
+        return mk_val<T>();
+    }
+}
 template<typename T>
 static value test_type_fn(const func_args & args) {
    args.ensure_count(1);
@@ -128,6 +140,13 @@ static value test_type_fn(const func_args & args) {
    JJ_DEBUG("test_type_fn: type=%s or %s result=%d", typeid(T).name(), typeid(U).name(), is_type ? 1 : 0);
    return mk_val<value_bool>(is_type);
 }
+template<typename T, typename U, typename V>
+static value test_type_fn(const func_args & args) {
+    args.ensure_count(1);
+    bool is_type = is_val<T>(args.get_pos(0)) || is_val<U>(args.get_pos(0)) || is_val<V>(args.get_pos(0));
+    JJ_DEBUG("test_type_fn: type=%s, %s or %s result=%d", typeid(T).name(), typeid(U).name(), typeid(V).name(), is_type ? 1 : 0);
+    return mk_val<value_bool>(is_type);
+}
 template<value_compare_op op>
 static value test_compare_fn(const func_args & args) {
    args.ensure_count(2, 2);
@@ -163,7 +182,7 @@ static value selectattr(const func_args & args) {
    args.ensure_vals<value_array, value_string, value_string, value_string>(true, true, false, false);

    auto arr = args.get_pos(0)->as_array();
-    auto attr_name = args.get_pos(1)->as_string().str();
+    auto attribute = args.get_pos(1);
    auto out = mk_val<value_array>();
    value val_default = mk_val<value_undefined>();

@@ -173,7 +192,7 @@ static value selectattr(const func_args & args) {
            if (!is_val<value_object>(item)) {
                throw raised_exception("selectattr: item is not an object");
            }
-            value attr_val = item->at(attr_name, val_default);
+            value attr_val = item->at(attribute, val_default);
            bool is_selected = attr_val->as_bool();
            if constexpr (is_reject) is_selected = !is_selected;
            if (is_selected) out->push_back(item);
@@ -217,7 +236,7 @@ static value selectattr(const func_args & args) {
            if (!is_val<value_object>(item)) {
                throw raised_exception("selectattr: item is not an object");
            }
-            value attr_val = item->at(attr_name, val_default);
+            value attr_val = item->at(attribute, val_default);
            func_args test_args(args.ctx);
            test_args.push_back(attr_val); // attribute value
            test_args.push_back(extra_arg); // extra argument
@@ -347,8 +366,8 @@ const func_builtins & global_builtins() {
        {"test_is_integer", test_type_fn<value_int>},
        {"test_is_float", test_type_fn<value_float>},
        {"test_is_number", test_type_fn<value_int, value_float>},
-        {"test_is_iterable", test_type_fn<value_array, value_string>},
-        {"test_is_sequence", test_type_fn<value_array, value_string>},
+        {"test_is_iterable", test_type_fn<value_array, value_string, value_undefined>},
+        {"test_is_sequence", test_type_fn<value_array, value_string, value_undefined>},
        {"test_is_mapping", test_type_fn<value_object>},
        {"test_is_lower", [](const func_args & args) -> value {
            args.ensure_vals<value_string>();
@@ -741,6 +760,7 @@ const func_builtins & value_array_t::get_builtins() const {
            args.ensure_count(1, 4);
            args.ensure_vals<value_array, value_int, value_int, value_int>(true, true, false, false);

+            auto val  = args.get_pos(0);
            auto arg0 = args.get_pos(1);
            auto arg1 = args.get_pos(2, mk_val<value_undefined>());
            auto arg2 = args.get_pos(3, mk_val<value_undefined>());
@@ -762,10 +782,8 @@ const func_builtins & value_array_t::get_builtins() const {
            if (step == 0) {
                throw raised_exception("slice step cannot be zero");
            }
-            auto arr = slice(args.get_pos(0)->as_array(), start, stop, step);
-            auto res = mk_val<value_array>();
-            res->val_arr = std::move(arr);
-            return res;
+            auto arr = slice(val->as_array(), start, stop, step);
+            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
        {"selectattr", selectattr<false>},
        {"select", selectattr<false>},
@@ -785,15 +803,14 @@ const func_builtins & value_array_t::get_builtins() const {
            }
            const int64_t attr_int = attr_is_int ? attribute->as_int() : 0;
            const std::string delim = val_delim->is_undefined() ? "" : val_delim->as_string().str();
-            const std::string attr_name = attribute->is_undefined() ? "" : attribute->as_string().str();
            std::string result;
            for (size_t i = 0; i < arr.size(); ++i) {
                value val_arr = arr[i];
                if (!attribute->is_undefined()) {
                    if (attr_is_int && is_val<value_array>(val_arr)) {
                        val_arr = val_arr->at(attr_int);
-                    } else if (!attr_is_int && !attr_name.empty() && is_val<value_object>(val_arr)) {
-                        val_arr = val_arr->at(attr_name);
+                    } else if (!attr_is_int && is_val<value_object>(val_arr)) {
+                        val_arr = val_arr->at(attribute);
                    }
                }
                if (!is_val<value_string>(val_arr) && !is_val<value_int>(val_arr) && !is_val<value_float>(val_arr)) {
@@ -808,9 +825,7 @@ const func_builtins & value_array_t::get_builtins() const {
        }},
        {"string", [](const func_args & args) -> value {
            args.ensure_vals<value_array>();
-            auto str = mk_val<value_string>();
-            gather_string_parts_recursive(args.get_pos(0), str);
-            return str;
+            return mk_val<value_string>(args.get_pos(0)->as_string());
        }},
        {"tojson", tojson},
        {"map", [](const func_args & args) -> value {
@@ -821,26 +836,26 @@ const func_builtins & value_array_t::get_builtins() const {
            if (!is_val<value_kwarg>(args.get_args().at(1))) {
                throw not_implemented_exception("map: filter-mapping not implemented");
            }
+            value val       = args.get_pos(0);
            value attribute = args.get_kwarg_or_pos("attribute", 1);
            const bool attr_is_int = is_val<value_int>(attribute);
            if (!is_val<value_string>(attribute) && !attr_is_int) {
                throw raised_exception("map: attribute must be string or integer");
            }
            const int64_t attr_int = attr_is_int ? attribute->as_int() : 0;
-            const std::string attr_name = attribute->as_string().str();
            value default_val = args.get_kwarg("default", mk_val<value_undefined>());
            auto out = mk_val<value_array>();
-            auto arr = args.get_pos(0)->as_array();
+            auto arr = val->as_array();
            for (const auto & item : arr) {
                value attr_val;
                if (attr_is_int) {
                    attr_val = is_val<value_array>(item) ? item->at(attr_int, default_val) : default_val;
                } else {
-                    attr_val = is_val<value_object>(item) ? item->at(attr_name, default_val) : default_val;
+                    attr_val = is_val<value_object>(item) ? item->at(attribute, default_val) : default_val;
                }
                out->push_back(attr_val);
            }
-            return out;
+            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(out->as_array())) : out;
        }},
        {"append", [](const func_args & args) -> value {
            args.ensure_count(2);
@@ -867,6 +882,7 @@ const func_builtins & value_array_t::get_builtins() const {
            if (!is_val<value_array>(args.get_pos(0))) {
                throw raised_exception("sort: first argument must be an array");
            }
+            value val         = args.get_pos(0);
            value val_reverse = args.get_kwarg_or_pos("reverse",        1);
            value val_case    = args.get_kwarg_or_pos("case_sensitive", 2);
            value attribute   = args.get_kwarg_or_pos("attribute",      3);
@@ -875,8 +891,7 @@ const func_builtins & value_array_t::get_builtins() const {
            const bool reverse = val_reverse->as_bool(); // undefined == false
            const bool attr_is_int = is_val<value_int>(attribute);
            const int64_t attr_int = attr_is_int ? attribute->as_int() : 0;
-            const std::string attr_name = attribute->is_undefined() ? "" : attribute->as_string().str();
-            std::vector<value> arr = cast_val<value_array>(args.get_pos(0))->as_array(); // copy
+            std::vector<value> arr = val->as_array(); // copy
            std::sort(arr.begin(), arr.end(),[&](const value & a, const value & b) {
                value val_a = a;
                value val_b = b;
@@ -884,22 +899,23 @@ const func_builtins & value_array_t::get_builtins() const {
                    if (attr_is_int && is_val<value_array>(a) && is_val<value_array>(b)) {
                        val_a = a->at(attr_int);
                        val_b = b->at(attr_int);
-                    } else if (!attr_is_int && !attr_name.empty() && is_val<value_object>(a) && is_val<value_object>(b)) {
-                        val_a = a->at(attr_name);
-                        val_b = b->at(attr_name);
+                    } else if (!attr_is_int && is_val<value_object>(a) && is_val<value_object>(b)) {
+                        val_a = a->at(attribute);
+                        val_b = b->at(attribute);
                    } else {
-                        throw raised_exception("sort: unsupported object attribute comparison");
+                        throw raised_exception("sort: unsupported object attribute comparison between " + a->type() + " and " + b->type());
                    }
                }
                return value_compare(val_a, val_b, reverse ? value_compare_op::gt : value_compare_op::lt);
            });
-            return mk_val<value_array>(arr);
+            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
        {"reverse", [](const func_args & args) -> value {
            args.ensure_vals<value_array>();
-            std::vector<value> arr = cast_val<value_array>(args.get_pos(0))->as_array(); // copy
+            value val = args.get_pos(0);
+            std::vector<value> arr = val->as_array(); // copy
            std::reverse(arr.begin(), arr.end());
-            return mk_val<value_array>(arr);
+            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
        {"unique", [](const func_args &) -> value {
            throw not_implemented_exception("Array unique builtin not implemented");
@@ -930,7 +946,7 @@ const func_builtins & value_object_t::get_builtins() const {
                default_val = args.get_pos(2);
            }
            const value obj = args.get_pos(0);
-            std::string key = args.get_pos(1)->as_string().str();
+            const value key = args.get_pos(1);
            return obj->at(key, default_val);
        }},
        {"keys", [](const func_args & args) -> value {
@@ -938,7 +954,7 @@ const func_builtins & value_object_t::get_builtins() const {
            const auto & obj = args.get_pos(0)->as_ordered_object();
            auto result = mk_val<value_array>();
            for (const auto & pair : obj) {
-                result->push_back(mk_val<value_string>(pair.first));
+                result->push_back(pair.first);
            }
            return result;
        }},
@@ -956,15 +972,16 @@ const func_builtins & value_object_t::get_builtins() const {
            const auto & obj = args.get_pos(0)->as_ordered_object();
            auto result = mk_val<value_array>();
            for (const auto & pair : obj) {
-                auto item = mk_val<value_array>();
-                item->push_back(mk_val<value_string>(pair.first));
-                item->push_back(pair.second);
+                auto item = mk_val<value_tuple>(pair);
                result->push_back(std::move(item));
            }
            return result;
        }},
        {"tojson", tojson},
-        {"string", tojson},
+        {"string", [](const func_args & args) -> value {
+            args.ensure_vals<value_object>();
+            return mk_val<value_string>(args.get_pos(0)->as_string());
+        }},
        {"length", [](const func_args & args) -> value {
            args.ensure_vals<value_object>();
            const auto & obj = args.get_pos(0)->as_ordered_object();
@@ -985,11 +1002,11 @@ const func_builtins & value_object_t::get_builtins() const {
            const bool reverse = val_reverse->as_bool(); // undefined == false
            const bool by_value = is_val<value_string>(val_by) && val_by->as_string().str() == "value" ? true : false;
            auto result = mk_val<value_object>(val_input); // copy
-            std::sort(result->val_obj.ordered.begin(), result->val_obj.ordered.end(), [&](const auto & a, const auto & b) {
+            std::sort(result->val_obj.begin(), result->val_obj.end(), [&](const auto & a, const auto & b) {
                if (by_value) {
                    return value_compare(a.second, b.second, reverse ? value_compare_op::gt : value_compare_op::lt);
                } else {
-                    return reverse ? a.first > b.first : a.first < b.first;
+                    return value_compare(a.first, b.first, reverse ? value_compare_op::gt : value_compare_op::lt);
                }
            });
            return result;
@@ -1005,7 +1022,12 @@ const func_builtins & value_none_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
        {"tojson", tojson},
-        {"string", [](const func_args &) -> value { return mk_val<value_string>("None"); }}
+        {"string", [](const func_args &) -> value {
+            return mk_val<value_string>("None");
+        }},
+        {"safe", [](const func_args &) -> value {
+            return mk_val<value_string>("None");
+        }},
    };
    return builtins;
 }
@@ -1014,10 +1036,33 @@ const func_builtins & value_none_t::get_builtins() const {
 const func_builtins & value_undefined_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
-        {"tojson", [](const func_args & args) -> value {
-            args.ensure_vals<value_undefined>();
-            return mk_val<value_string>("null");
-        }},
+        {"capitalize", empty_value_fn<value_string>},
+        {"first", empty_value_fn<value_undefined>},
+        {"items", empty_value_fn<value_array>},
+        {"join", empty_value_fn<value_string>},
+        {"last", empty_value_fn<value_undefined>},
+        {"length", empty_value_fn<value_int>},
+        {"list", empty_value_fn<value_array>},
+        {"lower", empty_value_fn<value_string>},
+        {"map", empty_value_fn<value_array>},
+        {"max", empty_value_fn<value_undefined>},
+        {"min", empty_value_fn<value_undefined>},
+        {"reject", empty_value_fn<value_array>},
+        {"rejectattr", empty_value_fn<value_array>},
+        {"replace", empty_value_fn<value_string>},
+        {"reverse", empty_value_fn<value_array>},
+        {"safe", empty_value_fn<value_string>},
+        {"select", empty_value_fn<value_array>},
+        {"selectattr", empty_value_fn<value_array>},
+        {"sort", empty_value_fn<value_array>},
+        {"string", empty_value_fn<value_string>},
+        {"strip", empty_value_fn<value_string>},
+        {"sum", empty_value_fn<value_int>},
+        {"title", empty_value_fn<value_string>},
+        {"truncate", empty_value_fn<value_string>},
+        {"unique", empty_value_fn<value_array>},
+        {"upper", empty_value_fn<value_string>},
+        {"wordcount", empty_value_fn<value_int>},
    };
    return builtins;
 }
@@ -1134,6 +1179,8 @@ void global_from_json(context & ctx, const nlohmann::ordered_json & json_obj, bo
    }
 }

+// recursively convert value to JSON string
+// TODO: avoid circular references
 static void value_to_json_internal(std::ostringstream & oss, const value & val, int curr_lvl, int indent, const std::string_view item_sep, const std::string_view key_sep) {
    auto indent_str = [indent, curr_lvl]() -> std::string {
        return (indent > 0) ? std::string(curr_lvl * indent, ' ') : "";
@@ -1196,7 +1243,8 @@ static void value_to_json_internal(std::ostringstream & oss, const value & val,
            size_t i = 0;
            for (const auto & pair : obj) {
                oss << indent_str() << (indent > 0 ? std::string(indent, ' ') : "");
-                oss << "\"" << pair.first << "\"" << key_sep;
+                value_to_json_internal(oss, mk_val<value_string>(pair.first->as_string().str()), curr_lvl + 1, indent, item_sep, key_sep);
+                oss << key_sep;
                value_to_json_internal(oss, pair.second, curr_lvl + 1, indent, item_sep, key_sep);
                if (i < obj.size() - 1) {
                    oss << item_sep;
@@ -1219,4 +1267,19 @@ std::string value_to_json(const value & val, int indent, const std::string_view
    return oss.str();
 }

+// TODO: avoid circular references
+std::string value_to_string_repr(const value & val) {
+    if (is_val<value_string>(val)) {
+        const std::string val_str = val->as_string().str();
+
+        if (val_str.find('\'') != std::string::npos) {
+            return value_to_json(val);
+        } else {
+            return "'" + val_str + "'";
+        }
+    } else {
+        return val->as_repr();
+    }
+}
+
 } // namespace jinja
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -1,8 +1,10 @@
 #pragma once

 #include "string.h"
+#include "utils.h"

 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <functional>
 #include <map>
@@ -93,7 +95,8 @@ void global_from_json(context & ctx, const T_JSON & json_obj, bool mark_input);

 struct func_args; // function argument values

-using func_handler = std::function<value(const func_args &)>;
+using func_hptr = value(const func_args &);
+using func_handler = std::function<func_hptr>;
 using func_builtins = std::map<std::string, func_handler>;

 enum value_compare_op { eq, ge, gt, lt, ne };
@@ -103,28 +106,9 @@ struct value_t {
    int64_t val_int;
    double val_flt;
    string val_str;
-    bool val_bool;

    std::vector<value> val_arr;
-
-    struct map {
-        // once set to true, all keys must be numeric
-        // caveat: we only allow either all numeric keys or all non-numeric keys
-        // for now, this only applied to for_statement in case of iterating over object keys/items
-        bool is_key_numeric = false;
-        std::map<std::string, value> unordered;
-        std::vector<std::pair<std::string, value>> ordered;
-        void insert(const std::string & key, const value & val) {
-            if (unordered.find(key) != unordered.end()) {
-                // if key exists, remove from ordered list
-                ordered.erase(std::remove_if(ordered.begin(), ordered.end(),
-                    [&](const std::pair<std::string, value> & p) { return p.first == key; }),
-                    ordered.end());
-            }
-            unordered[key] = val;
-            ordered.push_back({key, val});
-        }
-    } val_obj;
+    std::vector<std::pair<value, value>> val_obj;

    func_handler val_func;

@@ -139,6 +123,7 @@ struct value_t {
    value_t(const value_t &) = default;
    virtual ~value_t() = default;

+    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
@@ -146,7 +131,7 @@ struct value_t {
    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
-    virtual const std::vector<std::pair<std::string, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
@@ -154,43 +139,66 @@ struct value_t {
        throw std::runtime_error("No builtins available for type " + type());
    }

-    virtual bool has_key(const std::string & key) {
-        return val_obj.unordered.find(key) != val_obj.unordered.end();
-    }
-    virtual value & at(const std::string & key, value & default_val) {
-        auto it = val_obj.unordered.find(key);
-        if (it == val_obj.unordered.end()) {
-            return default_val;
-        }
-        return val_obj.unordered.at(key);
-    }
-    virtual value & at(const std::string & key) {
-        auto it = val_obj.unordered.find(key);
-        if (it == val_obj.unordered.end()) {
-            throw std::runtime_error("Key '" + key + "' not found in value of type " + type());
-        }
-        return val_obj.unordered.at(key);
-    }
-    virtual value & at(int64_t index, value & default_val) {
-        if (index < 0) {
-            index += val_arr.size();
-        }
-        if (index < 0 || static_cast<size_t>(index) >= val_arr.size()) {
-            return default_val;
-        }
-        return val_arr[index];
-    }
-    virtual value & at(int64_t index) {
-        if (index < 0) {
-            index += val_arr.size();
-        }
-        if (index < 0 || static_cast<size_t>(index) >= val_arr.size()) {
-            throw std::runtime_error("Index " + std::to_string(index) + " out of bounds for array of size " + std::to_string(val_arr.size()));
-        }
-        return val_arr[index];
-    }
+    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }

+    virtual bool is_numeric() const { return false; }
+    virtual bool is_hashable() const { return false; }
+    virtual bool is_immutable() const { return true; }
+    virtual hasher unique_hash() const noexcept = 0;
+    // TODO: C++20 <=> operator
+    // NOTE: We are treating == as equivalent (for normal comparisons) and != as strict nonequal (for strict (is) comparisons)
+    virtual bool operator==(const value_t & other) const { return equivalent(other); }
+    virtual bool operator!=(const value_t & other) const { return nonequal(other); }
+
+    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }
+
+protected:
+    virtual bool equivalent(const value_t &) const = 0;
+    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
+};
+
+//
+// utils
+//
+
+const func_builtins & global_builtins();
+
+std::string value_to_json(const value & val, int indent = -1, const std::string_view item_sep = ", ", const std::string_view key_sep = ": ");
+
+// Note: only used for debugging purposes
+std::string value_to_string_repr(const value & val);
+
+struct not_implemented_exception : public std::runtime_error {
+    not_implemented_exception(const std::string & msg) : std::runtime_error("NotImplemented: " + msg) {}
+};
+
+struct value_hasher {
+    size_t operator()(const value & val) const noexcept {
+        return val->unique_hash().digest();
+    }
+};
+
+struct value_equivalence {
+    bool operator()(const value & lhs, const value & rhs) const {
+        return *lhs == *rhs;
+    }
+    bool operator()(const std::pair<value, value> & lhs, const std::pair<value, value> & rhs) const {
+        return *(lhs.first) == *(rhs.first) && *(lhs.second) == *(rhs.second);
+    }
+};
+
+struct value_equality {
+    bool operator()(const value & lhs, const value & rhs) const {
+        return !(*lhs != *rhs);
+    }
 };

 //
@@ -198,24 +206,49 @@ struct value_t {
 //

 struct value_int_t : public value_t {
-    value_int_t(int64_t v) { val_int = v; }
+    value_int_t(int64_t v) {
+        val_int = v;
+        val_flt = static_cast<double>(v);
+        if (static_cast<int64_t>(val_flt) != v) {
+            val_flt = v < 0 ? -INFINITY : INFINITY;
+        }
+    }
    virtual std::string type() const override { return "Integer"; }
    virtual int64_t as_int() const override { return val_int; }
-    virtual double as_float() const override { return static_cast<double>(val_int); }
+    virtual double as_float() const override { return val_flt; }
    virtual string as_string() const override { return std::to_string(val_int); }
    virtual bool as_bool() const override {
        return val_int != 0;
    }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_numeric() const override { return true; }
+    virtual bool is_hashable() const override { return true; }
+    virtual hasher unique_hash() const noexcept override {
+        return hasher(typeid(*this))
+            .update(&val_int, sizeof(val_int))
+            .update(&val_flt, sizeof(val_flt));
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return other.is_numeric() && val_int == other.val_int && val_flt == other.val_flt;
+    }
+    virtual bool nonequal(const value_t & other) const override {
+        return !(typeid(*this) == typeid(other) && val_int == other.val_int);
+    }
 };
 using value_int = std::shared_ptr<value_int_t>;


 struct value_float_t : public value_t {
-    value_float_t(double v) { val_flt = v; }
+    value val;
+    value_float_t(double v) {
+        val_flt = v;
+        val_int = std::isfinite(v) ? static_cast<int64_t>(v) : 0;
+        val = mk_val<value_int>(val_int);
+    }
    virtual std::string type() const override { return "Float"; }
    virtual double as_float() const override { return val_flt; }
-    virtual int64_t as_int() const override { return static_cast<int64_t>(val_flt); }
+    virtual int64_t as_int() const override { return val_int; }
    virtual string as_string() const override {
        std::string out = std::to_string(val_flt);
        out.erase(out.find_last_not_of('0') + 1, std::string::npos); // remove trailing zeros
@@ -226,6 +259,24 @@ struct value_float_t : public value_t {
        return val_flt != 0.0;
    }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_numeric() const override { return true; }
+    virtual bool is_hashable() const override { return true; }
+    virtual hasher unique_hash() const noexcept override {
+        if (static_cast<double>(val_int) == val_flt) {
+            return val->unique_hash();
+        } else {
+            return hasher(typeid(*this))
+                .update(&val_int, sizeof(val_int))
+                .update(&val_flt, sizeof(val_flt));
+        }
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return other.is_numeric() && val_int == other.val_int && val_flt == other.val_flt;
+    }
+    virtual bool nonequal(const value_t & other) const override {
+        return !(typeid(*this) == typeid(other) && val_flt == other.val_flt);
+    }
 };
 using value_float = std::shared_ptr<value_float_t>;

@@ -247,19 +298,49 @@ struct value_string_t : public value_t {
        return val_str.length() > 0;
    }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_hashable() const override { return true; }
+    virtual hasher unique_hash() const noexcept override {
+        const auto type_hash = typeid(*this).hash_code();
+        auto hash = hasher();
+        hash.update(&type_hash, sizeof(type_hash));
+        val_str.hash_update(hash);
+        return hash;
+    }
    void mark_input() {
        val_str.mark_input();
    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return typeid(*this) == typeid(other) && val_str.str() == other.val_str.str();
+    }
 };
 using value_string = std::shared_ptr<value_string_t>;


 struct value_bool_t : public value_t {
-    value_bool_t(bool v) { val_bool = v; }
+    value val;
+    value_bool_t(bool v) {
+        val_int = static_cast<int64_t>(v);
+        val_flt = static_cast<double>(v);
+        val = mk_val<value_int>(val_int);
+    }
    virtual std::string type() const override { return "Boolean"; }
-    virtual bool as_bool() const override { return val_bool; }
-    virtual string as_string() const override { return std::string(val_bool ? "True" : "False"); }
+    virtual int64_t as_int() const override { return val_int; }
+    virtual bool as_bool() const override { return val_int; }
+    virtual string as_string() const override { return std::string(val_int ? "True" : "False"); }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_numeric() const override { return true; }
+    virtual bool is_hashable() const override { return true; }
+    virtual hasher unique_hash() const noexcept override {
+        return val->unique_hash();
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return other.is_numeric() && val_int == other.val_int && val_flt == other.val_flt;
+    }
+    virtual bool nonequal(const value_t & other) const override {
+        return !(typeid(*this) == typeid(other) && val_int == other.val_int);
+    }
 };
 using value_bool = std::shared_ptr<value_bool_t>;

@@ -269,13 +350,34 @@ struct value_array_t : public value_t {
    value_array_t(value & v) {
        val_arr = v->val_arr;
    }
+    value_array_t(std::vector<value> && arr) {
+        val_arr = arr;
+    }
    value_array_t(const std::vector<value> & arr) {
        val_arr = arr;
    }
-    void reverse() { std::reverse(val_arr.begin(), val_arr.end()); }
-    void push_back(const value & val) { val_arr.push_back(val); }
-    void push_back(value && val) { val_arr.push_back(std::move(val)); }
+    void reverse() {
+        if (is_immutable()) {
+            throw std::runtime_error("Attempting to modify immutable type");
+        }
+        std::reverse(val_arr.begin(), val_arr.end());
+    }
+    void push_back(const value & val) {
+        if (is_immutable()) {
+            throw std::runtime_error("Attempting to modify immutable type");
+        }
+        val_arr.push_back(val);
+    }
+    void push_back(value && val) {
+        if (is_immutable()) {
+            throw std::runtime_error("Attempting to modify immutable type");
+        }
+        val_arr.push_back(std::move(val));
+    }
    value pop_at(int64_t index) {
+        if (is_immutable()) {
+            throw std::runtime_error("Attempting to modify immutable type");
+        }
        if (index < 0) {
            index = static_cast<int64_t>(val_arr.size()) + index;
        }
@@ -287,64 +389,225 @@ struct value_array_t : public value_t {
        return val;
    }
    virtual std::string type() const override { return "Array"; }
+    virtual bool is_immutable() const override { return false; }
    virtual const std::vector<value> & as_array() const override { return val_arr; }
    virtual string as_string() const override {
+        const bool immutable = is_immutable();
        std::ostringstream ss;
-        ss << "[";
+        ss << (immutable ? "(" : "[");
        for (size_t i = 0; i < val_arr.size(); i++) {
            if (i > 0) ss << ", ";
-            ss << val_arr.at(i)->as_repr();
+            value val = val_arr.at(i);
+            ss << value_to_string_repr(val);
        }
-        ss << "]";
+        if (immutable && val_arr.size() == 1) {
+            ss << ",";
+        }
+        ss << (immutable ? ")" : "]");
        return ss.str();
    }
    virtual bool as_bool() const override {
        return !val_arr.empty();
    }
+    virtual value & at(int64_t index, value & default_val) override {
+        if (index < 0) {
+            index += val_arr.size();
+        }
+        if (index < 0 || static_cast<size_t>(index) >= val_arr.size()) {
+            return default_val;
+        }
+        return val_arr[index];
+    }
+    virtual value & at(int64_t index) override {
+        if (index < 0) {
+            index += val_arr.size();
+        }
+        if (index < 0 || static_cast<size_t>(index) >= val_arr.size()) {
+            throw std::runtime_error("Index " + std::to_string(index) + " out of bounds for array of size " + std::to_string(val_arr.size()));
+        }
+        return val_arr[index];
+    }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_hashable() const override {
+        if (std::all_of(val_arr.begin(), val_arr.end(), [&](auto & val) -> bool {
+            return val->is_immutable() && val->is_hashable();
+        })) {
+            return true;
+        }
+        return false;
+    }
+    virtual hasher unique_hash() const noexcept override {
+        auto hash = hasher(typeid(*this));
+        for (const auto & val : val_arr) {
+            // must use digest to prevent problems from "concatenation" property of hasher
+            // for ex. hash of [ "ab", "c" ] should be different from [ "a", "bc" ]
+            const size_t val_hash = val->unique_hash().digest();
+            hash.update(&val_hash, sizeof(size_t));
+        }
+        return hash;
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return typeid(*this) == typeid(other) && is_hashable() && other.is_hashable() && std::equal(val_arr.begin(), val_arr.end(), other.val_arr.begin(), value_equivalence());
+    }
 };
 using value_array = std::shared_ptr<value_array_t>;


+struct value_tuple_t : public value_array_t {
+    value_tuple_t(value & v) {
+        val_arr = v->val_arr;
+    }
+    value_tuple_t(std::vector<value> && arr) {
+        val_arr = arr;
+    }
+    value_tuple_t(const std::vector<value> & arr) {
+        val_arr = arr;
+    }
+    value_tuple_t(const std::pair<value, value> & pair) {
+        val_arr.push_back(pair.first);
+        val_arr.push_back(pair.second);
+    }
+    virtual std::string type() const override { return "Tuple"; }
+    virtual bool is_immutable() const override { return true; }
+};
+using value_tuple = std::shared_ptr<value_tuple_t>;
+
+
 struct value_object_t : public value_t {
+    std::unordered_map<value, value, value_hasher, value_equivalence> unordered;
    bool has_builtins = true; // context and loop objects do not have builtins
    value_object_t() = default;
    value_object_t(value & v) {
        val_obj = v->val_obj;
-    }
-    value_object_t(const std::map<std::string, value> & obj) {
-        for (const auto & pair : obj) {
-            val_obj.insert(pair.first, pair.second);
+        for (const auto & pair : val_obj) {
+            unordered[pair.first] = pair.second;
        }
    }
-    value_object_t(const std::vector<std::pair<std::string, value>> & obj) {
+    value_object_t(const std::map<value, value> & obj) {
        for (const auto & pair : obj) {
-            val_obj.insert(pair.first, pair.second);
+            insert(pair.first, pair.second);
+        }
+    }
+    value_object_t(const std::vector<std::pair<value, value>> & obj) {
+        for (const auto & pair : obj) {
+            insert(pair.first, pair.second);
        }
    }
    void insert(const std::string & key, const value & val) {
-        val_obj.insert(key, val);
+        insert(mk_val<value_string>(key), val);
    }
    virtual std::string type() const override { return "Object"; }
-    virtual const std::vector<std::pair<std::string, value>> & as_ordered_object() const override { return val_obj.ordered; }
+    virtual bool is_immutable() const override { return false; }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const override { return val_obj; }
+    virtual string as_string() const override {
+        std::ostringstream ss;
+        ss << "{";
+        for (size_t i = 0; i < val_obj.size(); i++) {
+            if (i > 0) ss << ", ";
+            auto & [key, val] = val_obj.at(i);
+            ss << value_to_string_repr(key) << ": " << value_to_string_repr(val);
+        }
+        ss << "}";
+        return ss.str();
+    }
    virtual bool as_bool() const override {
-        return !val_obj.unordered.empty();
+        return !unordered.empty();
+    }
+    virtual bool has_key(const value & key) override {
+        if (!key->is_immutable() || !key->is_hashable()) {
+            throw std::runtime_error("Object key of unhashable type: " + key->type());
+        }
+        return unordered.find(key) != unordered.end();
+    }
+    virtual void insert(const value & key, const value & val) override {
+        bool replaced = false;
+        if (is_immutable()) {
+            throw std::runtime_error("Attempting to modify immutable type");
+        }
+        if (has_key(key)) {
+            // if key exists, replace value in ordered list instead of appending
+            for (auto & pair : val_obj) {
+                if (*(pair.first) == *key) {
+                    pair.second = val;
+                    replaced = true;
+                    break;
+                }
+            }
+        }
+        unordered[key] = val;
+        if (!replaced) {
+            val_obj.push_back({key, val});
+        }
+    }
+    virtual value & at(const value & key, value & default_val) override {
+        if (!has_key(key)) {
+            return default_val;
+        }
+        return unordered.at(key);
+    }
+    virtual value & at(const value & key) override {
+        if (!has_key(key)) {
+            throw std::runtime_error("Key '" + key->as_string().str() + "' not found in value of type " + type());
+        }
+        return unordered.at(key);
+    }
+    virtual value & at(const std::string & key, value & default_val) override {
+        value key_val = mk_val<value_string>(key);
+        return at(key_val, default_val);
+    }
+    virtual value & at(const std::string & key) override {
+        value key_val = mk_val<value_string>(key);
+        return at(key_val);
    }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_hashable() const override {
+        if (std::all_of(val_obj.begin(), val_obj.end(), [&](auto & pair) -> bool {
+            const auto & val = pair.second;
+            return val->is_immutable() && val->is_hashable();
+        })) {
+            return true;
+        }
+        return false;
+    }
+    virtual hasher unique_hash() const noexcept override {
+        auto hash = hasher(typeid(*this));
+        for (const auto & [key, val] : val_obj) {
+            // must use digest to prevent problems from "concatenation" property of hasher
+            // for ex. hash of key="ab", value="c" should be different from key="a", value="bc"
+            const size_t key_hash = key->unique_hash().digest();
+            const size_t val_hash = val->unique_hash().digest();
+            hash.update(&key_hash, sizeof(key_hash));
+            hash.update(&val_hash, sizeof(val_hash));
+        }
+        return hash;
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return typeid(*this) == typeid(other) && is_hashable() && other.is_hashable() && std::equal(val_obj.begin(), val_obj.end(), other.val_obj.begin(), value_equivalence());
+    }
 };
 using value_object = std::shared_ptr<value_object_t>;

 //
-// null and undefined types
+// none and undefined types
 //

 struct value_none_t : public value_t {
    virtual std::string type() const override { return "None"; }
    virtual bool is_none() const override { return true; }
    virtual bool as_bool() const override { return false; }
-    virtual string as_string() const override { return string("None"); }
+    virtual string as_string() const override { return string(type()); }
    virtual std::string as_repr() const override { return type(); }
    virtual const func_builtins & get_builtins() const override;
+    virtual bool is_hashable() const override { return true; }
+    virtual hasher unique_hash() const noexcept override {
+        return hasher(typeid(*this));
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return typeid(*this) == typeid(other);
+    }
 };
 using value_none = std::shared_ptr<value_none_t>;

@@ -356,6 +619,13 @@ struct value_undefined_t : public value_t {
    virtual bool as_bool() const override { return false; }
    virtual std::string as_repr() const override { return type(); }
    virtual const func_builtins & get_builtins() const override;
+    virtual hasher unique_hash() const noexcept override {
+        return hasher(typeid(*this));
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        return is_undefined() == other.is_undefined();
+    }
 };
 using value_undefined = std::shared_ptr<value_undefined_t>;

@@ -436,7 +706,23 @@ struct value_func_t : public value_t {
        return val_func(new_args);
    }
    virtual std::string type() const override { return "Function"; }
-    virtual std::string as_repr() const override { return type(); }
+    virtual std::string as_repr() const override { return type() + "<" + name + ">(" + (arg0 ? arg0->as_repr() : "") + ")"; }
+    virtual bool is_hashable() const override { return false; }
+    virtual hasher unique_hash() const noexcept override {
+        // Note: this is unused for now, we don't support function as object keys
+        // use function pointer as unique identifier
+        const auto target = val_func.target<func_hptr>();
+        return hasher(typeid(*this)).update(&target, sizeof(target));
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        // Note: this is unused for now, we don't support function as object keys
+        // compare function pointers
+        // (val_func == other.val_func does not work as std::function::operator== is only used for nullptr check)
+        const auto target_this  = this->val_func.target<func_hptr>();
+        const auto target_other = other.val_func.target<func_hptr>();
+        return typeid(*this) == typeid(other) && target_this == target_other;
+    }
 };
 using value_func = std::shared_ptr<value_func_t>;

@@ -447,18 +733,21 @@ struct value_kwarg_t : public value_t {
    value_kwarg_t(const std::string & k, const value & v) : key(k), val(v) {}
    virtual std::string type() const override { return "KwArg"; }
    virtual std::string as_repr() const override { return type(); }
+    virtual bool is_hashable() const override { return true; }
+    virtual hasher unique_hash() const noexcept override {
+        const auto type_hash = typeid(*this).hash_code();
+        auto hash = val->unique_hash();
+        hash.update(&type_hash, sizeof(type_hash))
+            .update(key.data(), key.size());
+        return hash;
+    }
+protected:
+    virtual bool equivalent(const value_t & other) const override {
+        const value_kwarg_t & other_val = static_cast<const value_kwarg_t &>(other);
+        return typeid(*this) == typeid(other) && key == other_val.key && val == other_val.val;
+    }
 };
 using value_kwarg = std::shared_ptr<value_kwarg_t>;


-// utils
-
-const func_builtins & global_builtins();
-std::string value_to_json(const value & val, int indent = -1, const std::string_view item_sep = ", ", const std::string_view key_sep = ": ");
-
-struct not_implemented_exception : public std::runtime_error {
-    not_implemented_exception(const std::string & msg) : std::runtime_error("NotImplemented: " + msg) {}
-};
-
-
 } // namespace jinja
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -192,12 +192,12 @@ void common_ngram_cache_draft(
            break;
        }

-        LOG(" - draft candidate: token=%d\n", drafted_token);
+        LOG_DBG(" - draft candidate: token=%d\n", drafted_token);
        draft.push_back(drafted_token);
    }
 }

-void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, const std::string & filename) {
    std::ofstream file_out(filename, std::ios::binary);
    for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
        const common_ngram      ngram        = item.first;
@@ -217,10 +217,9 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
        }
    }
-
 }

-common_ngram_cache common_ngram_cache_load(std::string & filename) {
+common_ngram_cache common_ngram_cache_load(const std::string & filename) {
    std::ifstream hashmap_file(filename, std::ios::binary);
    if (!hashmap_file) {
        throw std::ifstream::failure("Unable to open file " + filename);
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -88,12 +88,12 @@ void common_ngram_cache_draft(
 // Save an ngram cache to a file.
 // ngram_cache: the ngram cache to save.
 // filename:    the path under which to save the ngram cache.
-void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, const std::string & filename);

 // Load an ngram cache saved with common_ngram_cache_save.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-common_ngram_cache common_ngram_cache_load(std::string & filename);
+common_ngram_cache common_ngram_cache_load(const std::string & filename);

 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -0,0 +1,367 @@
+#include "common.h"
+#include "log.h"
+#include "ngram-map.h"
+
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <sstream>
+
+// n-gram simple
+//
+
+/**
+ * Perform speculative generation using the model's own token history.
+ * Searches for a matching pattern in the token history and returns draft tokens.
+ *
+ * @param state     Current state of this implementation
+ * @param tokens    Token history to search in
+ * @param sampled   Last sampled token
+ * @return Vector of draft tokens, empty if no matching pattern is found
+ */
+llama_tokens common_ngram_simple_draft(
+        common_ngram_simple_state & state,
+        const llama_tokens & tokens, llama_token sampled) {
+
+    // Simple implementation of self-speculative decoding without a draft model.
+    //
+    const size_t cur_len = tokens.size();
+    // Only check every check_rate tokens to save compute
+    // i.e., perform check if (cur_len - idx_last_check) >= check_rate
+    if (state.idx_last_check + state.config.check_rate > cur_len) {
+        llama_tokens draft_tokens;
+        return draft_tokens;
+    }
+
+    size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
+    size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft
+
+    // vector for tokens we want to verify.
+    // return empty vector if there is no match.
+    llama_tokens draft_tokens;
+
+    // We need at least n_draft_min + n_draft_max + 1 tokens.
+    if (cur_len <= static_cast<size_t>(n_draft_min + n_draft_max + 1)) {
+        return draft_tokens;
+    }
+
+    // pattern search
+    llama_tokens pattern;
+    pattern.reserve(n_draft_min);
+    for (size_t j = cur_len - n_draft_min + 1; j < cur_len; ++j) {
+        pattern.push_back(tokens[j]);
+    }
+    pattern.push_back(sampled); // add the last token to the pattern
+
+    // We do a search in the token history.
+    state.idx_last_check = cur_len;
+
+    size_t match_pos = 0; // we ignore position 0, position 0 == no match
+                          // search backwards, but skip the current match (we are currently there)
+    for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
+        bool match = true;
+        for (size_t k = 0; k < pattern.size(); ++k) {
+            if (tokens[j + k] != pattern[k]) {
+                match = false;
+                break;
+            }
+        }
+        if (match) {
+            match_pos = j;
+            break;
+        }
+    }
+    if (match_pos == 0) {
+        return draft_tokens;
+    }
+
+    const size_t copy_max = std::min(
+            n_draft_max,
+            cur_len - (match_pos + n_draft_min)
+            );
+    if (copy_max < n_draft_min) {
+        return draft_tokens;
+    }
+    LOG_DBG("%s: #tokens = %zu: found matching pattern at pos %zu, length %zu, draft length %zu\n",
+            __func__, cur_len,
+            match_pos, pattern.size(), copy_max);
+
+    draft_tokens.reserve(copy_max);
+    for (size_t j = 0; j < copy_max; ++j) {
+        draft_tokens.push_back(tokens[match_pos + n_draft_min + j]);
+    }
+    return draft_tokens;
+}
+
+
+// n-gram map
+//
+
+// maximum number of counted values of a ngram map value.
+#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
+
+static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
+
+void common_ngram_map_draft(common_ngram_map & map,
+        const llama_tokens & inp, llama_token sampled,
+        llama_tokens & draft) {
+    // reset last key and value.
+    map.last_draft_created   = false;
+    map.last_draft_key_idx   = 0;
+    map.last_draft_value_idx = 0;
+
+    const size_t cur_len = inp.size();
+    const uint16_t n = map.size_key;
+    const uint16_t m = map.size_value;
+    if (cur_len < static_cast<size_t>(2 * n + m)) {
+        return;
+    }
+
+    // Only check every check_rate tokens to save compute
+    // i.e., perform check if (cur_len - idx_last_check) >= check_rate
+    if (map.idx_last_check + map.check_rate > cur_len) {
+        return;
+    }
+    map.idx_last_check = cur_len;
+
+    // search pattern, the key n-gram
+    std::vector<llama_token> key_tokens;
+    key_tokens.reserve(n);
+    for (size_t j = cur_len - n + 1; j < cur_len; ++j) {
+        key_tokens.push_back(inp[j]);
+    }
+    key_tokens.push_back(sampled);
+
+    // search for the key in the map
+    size_t match_pos = 0;
+    for (size_t j = cur_len - n - m - 1; j > 0; --j) {
+        bool match = true;
+        for (size_t k = 0; k < n; ++k) {
+            if (inp[j + k] != key_tokens[k]) {
+                match = false;
+                break;
+            }
+        }
+        if (match) {
+           match_pos = j;
+           break;
+        }
+    }
+    if (match_pos > 0) {
+        LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
+            cur_len, n, m, key_tokens.size(), sampled, match_pos);
+    }
+
+    if (match_pos == 0) {
+        return;
+    }
+
+    // We have a match, now we look for the statistics of the key.
+    size_t key_offset = map.keys.size(); // offset in the map
+    // We iterate through the std::vector<common_ngram_map_key> map->keys.
+    for (size_t i = 0; i < map.keys.size(); ++i) {
+        bool match = true;
+        for (size_t j = 0; j < n; ++j) {
+            if (inp[map.keys[i].key_idx + j] != key_tokens[j]) {
+                match = false;
+                break;
+            }
+        }
+        if (match) {
+            key_offset = i;
+            break;
+        }
+    }
+    if (key_offset == map.keys.size()) {
+        // We create a new key-entry, it will get offset key_offset.
+        common_ngram_map_key new_key;
+        new_key.key_idx = match_pos;
+        new_key.stat_idx = 0;
+        new_key.key_num = 0;
+        for (int i = 0; i < COMMON_NGRAM_MAX_VALUES; ++i) {
+            new_key.values[i].value_num = 0;
+            new_key.values[i].n_accepted = m;
+        }
+        map.keys.push_back(new_key);
+    }
+
+    // our key n-gram:
+    common_ngram_map_key & curr_key = map.keys[key_offset];
+
+    // update number of key hits
+    curr_key.key_num = (uint16_t) std::min((int) map.keys[key_offset].key_num + 1,
+            (int) COMMON_NGRAM_MAX_VALUE_COUNT);
+
+    if (map.key_only) {
+        // simple mode:
+        // Fill in the draft with the m tokens following the key.
+        // We work with value values[0] only.
+        int n_draft_tokens = std::min((int) m, (int) curr_key.values[0].n_accepted);
+
+        for (int i = 0; i < n_draft_tokens; ++i) {
+            draft.push_back(inp[match_pos + n + i]);
+        }
+
+        LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
+                key_offset, curr_key.key_num, draft.size());
+
+        map.last_draft_created   = false;
+        map.last_draft_key_idx   = key_offset;
+        map.last_draft_value_idx = 0; // value 0 is used for simple mode
+        return;
+    }
+
+    if (curr_key.key_num < map.min_hits) {
+        // not enough hits to consider this a good draft
+        LOG_DBG("%s: key_offset = %zu, key_num = %d, min_hits = %d, no draft\n", __func__,
+                key_offset, curr_key.key_num, map.min_hits);
+        return;
+    }
+
+    // complex mode: examine the different m-grams after this key n-gram.
+    //
+
+    // determine all (max COMMON_NGRAM_MAX_VALUES) m-grams after the key n-gram.
+    for (size_t i = curr_key.stat_idx; i <= match_pos; ++i) {
+        // begins the key n-gram at index i?
+        bool match_key = true;
+        for (size_t k = 0; k < n; ++k) {
+            if (inp[i + k] != key_tokens[k]) {
+                match_key = false;
+                break;
+            }
+        }
+        if (!match_key) {
+            continue;
+        }
+
+        // Do we haven a existing value m-gram or a new one after the key at index i?
+        size_t idx_begin_value_key = i + n;
+        int idx_value = -1;
+        for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
+            size_t idx_begin_value_v = curr_key.values[v].value_idx;
+            if (idx_begin_value_v == 0) {
+                // We found an empty value slot => we found a new value m-gram after the key n-gram.
+                curr_key.values[v].value_idx = idx_begin_value_key;
+                curr_key.values[v].value_num = 0;
+                curr_key.values[v].n_accepted = m;
+                idx_value = v;
+                break;
+            }
+            bool match = true;
+            for (size_t j = 0; j < m; ++j) {
+                if (inp[idx_begin_value_key + j] != inp[idx_begin_value_v + j]) {
+                    match = false;
+                    break;
+                }
+            }
+            if (match) {
+                // We found an existing value m-gram after the key n-gram.
+                idx_value = v;
+                break;
+            }
+        }
+        if (idx_value >= 0) {
+            // We found a value m-gram of the key n-gram.
+            curr_key.values[idx_value].value_num = (uint16_t) std::min((int) curr_key.values[idx_value].value_num + 1,
+                    (int) COMMON_NGRAM_MAX_VALUE_COUNT);
+        }
+    }
+    // the statistics are updated up to match_pos.
+    curr_key.stat_idx = match_pos;
+
+    // Do we have a value we could use for the draft?
+    uint16_t max_occur = 0;
+    int slot_max = 0;
+    for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
+        uint16_t curr_occur = curr_key.values[v].value_num;
+        if (curr_occur > max_occur) {
+            max_occur = curr_occur;
+            slot_max = v;
+        }
+    }
+    // What is sum of the other occurences?
+    uint32_t sum_occur = 0;
+    for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
+        if (v == slot_max) {
+            continue;
+        }
+        uint16_t curr_occur = curr_key.values[v].value_num;
+        sum_occur += curr_occur;
+    }
+
+    LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
+            key_offset,
+            max_occur, sum_occur, slot_max,
+            curr_key.values[0].value_idx, curr_key.values[0].value_num,
+            curr_key.values[1].value_idx, curr_key.values[1].value_num,
+            curr_key.values[2].value_idx, curr_key.values[2].value_num,
+            curr_key.values[3].value_idx, curr_key.values[3].value_num
+        );
+    // Print the tokens of the four values (if idx != 0), use LOG_INF
+    for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
+        if (curr_key.values[v].value_idx != 0) {
+            LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
+        }
+    }
+
+    if (sum_occur > 0 && max_occur < 3 * sum_occur) {
+        // The most frequent value is not much more frequent than the other values.
+        // We do not use the draft.
+        return;
+    }
+
+    // We use the most frequent value values[slot_max] for the draft.
+    // Fill in the draft with the m tokens following the key.
+    int n_draft_tokens = std::min((int) m, (int) curr_key.values[slot_max].n_accepted);
+
+    for (int i = 0; i < n_draft_tokens; ++i) {
+        draft.push_back(inp[match_pos + n + i]);
+    }
+
+    LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
+            key_offset, slot_max,
+            curr_key.key_num, draft.size());
+
+    map.last_draft_created   = true;
+    map.last_draft_key_idx   = key_offset;
+    map.last_draft_value_idx = slot_max; // value used for draft generation.
+}
+
+void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
+    if (!map.last_draft_created) {
+        return;
+    }
+
+    // find the key and its chosen value.
+    const size_t key_idx = map.last_draft_key_idx;
+    const size_t val_idx = map.last_draft_value_idx;
+
+    // find key corresponding to key_idx.
+    common_ngram_map_key & curr_key = map.keys[key_idx];
+    // find value corresponding to val_idx.
+    struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
+
+    // update the value statistics
+    LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
+            n_accepted, curr_value.n_accepted);
+    curr_value.n_accepted = n_accepted;
+}
+
+// Helper functions.
+//
+
+// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
+std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
+    std::ostringstream oss;
+    oss << '[';
+    for (size_t i = 0; i < length; ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << inp[start + i];
+    }
+    oss << ']';
+    return oss.str();
+}
+
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -0,0 +1,105 @@
+#pragma once
+//
+// common/ngram-map.h: structures used to manage a map from n-grams to a list of m-grams
+//
+// These structures are used to do a lookup of n-grams followed by m-grams in token history.
+//
+// There are two algorithms implemented:
+// 1. ngram_simple: lookup of n-grams followed by m-grams in token history.
+// 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
+//    The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
+//
+
+#include "llama.h"
+
+#include <vector>
+
+// n-gram simple
+//
+
+// config of n-gram simple.
+struct common_ngram_simple_config {
+    uint16_t   size_ngram;      // size of n-grams to lookup in self-mode
+    uint16_t   size_mgram;      // size of m-grams to draft in self-mode
+    uint16_t   check_rate;      // check for speculative decoding without draft model for each check_rate token
+};
+
+// current state (and config) of n-gram simple.
+struct common_ngram_simple_state {
+    common_ngram_simple_config config;
+
+    size_t idx_last_check = 0; // index of last check in context history (mutable)
+
+    common_ngram_simple_state(const common_ngram_simple_config & config)
+        : config(config) {}
+};
+
+// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
+// state:              the ngram simple state to search in.
+// inp:                the tokens generated so far.
+// sampled:            the token that was just sampled.
+// draft:              vector to store the draft tokens, initially empty.
+llama_tokens common_ngram_simple_draft(
+        common_ngram_simple_state & state,
+        const llama_tokens & tokens, llama_token sampled);
+
+
+// n-gram map
+//
+
+// maximum number of m-gram values stored for each key n-gram.
+#define COMMON_NGRAM_MAX_VALUES 4
+
+// statistics of a m-gram after a known n-gram
+struct common_ngram_map_value {
+    size_t   value_idx = 0;  // index of value m-gram in token-history (0 if unused)
+    uint16_t value_num = 0;  // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
+    int16_t n_accepted = -1;  // number of accepted tokens at last draft (-1 if unused)
+};
+
+// statistics of a n-gram
+struct common_ngram_map_key {
+    size_t   key_idx;   // index of key n-gram in token-history
+    size_t   stat_idx;  // index of last token of stastistics computation (key_num, values)
+
+    uint16_t key_num;   // number of occurences of this key n-gram in token-history
+    common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
+};
+
+// map from n-grams to following m-grams in token-history
+struct common_ngram_map {
+    uint16_t size_key;   // size of key n-grams
+    uint16_t size_value; // size of value m-grams
+
+    bool key_only;       // true if only key n-grams are used, no values.
+
+    // first draft: vector only, no map.
+    std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
+    uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
+    uint16_t min_hits;   // minimum number of key hits to consider a draft
+
+    common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
+                     uint16_t check_rate, uint16_t min_hits)
+        : size_key(sz_key), size_value(sz_value), key_only(only_keys),
+          check_rate(check_rate), min_hits(min_hits) {}
+
+    bool     last_draft_created   = false; // true if a draft was created at last call.
+    size_t   last_draft_key_idx   = 0; // index of last key used for draft generation.
+    uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
+
+    size_t   idx_last_check       = 0; // index of last check in context history
+};
+
+
+// Searches for the n-gram in the history and checks whether a draft sequence should be generated.
+// map:                the ngram map to search in.
+// inp:                the tokens generated so far.
+// sampled:            the token that was just sampled.
+// draft:              vector to store the draft tokens, initially empty.
+void common_ngram_map_draft(
+    common_ngram_map & map,
+    const llama_tokens & inp, llama_token sampled,
+    llama_tokens & draft);
+
+// Update the statistics of a value after a draft was processed.
+void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -5,31 +5,33 @@

 struct common_speculative;

-struct common_speculative_params {
-    int n_draft = 16;  // max drafted tokens
-    int n_reuse = 256;
+// comma separated list of all types
+std::string common_speculative_type_name_str();

-    float p_min = 0.75f; // min probability required to accept a token in the draft
-};
+// convert string to type
+enum common_speculative_type common_speculative_type_from_name(const std::string & name);

-struct common_speculative * common_speculative_init(
-        struct llama_context * ctx_tgt,
-        struct llama_context * ctx_dft
-);
+// convert type to string
+std::string common_speculative_type_to_str(enum common_speculative_type type);

-void common_speculative_free(struct common_speculative * spec);
+common_speculative * common_speculative_init(
+        const common_params_speculative & params,
+              llama_context             * ctx_tgt);

-bool common_speculative_are_compatible(
-        const struct llama_context * ctx_tgt,
-        const struct llama_context * ctx_dft);
+void common_speculative_free(common_speculative * spec);

-void common_speculative_add_replacement_tgt_dft(
-        struct common_speculative * spec,
-        const char *source, const char *dest);
+// optionally call once at the beginning of a new generation
+void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);

 // sample up to n_draft tokens and add them to the batch using the draft model
-llama_tokens common_speculative_gen_draft(
-               struct common_speculative * spec,
-        struct common_speculative_params   params,
-                      const llama_tokens & prompt,
-                             llama_token   id_last);
+llama_tokens common_speculative_draft(
+                     common_speculative * spec,
+        const common_params_speculative & params,
+                     const llama_tokens & prompt,
+                            llama_token   id_last);
+
+// informs the speculative decoder that n_accepted tokens were accepted by the target model
+void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
+
+// print statistics about the speculative decoding
+void common_speculative_print_stats(const common_speculative * spec);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8912,13 +8912,16 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
            name.endswith("block_sparse_moe.input_linear.weight")
            or "shared_mlp" in name
        ):
-            return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
+            yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
+            return

        # Determine whether this is a mamba layer or an attention layer
        if bid in self._ssm_layers:
-            return Mamba2Model.modify_tensors(self, data_torch, name, bid)
+            yield from Mamba2Model.modify_tensors(self, data_torch, name, bid)
+            return
        elif bid in self._attn_layers:
-            return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
+            yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
+            return
        yield from ModelBase.modify_tensors(self, data_torch, name, bid)

    def set_gguf_parameters(self):
--- a/docs/build.md
+++ b/docs/build.md
@@ -144,7 +144,7 @@ We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in
 - ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
  - (there are no supported CUDA packages for these systems)
 - ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
-  - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
+  - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your host operating system)
 - ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
 - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)

@@ -495,6 +495,37 @@ Finally, after finishing your build, you should be able to do something like thi
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```

+### For Mac users:
+
+Generally, follow LunarG's [Getting Started with the MacOS Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/mac/getting_started.html) guide for installation and setup of the Vulkan SDK. There are two options of Vulkan drivers on macOS, both of which implement translation layers to map Vulkan to Metal. They can be hot-swapped by setting the `VK_ICD_FILENAMES` environment variable to point to the respective ICD JSON file.
+
+Check the box for "KosmicKrisp" during the LunarG Vulkan SDK installation.
+
+Set environment variable for the LunarG Vulkan SDK after installation (and optionally add to your shell profile for persistence):
+```bash
+source /path/to/vulkan-sdk/setup-env.sh
+```
+
+#### Using MoltenVK
+
+MoltenVK is the default Vulkan driver installed with the LunarG Vulkan SDK on macOS, so you can use the above environment variable settings as is.
+
+#### Using KosmicKrisp
+
+Override the environment variable for KosmicKrisp:
+```bash
+export VK_ICD_FILENAMES=$VULKAN_SDK/share/vulkan/icd.d/libkosmickrisp_icd.json
+export VK_DRIVER_FILES=$VULKAN_SDK/share/vulkan/icd.d/libkosmickrisp_icd.json
+```
+
+#### Build
+
+This is the only step different from [above](#common-steps) instructions.
+```bash
+cmake -B build -DGGML_VULKAN=1 -DGGML_METAL=OFF
+cmake --build build --config Release
+```
+
 ## CANN
 This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.

--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -0,0 +1,120 @@
+# Speculative Decoding
+
+llama.cpp supports speculative decoding, a technique that can significantly accelerate token generation by predicting multiple tokens ahead of the main model.
+
+[Speculative decoding](https://en.wikipedia.org/wiki/Transformer_(deep_learning)#Speculative_decoding) leverages the fact that computing n tokens in a batch (as in prompt processing) is more efficient than computing n sequentially (as in response generation). By generating draft tokens quickly and then verifying them with the target model in a single batch, this approach can achieve substantial speedups when the draft predictions are frequently correct.
+
+## Implementations
+
+The `llama-server` application supports several implementations of speculative decoding:
+
+### Draft Model (`draft`)
+
+A much smaller model (called the _draft model_) generates drafts.
+A draft model is the most used approach in speculative decoding.
+
+### n-gram Cache (`ngram-cache`)
+
+An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
+A draft is computed using probabilities derived from these statistics. External statistics can also be loaded from files for improved accuracy.
+
+See:
+
+- #5479, #6828, #6848
+
+### n-gram Map (`ngram-simple`, `ngram-map-*`)
+
+These implementations search the token history for patterns and use matching sequences as draft candidates.
+They require no additional model but rely on patterns that have already appeared in the generated text.
+An example to use this approach can be the rewriting of source code by a LLM.
+
+#### n-gram Map (`ngram-simple`)
+
+This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
+
+#### n-gram Map Key (`ngram-map-k`)
+
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts.
+
+The number of accepted tokens is stored for each used n-gram.
+
+#### n-gram Map Key-4-Values (`ngram-map-k4v`)
+
+This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft.
+
+The number of accepted tokens is stored for each used n-gram.
+
+**Example:** Server options to be used if there are a lot of longer repetitions.
+```bash
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2
+```
+
+
+## Command-Line Options
+
+If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
+
+```
+--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]
+                                        type of speculative decoding to use when no draft model is provided
+                                        (default: none)
+--spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
+                                        of lookup n-gram (default: 12)
+--spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
+                                        of draft m-gram (default: 48)
+--spec-ngram-check-rate N               ngram check rate for ngram-simple/ngram-map speculative decoding
+                                        (default: 1)
+--spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
+```
+
+### `--spec-type TYPE`
+
+Specifies a type of speculative decoding without draft model.
+
+| Type | Description |
+|------|-------------|
+| `none` | No speculative decoding (default) |
+| `ngram-cache` | Use n-gram cache lookup |
+| `ngram-simple` | Use simple n-gram pattern matching |
+| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
+| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
+
+**Example:** Server-instance used to refactor source code.
+```bash
+./llama-server [...] --spec-type ngram-simple
+```
+
+### `--spec-ngram-size-n N`
+
+Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
+The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
+
+### `--spec-ngram-size-m M`
+
+Sets the size M of the draft m-gram for n-gram map based speculative decoding.
+The m-gram size determines how many tokens to draft when a match is found.
+Larger values can provide more speedup but may reduce acceptance rate.
+
+### `--spec-ngram-check-rate R`
+
+This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token).
+
+### `--spec-ngram-min-hits H`
+
+This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
+
+## Statistics
+Each speculative decoding implementation prints statistics.
+
+```
+draft acceptance rate = 0.57576 (  171 accepted /   297 generated)
+statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tokens = 187, #acc tokens = 73
+statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98
+```
+
+- `#calls`: number of calls of this implementations
+- `#gen drafts`: number of drafts generated by this implementation
+- `#acc drafts`: number of drafts accepted (partially) by the main model
+- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
+- `#acc tokens`: number of tokens accepted by the main model
+
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -32,9 +32,9 @@ int main(int argc, char ** argv){

    common_ngram_cache ngram_cache;
    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
-    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
+    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.lookup_cache_static.c_str());

-    common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+    common_ngram_cache_save(ngram_cache, params.speculative.lookup_cache_static);

    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -46,18 +46,18 @@ int main(int argc, char ** argv){
    {
        const int64_t t_start_draft_us = ggml_time_us();

-        if (!params.lookup_cache_static.empty()) {
+        if (!params.speculative.lookup_cache_static.empty()) {
            try {
-                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
                exit(1);
            }
        }

-        if (!params.lookup_cache_dynamic.empty()) {
+        if (!params.speculative.lookup_cache_dynamic.empty()) {
            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -51,18 +51,18 @@ int main(int argc, char ** argv){
        const int64_t t_start_draft_us = ggml_time_us();
        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);

-        if (!params.lookup_cache_static.empty()) {
+        if (!params.speculative.lookup_cache_static.empty()) {
            try {
-                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
                exit(1);
            }
        }

-        if (!params.lookup_cache_dynamic.empty()) {
+        if (!params.speculative.lookup_cache_dynamic.empty()) {
            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

@@ -210,7 +210,7 @@ int main(int argc, char ** argv){

    // Update dynamic ngram cache with context ngram cache and save it to disk:
    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
+    common_ngram_cache_save(ngram_cache_dynamic, params.speculative.lookup_cache_dynamic);

    LOG("\n\n");

--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.path.empty()) {
+    if (params.speculative.mparams_dft.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -34,10 +34,8 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);

    llama_model * model_tgt = NULL;
-    //llama_model * model_dft = NULL;

    llama_context * ctx_tgt = NULL;
-    llama_context * ctx_dft = NULL;

    // load the target model
    auto llama_init_tgt = common_init_from_params(params);
@@ -48,26 +46,38 @@ int main(int argc, char ** argv) {
    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);

    // load the draft model
-    params.devices      = params.speculative.devices;
-    params.model        = params.speculative.model;
-    params.n_ctx        = params.speculative.n_ctx;
-    params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
-    params.n_gpu_layers = params.speculative.n_gpu_layers;
+    llama_model_ptr model_dft;

-    if (params.speculative.cpuparams.n_threads > 0) {
-        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
-    }
+    // TODO: simplify this logic
+    {
+        const auto & params_spec = params.speculative;

-    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
+        auto params_dft = params;

-    auto llama_init_dft = common_init_from_params(params);
+        params_dft.n_parallel   = 1;
+        params_dft.n_ctx        = params_spec.n_ctx;
+        params_dft.n_batch      = llama_n_ctx_seq(ctx_tgt);
+        params_dft.devices      = params_spec.devices;
+        params_dft.model        = params_spec.mparams_dft;
+        params_dft.n_gpu_layers = params_spec.n_gpu_layers;

-    //model_dft = llama_init_dft->model();
-    ctx_dft   = llama_init_dft->context();
+        if (params_spec.cpuparams.n_threads > 0) {
+            params_dft.cpuparams.n_threads       = params.speculative.cpuparams.n_threads;
+            params_dft.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+        }

-    if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
-        LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
+        params_dft.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
+
+        auto mparams_dft = common_model_params_to_llama(params_dft);
+
+        model_dft.reset(llama_model_load_from_file(params_dft.model.path.c_str(), mparams_dft));
+        if (model_dft == nullptr) {
+            LOG_ERR("failed to load draft model, '%s'\n", params_dft.model.path.c_str());
+            return 1;
+        }
+
+        params.speculative.model_dft = model_dft.get();
+        params.speculative.cparams_dft = common_context_params_to_llama(params_dft);
    }

    // Tokenize the prompt
@@ -92,12 +102,6 @@ int main(int argc, char ** argv) {
        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
    }

-    // how many tokens to draft each time
-    int n_draft     = params.speculative.n_max;
-    int n_draft_min = params.speculative.n_min;
-
-    float p_min = params.speculative.p_min;
-
    int n_predict = 0;
    int n_drafted = 0;
    int n_accept  = 0;
@@ -127,15 +131,11 @@ int main(int argc, char ** argv) {
    int n_past = inp.size() - 1;

    // init the speculator
-    struct common_speculative_params params_spec;
-    params_spec.n_draft = n_draft;
-    params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
-    params_spec.p_min   = p_min;
+    const auto & params_spec = params.speculative;

-    struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
-    for (auto &pair : params.speculative.replacements) {
-        common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
-    }
+    struct common_speculative * spec = common_speculative_init(params.speculative, ctx_tgt);
+
+    common_speculative_begin(spec, prompt_tgt);

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

@@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
        // from a cache or lookup tables.
        //
-        llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
+        llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);

        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());

@@ -162,7 +162,7 @@ int main(int argc, char ** argv) {
        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
            // do not waste time on small drafts
-            if (draft.size() < (size_t) n_draft_min) {
+            if (draft.size() < (size_t) params_spec.n_min) {
                draft.clear();
            }

@@ -240,7 +240,7 @@ int main(int argc, char ** argv) {
    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

    LOG_INF("\n");
-    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_draft   = %d\n", params_spec.n_max);
    LOG_INF("n_predict = %d\n", n_predict);
    LOG_INF("n_drafted = %d\n", n_drafted);
    LOG_INF("n_accept  = %d\n", n_accept);
@@ -249,8 +249,6 @@ int main(int argc, char ** argv) {
    LOG_INF("\n");
    LOG_INF("draft:\n\n");

-    llama_perf_context_print(ctx_dft);
-
    LOG_INF("\n");
    LOG_INF("target:\n\n");
    common_perf_print(ctx_tgt, smpl);
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.path.empty()) {
+    if (params.speculative.mparams_dft.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {

    // load the draft model
    params.devices = params.speculative.devices;
-    params.model = params.speculative.model;
+    params.model = params.speculative.mparams_dft;
    params.n_gpu_layers = params.speculative.n_gpu_layers;
    if (params.speculative.cpuparams.n_threads > 0) {
        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -228,6 +228,8 @@ option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
 option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
+option(GGML_VIRTGPU                         "ggml: use the VirtGPU/Virglrenderer API Remoting frontend"     OFF)
+option(GGML_VIRTGPU_BACKEND                 "ggml: build the VirtGPU/Virglrenderer API Remoting backend"    OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
@@ -320,6 +322,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
+    include/ggml-virtgpu.h
    include/ggml-sycl.h
    include/ggml-vulkan.h
    include/ggml-webgpu.h
--- a/ggml/include/ggml-virtgpu.h
+++ b/ggml/include/ggml-virtgpu.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_virtgpu_reg();
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -451,6 +451,7 @@ ggml_add_backend(HIP)
 ggml_add_backend(METAL)
 ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
+ggml_add_backend(VirtGPU)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -69,6 +69,10 @@
 #include "ggml-rpc.h"
 #endif

+#ifdef GGML_USE_VIRTGPU_FRONTEND
+#include "ggml-virtgpu.h"
+#endif
+
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
@@ -180,7 +184,12 @@ struct ggml_backend_registry {
        register_backend(ggml_backend_sycl_reg());
 #endif
 #ifdef GGML_USE_VULKAN
+    // Add runtime disable check
+    if (getenv("GGML_DISABLE_VULKAN") == nullptr) {
        register_backend(ggml_backend_vk_reg());
+    } else {
+        GGML_LOG_DEBUG("Vulkan backend disabled by GGML_DISABLE_VULKAN environment variable\n");
+    }
 #endif
 #ifdef GGML_USE_WEBGPU
        register_backend(ggml_backend_webgpu_reg());
@@ -188,6 +197,10 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_ZDNN
        register_backend(ggml_backend_zdnn_reg());
 #endif
+#ifdef GGML_USE_VIRTGPU_FRONTEND
+        register_backend(ggml_backend_virtgpu_reg());
+#endif
+
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
@@ -604,6 +617,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
    ggml_backend_load_best("rpc", silent, dir_path);
    ggml_backend_load_best("sycl", silent, dir_path);
    ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("virtgpu", silent, dir_path);
    ggml_backend_load_best("opencl", silent, dir_path);
    ggml_backend_load_best("hexagon", silent, dir_path);
    ggml_backend_load_best("musa", silent, dir_path);
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -1,3 +1,4 @@
+
 #pragma once

 // Rename `_generic` functions if no native implementation is available.
@@ -42,6 +43,7 @@
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -53,6 +55,7 @@
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#    define ggml_gemm_q6_K_8x8_q8_K_generic   ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -73,6 +76,7 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
@@ -80,6 +84,7 @@
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
@@ -102,6 +107,7 @@
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -113,6 +119,7 @@
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -136,6 +143,7 @@
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -147,6 +155,7 @@
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -177,6 +186,7 @@
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -187,6 +197,7 @@
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -216,6 +227,7 @@
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -227,6 +239,7 @@
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -258,6 +271,7 @@
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
+#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -269,6 +283,7 @@
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -1055,10 +1055,10 @@ void ggml_gemv_q5_K_8x8_q8_K(int                        n,

                    // FUSED BIAS: Compute and subtract bias immediately
                    // bias = (bsums_lo * mins_lo + bsums_hi * mins_hi) * sb_min
-                    int32x4_t bias = vmull_s16(bsums_vec_lo, group_mins_lo);
-                    bias = vmlal_s16(bias, bsums_vec_hi, group_mins_hi);
+                    int32x4_t bias       = vmull_s16(bsums_vec_lo, group_mins_lo);
+                    bias                 = vmlal_s16(bias, bsums_vec_hi, group_mins_hi);
                    float32x4_t bias_f32 = vcvtq_f32_s32(bias);
-                    acc_f32[i] = vmlsq_f32(acc_f32[i], sb_min, bias_f32);
+                    acc_f32[i]           = vmlsq_f32(acc_f32[i], sb_min, bias_f32);
                }
            }  // for sb
        }  // for b
@@ -1072,6 +1072,208 @@ void ggml_gemv_q5_K_8x8_q8_K(int                        n,
    ggml_gemv_q5_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }

+void ggml_gemv_q6_K_8x8_q8_K(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    constexpr int    col_pairs = ncols_interleaved / 2;
+    const uint8x16_t m4b       = vdupq_n_u8(0x0f);
+    const uint8x16_t mask_lo   = vdupq_n_u8(0x03);
+    const uint8x16_t mask_hi   = vdupq_n_u8(0x30);
+
+    // 1x8 tile = 2 x 4
+    float32x4_t acc_f32[2];
+
+    const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
+
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q6_Kx8 * GGML_RESTRICT q6_ptr = (const block_q6_Kx8 *) vx + (x * nb);
+
+        acc_f32[0] = vdupq_n_f32(0);
+        acc_f32[1] = vdupq_n_f32(0);
+
+        for (int b = 0; b < nb; b++) {
+            float32x4_t q6_d_0     = vcvt_f32_f16(vld1_f16((const __fp16 *) q6_ptr[b].d));      // d0 d1 d2 d3
+            float32x4_t q6_d_1     = vcvt_f32_f16(vld1_f16((const __fp16 *) q6_ptr[b].d + 4));  // d4 d5 d6 d7
+            float32x4_t q8_d       = vdupq_n_f32(q8_ptr[b].d);
+            float32x4_t sb_scale_0 = vmulq_f32(q6_d_0, q8_d);
+            float32x4_t sb_scale_1 = vmulq_f32(q6_d_1, q8_d);
+
+            int32x2_t acc[col_pairs];
+            for (int i = 0; i < col_pairs; i++) {
+                acc[i] = vdup_n_s32(0);
+            }
+
+            // Load all 16 scales once and widen to int16 (Q6_K has 16 scales per block)
+            // Reused for bias and dequantization later
+            int16_t q6_scales[16 * 8];
+            for (int i = 0; i < 16; i++) {
+                int16x8_t scales = vmovl_s8(vld1_s8(q6_ptr[b].scales + i * 8));
+                vst1q_s16(q6_scales + i * 8, scales);
+            }
+
+            // Compute bias per column using q8 bsums and preloaded scales to skip the -32 shift
+            int32x4_t bias_lo = vdupq_n_s32(0);
+            int32x4_t bias_hi = vdupq_n_s32(0);
+
+            // Load bsums in chunks of 4 to process with vectorized operations
+            for (int i = 0; i < 16; i += 4) {
+                int16x4_t bsums_vec   = vld1_s16(q8_ptr[b].bsums + i);
+                int16x4_t scales_lo_0 = vld1_s16(q6_scales + (i + 0) * 8);
+                int16x4_t scales_hi_0 = vld1_s16(q6_scales + (i + 0) * 8 + 4);
+                int16x4_t scales_lo_1 = vld1_s16(q6_scales + (i + 1) * 8);
+                int16x4_t scales_hi_1 = vld1_s16(q6_scales + (i + 1) * 8 + 4);
+                int16x4_t scales_lo_2 = vld1_s16(q6_scales + (i + 2) * 8);
+                int16x4_t scales_hi_2 = vld1_s16(q6_scales + (i + 2) * 8 + 4);
+                int16x4_t scales_lo_3 = vld1_s16(q6_scales + (i + 3) * 8);
+                int16x4_t scales_hi_3 = vld1_s16(q6_scales + (i + 3) * 8 + 4);
+
+                bias_lo = vmlal_lane_s16(bias_lo, scales_lo_0, bsums_vec, 0);
+                bias_hi = vmlal_lane_s16(bias_hi, scales_hi_0, bsums_vec, 0);
+                bias_lo = vmlal_lane_s16(bias_lo, scales_lo_1, bsums_vec, 1);
+                bias_hi = vmlal_lane_s16(bias_hi, scales_hi_1, bsums_vec, 1);
+                bias_lo = vmlal_lane_s16(bias_lo, scales_lo_2, bsums_vec, 2);
+                bias_hi = vmlal_lane_s16(bias_hi, scales_hi_2, bsums_vec, 2);
+                bias_lo = vmlal_lane_s16(bias_lo, scales_lo_3, bsums_vec, 3);
+                bias_hi = vmlal_lane_s16(bias_hi, scales_hi_3, bsums_vec, 3);
+            }
+            bias_lo = vshlq_n_s32(bias_lo, 5);
+            bias_hi = vshlq_n_s32(bias_hi, 5);
+
+            // Process two 128-value halves per superblock
+            for (int half = 0; half < 2; half++) {
+                const uint8_t * ql_base = q6_ptr[b].ql + half * 512;
+                const uint8_t * qh_base = q6_ptr[b].qh + half * 256;
+
+                // A subblock (sb) is a set of weights that share the scale
+                // Since q6_K scales are per 16 elements
+                // num sbs -> 256 elements / (16 elements/scale * 2 elements/byte * 2 halves)
+                for (int sb = 0; sb < QK_K / 64; sb++) {
+                    const int8_t * q8_base_l = q8_ptr[b].qs + half * 128 + sb * 16;
+                    const int8_t * q8_base_h = q8_base_l + 64;
+
+                    // Load and duplicate q8 values (each register covers two interleaved columns of q6)
+                    int8x16_t q8_l[2];
+                    int8x16_t q8_h[2];
+                    for (int i = 0; i < 2; i++) {
+                        q8_l[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base_l + i * 8));
+                        q8_h[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base_h + i * 8));
+                    }
+
+                    // TODO: Test other qh repack patterns to reduce loads
+                    const int ql_off_base = sb * QK_K / 2;
+                    const int qh_off_base = ql_off_base & 255;  // wraps after 256 bytes
+
+                    // Load 4 vectors at once (64 bytes each for ql_0, ql_1, qh_0, qh_1)
+                    ggml_uint8x16x4_t q6_ql_0 = ggml_vld1q_u8_x4(ql_base + ql_off_base);
+                    ggml_uint8x16x4_t q6_ql_1 = ggml_vld1q_u8_x4(ql_base + ql_off_base + 64);
+                    ggml_uint8x16x4_t q6_qh_0 = ggml_vld1q_u8_x4(qh_base + qh_off_base);
+                    ggml_uint8x16x4_t q6_qh_1 = ggml_vld1q_u8_x4(qh_base + qh_off_base + 64);
+
+                    // Adjust qh for subblocks 2 and 3 (shift right by 2)
+                    if (sb > 1) {
+                        q6_qh_0.val[0] = vshrq_n_u8(q6_qh_0.val[0], 2);
+                        q6_qh_0.val[1] = vshrq_n_u8(q6_qh_0.val[1], 2);
+                        q6_qh_0.val[2] = vshrq_n_u8(q6_qh_0.val[2], 2);
+                        q6_qh_0.val[3] = vshrq_n_u8(q6_qh_0.val[3], 2);
+                        q6_qh_1.val[0] = vshrq_n_u8(q6_qh_1.val[0], 2);
+                        q6_qh_1.val[1] = vshrq_n_u8(q6_qh_1.val[1], 2);
+                        q6_qh_1.val[2] = vshrq_n_u8(q6_qh_1.val[2], 2);
+                        q6_qh_1.val[3] = vshrq_n_u8(q6_qh_1.val[3], 2);
+                    }
+
+                    // Process column pairs (0-1, 2-3, 4-5, 6-7)
+                    for (int cp = 0; cp < col_pairs; cp++) {
+                        const uint8x16_t q6_qs_cp_0_l = q6_ql_0.val[cp];
+                        const uint8x16_t q6_qs_cp_1_l = q6_ql_1.val[cp];
+                        const uint8x16_t q6_qs_cp_0_h = q6_qh_0.val[cp];
+                        const uint8x16_t q6_qs_cp_1_h = q6_qh_1.val[cp];
+
+                        // Extract high 2 bits for upper nibble reconstruction
+                        const uint8x16_t q6_qs_cp_0_hh = vandq_u8(q6_qs_cp_0_h, mask_hi);
+                        const uint8x16_t q6_qs_cp_1_hh = vandq_u8(q6_qs_cp_1_h, mask_hi);
+
+                        // q6 = (low4 | high2<<4), without -32 bias (handled via bsums)
+                        const int8x16_t q6_l0 = vreinterpretq_s8_u8(
+                            vsliq_n_u8(vandq_u8(q6_qs_cp_0_l, m4b), vandq_u8(q6_qs_cp_0_h, mask_lo), 4));
+                        const int8x16_t q6_l1 = vreinterpretq_s8_u8(
+                            vsliq_n_u8(vandq_u8(q6_qs_cp_1_l, m4b), vandq_u8(q6_qs_cp_1_h, mask_lo), 4));
+                        const int8x16_t q6_h0 =
+                            vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_qs_cp_0_l, 4), q6_qs_cp_0_hh));
+                        const int8x16_t q6_h1 =
+                            vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_qs_cp_1_l, 4), q6_qs_cp_1_hh));
+
+                        int32x4_t sb_acc_l = vdupq_n_s32(0);
+                        sb_acc_l           = vdotq_s32(sb_acc_l, q6_l0, q8_l[0]);
+                        sb_acc_l           = vdotq_s32(sb_acc_l, q6_l1, q8_l[1]);
+
+                        int32x4_t sb_acc_h = vdupq_n_s32(0);
+                        sb_acc_h           = vdotq_s32(sb_acc_h, q6_h0, q8_h[0]);
+                        sb_acc_h           = vdotq_s32(sb_acc_h, q6_h1, q8_h[1]);
+
+                        // Pairwise add to get per-column sums: [col0, col1]
+                        int32x2_t sum_l = vpadd_s32(vget_low_s32(sb_acc_l), vget_high_s32(sb_acc_l));
+                        int32x2_t sum_h = vpadd_s32(vget_low_s32(sb_acc_h), vget_high_s32(sb_acc_h));
+
+                        const int scale_idx_l = half * 8 + sb;
+                        const int scale_idx_h = half * 8 + sb + 4;
+
+                        // Access scales using array indexing (scales are interleaved by column)
+                        const int32x2_t scale_vec_l = { (int32_t) q6_scales[scale_idx_l * 8 + cp * 2],
+                                                        (int32_t) q6_scales[scale_idx_l * 8 + cp * 2 + 1] };
+                        const int32x2_t scale_vec_h = { (int32_t) q6_scales[scale_idx_h * 8 + cp * 2],
+                                                        (int32_t) q6_scales[scale_idx_h * 8 + cp * 2 + 1] };
+
+                        // Accumulate scaled results
+                        acc[cp] = vmla_s32(acc[cp], sum_l, scale_vec_l);
+                        acc[cp] = vmla_s32(acc[cp], sum_h, scale_vec_h);
+                    }
+                }
+            }  // for half
+
+            // Bias correction
+            acc[0] = vsub_s32(acc[0], vget_low_s32(bias_lo));
+            acc[1] = vsub_s32(acc[1], vget_high_s32(bias_lo));
+            acc[2] = vsub_s32(acc[2], vget_low_s32(bias_hi));
+            acc[3] = vsub_s32(acc[3], vget_high_s32(bias_hi));
+
+            // Apply superblock scale (no mins for q6_K)
+            // acc[cp] has [c0, c1]
+            float32x2_t w_01 = vmul_f32(vcvt_f32_s32(acc[0]), vget_low_f32(sb_scale_0));
+            float32x2_t w_23 = vmul_f32(vcvt_f32_s32(acc[1]), vget_high_f32(sb_scale_0));
+            float32x2_t w_45 = vmul_f32(vcvt_f32_s32(acc[2]), vget_low_f32(sb_scale_1));
+            float32x2_t w_67 = vmul_f32(vcvt_f32_s32(acc[3]), vget_high_f32(sb_scale_1));
+
+            acc_f32[0] = vaddq_f32(acc_f32[0], vcombine_f32(w_01, w_23));
+            acc_f32[1] = vaddq_f32(acc_f32[1], vcombine_f32(w_45, w_67));
+        }  // for b
+
+        int base = x * ncols_interleaved;
+        vst1q_f32(s + base, acc_f32[0]);
+        vst1q_f32(s + base + 4, acc_f32[1]);
+    }  // for x
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q6_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemv_q8_0_4x4_q8_0(int                        n,
                             float * GGML_RESTRICT      s,
                             size_t                     bs,
@@ -2946,16 +3148,17 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,

                        // Scales[i] corresponds to column i
                        const int scale_offset = cp * 2;
-                        for (int blk = 0; blk < 2; blk++) {
-                            const int32x4_t block_scale = {
-                                (int32_t) q4sb_scales[blk][scale_offset],
-                                (int32_t) q4sb_scales[blk][scale_offset],
-                                (int32_t) q4sb_scales[blk][scale_offset + 1],
-                                (int32_t) q4sb_scales[blk][scale_offset + 1],
-                            };
-                            acc[cp]     = vmlaq_s32(acc[cp], sb_acc[blk], block_scale);
-                            acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[blk + 2], block_scale);
-                        }
+                        const int32_t scale_00 = q4sb_scales[0][scale_offset];
+                        const int32_t scale_01 = q4sb_scales[0][scale_offset + 1];
+                        const int32_t scale_10 = q4sb_scales[1][scale_offset];
+                        const int32_t scale_11 = q4sb_scales[1][scale_offset + 1];
+                        const int32x4_t block_scale_0 = vcombine_s32(vdup_n_s32(scale_00), vdup_n_s32(scale_01));
+                        const int32x4_t block_scale_1 = vcombine_s32(vdup_n_s32(scale_10), vdup_n_s32(scale_11));
+
+                        acc[cp]     = vmlaq_s32(acc[cp], sb_acc[0], block_scale_0);
+                        acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[2], block_scale_0);
+                        acc[cp]     = vmlaq_s32(acc[cp], sb_acc[1], block_scale_1);
+                        acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[3], block_scale_1);
                    }

                    // Multiply Acc bsum + mins
@@ -3146,8 +3349,8 @@ void ggml_gemm_q5_K_8x8_q8_K(int                        n,
                        const int8x16_t qs_lo_0 = vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(qs_cp_0, m4b), hbit_lo_0, 4));
                        int32x4_t       acc_0   = sb_acc[0];
                        acc_0                   = vmmlaq_s32(acc_0, qs_lo_0, q8s[0][0]);
-                        int32x4_t acc_2 = sb_acc[2];
-                        acc_2           = vmmlaq_s32(acc_2, qs_lo_0, q8s[1][0]);
+                        int32x4_t acc_2         = sb_acc[2];
+                        acc_2                   = vmmlaq_s32(acc_2, qs_lo_0, q8s[1][0]);
                        const int8x16_t qs_hi_0 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(qs_cp_0, 4), hbit_hi_0));
                        int32x4_t       acc_1   = sb_acc[1];
                        acc_1                   = vmmlaq_s32(acc_1, qs_hi_0, q8s[0][4]);
@@ -3271,6 +3474,223 @@ void ggml_gemm_q5_K_8x8_q8_K(int                        n,
    ggml_gemm_q5_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }

+void ggml_gemm_q6_K_8x8_q8_K(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    constexpr int    q8_k_blocklen = 4;
+    const uint8x16_t m4b           = vdupq_n_u8(0x0f);
+    const uint8x16_t mask_lo       = vdupq_n_u8(0x03);
+    const uint8x16_t mask_hi       = vdupq_n_u8(0x30);
+    const int8x16_t  m32s          = vdupq_n_s8(32);
+
+    // 8 accumulators: 4 q8 rows × 2 col groups (0-3, 4-7)
+    float32x4_t acc_f32[blocklen];
+
+    for (int y = 0; y < nr / q8_k_blocklen; y++) {
+        const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q6_Kx8 * GGML_RESTRICT q6_ptr = (const block_q6_Kx8 *) vx + (x * nb);
+
+            for (int i = 0; i < blocklen; i++) {
+                acc_f32[i] = vdupq_n_f32(0);
+            }
+
+            for (int b = 0; b < nb; b++) {
+                int32x4_t acc[8];  // rows 01 stored in [0][1][2][3], rows 23 stored in [4][5][6][7]
+                for (int i = 0; i < 8; i++) {
+                    acc[i] = vdupq_n_s32(0);
+                }
+
+                // Q6_K has simple 8-bit scales, 16 per block (one per 16 values)
+                // Reused for bias and dequantization later
+                int16_t q6_scales[16 * 8];
+                for (int i = 0; i < 16; ++i) {
+                    int16x8_t s16 = vmovl_s8(vld1_s8(q6_ptr[b].scales + i * 8));
+                    vst1q_s16(q6_scales + i * 8, s16);
+                }
+
+                // Process two 128-value halves per superblock
+                for (int half = 0; half < 2; half++) {
+
+                    const uint8_t * ql_base = q6_ptr[b].ql + half * 512;
+                    const uint8_t * qh_base = q6_ptr[b].qh + half * 256;
+
+                    // A subblock (sb) is a set of weights that share the scale
+                    // Since q6_K scales are per 16 elements
+                    // num sbs -> 256 elements / (16 elements/scale * 2 elements/byte * 2 halves)
+                    for (int sb = 0; sb < QK_K / 64; sb++) {
+                        // Q6_K weight index increasing by 64 instead of 32 requires
+                        // loading various q8 memory regions
+                        const int8_t * q8_base_l = q8_ptr[b].qs + half * 512 + sb * 64;
+                        const int8_t * q8_base_h = q8_ptr[b].qs + half * 512 + 256 + sb * 64;
+
+                        int8x16_t q8_l_01[2];
+                        int8x16_t q8_l_23[2];
+                        for (int i = 0; i < 2; i++) {
+                            const int offset = i * 32;
+                            q8_l_01[i]       = vld1q_s8(q8_base_l + offset);       // 0..7 & 8..15 (r01)
+                            q8_l_23[i]       = vld1q_s8(q8_base_l + offset + 16);  // 0..7 & 8..15 (r23)
+                        }
+
+                        int8x16_t q8_h_01[2];
+                        int8x16_t q8_h_23[2];
+                        for (int i = 0; i < 2; i++) {
+                            const int offset = i * 32;
+                            q8_h_01[i]       = vld1q_s8(q8_base_h + offset);
+                            q8_h_23[i]       = vld1q_s8(q8_base_h + offset + 16);
+                        }
+
+                        const int ql_off_base = sb * QK_K / 2;
+
+                        uint8x16_t q6_ql_0[4];
+                        uint8x16_t q6_ql_1[4];
+                        for (int k = 0; k < 4; k++) {
+                            q6_ql_0[k] = vld1q_u8(ql_base + ql_off_base + 16 * k);
+                            q6_ql_1[k] = vld1q_u8(ql_base + ql_off_base + 64 + 16 * k);
+                        }
+
+                        const int  qh_off_base = (sb * QK_K / 2) & 255;  // wrap after 256 bytes
+                        uint8x16_t q6_qh_0[4];
+                        uint8x16_t q6_qh_1[4];
+                        for (int k = 0; k < 4; k++) {
+                            q6_qh_0[k] = vld1q_u8(qh_base + qh_off_base + 16 * k);
+                            q6_qh_1[k] = vld1q_u8(qh_base + qh_off_base + 64 + 16 * k);
+                        }
+
+                        // Adjust for the proper high bits (Sb 2 and 3)
+                        if (sb > 1) {
+                            for (int k = 0; k < 4; k++) {
+                                q6_qh_0[k] = vshrq_n_u8(q6_qh_0[k], 2);
+                                q6_qh_1[k] = vshrq_n_u8(q6_qh_1[k], 2);
+                            }
+                        }
+
+                        // Process column pairs (0-1, 2-3, 4-5, 6-7)
+                        for (int cp = 0; cp < ncols_interleaved / 2; cp++) {
+                            const uint8x16_t q6_qs_cp_0_l = q6_ql_0[cp];
+                            const uint8x16_t q6_qs_cp_1_l = q6_ql_1[cp];
+                            const uint8x16_t q6_qs_cp_0_h = q6_qh_0[cp];
+                            const uint8x16_t q6_qs_cp_1_h = q6_qh_1[cp];
+
+                            // Extract high 2 bits for upper nibble reconstruction
+                            const uint8x16_t q6_qs_cp_0_hh = vandq_u8(q6_qs_cp_0_h, mask_hi);
+                            const uint8x16_t q6_qs_cp_1_hh = vandq_u8(q6_qs_cp_1_h, mask_hi);
+
+                            // q6 = (low4 | high2<<4) - 32
+                            // Use vsliq_n_u8 to combine shift-left-insert in one instruction (like Q5_K)
+                            const int8x16_t q6_l0 = vsubq_s8(
+                                vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q6_qs_cp_0_l, m4b), vandq_u8(q6_qs_cp_0_h, mask_lo), 4)),
+                                m32s);
+                            const int8x16_t q6_l1 = vsubq_s8(
+                                vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q6_qs_cp_1_l, m4b), vandq_u8(q6_qs_cp_1_h, mask_lo), 4)),
+                                m32s);
+                            const int8x16_t q6_h0 = vsubq_s8(
+                                vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_qs_cp_0_l, 4), q6_qs_cp_0_hh)), m32s);
+                            const int8x16_t q6_h1 = vsubq_s8(
+                                vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_qs_cp_1_l, 4), q6_qs_cp_1_hh)), m32s);
+
+                            // row pair 0, base_l
+                            int32x4_t sb_acc_0l = vmmlaq_s32(vdupq_n_s32(0), q6_l0, q8_l_01[0]);
+                            sb_acc_0l           = vmmlaq_s32(sb_acc_0l, q6_l1, q8_l_01[1]);
+                            // row pair 0, base_h
+                            int32x4_t sb_acc_0h = vmmlaq_s32(vdupq_n_s32(0), q6_h0, q8_h_01[0]);
+                            sb_acc_0h           = vmmlaq_s32(sb_acc_0h, q6_h1, q8_h_01[1]);
+                            // row pair 1, base_l
+                            int32x4_t sb_acc_1l = vmmlaq_s32(vdupq_n_s32(0), q6_l0, q8_l_23[0]);
+                            sb_acc_1l           = vmmlaq_s32(sb_acc_1l, q6_l1, q8_l_23[1]);
+                            // row pair 1, base_h
+                            int32x4_t sb_acc_1h = vmmlaq_s32(vdupq_n_s32(0), q6_h0, q8_h_23[0]);
+                            sb_acc_1h           = vmmlaq_s32(sb_acc_1h, q6_h1, q8_h_23[1]);
+
+                            const int scale_idx_l = half * 8 + sb;
+                            const int scale_idx_h = half * 8 + sb + 4;
+
+                            const int32x4_t scale_vec_l = {
+                                q6_scales[scale_idx_l * 8 + cp * 2 + 0],
+                                q6_scales[scale_idx_l * 8 + cp * 2 + 0],
+                                q6_scales[scale_idx_l * 8 + cp * 2 + 1],
+                                q6_scales[scale_idx_l * 8 + cp * 2 + 1],
+                            };
+                            const int32x4_t scale_vec_h = {
+                                q6_scales[scale_idx_h * 8 + cp * 2 + 0],
+                                q6_scales[scale_idx_h * 8 + cp * 2 + 0],
+                                q6_scales[scale_idx_h * 8 + cp * 2 + 1],
+                                q6_scales[scale_idx_h * 8 + cp * 2 + 1],
+                            };
+
+                            acc[cp]     = vmlaq_s32(acc[cp], sb_acc_0l, scale_vec_l);
+                            acc[cp]     = vmlaq_s32(acc[cp], sb_acc_0h, scale_vec_h);
+                            acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc_1l, scale_vec_l);
+                            acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc_1h, scale_vec_h);
+                        }
+                    }
+                }  // for half
+
+                // Reorder i8mm output to match memory layout
+                for (int i = 0; i < 8; i++) {
+                    int32x2x2_t aux = vzip_s32(vget_low_s32(acc[i]), vget_high_s32(acc[i]));
+                    acc[i]          = vcombine_s32(aux.val[0], aux.val[1]);
+                }
+                int32x4_t reorder_acc[8] = {
+                    vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1])),
+                    vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3])),
+                    vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1])),
+                    vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3])),
+                    vcombine_s32(vget_low_s32(acc[4]), vget_low_s32(acc[5])),
+                    vcombine_s32(vget_low_s32(acc[6]), vget_low_s32(acc[7])),
+                    vcombine_s32(vget_high_s32(acc[4]), vget_high_s32(acc[5])),
+                    vcombine_s32(vget_high_s32(acc[6]), vget_high_s32(acc[7])),
+                };
+
+                // Apply superblock scale (no mins for q6_K)
+                for (int i = 0; i < q8_k_blocklen; i++) {
+                    for (int j = 0; j < 2; j++) {
+                        float32x4_t       q8_d  = vdupq_n_f32(q8_ptr[b].d[i]);
+                        float32x4_t       q6_d  = vcvt_f32_f16(vld1_f16((const __fp16 *) (q6_ptr[b].d + j * 4)));
+                        const float32x4_t scale = vmulq_f32(q6_d, q8_d);
+
+                        acc_f32[2 * i + j] =
+                            vmlaq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(reorder_acc[2 * i + j]), scale);
+                    }
+                }
+            }  // for b
+
+            // Store results
+            for (int i = 0; i < q8_k_blocklen; i++) {
+                int row = y * q8_k_blocklen + i;
+                for (int j = 0; j < 2; j++) {
+                    int col    = x * ncols_interleaved + j * 4;
+                    int offset = row * bs + col;
+                    vst1q_f32(s + offset, acc_f32[2 * i + j]);
+                }
+            }
+        }  // for x
+    }  // for y
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_gemm_q6_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemm_q8_0_4x4_q8_0(int                        n,
                             float * GGML_RESTRICT      s,
                             size_t                     bs,
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -703,6 +703,97 @@ void ggml_gemv_q5_K_8x8_q8_K_generic(int                        n,
    }
 }

+
+void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    constexpr int qk = QK_K;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+    UNUSED(nr);
+
+    float sumf[8];
+
+    const block_q8_K * a_ptr = (const block_q8_K *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j] = 0.0f;
+        }
+
+        for (int l = 0; l < nb; l++) {
+
+
+            for (int k = 0; k < 16; k++) {
+                // k = 0.. 7 weights 0-63 low, 64-127 high
+                // k = 8..15 weights 128-191 low, 192-255 high
+                const int base_l = (k / 8) * 128 + (k % 8) * 8;
+                const int base_h = base_l + 64;
+
+                const int scale_idx_l = base_l / 16;
+                const int scale_idx_h = base_h / 16;
+
+                // Bit shift cycles 0,2,4,6 for each 32-value group within a 128-value half
+                const int qh_shift_l = ((base_l % 128) / 32) * 2;
+                const int qh_shift_h = ((base_h % 128) / 32) * 2;
+
+                // qh_half: offset to the correct 32-byte half (0 or 32)
+                const int qh_half_l = (base_l / 128) * 32;
+                const int qh_half_h = (base_h / 128) * 32;
+
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    // Interleaved scales
+                    const int8_t scale_l = b_ptr[l].scales[scale_idx_l * 8 + j];
+                    const int8_t scale_h = b_ptr[l].scales[scale_idx_h * 8 + j];
+
+                    int sumi_l = 0;
+                    int sumi_h = 0;
+
+                    for (int i = 0; i < blocklen; i++) {
+                        const int ql_pos = k * 64 + j * 8 + i;
+                        const int l_4    = b_ptr[l].ql[ql_pos] & 0xF;
+                        const int hi_4   = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
+
+                        // qh indexing with 8-byte interleaving (like q5_K)
+                        const int qh_byte_l   = qh_half_l + ((base_l + i) % 32);
+                        const int qh_chunk_l  = qh_byte_l / 8;
+                        const int qh_pos_l    = qh_byte_l % 8;
+                        const int qh_offset_l = qh_chunk_l * 64 + j * 8 + qh_pos_l;
+                        const int hi_2_l      = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
+
+                        const int qh_byte_h   = qh_half_h + ((base_h + i) % 32);
+                        const int qh_chunk_h  = qh_byte_h / 8;
+                        const int qh_pos_h    = qh_byte_h % 8;
+                        const int qh_offset_h = qh_chunk_h * 64 + j * 8 + qh_pos_h;
+                        const int hi_2_h      = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
+
+                        const int q_l = ((hi_2_l << 4) | l_4) - 32;
+                        const int q_h = ((hi_2_h << 4) | hi_4) - 32;
+
+                        const int8_t a_l = a_ptr[l].qs[base_l + i];
+                        const int8_t a_h = a_ptr[l].qs[base_h + i];
+
+                        sumi_l += q_l * a_l;
+                        sumi_h += q_h * a_h;
+                    }
+
+                    sumf[j] +=
+                        (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
+                }
+            }
+        }
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
 void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -1133,15 +1224,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    assert (nr % 4 == 0);
    assert (nc % ncols_interleaved == 0);

-    UNUSED(s);
    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);

    float sumf[4][8];
    float sum_minf[4][8];
@@ -1402,6 +1485,111 @@ void ggml_gemm_q5_K_8x8_q8_K_generic(int                        n,
    }
 }

+void ggml_gemm_q6_K_8x8_q8_K_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK_K;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+
+    float sumf[4][8];
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
+
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j] = 0.0f;
+                }
+            }
+
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < 16; k++) {
+                    // k = 0.. 7 weights 0-63 low, 64-127 high
+                    // k = 8..15 weights 128-191 low, 192-255 high
+                    const int base_l = (k / 8) * 128 + (k % 8) * 8;
+                    const int base_h = base_l + 64;
+
+                    const int scale_idx_l = base_l / 16;
+                    const int scale_idx_h = base_h / 16;
+
+                    // Bit shift cycles 0,2,4,6 for each 32-value group within a 128-value half
+                    const int qh_shift_l = ((base_l % 128) / 32) * 2;
+                    const int qh_shift_h = ((base_h % 128) / 32) * 2;
+
+                    // qh_half: offset to the correct 32-byte half (0 or 32)
+                    const int qh_half_l = (base_l / 128) * 32;
+                    const int qh_half_h = (base_h / 128) * 32;
+
+                    // Activation base indices for q8_Kx4 interleaved format
+                    // Layout: 128-value halves (k/8), then 8-value sub-blocks (k%8) with stride 32
+                    const int q8_base = (k / 8) * 512 + (k % 8) * 32;
+
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            // Interleaved scales
+                            const int8_t scale_l = b_ptr[l].scales[scale_idx_l * 8 + j];
+                            const int8_t scale_h = b_ptr[l].scales[scale_idx_h * 8 + j];
+
+                            int sumi_l = 0;
+                            int sumi_h = 0;
+
+                            for (int i = 0; i < blocklen; i++) {
+                                const int ql_pos = k * 64 + j * 8 + i;
+                                const int l_4    = b_ptr[l].ql[ql_pos] & 0xF;
+                                const int hi_4   = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
+
+                                const int qh_idx_l    = qh_half_l + ((base_l + i) % 32);
+                                const int qh_chunk_l  = qh_idx_l / 8;
+                                const int qh_pos_l    = qh_idx_l % 8;
+                                const int qh_offset_l = qh_chunk_l * 64 + j * 8 + qh_pos_l;
+                                const int hi_2_l      = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
+
+                                const int qh_idx_h    = qh_half_h + ((base_h + i) % 32);
+                                const int qh_chunk_h  = qh_idx_h / 8;
+                                const int qh_pos_h    = qh_idx_h % 8;
+                                const int qh_offset_h = qh_chunk_h * 64 + j * 8 + qh_pos_h;
+                                const int hi_2_h      = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
+
+                                const int q_l = ((hi_2_l << 4) | l_4) - 32;
+                                const int q_h = ((hi_2_h << 4) | hi_4) - 32;
+
+                                const int8_t q8_l = a_ptr[l].qs[q8_base + m * 8 + i];
+                                const int8_t q8_h = a_ptr[l].qs[q8_base + m * 8 + i + 256];
+
+                                sumi_l += q_l * q8_l;
+                                sumi_h += q_h * q8_h;
+                            }
+
+                            sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
+                                          a_ptr[l].d[m];
+                        }
+                    }
+                }
+            }
+
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -1801,8 +1989,7 @@ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_in
    // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
    // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures

-    for(int i = 0; i < 128; i++){
-
+    for (int i = 0; i < 128; i++) {
        // Index for selecting which q2k super block
        int src1 = (i % 16) / 2;
        // Index for selecting scale
@@ -1902,6 +2089,52 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
    return out;
 }

+static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
+    block_q6_Kx8  out;
+    constexpr int n_blocks = 8;  // Kx8
+    for (int i = 0; i < n_blocks; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end_ls = QK_K * 4 / blck_size_interleave;
+    // Interleave Q6_K quants by taking 8 bytes at a time
+    for (int i = 0; i < end_ls; ++i) {
+        int src_id     = i % n_blocks;
+        int src_offset = (i / n_blocks) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elem_ls;
+        memcpy(&elem_ls, &in[src_id].ql[src_offset], sizeof(uint64_t));
+        memcpy(&out.ql[dst_offset], &elem_ls, sizeof(uint64_t));
+    }
+
+    // Interleave high bits using same 8-byte pattern as low bits
+    const int end_hs = end_ls / 2;
+    for (int i = 0; i < end_hs; ++i) {
+        int src_id     = i % n_blocks;
+        int src_offset = (i / n_blocks) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elem_hs;
+        memcpy(&elem_hs, &in[src_id].qh[src_offset], sizeof(uint64_t));
+        memcpy(&out.qh[dst_offset], &elem_hs, sizeof(uint64_t));
+    }
+
+    // The below logic is designed so as to unpack and rearrange scales in Q6_K
+    // The output Q6_Kx8 structure interleaves the 8 bit scales in the same fashion as the quants
+    // Q6_K structure has an 8-bit scale per 16 elements -> 16 scales
+    // scales: [0 bl0 0 bl1 ... 0 bl7][1 bl0 ... 1 bl7] ... [15 bl0 ... 15 bl7]  (bl = block)
+    constexpr int n_scales = QK_K / 16;
+
+    for (int i = 0; i < n_blocks; i++) {
+        for (int j = 0; j < n_scales; j++) {
+            out.scales[j * n_blocks + i] = in[i].scales[j];
+        }
+    }
+
+    return out;
+}
+
 static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
@@ -1983,7 +2216,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block

    for (int b = 0; b < nrow; b += nrows_interleaved) {
        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
+            for (int i = 0; i < nrows_interleaved; i++) {
                dst_tmp[i] = src[x + i * nblocks];
            }
            *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
@@ -2027,6 +2260,35 @@ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor *       t,
    return 0;
 }

+static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
+    GGML_ASSERT(interleave_block == 8);
+    constexpr int nrows_interleaved = 8;
+
+    block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
+    const block_q6_K * src = (const block_q6_K *) data;
+    block_q6_K dst_tmp[8];
+    int nrow = ggml_nrows(t);
+    int nblocks = t->ne[0] / QK_K;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+}
+
 static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 8);
@@ -2249,6 +2511,10 @@ template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * da
    return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
 }

+template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
+}
+
 template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
 }
@@ -2286,7 +2552,14 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
    ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+template <>
+void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int          n,
+                                            float *      s,
+                                            size_t       bs,
+                                            const void * vx,
+                                            const void * vy,
+                                            int          nr,
+                                            int          nc) {
    ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }

@@ -2302,6 +2575,10 @@ template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
    ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }

+template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
 template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }
@@ -2330,7 +2607,14 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
    ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+template <>
+void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int          n,
+                                            float *      s,
+                                            size_t       bs,
+                                            const void * vx,
+                                            const void * vy,
+                                            int          nr,
+                                            int          nc) {
    ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }

@@ -2350,6 +2634,10 @@ template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
    ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }

+template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
 template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }
@@ -2714,20 +3002,19 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
            for (int ir1 = 0; ir1 < nr1; ir1++) {
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);

-                const int id = row_mapping.i1; // selected expert index
+                const int id = row_mapping.i1;  // selected expert index

                const int64_t i11 = id % ne11;
-                const int64_t i12 = row_mapping.i2; // row index in src1
+                const int64_t i12 = row_mapping.i2;  // row index in src1

-                const int64_t i1 = id;  // selected expert index
-                const int64_t i2 = i12; // row
+                const int64_t i1 = id;               // selected expert index
+                const int64_t i2 = i12;              // row

                const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);

-                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                        (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
-                        src0_cur + src0_cur_start * nb01,
-                        src1_col, 1, src0_cur_end - src0_cur_start);
+                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
+                    ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+                    src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
            }
        }
 #undef MMID_MATRIX_ROW
@@ -2743,7 +3030,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 }  // namespace ggml::cpu::repack

 static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
-
    // instance for Q4
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
@@ -2756,6 +3042,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    // instance for Q5_K
    static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;

+    // instance for Q6_K
+    static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
+
    // instance for Q2
    static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;

@@ -2812,6 +3101,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                return &q5_K_8x8_q8_K;
            }
        }
+    } else if (cur->type == GGML_TYPE_Q6_K) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            if (cur->ne[1] % 8 == 0) {
+                return &q6_K_8x8_q8_K;
+            }
+        }
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
        if (ggml_cpu_has_avx2()) {
            if (cur->ne[1] % 8 == 0) {
--- a/ggml/src/ggml-cpu/repack.h
+++ b/ggml/src/ggml-cpu/repack.h
@@ -65,6 +65,16 @@ struct block_q5_Kx8 {
 static_assert(sizeof(block_q5_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 5,
              "wrong q5_K block size/padding");

+struct block_q6_Kx8 {
+    ggml_half d[8];
+    int8_t    scales[QK_K / 16 * 8];
+    uint8_t   ql[QK_K / 2 * 8];  // low bits of 6-bit quants (groups of 2)
+    uint8_t   qh[QK_K / 4 * 8];  // high bits of 6-bit quants (groups of 4)
+};
+
+static_assert(sizeof(block_q6_Kx8) == sizeof(ggml_half) * 8 + QK_K / 16 * 8 + 3 * QK_K / 4 * 8,
+              "wrong q6_K block size/padding");
+
 struct block_q8_Kx4 {
    float d[4];              // delta
    int8_t qs[QK_K * 4];     // quants
@@ -95,13 +105,14 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
 void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
-void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -111,6 +122,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -130,6 +142,7 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -139,6 +152,7 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -53,6 +53,7 @@
 // While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
 // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
 #define GGML_CUDA_CC_BLACKWELL       1200
+#define GGML_CUDA_CC_DGX_SPARK       1210
 #define GGML_CUDA_CC_RUBIN           1300
 #define GGML_CUDA_CC_OFFSET_AMD      0x1000000
 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -789,7 +789,7 @@ void launch_fattn(
    const ggml_tensor * K = dst->src[1];
    const ggml_tensor * V = dst->src[2];

-    const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
+    const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs));

    const ggml_tensor * mask  = dst->src[3];
    const ggml_tensor * sinks = dst->src[4];
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -147,7 +147,19 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
            const int gqa_ratio = Q->ne[2] / K->ne[2];
            if (gqa_ratio == 20) { // GLM 4.7 Flash
+                if (cc >= GGML_CUDA_CC_DGX_SPARK) {
+                    if (Q->ne[1] <= 8) {
+                        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+                        break;
+                    }
+                    ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
+                    break;
+                }
                if (cc >= GGML_CUDA_CC_BLACKWELL) {
+                    if (Q->ne[1] <= 4 && K->ne[1] >= 65536) {
+                        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+                        break;
+                    }
                    ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
                    break;
                }
@@ -161,6 +173,10 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
                }
                if (cc >= GGML_CUDA_CC_TURING) {
                    if (Q->ne[1] <= 4) {
+                        if (K->ne[1] <= 16384) {
+                            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+                            break;
+                        }
                        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 32>(ctx, dst);
                        break;
                    }
@@ -294,7 +310,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
        }
    }

-    const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
+    const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs));

    const int cc = ggml_cuda_info().devices[device].cc;

--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3080,63 +3080,166 @@ static bool ggml_cuda_should_fuse_rope_set_rows(const ggml_tensor * rope,
    return true;
 }

-static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
+static bool ggml_cuda_topk_moe_fusion(const struct ggml_cgraph * cgraph, int node_idx, ggml_cuda_topk_moe_args & args) {
+    args.sigmoid         = false;
+    args.softmax         = false;
+    args.delayed_softmax = false;
+    args.prob_bias       = false;
+    args.norm            = false;
+
+    const int      n_nodes = cgraph->n_nodes;
+    ggml_tensor ** nodes   = cgraph->nodes;
+
+    if (nodes[node_idx]->op == GGML_OP_SOFT_MAX) {
+        args.softmax = true;
+    }
+
+    if (nodes[node_idx]->op == GGML_OP_UNARY) {
+        if (ggml_get_unary_op(nodes[node_idx]) != GGML_UNARY_OP_SIGMOID) {
+            return false;
+        }
+        args.sigmoid = true;
+    }
+
+    if (nodes[node_idx]->op == GGML_OP_ARGSORT) {
+        args.delayed_softmax = true;
+    }
+
+    node_idx++;
+
+    if (args.sigmoid || args.softmax) {
+        // SOFTMAX -> RESHAPE
+        if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_RESHAPE ||
+                nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
+            return false;
+        }
+        ggml_tensor * probs_reshaped = nodes[node_idx];
+        node_idx++;
+
+        if (node_idx >= n_nodes) {
+            return false;
+        }
+
+        // src of bias add is the unreshaped probs (-2 instead of -1)
+        if (nodes[node_idx]->op == GGML_OP_ADD && nodes[node_idx]->src[0] == nodes[node_idx - 2]) {
+            args.prob_bias = true;
+            node_idx++;
+        }
+        // RESHAPE/ADD -> ARGSORT
+        if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_ARGSORT) {
+            return false;
+        }
+
+        if (args.prob_bias && nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
+            return false;
+        } else if (!args.prob_bias && nodes[node_idx]->src[0] != nodes[node_idx - 2]) {
+            return false;
+        }
+
+        node_idx++;
+
+        // ARGSORT-> VIEW
+        if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_VIEW ||
+                nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
+            return false;
+        }
+        node_idx++;
+
+        if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_GET_ROWS) {
+            return false;
+        }
+
+        // GET_ROWS
+        if (nodes[node_idx]->src[0] != probs_reshaped || nodes[node_idx]->src[1] != nodes[node_idx - 1]) {
+            return false;
+        }
+        node_idx++;
+    } else if (args.delayed_softmax) {
+        if (node_idx - 2 < 0) {
+            return false;
+        }
+        ggml_tensor * probs_reshaped = nodes[node_idx - 2];
+
+        // VIEW->ARGSORT
+        if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_VIEW ||
+            nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
+            return false;
+        }
+        node_idx++;
+
+        // GET_ROWS
+        if (node_idx >= n_nodes || nodes[node_idx]->src[1] != nodes[node_idx - 1] ||
+                nodes[node_idx]->src[0] != probs_reshaped) {
+            return false;
+        }
+        node_idx++;
+
+        static const std::vector<ggml_op> remaining_ops = { GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
+
+        for (const ggml_op op : remaining_ops) {
+            if (node_idx >= n_nodes || nodes[node_idx]->op != op || nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
+                return false;
+            }
+            node_idx++;
+        }
+    }
+
+    // At this point we can check for norm + scale. Everything is now at least valid till the norm
+    if (node_idx >= n_nodes) {
+        return true;
+    }
+
+    if (nodes[node_idx]->op == GGML_OP_RESHAPE) {
+        //check RESHAPE->SUM_ROWS->CLAMP->DIV->RESHAPE
+        static const std::vector<ggml_op> norm_ops = { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP };
+
+        args.norm = true;
+        for (const ggml_op op : norm_ops) {
+            if (nodes[node_idx]->op == op && nodes[node_idx]->src[0] == nodes[node_idx - 1]) {
+                node_idx++;
+            } else {
+                args.norm = false;
+                return true;
+            }
+        }
+
+        // DIV <- CLAMP, RESHAPE
+        if (nodes[node_idx]->op != GGML_OP_DIV || nodes[node_idx]->src[1] != nodes[node_idx - 1] ||
+            nodes[node_idx]->src[0] != nodes[node_idx - 3]) {
+            args.norm = false;
+            return true;
+        }
+        node_idx++;
+
+        if (nodes[node_idx]->op != GGML_OP_RESHAPE || nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
+            args.norm = false;
+            return true;
+        }
+
+        node_idx++;
+    }
+
+    if (nodes[node_idx]->op == GGML_OP_SCALE && nodes[node_idx]->src[0] == nodes[node_idx - 1]) {
+        args.scale = true;
+    }
+
+    return true;
+}
+
+static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
+                               int                                       node_idx,
+                               std::initializer_list<enum ggml_op>       ops,
+                               std::initializer_list<enum ggml_unary_op> unary_ops) {
 #ifndef NDEBUG
    const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
    GGML_ASSERT(unary_ops.size() == num_unary);
 #endif

-    //TODO: remove special case once ggml_can_fuse can handle empty nodes
-    std::initializer_list<enum ggml_op> topk_moe_ops =
-        ggml_cuda_topk_moe_ops(/*with_norm*/ false, /*delayed_softmax=*/false);
-    std::initializer_list<enum ggml_op> topk_moe_ops_with_norm =
-        ggml_cuda_topk_moe_ops(/*with_norm=*/true, /*delayed_softmax=*/false);
-    std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
-        ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
-
    const auto is_equal = [](const std::initializer_list<enum ggml_op> & list1,
                             const std::initializer_list<enum ggml_op> & list2) {
        return std::equal(list1.begin(), list1.end(), list2.begin(), list2.end());
    };

-    if (is_equal(topk_moe_ops_with_norm, ops) &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
-        ggml_tensor * softmax = cgraph->nodes[node_idx];
-        ggml_tensor * weights = cgraph->nodes[node_idx + 9];
-        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
-        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
-        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
-
-        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
-            return true;
-        }
-    }
-
-    if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
-        ggml_tensor * softmax = cgraph->nodes[node_idx];
-        ggml_tensor * weights = cgraph->nodes[node_idx + 4];
-        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
-        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
-        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
-
-        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
-            return true;
-        }
-    }
-
-    if (is_equal(topk_moe_ops_delayed_softmax, ops) &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
-        ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
-        ggml_tensor * weights = cgraph->nodes[node_idx + 5];
-        ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
-        ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
-        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
-
-        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
-            return true;
-        }
-    }
-
    std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops    = { GGML_OP_MUL_MAT,    GGML_OP_ADD,    GGML_OP_MUL_MAT,    GGML_OP_ADD,    GGML_OP_GLU };
    std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };

@@ -3398,35 +3501,75 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                // start of fusion operations
                static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                if (!disable_fusion) {
+                    ggml_cuda_topk_moe_args args;

-                    if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
-                        ggml_tensor * weights          = cgraph->nodes[i + 9];
-                        ggml_tensor * selected_experts = cgraph->nodes[i + 3];
-                        ggml_tensor * clamp            = cgraph->nodes[i + 7];
-                        ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
-                                              /*delayed softmax*/ false, clamp);
-                        i += 9;
-                        continue;
-                    }
+                    if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
+                        cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
+                        const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);

-                    if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
-                        ggml_tensor * weights          = cgraph->nodes[i + 4];
-                        ggml_tensor * selected_experts = cgraph->nodes[i + 3];
-                        ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
-                                              /*delayed softmax*/ false);
-                        i += 4;
-                        continue;
-                    }
+                        std::vector<ggml_op> ops;

-                    if (ggml_cuda_can_fuse(cgraph, i,
-                                           ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) {
-                        ggml_tensor * weights = cgraph->nodes[i + 5];
-                        ggml_tensor * ids     = cgraph->nodes[i + 1];
+                        if (can_fuse) {
+                            const ggml_tensor * logits  = node->src[0];
+                            ggml_tensor *       weights = nullptr;
+                            ggml_tensor *       ids     = nullptr;
+                            const ggml_tensor * bias    = nullptr;
+                            const ggml_tensor * clamp   = nullptr;
+                            const ggml_tensor * scale   = nullptr;

-                        ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false,
-                                              /*delayed_softmax*/ true);
-                        i += 5;
-                        continue;
+                            if (!args.delayed_softmax) {
+                                ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
+                                int     out_nodes[2];  // nodes which can't be elided
+
+                                if (args.prob_bias) {
+                                    bias = cgraph->nodes[i + 2]->src[1];
+                                    ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT,
+                                                            GGML_OP_VIEW, GGML_OP_GET_ROWS });
+                                    out_nodes[0] = i + 4;
+                                    ids          = cgraph->nodes[i + 4];
+                                } else {
+                                    ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW,
+                                                            GGML_OP_GET_ROWS });
+                                    out_nodes[0] = i + 3;
+                                    ids          = cgraph->nodes[i + 3];
+                                }
+
+                                if (args.norm) {
+                                    ops.insert(ops.end(), { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
+                                                            GGML_OP_DIV, GGML_OP_RESHAPE });
+                                    clamp = cgraph->nodes[i + ops.size() - 3];
+                                }
+                                if (args.scale) {
+                                    ops.insert(ops.end(), { GGML_OP_SCALE });
+                                    scale = cgraph->nodes[i + ops.size() - 1];
+                                }
+
+                                weights      = cgraph->nodes[i + ops.size() - 1];
+                                out_nodes[1] = i + ops.size() - 1;
+
+                                if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
+                                        ggml_cuda_should_use_topk_moe(node, logits, weights, ids)) {
+                                    ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
+                                    i += ops.size() - 1;
+                                    continue;
+                                }
+                            } else if (!args.norm && !args.prob_bias) {
+                                //special case gpt-oss, no norm, no bias.
+                                ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
+                                                        GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
+                                weights                     = cgraph->nodes[i + 5];
+                                ids                         = cgraph->nodes[i + 1];
+                                const ggml_tensor * softmax = cgraph->nodes[i + 4];
+
+                                int out_nodes[2] = { i + 1, i + 5 };
+                                if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
+                                        ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids)) {
+                                    ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
+                                    i += ops.size() - 1;
+                                    continue;
+                                }
+                            }
+                        }
                    }

                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -5,6 +5,13 @@
 #include <cmath>
 #include <initializer_list>

+// Kernel config struct - passed by value to CUDA kernel
+struct topk_moe_config {
+    bool use_sigmoid;
+    bool with_norm;
+    bool delayed_softmax;
+};
+
 // Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
 template <int experts_per_thread, bool use_limit>
 __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
@@ -50,6 +57,16 @@ __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const in
    }
 }

+template <int experts_per_thread, bool use_limit>
+__device__ void sigmoid_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        const int  idx    = lane + i * WARP_SIZE;
+        const bool active = !use_limit || (idx < limit);
+        vals[i]           = active ? 1.f / (1.f + expf(-vals[i])) : -INFINITY;
+    }
+}
+
 /*
    This kernel does the following:
    1. optionally softmax over the logits per token [n_experts, n_tokens]
@@ -59,13 +76,16 @@ __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const in

    It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
 */
-template <int n_experts, bool with_norm, bool delayed_softmax = false>
-__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
-                                                                  float *       weights,
-                                                                  int32_t *     ids,
-                                                                  const int     n_rows,
-                                                                  const int     n_expert_used,
-                                                                  const float   clamp_val) {
+template <int n_experts, bool has_bias>
+__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *         logits,
+                                                                  float *               weights,
+                                                                  int32_t *             ids,
+                                                                  float *               bias,
+                                                                  const int             n_rows,
+                                                                  const int             n_expert_used,
+                                                                  const float           clamp_val,
+                                                                  const float           scale_val,
+                                                                  const topk_moe_config config) {
    const int row = blockIdx.x * blockDim.y + threadIdx.y;
    if (row >= n_rows) {
        return;
@@ -79,14 +99,41 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *

    float wt[experts_per_thread];

+    // Initialize all slots to -INFINITY
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        wt[i] = -INFINITY;
+    }
+
 #pragma unroll
    for (int i = 0; i < n_experts; i += WARP_SIZE) {
        const int expert  = i + threadIdx.x;
        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[expert] : -INFINITY;
    }

-    if constexpr (!delayed_softmax) {
-        softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
+    if (!config.delayed_softmax) {
+        if (config.use_sigmoid) {
+           sigmoid_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
+        } else {
+           softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
+        }
+    }
+
+    // selection_wt is only needed when bias is present (selection uses wt + bias)
+    // when no bias, we use wt directly for both selection and weight values
+    float selection_wt[has_bias ? experts_per_thread : 1];
+
+    if constexpr (has_bias) {
+#pragma unroll
+        for (int i = 0; i < experts_per_thread; i++) {
+            selection_wt[i] = -INFINITY;
+        }
+#pragma unroll
+        for (int i = 0; i < n_experts; i += WARP_SIZE) {
+            const int expert = i + threadIdx.x;
+            selection_wt[i / WARP_SIZE] =
+                (n_experts % WARP_SIZE == 0 || expert < n_experts) ? wt[i / WARP_SIZE] + bias[expert] : -INFINITY;
+        }
    }

    //at this point, each thread holds either a portion of the softmax distribution
@@ -106,22 +153,56 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
        float max_val    = wt[0];
        int   max_expert = threadIdx.x;

-#pragma unroll
-        for (int i = 1; i < experts_per_thread; i++) {
-            const int expert = threadIdx.x + i * WARP_SIZE;
-            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
-                max_val    = wt[i];
-                max_expert = expert;
-            }
-        }
+        if constexpr (has_bias) {
+            float max_val_s = selection_wt[0];

 #pragma unroll
-        for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
-            const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
-            const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
-            if (val > max_val || (val == max_val && expert < max_expert)) {
-                max_val    = val;
-                max_expert = expert;
+            for (int i = 1; i < experts_per_thread; i++) {
+                const int expert = threadIdx.x + i * WARP_SIZE;
+                if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_wt[i] > max_val_s) {
+                    max_val    = wt[i];
+                    max_val_s  = selection_wt[i];
+                    max_expert = expert;
+                }
+            }
+
+#pragma unroll
+            for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
+                const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
+                const float val_s  = __shfl_xor_sync(0xFFFFFFFF, max_val_s, mask, WARP_SIZE);
+                const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
+                if (val_s > max_val_s || (val_s == max_val_s && expert < max_expert)) {
+                    max_val    = val;
+                    max_val_s  = val_s;
+                    max_expert = expert;
+                }
+            }
+
+            if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
+                selection_wt[max_expert / WARP_SIZE] = -INFINITY;
+            }
+        } else {
+#pragma unroll
+            for (int i = 1; i < experts_per_thread; i++) {
+                const int expert = threadIdx.x + i * WARP_SIZE;
+                if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
+                    max_val    = wt[i];
+                    max_expert = expert;
+                }
+            }
+
+#pragma unroll
+            for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
+                const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
+                const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
+                if (val > max_val || (val == max_val && expert < max_expert)) {
+                    max_val    = val;
+                    max_expert = expert;
+                }
+            }
+
+            if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
+                wt[max_expert / WARP_SIZE] = -INFINITY;
            }
        }

@@ -130,16 +211,14 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
        }

        if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
-            wt[max_expert / WARP_SIZE] = -INFINITY;
-
            ids[k] = max_expert;
-            if constexpr (with_norm) {
+            if (config.with_norm) {
                wt_sum += max_val;
            }
        }
    }

-    if constexpr (with_norm) {
+    if (config.with_norm) {
        wt_sum              = warp_reduce_sum(wt_sum);
        wt_sum              = max(wt_sum, clamp_val);
        const float inv_sum = 1.0f / wt_sum;
@@ -149,7 +228,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
        }
    }

-    if constexpr (delayed_softmax) {
+    if (config.delayed_softmax) {
        softmax_warp_inplace<experts_per_thread, true>(output_weights, n_expert_used, threadIdx.x);
    }

@@ -157,25 +236,25 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
    for (int i = 0; i < experts_per_thread; i++) {
        const int idx = i * WARP_SIZE + threadIdx.x;
        if (idx < n_expert_used) {
-            weights[idx] = output_weights[i];
+            weights[idx] = output_weights[i] * scale_val;
        }
    }
-
-    if (!with_norm) {
-        GGML_UNUSED(clamp_val);
-    }
 }

-template <bool with_norm, bool delayed_softmax = false>
+template<bool has_bias>
 static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
                                 const float *               logits,
                                 float *                     weights,
                                 int32_t *                   ids,
+                                 float *                     bias,
                                 const int                   n_rows,
                                 const int                   n_expert,
                                 const int                   n_expert_used,
-                                 const float                 clamp_val) {
-    static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization");
+                                 const float                 clamp_val,
+                                 const float                 scale_val,
+                                 const topk_moe_config       config) {
+    GGML_ASSERT(!(config.with_norm && config.delayed_softmax) &&
+                "delayed softmax is not supported with weight normalization");
    const int    rows_per_block = 4;
    dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
    dim3         block_dims(WARP_SIZE, rows_per_block, 1);
@@ -183,44 +262,48 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,

    switch (n_expert) {
        case 1:
-            topk_moe_cuda<1, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<1, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
            break;
        case 2:
-            topk_moe_cuda<2, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<2, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
            break;
        case 4:
-            topk_moe_cuda<4, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<4, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
            break;
        case 8:
-            topk_moe_cuda<8, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<8, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
            break;
        case 16:
-            topk_moe_cuda<16, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<16, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                    clamp_val, scale_val, config);
            break;
        case 32:
-            topk_moe_cuda<32, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<32, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                    clamp_val, scale_val, config);
            break;
        case 64:
-            topk_moe_cuda<64, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<64, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                    clamp_val, scale_val, config);
            break;
        case 128:
-            topk_moe_cuda<128, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<128, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
            break;
        case 256:
-            topk_moe_cuda<256, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<256, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
            break;
        case 512:
-            topk_moe_cuda<512, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<512, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
+            break;
+        case 576:
+            topk_moe_cuda<576, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
            break;
        default:
            GGML_ASSERT(false && "fatal error");
@@ -228,13 +311,14 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
    }
 }

-void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
-                           const ggml_tensor *         logits,
-                           ggml_tensor *               weights,
-                           ggml_tensor *               ids,
-                           const bool                  with_norm,
-                           const bool                  delayed_softmax,
-                           ggml_tensor *               clamp) {
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context &     ctx,
+                           const ggml_tensor *             logits,
+                           ggml_tensor *                   weights,
+                           ggml_tensor *                   ids,
+                           const ggml_tensor *             clamp,
+                           const ggml_tensor *             scale,
+                           const ggml_tensor *             bias,
+                           const ggml_cuda_topk_moe_args & args) {
    GGML_ASSERT(logits->type == GGML_TYPE_F32);
    GGML_ASSERT(weights->type == GGML_TYPE_F32);
    GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -245,107 +329,75 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
    const float * logits_d  = (const float *) logits->data;
    float *       weights_d = (float *) weights->data;
    int32_t *     ids_d     = (int32_t *) ids->data;
+    float *       bias_d    = bias ? (float *) bias->data : nullptr;
+
+    float scale_val = scale ? ggml_get_op_params_f32(scale, 0) : 1.0f;

    GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);

    const int n_expert_used = weights->ne[1];

+    const bool with_norm = clamp != nullptr;
+
    float clamp_val = -INFINITY;
-    if (with_norm) {
-        if (clamp) {
-            clamp_val = ggml_get_op_params_f32(clamp, 0);
-        }
-        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val);
+    if (clamp) {
+        clamp_val = ggml_get_op_params_f32(clamp, 0);
+    }
+
+    topk_moe_config config;
+    config.use_sigmoid     = args.sigmoid;
+    config.with_norm       = with_norm;
+    config.delayed_softmax = args.delayed_softmax;
+
+    if (bias) {
+        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, bias_d, n_rows, n_experts, n_expert_used, clamp_val,
+                             scale_val, config);
    } else {
-        GGML_ASSERT(clamp == nullptr);
-        if (delayed_softmax) {
-            launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
-                                              clamp_val);
-        } else {
-            launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
-                                               clamp_val);
-        }
+        launch_topk_moe_cuda<false>(ctx, logits_d, weights_d, ids_d, bias_d, n_rows, n_experts, n_expert_used, clamp_val,
+                             scale_val, config);
    }
 }

-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
                                   const ggml_tensor * weights,
-                                   const ggml_tensor * get_rows,
-                                   const ggml_tensor * argsort,
-                                   const ggml_tensor * clamp,
-                                   int n_expert) {
-    ggml_tensor * probs = get_rows->src[0];
-    if (probs->op != GGML_OP_RESHAPE) {
-        return false;
-    }
-    probs = probs->src[0];
-    ggml_tensor * selection_probs = argsort->src[0];
-
-    if (probs != selection_probs) {
+                                   const ggml_tensor * logits,
+                                   const ggml_tensor * ids) {
+    const int n_expert = ids->nb[1] / ids->nb[0];
+    if (((n_expert & (n_expert - 1)) != 0 || n_expert > 512) && n_expert != 576) {
        return false;
    }

-    float scale    = 1.0f;
-    float max_bias = 0.0f;
-
-    memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
-
-    if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
+    if (!ggml_is_contiguous(weights) || !ggml_is_contiguous(logits)) {
        return false;
    }

-    if (scale != 1.0f || max_bias != 0.0f) {
-        return false;
-    }
+    if (gating_op->op == GGML_OP_SOFT_MAX) {
+        const ggml_tensor * softmax  = gating_op;
+        float               scale    = 1.0f;
+        float               max_bias = 0.0f;

-    // don't fuse when masks or sinks are present
-    if (softmax->src[1] || softmax->src[2]) {
-        return false;
-    }
+        memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
+        memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));

-    // n_expert must be a power of 2
-    if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
-        return false;
-    }
-
-    if (clamp) {
-        if (clamp->op != GGML_OP_CLAMP) {
+        if (!ggml_is_contiguous(softmax->src[0])) {
            return false;
        }
-        float max_val = ggml_get_op_params_f32(clamp, 1);

-        if (max_val != INFINITY) {
+        if (scale != 1.0f || max_bias != 0.0f) {
+            return false;
+        }
+
+        // don't fuse when masks or sinks are present
+        if (softmax->src[1] || softmax->src[2]) {
+            return false;
+        }
+    } else if (gating_op->op == GGML_OP_UNARY) {
+        ggml_unary_op op = ggml_get_unary_op(gating_op);
+
+        if (op != GGML_UNARY_OP_SIGMOID) {
            return false;
        }
    }

-
    return true;
 }
-
-std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) {
-    static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-                                                            GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                                            GGML_OP_SUM_ROWS, GGML_OP_CLAMP,    GGML_OP_DIV,
-                                                            GGML_OP_RESHAPE };
-
-    static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
-                                                               GGML_OP_VIEW, GGML_OP_GET_ROWS };
-
-    static std::initializer_list<enum ggml_op> delayed_softmax_ops = { GGML_OP_ARGSORT,  GGML_OP_VIEW,
-                                                                       GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                                                       GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
-
-    GGML_ASSERT(!norm || !delayed_softmax);
-
-    if (delayed_softmax) {
-        return delayed_softmax_ops;
-    }
-
-    if (norm) {
-        return norm_ops;
-    }
-
-    return no_norm_ops;
-}
--- a/ggml/src/ggml-cuda/topk-moe.cuh
+++ b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -3,19 +3,25 @@

 #include <initializer_list>

-void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
-                           const ggml_tensor *         logits,
-                           ggml_tensor *               weights,
-                           ggml_tensor *               ids,
-                           const bool                  with_norm,
-                           const bool                  delayed_softmax = false,
-                           ggml_tensor *               weight_clamp    = nullptr);
+struct ggml_cuda_topk_moe_args {
+    bool sigmoid{};
+    bool softmax{};
+    bool delayed_softmax{};
+    bool prob_bias{};
+    bool norm{};
+    bool scale{};
+};

-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context &     ctx,
+                           const ggml_tensor *             logits,
+                           ggml_tensor *                   weights,
+                           ggml_tensor *                   ids,
+                           const ggml_tensor *             clamp,
+                           const ggml_tensor *             scale,
+                           const ggml_tensor *             bias,
+                           const ggml_cuda_topk_moe_args & args);
+
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
                                   const ggml_tensor * weights,
-                                   const ggml_tensor * get_rows,
-                                   const ggml_tensor * argsort,
-                                   const ggml_tensor * clamp,
-                                   int n_expert);
-
-std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
+                                   const ggml_tensor * logits,
+                                   const ggml_tensor * ids);
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -15,7 +15,6 @@

 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
-#include <syclcompat/math.hpp>
 #include <map>

 #ifdef GGML_SYCL_USE_INTEL_ONEMKL
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4606,14 +4606,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
 #endif
        case GGML_OP_NORM:
-            return true;
        case GGML_OP_L2_NORM:
        case GGML_OP_GROUP_NORM:
-            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_RMS_NORM:
-            return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
+            return true;
        case GGML_OP_RMS_NORM_BACK:
-            return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
+            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_SCALE:
            return true;
        case GGML_OP_CONT:
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -251,7 +251,6 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
        const float eps, queue_ptr stream, int device) {

    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
        stream->submit([&](sycl::handler& cgh) {
@@ -334,7 +333,6 @@ static void group_norm_f32_sycl(const float* x, float* dst,

 static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);

    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
@@ -374,7 +372,6 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
 static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
    const int nrows, const float eps,
    queue_ptr stream, int device) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
    if (ncols < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
--- a/ggml/src/ggml-virtgpu/CMakeLists.txt
+++ b/ggml/src/ggml-virtgpu/CMakeLists.txt
@@ -0,0 +1,70 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+include(ExternalProject)
+
+message(STATUS "Including the VirtGPU/Virglrenderer API Remoting")
+
+# Download venus_hw.h from virglrenderer repository
+ExternalProject_Add(
+    venus_hw_header
+    URL https://gitlab.freedesktop.org/virgl/virglrenderer/-/raw/virglrenderer-1.2.0/src/venus_hw.h
+    DOWNLOAD_NO_EXTRACT YES
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include
+    DOWNLOAD_NAME venus_hw.h
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+    LOG_DOWNLOAD ON
+)
+
+if (NOT GGML_VIRTGPU_BACKEND STREQUAL "ONLY")
+    message(STATUS "Enable the VirtGPU/Virglrenderer API Remoting frontend library")
+
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(DRM REQUIRED libdrm)
+    if (NOT GGML_BACKEND_DL)
+      # cannot simply use USE_VIRTGPU, as in the 'else()' case the
+      # frontend isn't compiled
+      target_compile_definitions(ggml PUBLIC "GGML_USE_VIRTGPU_FRONTEND")
+    endif()
+
+    ggml_add_backend_library(ggml-virtgpu
+                             ggml-backend-buffer.cpp
+                             ggml-backend.cpp
+                             ggml-backend-device.cpp
+                             ggml-backend-reg.cpp
+                             ggml-backend-buffer-type.cpp
+                             virtgpu-apir.h
+                             virtgpu-forward.gen.h
+                             virtgpu.cpp
+                             virtgpu-shm.cpp
+                             virtgpu-utils.cpp
+                             virtgpu-forward-device.cpp
+                             virtgpu-forward-buffer-type.cpp
+                             virtgpu-forward-buffer.cpp
+                             virtgpu-forward-backend.cpp
+                             virtgpu-forward-impl.h
+                             apir_cs_ggml-rpc-front.cpp
+                             ../../include/ggml-virtgpu.h)
+
+    target_include_directories(ggml-virtgpu PUBLIC /usr/include/libdrm/)
+
+    target_link_libraries(ggml-virtgpu PUBLIC ${DRM_LIBRARIES})
+    target_include_directories(ggml-virtgpu PUBLIC ${DRM_INCLUDE_DIRS})
+    target_compile_options(ggml-virtgpu PUBLIC ${DRM_CFLAGS_OTHER})
+
+    target_include_directories(ggml-virtgpu PUBLIC ./include)
+    target_include_directories(ggml-virtgpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+    # Ensure venus_hw.h is downloaded before building ggml-virtgpu
+    add_dependencies(ggml-virtgpu venus_hw_header)
+
+    target_compile_options(ggml-virtgpu PRIVATE -std=c++20)
+else()
+    message(STATUS "Not building the VirtGPU/Virglrenderer API Remoting frontend library")
+endif()
+
+if (NOT GGML_VIRTGPU_BACKEND STREQUAL "OFF")
+    add_subdirectory("backend")
+endif()
--- a/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp
+++ b/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp
@@ -0,0 +1,87 @@
+#include "backend/shared/apir_cs_rpc.h"
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "ggml-remoting.h"
+
+#include <cinttypes>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor) {
+    apir_rpc_tensor result;
+    result.id   = reinterpret_cast<uint64_t>(tensor);
+    result.type = tensor->type;
+    if (tensor->buffer) {
+        ggml_backend_buffer_t buffer = tensor->buffer;
+
+        result.buffer = BUFFER_TO_HOST_HANDLE(buffer);
+    } else {
+        result.buffer = 0;
+    }
+    for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+        result.ne[i] = tensor->ne[i];
+        result.nb[i] = tensor->nb[i];
+    }
+    result.op = tensor->op;
+    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+        result.op_params[i] = tensor->op_params[i];
+    }
+    result.flags = tensor->flags;
+    for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+        result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+    }
+    result.view_src  = reinterpret_cast<uint64_t>(tensor->view_src);
+    result.view_offs = tensor->view_offs;
+    result.data      = reinterpret_cast<uint64_t>(tensor->data);
+    if (tensor->data) {
+        if (!tensor->buffer) {
+            GGML_ABORT("tensor has data but not buffer");
+        }
+        // tensor->data is serialized as an offset to the buffer base address
+        result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+    }
+    snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+    return result;
+}
+
+void apir_add_tensor(ggml_tensor *                       tensor,
+                     std::vector<apir_rpc_tensor> &      tensors,
+                     std::unordered_set<ggml_tensor *> & visited) {
+    if (tensor == nullptr) {
+        return;
+    }
+    if (visited.find(tensor) != visited.end()) {
+        return;
+    }
+    visited.insert(tensor);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        apir_add_tensor(tensor->src[i], tensors, visited);
+    }
+    apir_add_tensor(tensor->view_src, tensors, visited);
+    tensors.push_back(apir_serialize_tensor(tensor));
+}
+
+void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+    uint32_t                          n_nodes = cgraph->n_nodes;
+    std::vector<apir_rpc_tensor>      tensors;
+    std::unordered_set<ggml_tensor *> visited;
+    for (uint32_t i = 0; i < n_nodes; i++) {
+        apir_add_tensor(cgraph->nodes[i], tensors, visited);
+    }
+    // serialization format:
+    // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(apir_rpc_tensor)) |
+    uint32_t n_tensors = tensors.size();
+    int      output_size =
+        sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(apir_rpc_tensor);
+    output.resize(output_size, 0);
+    memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+    for (uint32_t i = 0; i < n_nodes; i++) {
+        memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+    }
+    uint32_t * out_ntensors = (uint32_t *) (output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+    *out_ntensors           = n_tensors;
+    apir_rpc_tensor * out_tensors =
+        (apir_rpc_tensor *) (output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+    memcpy(out_tensors, tensors.data(), n_tensors * sizeof(apir_rpc_tensor));
+}
--- a/ggml/src/ggml-virtgpu/backend/CMakeLists.txt
+++ b/ggml/src/ggml-virtgpu/backend/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable the VirtGPU/Virglrenderer backend library")
+
+ggml_add_backend_library(ggml-virtgpu-backend
+                         backend.cpp
+                         backend-dispatched.cpp
+                         backend-dispatched-backend.cpp
+                         backend-dispatched-device.cpp
+                         backend-dispatched-buffer.cpp
+                         backend-dispatched-buffer-type.cpp
+                         shared/api_remoting.h
+                         shared/apir_backend.h
+                         shared/apir_cs.h
+                         apir_cs_ggml-rpc-back.cpp)
+
+target_compile_options(ggml-virtgpu-backend PRIVATE -std=c++20)
+
+# Add include directory for ggml-backend-impl.h and other core headers
+target_include_directories(ggml-virtgpu-backend PRIVATE ../..)
--- a/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp
+++ b/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp
@@ -0,0 +1,115 @@
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "shared/apir_cs_rpc.h"
+
+#include <cinttypes>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void apir_track_backend_buffer(ggml_backend_buffer_t buffer) {
+    backend_buffers.insert(buffer);
+}
+
+bool apir_untrack_backend_buffer(ggml_backend_buffer_t buffer) {
+    auto it = backend_buffers.find(buffer);
+    if (it == backend_buffers.end()) {
+        return false;
+    }
+
+    backend_buffers.erase(it);
+    return true;
+}
+
+std::unordered_set<ggml_backend_buffer_t> apir_get_track_backend_buffers() {
+    return backend_buffers;
+}
+
+ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor) {
+    ggml_tensor * result =
+        ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = tensor->nb[i];
+    }
+    result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+    if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+        printf("WARNING: HOST BUFFER NOT FOUND | %p\n", (void *) result->buffer);
+        result->buffer = nullptr;
+    }
+
+    uint64_t tensor_data = tensor->data;
+    if (result->buffer) {
+        // require that the tensor data does not go beyond the buffer end
+        uint64_t tensor_size  = (uint64_t) ggml_nbytes(result);
+        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+        uint64_t buffer_size  = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+
+        // tensor->data is serialized as an offset to the buffer base address
+        tensor_data += buffer_start;
+
+        GGML_ASSERT(tensor_data + tensor_size >= tensor_data);  // check for overflow
+        GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size);
+    }
+
+    result->op = (ggml_op) tensor->op;
+    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+        result->op_params[i] = tensor->op_params[i];
+    }
+    result->flags = tensor->flags;
+    result->data  = reinterpret_cast<void *>(tensor_data);
+    ggml_set_name(result, tensor->name);
+    return result;
+}
+
+ggml_tensor * apir_create_node(uint64_t                                                      id,
+                               ggml_context *                                                ctx,
+                               const std::unordered_map<uint64_t, const apir_rpc_tensor *> & tensor_ptrs,
+                               std::unordered_map<uint64_t, ggml_tensor *> &                 tensor_map) {
+    if (id == 0) {
+        return nullptr;
+    }
+    if (tensor_map.find(id) != tensor_map.end()) {
+        return tensor_map[id];
+    }
+    const apir_rpc_tensor * tensor = tensor_ptrs.at(id);
+    ggml_tensor *           result = apir_deserialize_tensor(ctx, tensor);
+    if (result == nullptr) {
+        return nullptr;
+    }
+    tensor_map[id] = result;
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        result->src[i] = apir_create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+    }
+    result->view_src  = apir_create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+    result->view_offs = tensor->view_offs;
+    return result;
+}
+
+ggml_cgraph * apir_deserialize_graph(uint32_t                n_nodes,
+                                     uint32_t                n_tensors,
+                                     const apir_rpc_tensor * tensors,
+                                     const uint64_t *        nodes) {
+    size_t buf_size = ggml_tensor_overhead() * (n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    ggml_init_params params = {
+        /*.mem_size   =*/buf_size,
+        /*.mem_buffer =*/NULL,
+        /*.no_alloc   =*/true,
+    };
+    ggml_context * ctx   = ggml_init(params);
+    ggml_cgraph *  graph = ggml_new_graph_custom(ctx, n_nodes, false);
+    graph->n_nodes       = n_nodes;
+    std::unordered_map<uint64_t, const apir_rpc_tensor *> tensor_ptrs;
+    for (uint32_t i = 0; i < n_tensors; i++) {
+        tensor_ptrs[tensors[i].id] = &tensors[i];
+    }
+    std::unordered_map<uint64_t, ggml_tensor *> tensor_map;
+    for (uint32_t i = 0; i < n_nodes; i++) {
+        int64_t id;
+        memcpy(&id, &nodes[i], sizeof(id));
+        graph->nodes[i] = apir_create_node(id, ctx, tensor_ptrs, tensor_map);
+    }
+
+    return graph;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-convert.h
+++ b/ggml/src/ggml-virtgpu/backend/backend-convert.h
@@ -0,0 +1,13 @@
+#include "shared/apir_backend.h"
+
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+    // in the backend, the buffer handle is the buffer pointer
+    return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+    // in the backend, the buffer handle is the buffer pointer
+    return (apir_buffer_type_host_handle_t) buft;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp
@@ -0,0 +1,65 @@
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "shared/apir_backend.h"
+
+#include <cstdint>
+
+uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(enc);
+
+    static bool async_backend_initialized = false;
+    static bool async_backend;
+
+    if (!async_backend_initialized) {
+        ggml_backend_dev_props props;
+
+        dev->iface.get_props(dev, &props);
+        async_backend             = props.caps.async;
+        async_backend_initialized = true;
+    }
+
+    uint32_t shmem_res_id;
+    apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+    const void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+    if (!shmem_data) {
+        GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+        apir_decoder_set_fatal(dec);
+        return 1;
+    }
+    size_t cgraph_size;
+    apir_decode_size_t(dec, &cgraph_size);
+
+    apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size);
+
+    ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size);
+
+    ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+    for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+        ggml_tensor * op = ggml_graph_node(cgraph, idx);
+        if (dev->iface.supports_op(dev, op)) {
+            continue;
+        }
+        GGML_LOG_ERROR("Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
+
+        status = GGML_STATUS_ABORTED;
+        apir_encode_ggml_status(enc, &status);
+
+        return 0;
+    }
+#endif
+    status = bck->iface.graph_compute(bck, cgraph);
+
+    if (async_backend) {
+        bck->iface.synchronize(bck);
+    }
+
+    apir_encode_ggml_status(enc, &status);
+
+    return 0;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
@@ -0,0 +1,89 @@
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_type_t buft;
+    buft = apir_decode_ggml_buffer_type(dec);
+
+    const char * string = buft->iface.get_name(buft);
+
+    const size_t string_size = strlen(string) + 1;
+    apir_encode_array_size(enc, string_size);
+    apir_encode_char_array(enc, string, string_size);
+
+    return 0;
+}
+
+uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_type_t buft;
+    buft = apir_decode_ggml_buffer_type(dec);
+
+    size_t value = buft->iface.get_alignment(buft);
+    apir_encode_size_t(enc, &value);
+
+    return 0;
+}
+
+uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_type_t buft;
+    buft = apir_decode_ggml_buffer_type(dec);
+
+    size_t value = buft->iface.get_max_size(buft);
+    apir_encode_size_t(enc, &value);
+
+    return 0;
+}
+
+uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_type_t buft;
+    buft = apir_decode_ggml_buffer_type(dec);
+
+    bool is_host = buft->iface.is_host(buft);
+    apir_encode_bool_t(enc, &is_host);
+
+    return 0;
+}
+
+uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_type_t buft;
+    buft = apir_decode_ggml_buffer_type(dec);
+
+    size_t size;
+    apir_decode_size_t(dec, &size);
+
+    ggml_backend_buffer_t buffer;
+
+    buffer = buft->iface.alloc_buffer(buft, size);
+
+    apir_encode_ggml_buffer(enc, buffer);
+
+    if (buffer) {
+        apir_track_backend_buffer(buffer);
+    }
+
+    return 0;
+}
+
+uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_type_t buft;
+    buft = apir_decode_ggml_buffer_type(dec);
+
+    const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
+
+    size_t value = buft->iface.get_alloc_size(buft, op);
+
+    apir_encode_size_t(enc, &value);
+
+    return 0;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp
@@ -0,0 +1,131 @@
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    ggml_backend_buffer_t buffer;
+    buffer = apir_decode_ggml_buffer(dec);
+
+    uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
+    apir_encode_uintptr_t(enc, &base);
+
+    return 0;
+}
+
+uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(enc);
+
+    ggml_backend_buffer_t buffer;
+    buffer = apir_decode_ggml_buffer(dec);
+
+    ggml_tensor * tensor;
+    // safe to remove the const qualifier here
+    tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
+
+    uint32_t shmem_res_id;
+    apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+    size_t offset;
+    apir_decode_size_t(dec, &offset);
+
+    size_t size;
+    apir_decode_size_t(dec, &size);
+
+    void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+
+    if (!shmem_data) {
+        GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+        return 1;
+    }
+
+    buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
+
+    return 0;
+}
+
+uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(enc);
+
+    ggml_backend_buffer_t buffer;
+    buffer = apir_decode_ggml_buffer(dec);
+
+    const ggml_tensor * tensor;
+    // safe to remove the const qualifier here
+    tensor = apir_decode_ggml_tensor(dec);
+
+    uint32_t shmem_res_id;
+    apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+    size_t offset;
+    apir_decode_size_t(dec, &offset);
+
+    size_t size;
+    apir_decode_size_t(dec, &size);
+
+    void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+    if (!shmem_data) {
+        GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+        return 1;
+    }
+
+    buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
+
+    return 0;
+}
+
+uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+
+    ggml_backend_buffer_t buffer;
+    buffer = apir_decode_ggml_buffer(dec);
+
+    const ggml_tensor * src;
+    // safe to remove the const qualifier here
+    src               = apir_decode_ggml_tensor(dec);
+    ggml_tensor * dst = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
+
+    bool ret = buffer->iface.cpy_tensor(buffer, src, (ggml_tensor *) dst);
+
+    apir_encode_bool_t(enc, &ret);
+
+    return 0;
+}
+
+uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(enc);
+
+    ggml_backend_buffer_t buffer;
+    buffer = apir_decode_ggml_buffer(dec);
+
+    uint8_t value;
+    apir_decode_uint8_t(dec, &value);
+
+    buffer->iface.clear(buffer, value);
+
+    return 0;
+}
+
+uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(enc);
+
+    ggml_backend_buffer_t buffer;
+    buffer = apir_decode_ggml_buffer(dec);
+
+    if (!apir_untrack_backend_buffer(buffer)) {
+        GGML_LOG_WARN("%s: unknown buffer %p\n", __func__, (void *) buffer);
+        return 1;
+    }
+
+    buffer->iface.free_buffer(buffer);
+
+    return 0;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp
@@ -0,0 +1,148 @@
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+uint32_t backend_device_get_device_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    int32_t dev_count = reg->iface.get_device_count(reg);
+    apir_encode_int32_t(enc, &dev_count);
+
+    return 0;
+}
+
+uint32_t backend_device_get_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    int32_t dev_count = reg->iface.get_device_count(reg);
+    apir_encode_int32_t(enc, &dev_count);
+
+    return 0;
+}
+
+uint32_t backend_device_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    const char * string = dev->iface.get_name(dev);
+
+    const size_t string_size = strlen(string) + 1;
+    apir_encode_array_size(enc, string_size);
+    apir_encode_char_array(enc, string, string_size);
+
+    return 0;
+}
+
+uint32_t backend_device_get_description(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    const char * string = dev->iface.get_description(dev);
+
+    const size_t string_size = strlen(string) + 1;
+    apir_encode_array_size(enc, string_size);
+    apir_encode_char_array(enc, string, string_size);
+
+    return 0;
+}
+
+uint32_t backend_device_get_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    uint32_t type = dev->iface.get_type(dev);
+    apir_encode_uint32_t(enc, &type);
+
+    return 0;
+}
+
+uint32_t backend_device_get_memory(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    size_t free, total;
+    dev->iface.get_memory(dev, &free, &total);
+
+    apir_encode_size_t(enc, &free);
+    apir_encode_size_t(enc, &total);
+
+    return 0;
+}
+
+uint32_t backend_device_supports_op(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+
+    const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
+
+    bool supports_op = dev->iface.supports_op(dev, op);
+
+    apir_encode_bool_t(enc, &supports_op);
+
+    return 0;
+}
+
+uint32_t backend_device_get_buffer_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
+
+    apir_encode_ggml_buffer_type(enc, bufft);
+
+    return 0;
+}
+
+uint32_t backend_device_get_props(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    ggml_backend_dev_props props;
+    dev->iface.get_props(dev, &props);
+
+    apir_encode_bool_t(enc, &props.caps.async);
+    apir_encode_bool_t(enc, &props.caps.host_buffer);
+    apir_encode_bool_t(enc, &props.caps.buffer_from_host_ptr);
+    apir_encode_bool_t(enc, &props.caps.events);
+
+    return 0;
+}
+
+uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dec);
+
+    uint32_t shmem_res_id;
+    apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+    void * shmem_ptr = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
+    if (!shmem_ptr) {
+        GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
+        apir_decoder_set_fatal(dec);
+        return 1;
+    }
+
+    size_t size;
+    apir_decode_size_t(dec, &size);
+    size_t max_tensor_size;
+    apir_decode_size_t(dec, &max_tensor_size);
+
+    ggml_backend_buffer_t buffer;
+    buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
+
+    apir_encode_ggml_buffer(enc, buffer);
+    apir_encode_ggml_buffer_type(enc, buffer->buft);
+
+    if (buffer) {
+        apir_track_backend_buffer(buffer);
+    }
+
+    return 0;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp
@@ -0,0 +1,46 @@
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <cstdint>
+
+ggml_backend_reg_t reg = NULL;
+ggml_backend_dev_t dev = NULL;
+ggml_backend_t     bck = NULL;
+
+uint64_t timer_start = 0;
+uint64_t timer_total = 0;
+uint64_t timer_count = 0;
+
+uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
+    if (reg != NULL) {
+        GGML_LOG_WARN("%s: already initialized\n", __func__);
+        return APIR_BACKEND_INITIALIZE_ALREADY_INITED;
+    }
+    ggml_backend_reg_t (*ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
+
+    reg = ggml_backend_reg_fct();
+    if (reg == NULL) {
+        GGML_LOG_ERROR("%s: backend registration failed\n", __func__);
+        return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
+    }
+
+    if (!reg->iface.get_device_count(reg)) {
+        GGML_LOG_ERROR("%s: backend initialization failed: no device found\n", __func__);
+        return APIR_BACKEND_INITIALIZE_NO_DEVICE;
+    }
+
+    dev = reg->iface.get_device(reg, 0);
+
+    if (!dev) {
+        GGML_LOG_ERROR("%s: backend initialization failed: no device received\n", __func__);
+        return APIR_BACKEND_INITIALIZE_NO_DEVICE;
+    }
+
+    bck = dev->iface.init_backend(dev, NULL);
+
+    return APIR_BACKEND_INITIALIZE_SUCCESS;
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h
@@ -0,0 +1,130 @@
+#pragma once
+
+/* device */
+uint32_t backend_device_get_device_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_description(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_memory(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_supports_op(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_buffer_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_get_props(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+/* buffer-type */
+uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+/* buffer */
+uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+/* backend */
+uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) {
+    switch (type) {
+        /* device */
+        case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
+            return "backend_device_get_device_count";
+        case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
+            return "backend_device_get_count";
+        case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
+            return "backend_device_get_name";
+        case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
+            return "backend_device_get_description";
+        case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
+            return "backend_device_get_type";
+        case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
+            return "backend_device_get_memory";
+        case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
+            return "backend_device_supports_op";
+        case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
+            return "backend_device_get_buffer_type";
+        case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
+            return "backend_device_get_props";
+        case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
+            return "backend_device_buffer_from_ptr";
+        /* buffer-type */
+        case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
+            return "backend_buffer_type_get_name";
+        case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
+            return "backend_buffer_type_get_alignment";
+        case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
+            return "backend_buffer_type_get_max_size";
+        case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
+            return "backend_buffer_type_is_host";
+        case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
+            return "backend_buffer_type_alloc_buffer";
+        case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
+            return "backend_buffer_type_get_alloc_size";
+        /* buffer */
+        case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
+            return "backend_buffer_get_base";
+        case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
+            return "backend_buffer_set_tensor";
+        case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
+            return "backend_buffer_get_tensor";
+        case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
+            return "backend_buffer_cpy_tensor";
+        case APIR_COMMAND_TYPE_BUFFER_CLEAR:
+            return "backend_buffer_clear";
+        case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
+            return "backend_buffer_free_buffer";
+        /* backend */
+        case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
+            return "backend_backend_graph_compute";
+
+        default:
+            return "unknown";
+    }
+}
+
+extern "C" {
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
+
+    /* device */
+
+    /* APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT  = */ backend_device_get_device_count,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_COUNT  = */ backend_device_get_count,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_NAME  = */ backend_device_get_name,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION  = */ backend_device_get_description,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_TYPE  = */ backend_device_get_type,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_MEMORY  = */ backend_device_get_memory,
+    /* APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP  = */ backend_device_supports_op,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE  = */ backend_device_get_buffer_type,
+    /* APIR_COMMAND_TYPE_DEVICE_GET_PROPS  = */ backend_device_get_props,
+    /* APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR  = */ backend_device_buffer_from_ptr,
+
+    /* buffer-type */
+
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME  = */ backend_buffer_type_get_name,
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT  = */ backend_buffer_type_get_alignment,
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE  = */ backend_buffer_type_get_max_size,
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST  = */ backend_buffer_type_is_host,
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER  = */ backend_buffer_type_alloc_buffer,
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE  = */ backend_buffer_type_get_alloc_size,
+
+    /* buffer */
+
+    /* APIR_COMMAND_TYPE_BUFFER_GET_BASE  = */ backend_buffer_get_base,
+    /* APIR_COMMAND_TYPE_BUFFER_SET_TENSOR  = */ backend_buffer_set_tensor,
+    /* APIR_COMMAND_TYPE_BUFFER_GET_TENSOR  = */ backend_buffer_get_tensor,
+    /* APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR  = */ backend_buffer_cpy_tensor,
+    /* APIR_COMMAND_TYPE_BUFFER_CLEAR  = */ backend_buffer_clear,
+    /* APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER  = */ backend_buffer_free_buffer,
+
+    /* backend */
+
+    /* APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE  = */ backend_backend_graph_compute,
+};
+}
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <ggml-backend.h>
+
+#include "backend-convert.h"
+#include "backend-virgl-apir.h"
+#include "shared/apir_backend.h"
+#include "shared/apir_cs.h"
+#include "shared/apir_cs_ggml.h"
+
+struct virgl_apir_context {
+    uint32_t               ctx_id;
+    virgl_apir_callbacks * iface;
+};
+
+typedef uint32_t (*backend_dispatch_t)(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+
+#include "backend-dispatched.gen.h"
+
+uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p);
--- a/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h
+++ b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "shared/api_remoting.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+
+extern ggml_backend_reg_t reg;
+extern ggml_backend_dev_t dev;
+extern ggml_backend_t     bck;
+
+struct virgl_apir_callbacks {
+    const char * (*get_config)(uint32_t virgl_ctx_id, const char * key);
+    void * (*get_shmem_ptr)(uint32_t virgl_ctx_id, uint32_t res_id);
+};
+
+extern "C" {
+ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs);
+void                      apir_backend_deinit(uint32_t virgl_ctx_id);
+uint32_t                  apir_backend_dispatcher(uint32_t               virgl_ctx_id,
+                                                  virgl_apir_callbacks * virgl_cbs,
+                                                  uint32_t               cmd_type,
+                                                  char *                 dec_cur,
+                                                  const char *           dec_end,
+                                                  char *                 enc_cur,
+                                                  const char *           enc_end,
+                                                  char **                enc_cur_after);
+}
--- a/ggml/src/ggml-virtgpu/backend/backend.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend.cpp
@@ -0,0 +1,148 @@
+#include "backend-dispatched.h"
+#include "backend-virgl-apir.h"
+
+#include "shared/api_remoting.h"
+#include "shared/apir_backend.h"
+#include "shared/apir_cs.h"
+
+#include <dlfcn.h>
+#include <ggml-backend.h>
+
+#include <iostream>
+
+#define APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH"
+#define APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV  "APIR_LLAMA_CPP_GGML_LIBRARY_REG"
+#define APIR_LLAMA_CPP_LOG_TO_FILE_ENV       "APIR_LLAMA_CPP_LOG_TO_FILE"
+
+#define GGML_DEFAULT_BACKEND_REG "ggml_backend_init"
+
+static void * backend_library_handle = NULL;
+static FILE * apir_logfile = NULL;
+
+static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) {
+    FILE * logfile = (FILE *)user_data;
+    fprintf(logfile, "[%d] %s", level, text);
+    fflush(logfile);
+}
+
+extern "C" {
+void apir_backend_deinit(uint32_t virgl_ctx_id) {
+    GGML_UNUSED(virgl_ctx_id);
+
+    auto buffers = apir_get_track_backend_buffers();
+    for (const auto & buffer : buffers) {
+        apir_untrack_backend_buffer(buffer);
+        buffer->iface.free_buffer(buffer);
+    }
+
+    if (dev) {
+        size_t free, total;
+        dev->iface.get_memory(dev, &free, &total);
+        GGML_LOG_INFO("%s: free memory: %ld MB\n", __func__, (size_t) free / 1024 / 1024);
+    }
+
+    if (backend_library_handle) {
+        GGML_LOG_INFO("%s: The GGML backend library was loaded. Unloading it.\n", __func__);
+        dlclose(backend_library_handle);
+        backend_library_handle = NULL;
+    }
+
+    if (apir_logfile) {
+        fclose(apir_logfile);
+        apir_logfile = NULL;
+    }
+}
+
+#define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path"
+#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
+
+ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) {
+    const char * dlsym_error;
+
+    const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV);
+    if (apir_log_to_file) {
+        apir_logfile = fopen(apir_log_to_file, "w");
+        if (apir_logfile) {
+            ggml_log_set(log_to_file_callback, apir_logfile);
+        } else {
+            GGML_LOG_INFO("Could not open the log file at '%s'\n", apir_log_to_file);
+        }
+    }
+
+    const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
+    const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY);
+    const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
+
+    if (!library_name) {
+        GGML_LOG_ERROR("cannot open the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
+
+        return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+    }
+
+    backend_library_handle = dlopen(library_name, RTLD_LAZY);
+
+    if (!backend_library_handle) {
+        GGML_LOG_ERROR("cannot open the GGML library: %s\n", dlerror());
+
+        return APIR_LOAD_LIBRARY_CANNOT_OPEN;
+    }
+
+    if (!library_reg) {
+        GGML_LOG_ERROR("cannot register the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
+
+        return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+    }
+
+    void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
+    dlsym_error                 = dlerror();
+    if (dlsym_error) {
+        GGML_LOG_ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s\n", library_reg,
+              APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
+
+        return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
+    }
+
+    uint32_t ret = backend_dispatch_initialize(ggml_backend_reg_fct);
+
+    return (ApirLoadLibraryReturnCode) (APIR_LOAD_LIBRARY_INIT_BASE_INDEX + ret);
+}
+
+uint32_t apir_backend_dispatcher(uint32_t               virgl_ctx_id,
+                                 virgl_apir_callbacks * virgl_cbs,
+                                 uint32_t               cmd_type,
+                                 char *                 dec_cur,
+                                 const char *           dec_end,
+                                 char *                 enc_cur,
+                                 const char *           enc_end,
+                                 char **                enc_cur_after) {
+    apir_encoder enc = {
+        .cur   = enc_cur,
+        .start = enc_cur,
+        .end   = enc_end,
+        .fatal = false,
+    };
+
+    apir_decoder dec = {
+        .cur   = dec_cur,
+        .end   = dec_end,
+        .fatal = false,
+    };
+
+    virgl_apir_context ctx = {
+        .ctx_id = virgl_ctx_id,
+        .iface = virgl_cbs,
+    };
+
+    if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
+        GGML_LOG_ERROR("Received an invalid dispatch index (%d >= %d)\n", cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
+        return APIR_BACKEND_FORWARD_INDEX_INVALID;
+    }
+
+    backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
+    uint32_t           ret         = forward_fct(&enc, &dec, &ctx);
+
+    *enc_cur_after = enc.cur;
+
+    return ret;
+}
+}
--- a/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h
@@ -0,0 +1,90 @@
+#pragma once
+
+/* the rest of this file must match virglrenderer/src/apir-protocol.h */
+
+#include <unistd.h>
+
+#include <cstdint>
+
+#define APIR_PROTOCOL_MAJOR 0
+#define APIR_PROTOCOL_MINOR 1
+
+#define APIR_HANDSHAKE_MAGIC 0xab1e
+
+enum ApirCommandType {
+    APIR_COMMAND_TYPE_HANDSHAKE   = 0,
+    APIR_COMMAND_TYPE_LOADLIBRARY = 1,
+    APIR_COMMAND_TYPE_FORWARD     = 2,
+
+    APIR_COMMAND_TYPE_LENGTH      = 3,
+};
+
+typedef uint64_t ApirCommandFlags;
+
+enum ApirLoadLibraryReturnCode {
+    APIR_LOAD_LIBRARY_SUCCESS                        = 0,
+    APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
+    APIR_LOAD_LIBRARY_ALREADY_LOADED                 = 2,
+    APIR_LOAD_LIBRARY_ENV_VAR_MISSING                = 3,
+    APIR_LOAD_LIBRARY_CANNOT_OPEN                    = 4,
+    APIR_LOAD_LIBRARY_SYMBOL_MISSING                 = 5,
+    APIR_LOAD_LIBRARY_INIT_BASE_INDEX                = 6,  // anything above this is a APIR backend library initialization return code
+};
+
+enum ApirForwardReturnCode {
+    APIR_FORWARD_SUCCESS         = 0,
+    APIR_FORWARD_NO_DISPATCH_FCT = 1,
+    APIR_FORWARD_TIMEOUT         = 2,
+
+    APIR_FORWARD_BASE_INDEX      = 3,  // anything above this is a APIR backend library forward return code
+} ;
+
+__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) {
+    switch (type) {
+        case APIR_COMMAND_TYPE_HANDSHAKE:
+            return "HandShake";
+        case APIR_COMMAND_TYPE_LOADLIBRARY:
+            return "LoadLibrary";
+        case APIR_COMMAND_TYPE_FORWARD:
+            return "Forward";
+        default:
+            return "unknown";
+    }
+}
+
+__attribute__((unused)) static const char * apir_load_library_error(ApirLoadLibraryReturnCode code) {
+#define APIR_LOAD_LIBRARY_ERROR(code_name) \
+    do {                                   \
+        if (code == code_name)             \
+            return #code_name;             \
+    } while (0)
+
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SUCCESS);
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR);
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ALREADY_LOADED);
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ENV_VAR_MISSING);
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_CANNOT_OPEN);
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SYMBOL_MISSING);
+    APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+    return "Unknown APIR_COMMAND_TYPE_LoadLibrary error";
+
+#undef APIR_LOAD_LIBRARY_ERROR
+}
+
+__attribute__((unused)) static const char * apir_forward_error(ApirForwardReturnCode code) {
+#define APIR_FORWARD_ERROR(code_name) \
+    do {                              \
+        if (code == code_name)        \
+            return #code_name;        \
+    } while (0)
+
+    APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
+    APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
+    APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
+    APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
+
+    return "Unknown APIR_COMMAND_TYPE_FORWARD error";
+
+#undef APIR_FORWARD_ERROR
+}
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h
@@ -0,0 +1,36 @@
+typedef enum ApirBackendCommandType {
+
+    /* device */
+    APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT = 0,
+    APIR_COMMAND_TYPE_DEVICE_GET_COUNT        = 1,
+    APIR_COMMAND_TYPE_DEVICE_GET_NAME         = 2,
+    APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION  = 3,
+    APIR_COMMAND_TYPE_DEVICE_GET_TYPE         = 4,
+    APIR_COMMAND_TYPE_DEVICE_GET_MEMORY       = 5,
+    APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP      = 6,
+    APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE  = 7,
+    APIR_COMMAND_TYPE_DEVICE_GET_PROPS        = 8,
+    APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR  = 9,
+
+    /* buffer-type */
+    APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME       = 10,
+    APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT  = 11,
+    APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE   = 12,
+    APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST        = 13,
+    APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER   = 14,
+    APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = 15,
+
+    /* buffer */
+    APIR_COMMAND_TYPE_BUFFER_GET_BASE    = 16,
+    APIR_COMMAND_TYPE_BUFFER_SET_TENSOR  = 17,
+    APIR_COMMAND_TYPE_BUFFER_GET_TENSOR  = 18,
+    APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR  = 19,
+    APIR_COMMAND_TYPE_BUFFER_CLEAR       = 20,
+    APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 21,
+
+    /* backend */
+    APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 22,
+
+    // last command_type index + 1
+    APIR_BACKEND_DISPATCH_TABLE_COUNT = 23,
+} ApirBackendCommandType;
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "apir_backend.gen.h"
+
+#include <stdint.h>  // for uintptr_t
+#include <time.h>    // for timespec, clock_gettime
+
+#define APIR_BACKEND_INITIALIZE_SUCCESS                     0
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY    2
+#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS     3
+#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS        4
+#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED              5
+#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED          6
+#define APIR_BACKEND_INITIALIZE_ALREADY_INITED              7
+#define APIR_BACKEND_INITIALIZE_NO_DEVICE                   8
+
+
+// new entries here need to be added to the apir_backend_initialize_error function below
+
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
+
+// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
+#define APIR_BACKEND_CHECK_SUPPORTS_OP 0
+
+typedef uintptr_t apir_buffer_type_host_handle_t;
+typedef uintptr_t apir_buffer_host_handle_t;
+
+static const char * apir_backend_initialize_error(int code) {
+#define APIR_BACKEND_INITIALIZE_ERROR(code_name) \
+    do {                                         \
+        if (code == code_name)                   \
+            return #code_name;                   \
+    } while (0)
+
+    APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_SUCCESS);
+    APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY);
+    APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY);
+    APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
+    APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
+    APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
+
+    return "Unknown APIR_BACKEND_INITIALIZE error:/";
+
+#undef APIR_BACKEND_INITIALIZE_ERROR
+}
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h
@@ -0,0 +1,383 @@
+#pragma once
+
+#include "ggml-impl.h"
+
+#include <cassert>
+#include <cstring>
+
+#define likely(x)   __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+struct apir_encoder {
+    char *       cur;
+    const char * start;
+    const char * end;
+    bool         fatal;
+
+};
+
+struct apir_decoder {
+    const char * cur;
+    const char * end;
+    bool         fatal;
+};
+
+/*
+ * new encoder and decoder
+ */
+
+static apir_decoder apir_new_decoder(const char * ptr, size_t size) {
+    apir_decoder dec = {
+        .cur = ptr,
+        .end = ptr + size,
+        .fatal = false,
+    };
+
+    return dec;
+}
+
+static apir_encoder apir_new_encoder(char * ptr, size_t size) {
+    apir_encoder enc = {
+        .cur   = ptr,
+        .start = ptr,
+        .end   = ptr + size,
+        .fatal = false,
+    };
+
+    return enc;
+}
+
+/*
+ * fatal flag handling
+ */
+
+static inline void apir_encoder_reset_fatal(apir_encoder * enc) {
+    enc->fatal = false;
+}
+
+static inline void apir_encoder_set_fatal(apir_encoder * enc) {
+    enc->fatal = true;
+}
+
+static inline bool apir_encoder_get_fatal(const apir_encoder * enc) {
+    return enc->fatal;
+}
+
+static inline void apir_decoder_reset_fatal(apir_decoder * dec) {
+    dec->fatal = false;
+}
+
+static inline void apir_decoder_set_fatal(apir_decoder * dec) {
+    dec->fatal = true;
+}
+
+static inline bool apir_decoder_get_fatal(const apir_decoder * dec) {
+    return dec->fatal;
+}
+
+/*
+ * encode peek
+ */
+
+static inline bool apir_decoder_peek_internal(apir_decoder * dec,
+                                              size_t                size,
+                                              void *                val,
+                                              size_t                val_size) {
+    assert(val_size <= size);
+
+    if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+        GGML_LOG_ERROR("reading too much from the decoder ...\n");
+        apir_decoder_set_fatal(dec);
+        memset(val, 0, val_size);
+        return false;
+    }
+
+    /* we should not rely on the compiler to optimize away memcpy... */
+    memcpy(val, dec->cur, val_size);
+    return true;
+}
+
+static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val, size_t val_size) {
+    apir_decoder_peek_internal(dec, size, val, val_size);
+}
+
+static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) {
+    if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+        GGML_LOG_ERROR("reading too much from the decoder ...\n");
+        apir_decoder_set_fatal(dec);
+        return NULL;
+    }
+    const void * addr = dec->cur;
+    dec->cur += size;
+
+    return addr;
+}
+
+/*
+ * read/write
+ */
+
+static inline void apir_decoder_read(apir_decoder * dec, size_t size, void * val, size_t val_size) {
+    if (apir_decoder_peek_internal(dec, size, val, val_size)) {
+        dec->cur += size;
+    }
+}
+
+static inline char * apir_encoder_write(apir_encoder * enc, size_t size, const void * val, size_t val_size) {
+    assert(val_size <= size);
+    assert(size <= ((size_t) (enc->end - enc->cur)));
+
+    char * write_addr = enc->cur;
+    /* we should not rely on the compiler to optimize away memcpy... */
+    memcpy(write_addr, val, val_size);
+    enc->cur += size;
+
+    return write_addr;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void apir_decode(apir_decoder * dec, size_t size, void * data, size_t data_size) {
+    assert(size % 4 == 0);
+    apir_decoder_read(dec, size, data, data_size);
+}
+
+static inline void apir_encode(apir_encoder * enc, size_t size, const void * data, size_t data_size) {
+    assert(size % 4 == 0);
+    apir_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint8_t */
+
+static inline void apir_encode_uint8_t(apir_encoder * enc, const uint8_t * val) {
+    apir_encode(enc, sizeof(int), val, sizeof(*val));
+}
+
+static inline void apir_decode_uint8_t(apir_decoder * dec, uint8_t * val) {
+    apir_decode(dec, sizeof(int), val, sizeof(*val));
+}
+
+/* uint64_t */
+
+static inline void apir_encode_uint64_t(apir_encoder * enc, const uint64_t * val) {
+    apir_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void apir_decode_uint64_t(apir_decoder * dec, uint64_t * val) {
+    apir_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline void apir_encode_uint64_t_array(apir_encoder * enc, const uint64_t * val, uint32_t count) {
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_uint64_t_array(apir_decoder * dec, uint64_t * val, uint32_t count) {
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    apir_decode(dec, size, val, size);
+}
+
+static inline const uint64_t * apir_decode_uint64_t_array_inplace(apir_decoder * dec, uint32_t count) {
+    return (uint64_t *) (uintptr_t) apir_decoder_use_inplace(dec, count * sizeof(uint64_t));
+}
+
+/* int32_t */
+
+static inline void apir_encode_int32_t(apir_encoder * enc, const int32_t * val) {
+    apir_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void apir_decode_int32_t(apir_decoder * dec, int32_t * val) {
+    apir_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline void apir_encode_int32_t_array(apir_encoder * enc, const int32_t * val, uint32_t count) {
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_int32_t_array(apir_decoder * dec, int32_t * val, uint32_t count) {
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    apir_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline void apir_encode_array_size(apir_encoder * enc, uint64_t size) {
+    apir_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expected_size) {
+    uint64_t size;
+    apir_decode_uint64_t(dec, &size);
+    if (size != expected_size) {
+        GGML_LOG_ERROR("Couldn't decode array from the decoder\n");
+        apir_decoder_set_fatal(dec);
+        size = 0;
+    }
+    return size;
+}
+
+static inline uint64_t apir_decode_array_size_unchecked(apir_decoder * dec) {
+    uint64_t size;
+    apir_decode_uint64_t(dec, &size);
+    return size;
+}
+
+/* non-array pointer */
+
+static inline bool apir_encode_simple_pointer(apir_encoder * enc, const void * val) {
+    apir_encode_array_size(enc, val ? 1 : 0);
+    return val;
+}
+
+static inline bool apir_decode_simple_pointer(apir_decoder * dec) {
+    return apir_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline void apir_encode_uint32_t(apir_encoder * enc, const uint32_t * val) {
+    apir_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void apir_decode_uint32_t(apir_decoder * dec, uint32_t * val) {
+    apir_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline void apir_encode_uint32_t_array(apir_encoder * enc, const uint32_t * val, uint32_t count) {
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_uint32_t_array(apir_decoder * dec, uint32_t * val, uint32_t count) {
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    apir_decode(dec, size, val, size);
+}
+
+/* size_t */
+
+static inline void apir_encode_size_t(apir_encoder * enc, const size_t * val) {
+    const uint64_t tmp = *val;
+    apir_encode_uint64_t(enc, &tmp);
+}
+
+static inline void apir_decode_size_t(apir_decoder * dec, size_t * val) {
+    uint64_t tmp;
+    apir_decode_uint64_t(dec, &tmp);
+    *val = tmp;
+}
+
+static inline void apir_encode_size_t_array(apir_encoder * enc, const size_t * val, uint32_t count) {
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        apir_encode_uint64_t_array(enc, (const uint64_t *) val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++) {
+            apir_encode_size_t(enc, &val[i]);
+        }
+    }
+}
+
+static inline void apir_decode_size_t_array(apir_decoder * dec, size_t * val, uint32_t count) {
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        apir_decode_uint64_t_array(dec, (uint64_t *) val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++) {
+            apir_decode_size_t(dec, &val[i]);
+        }
+    }
+}
+
+/* opaque blob */
+
+static inline void apir_encode_blob_array(apir_encoder * enc, const void * val, size_t size) {
+    apir_encode(enc, (size + 3) & ~3, val, size);
+}
+
+static inline void apir_decode_blob_array(apir_decoder * dec, void * val, size_t size) {
+    apir_decode(dec, (size + 3) & ~3, val, size);
+}
+
+/* string */
+
+static inline void apir_encode_char_array(apir_encoder * enc, const char * val, size_t size) {
+    assert(size && strlen(val) < size);
+    apir_encode_blob_array(enc, val, size);
+}
+
+static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t size) {
+    apir_decode_blob_array(dec, val, size);
+    if (size) {
+        val[size - 1] = '\0';
+    } else {
+        GGML_LOG_ERROR("Couldn't decode the blog array\n");
+        apir_decoder_set_fatal(dec);
+    }
+}
+
+/* (temp) buffer allocation */
+
+static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
+    size_t alloc_size;
+    if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
+        GGML_LOG_ERROR("overflow in array allocation of %zu * %zu bytes\n", size, count);
+        return NULL;
+    }
+
+    return malloc(alloc_size);
+}
+
+/* bool */
+
+static inline void apir_encode_bool_t(apir_encoder * enc, const bool * val) {
+    apir_encode(enc, sizeof(int), val, sizeof(bool));
+}
+
+static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) {
+    apir_decode(dec, sizeof(int), val, sizeof(bool));
+}
+
+/* apir_buffer_type_host_handle_t */
+
+static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder *                  enc,
+                                                              const apir_buffer_type_host_handle_t * val) {
+    apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder *            dec,
+                                                              apir_buffer_type_host_handle_t * val) {
+    apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+/* apir_buffer_host_handle_t */
+
+static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder *             enc,
+                                                         const apir_buffer_host_handle_t * val) {
+    apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+static inline void apir_decode_apir_buffer_host_handle_t(apir_decoder * dec, apir_buffer_host_handle_t * val) {
+    apir_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+/* uintptr_t */
+
+static inline void apir_encode_uintptr_t(apir_encoder * enc, const uintptr_t * val) {
+    apir_encode(enc, sizeof(*val), val, sizeof(*val));
+}
+
+static inline void apir_decode_uintptr_t(apir_decoder * dec, uintptr_t * val) {
+    apir_decode(dec, sizeof(*val), val, sizeof(*val));
+}
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
@@ -0,0 +1,211 @@
+#include "ggml-impl.h"
+#include "apir_cs.h"
+#include "apir_cs_rpc.h"
+
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
+
+static inline void apir_encode_ggml_buffer_host_handle(apir_encoder *                    enc,
+                                                       const apir_buffer_host_handle_t * handle);
+
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
+
+/* apir_rpc_tensor */
+
+static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) {
+    size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor);
+    apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size);
+}
+
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) {
+    size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor);
+
+    return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
+                                                                          uint32_t       n_tensors) {
+    size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
+
+    return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+
+/* ggml_tensor */
+
+static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) {
+    apir_rpc_tensor serialized = apir_serialize_tensor(tensor);
+
+    apir_encode_rcp_tensor(enc, &serialized);
+}
+
+static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
+    const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
+    ggml_init_params params{
+        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context * ctx = ggml_init(params);
+
+    const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
+
+    return tensor;
+}
+
+/* *** ggml_backend_buffer_type_t *** */
+
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
+
+static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) {
+    apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) {
+    apir_buffer_type_host_handle_t handle;
+
+    apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+    return (ggml_backend_buffer_type_t) handle;
+}
+
+static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
+    apir_buffer_type_host_handle_t handle;
+
+    apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+    return handle;
+}
+
+/* *** ggml_backend_type_t *** */
+
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+
+static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) {
+    apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) {
+    ggml_backend_buffer_t buffer;
+    size_t                buffer_ptr_size = sizeof(buffer);
+
+    apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
+
+    return buffer;
+}
+
+/* enum ggml_status */
+
+static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) {
+    apir_encoder_write(enc, sizeof(*status), status, sizeof(*status));
+}
+
+static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) {
+    apir_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
+
+/* virtgpu_shmem */
+
+static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) {
+    apir_encode_uint32_t(enc, &shmem_res_id);
+}
+
+static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) {
+    apir_decode_uint32_t(dec, shmem_res_id);
+}
+
+/* ggml_cgraph */
+
+static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) {
+    apir_serialize_graph(cgraph, cgraph_data);
+
+    return cgraph_data.size();
+}
+
+static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) {
+    size_t cgraph_size = cgraph_data.size();
+
+    apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
+}
+
+static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) {
+    GGML_UNUSED(cgraph_size);
+
+    uint32_t n_nodes;
+    apir_decode_uint32_t(dec, &n_nodes);
+    const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes);
+
+    uint32_t n_tensors;
+    apir_decode_uint32_t(dec, &n_tensors);
+    const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors);
+
+    return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes);
+}
+
+static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) {
+    apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) {
+    size_t tensor_size = sizeof(*tensor);
+
+    if (tensor->extra) {
+        GGML_ABORT("Cannot pass tensors with extra");
+    }
+
+    if (tensor->src[0] && tensor->buffer) {
+        static int first = 1;
+        if (first) {
+            GGML_LOG_WARN("Cannot pass tensors with src and buffer\n");
+            first = 0;
+        }
+    }
+
+    apir_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+    // tensor->data is a pointer inside the device buffer. No need to touch it
+    // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+    // (could also make a copy of the tensor, and update locally.)
+
+    if (tensor->buffer) {
+        apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+        apir_encode_ggml_buffer_handle(enc, &buffer_handle);
+    }
+
+    if (tensor->view_src) {
+        apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+    }
+
+    for (int i = 0; tensor->src[i]; i++) {
+        const ggml_tensor * tensor_src = tensor->src[i];
+        apir_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+    }
+}
+
+static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) {
+    // it safe to remove the `const` qualifier here, we *do* want to
+    // modify the shared memory data to fix the `src` pointers.
+    ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+    // tensor->data is a pointer inside the device buffer. No need to touch it
+    // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+    if (tensor->buffer) {
+        tensor->buffer = apir_decode_ggml_buffer(dec);
+    }
+
+    if (tensor->view_src) {
+        ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+        tensor->view_src              = tensor_view_src;
+    }
+
+    for (int i = 0; tensor->src[i]; i++) {
+        ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+        tensor->src[i] = tensor_src;  // overwrite op->src[i] pointer with the actual location of the src tensor
+    }
+
+    return tensor;
+}
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h
@@ -0,0 +1,54 @@
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <cstdint>
+
+// ggml_tensor is serialized into apir_rpc_tensor
+struct apir_rpc_tensor {
+    uint64_t id;
+    uint32_t type;
+    uint64_t buffer;
+    uint32_t ne[GGML_MAX_DIMS];
+    uint32_t nb[GGML_MAX_DIMS];
+    uint32_t op;
+    int32_t  op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+    int32_t  flags;
+    uint64_t src[GGML_MAX_SRC];
+    uint64_t view_src;
+    uint64_t view_offs;
+    uint64_t data;
+    char     name[GGML_MAX_NAME];
+
+    char padding[4];
+};
+
+/* frontend */
+
+apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor);
+
+void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
+
+/* backend */
+
+void                                      apir_track_backend_buffer(ggml_backend_buffer_t buffer);
+bool                                      apir_untrack_backend_buffer(ggml_backend_buffer_t buffer);
+std::unordered_set<ggml_backend_buffer_t> apir_get_track_backend_buffers();
+
+void apir_add_tensor(ggml_tensor *                       tensor,
+                     std::vector<apir_rpc_tensor> &      tensors,
+                     std::unordered_set<ggml_tensor *> & visited);
+
+ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor);
+
+ggml_tensor * apir_create_node(uint64_t                                                      id,
+                               ggml_context *                                                ctx,
+                               const std::unordered_map<uint64_t, const apir_rpc_tensor *> & tensor_ptrs,
+                               std::unordered_map<uint64_t, ggml_tensor *> &                 tensor_map);
+
+ggml_cgraph * apir_deserialize_graph(uint32_t                n_nodes,
+                                     uint32_t                n_tensors,
+                                     const apir_rpc_tensor * tensors,
+                                     const uint64_t *        nodes);
--- a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
@@ -0,0 +1,98 @@
+#include "ggml-remoting.h"
+
+static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                            size_t                     size) {
+    virtgpu * gpu = BUFT_TO_GPU(buft);
+
+    ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+    if (!context) {
+        GGML_ABORT("Couldn't allocate the buffer context ...");
+    }
+
+    context->gpu = gpu;
+
+    bool async__unused, host_buffer__unused, events__unused;
+    bool buffer_from_host_ptr;
+    apir_device_get_props(gpu, &async__unused, &host_buffer__unused, &buffer_from_host_ptr, &events__unused);
+
+    if (buffer_from_host_ptr) {
+        context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+        context->base         = context->apir_context.shmem.mmap_ptr;
+        context->is_from_ptr  = true;
+    } else {
+        context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+        context->is_from_ptr  = false;
+        context->base         = NULL;
+    }
+
+    ggml_backend_buffer_t buffer =
+        ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+
+    return buffer;
+}
+
+static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    virtgpu * gpu = BUFT_TO_GPU(buft);
+
+    return apir_buffer_type_get_name(gpu, buft);
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    virtgpu * gpu = BUFT_TO_GPU(buft);
+
+    static size_t align = 0;
+
+    if (align == 0) {
+        align = apir_buffer_type_get_alignment(gpu, buft);
+    }
+
+    return align;
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    virtgpu * gpu = BUFT_TO_GPU(buft);
+
+    static size_t max_size = 0;
+    if (max_size == 0) {
+        max_size = apir_buffer_type_get_max_size(gpu, buft);
+    }
+
+    return max_size;
+}
+
+static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    virtgpu * gpu = BUFT_TO_GPU(buft);
+
+    return apir_buffer_type_is_host(gpu, buft);
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                               const ggml_tensor *        tensor) {
+    virtgpu * gpu = BUFT_TO_GPU(buft);
+
+    if (tensor->buffer == NULL
+        || !tensor->buffer->context
+        || !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
+        return ggml_nbytes(tensor);
+    }
+
+    return apir_buffer_type_get_alloc_size(gpu, buft, tensor);
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+    /* .alloc_buffer     = */ NULL,
+    /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
--- a/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp
@@ -0,0 +1,119 @@
+#include "ggml-remoting.h"
+
+#define BUFFER_TO_GPU(name) ((ggml_backend_remoting_buffer_context *) (name)->context)->gpu
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) buffer->context;
+    if (context->base) {
+        return context->base;
+    }
+
+    context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer), BUFFER_TO_APIR_CONTEXT(buffer));
+
+    return context->base;
+}
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                    ggml_tensor *         tensor,
+                                                    const void *          data,
+                                                    size_t                offset,
+                                                    size_t                size) {
+    virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+    ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer);
+    if (context->is_from_ptr) {
+        memcpy((char *) tensor->data + offset, data, size);
+    } else {
+        apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+    }
+
+    return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                    const ggml_tensor *   tensor,
+                                                    void *                data,
+                                                    size_t                offset,
+                                                    size_t                size) {
+    virtgpu *                              gpu     = BUFFER_TO_GPU(buffer);
+    ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer);
+    if (context->is_from_ptr) {
+        memcpy(data, (const char *) tensor->data + offset, size);
+    } else {
+        apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+    }
+}
+
+static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer,
+                                                             ggml_tensor *         tensor,
+                                                             const void *          data,
+                                                             size_t                offset,
+                                                             size_t                size) {
+    UNUSED(buffer);
+
+    memcpy((char *) tensor->data + offset, data, size);
+
+    return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer,
+                                                             const ggml_tensor *   tensor,
+                                                             void *                data,
+                                                             size_t                offset,
+                                                             size_t                size) {
+    UNUSED(buffer);
+
+    memcpy(data, (const char *) tensor->data + offset, size);
+}
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                                    const ggml_tensor *   src,
+                                                    ggml_tensor *         dst) {
+    virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+    bool ret = apir_buffer_cpy_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), src, dst);
+
+    return ret;
+}
+
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+    apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value);
+
+    return;
+}
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    virtgpu * gpu = BUFFER_TO_GPU(buffer);
+
+    apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
+
+    ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer);
+    free(context);
+    buffer->context = NULL;
+}
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+    /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_remoting_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
+    /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+    /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
+    /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
+    /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_remoting_buffer_clear,
+    /* .reset           = */ NULL,
+};
--- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
@@ -0,0 +1,144 @@
+#include "ggml-remoting.h"
+
+static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    return apir_device_get_name(gpu);
+}
+
+static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    return apir_device_get_description(gpu);
+}
+
+static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    static enum ggml_backend_dev_type type;
+    static bool                       has_type = false;
+    if (!has_type) {
+        has_type = true;
+        type     = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
+    }
+
+    return type;
+}
+
+static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    return apir_device_get_memory(gpu, free, total);
+}
+
+static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+#if USE_ALWAYS_TRUE_SUPPORTS_OP == 1
+    /* ggml-rpc cheats it like this */
+    /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
+    UNUSED(dev);
+    UNUSED(op);
+
+    return true;
+#else
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    return apir_device_supports_op(gpu, op);
+#endif
+}
+
+static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    bool supported = buft->device == dev;
+
+    return supported;
+}
+
+static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    UNUSED(dev);
+    UNUSED(op);
+
+    return false;
+}
+
+static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_remoting_device_get_name(dev);
+    props->description = ggml_backend_remoting_device_get_description(dev);
+    props->type        = ggml_backend_remoting_device_get_type(dev);
+    ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    virtgpu * gpu = DEV_TO_GPU(dev);
+    apir_device_get_props(gpu, &props->caps.async, &props->caps.host_buffer, &props->caps.buffer_from_host_ptr,
+                          &props->caps.events);
+
+    props->caps.buffer_from_host_ptr = false;
+    props->caps.async                = false;
+    props->caps.events               = false;
+}
+
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+    static ggml_backend_buffer_type buft{
+        /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+        /* .device   = */ dev,
+        /* .context  = */ (void *) ctx,
+    };
+
+    return &buft;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+    static ggml_backend_buffer_type buft{
+        /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+        /* .device   = */ dev,
+        /* .context  = */ (void *) ctx,
+    };
+
+    return &buft;
+}
+
+static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev,
+                                                                          void *             ptr,
+                                                                          size_t             size,
+                                                                          size_t             max_tensor_size) {
+    virtgpu * gpu = DEV_TO_GPU(dev);
+
+    ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+    if (!context) {
+        GGML_ABORT("Couldn't allocate the buffer context ...");
+    }
+
+    context->gpu          = gpu;
+    context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
+    context->base         = ptr;
+    context->is_from_ptr  = true;
+
+    ggml_backend_buffer_t buffer =
+        ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev),
+                                 ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
+
+    return buffer;
+}
+
+const ggml_backend_device_i ggml_backend_remoting_device_interface = {
+    /* .get_name             = */ ggml_backend_remoting_device_get_name,
+    /* .get_description      = */ ggml_backend_remoting_device_get_description,
+    /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
+    /* .get_type             = */ ggml_backend_remoting_device_get_type,
+    /* .get_props            = */ ggml_backend_remoting_device_get_props,
+    /* .init_backend         = */ ggml_backend_remoting_device_init,
+    /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
--- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
@@ -0,0 +1,137 @@
+#include "ggml-remoting.h"
+#include "ggml-virtgpu.h"
+
+#include <iostream>
+#include <mutex>
+
+static virtgpu * apir_initialize() {
+    static virtgpu * apir_gpu_instance = NULL;
+    static bool      apir_initialized  = false;
+
+    {
+        static std::mutex           mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (apir_initialized) {
+            return apir_gpu_instance;
+        }
+
+        apir_gpu_instance = create_virtgpu();
+        if (!apir_gpu_instance) {
+            GGML_ABORT("failed to initialize the virtgpu");
+        }
+
+        apir_initialized = true;
+    }
+
+    return apir_gpu_instance;
+}
+
+static int ggml_backend_remoting_get_device_count() {
+    virtgpu * gpu = apir_initialize();
+    if (!gpu) {
+        GGML_LOG_WARN("apir_initialize failed\n");
+        return 0;
+    }
+
+    return apir_device_get_count(gpu);
+}
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+
+    return ggml_backend_remoting_get_device_count();
+}
+
+static std::vector<ggml_backend_dev_t> devices;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
+    if (devices.size() > 0) {
+        GGML_LOG_INFO("%s: already initialized\n", __func__);
+        return;
+    }
+
+    virtgpu * gpu = apir_initialize();
+    if (!gpu) {
+        GGML_LOG_ERROR("apir_initialize failed\n");
+        return;
+    }
+
+    static bool initialized = false;
+
+    {
+        static std::mutex           mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
+                ggml_backend_remoting_device_context * ctx       = new ggml_backend_remoting_device_context;
+                char                                   desc[256] = "API Remoting device";
+
+                ctx->device      = i;
+                ctx->name        = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
+                ctx->description = desc;
+                ctx->gpu         = gpu;
+
+                ggml_backend_dev_t dev = new ggml_backend_device{
+                    /* .iface   = */ ggml_backend_remoting_device_interface,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                };
+                devices.push_back(dev);
+            }
+            initialized = true;
+        }
+    }
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    UNUSED(reg);
+
+    return ggml_backend_remoting_get_device(device);
+}
+
+static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+
+    return GGML_REMOTING_FRONTEND_NAME;
+}
+
+static const ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_virtgpu_reg() {
+    virtgpu * gpu = apir_initialize();
+    if (!gpu) {
+        GGML_LOG_ERROR("virtgpu_apir_initialize failed\n");
+        return NULL;
+    }
+
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_remoting_reg_i,
+        /* .context     = */ gpu,
+    };
+
+    static bool initialized = false;
+    if (initialized) {
+        return &reg;
+    }
+    initialized = true;
+
+    ggml_backend_remoting_reg_init_devices(&reg);
+
+    GGML_LOG_INFO("%s: initialized\n", __func__);
+
+    return &reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)
--- a/ggml/src/ggml-virtgpu/ggml-backend.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp
@@ -0,0 +1,69 @@
+#include "ggml-remoting.h"
+#include "../../include/ggml-virtgpu.h"
+
+static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
+    UNUSED(backend);
+
+    return "API Remoting backend";
+}
+
+static void ggml_backend_remoting_free(ggml_backend_t backend) {
+    delete backend;
+}
+
+static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    virtgpu * gpu = DEV_TO_GPU(backend->device);
+
+    return apir_backend_graph_compute(gpu, cgraph);
+}
+
+static void ggml_backend_remoting_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    virtgpu * gpu = DEV_TO_GPU(backend->device);
+#if true
+    UNUSED(gpu);
+    UNUSED(cgraph);
+#else
+    // not working yet
+
+    apir_backend_graph_optimize(gpu, cgraph);
+#endif
+}
+
+static ggml_backend_i ggml_backend_remoting_interface = {
+    /* .get_name                = */ ggml_backend_remoting_get_name,
+    /* .free                    = */ ggml_backend_remoting_free,
+    /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
+    /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
+    /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .graph_optimize          = */ ggml_backend_remoting_graph_optimize,
+};
+
+static ggml_guid_t ggml_backend_remoting_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02,
+                              0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
+    UNUSED(params);
+
+    ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *) dev->context;
+
+    ggml_backend_t remoting_backend = new ggml_backend{
+        /* .guid      = */ ggml_backend_remoting_guid(),
+        /* .interface = */ ggml_backend_remoting_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_virtgpu_reg(), ctx->device),
+        /* .context   = */ ctx,
+    };
+
+    return remoting_backend;
+}
--- a/ggml/src/ggml-virtgpu/ggml-remoting.h
+++ b/ggml/src/ggml-virtgpu/ggml-remoting.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "virtgpu.h"
+
+#include <memory>
+#include <string>
+
+// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes
+
+#define USE_ALWAYS_TRUE_SUPPORTS_OP 1
+#define USE_METAL_GUEST_SUPPORTS_OP 0
+
+#define DEV_TO_GPU(name) ((ggml_backend_remoting_device_context *) (name)->context)->gpu
+
+#define BUFFER_TO_GGML_CONTEXT(name) ((ggml_backend_remoting_buffer_context *) (name)->context)
+
+#define BUFFER_TO_APIR_CONTEXT(name) &((ggml_backend_remoting_buffer_context *) (name)->context)->apir_context
+
+#define BUFFER_TO_HOST_HANDLE(name) ((ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle
+
+#define GET_DEVICE_CONTEXT() (ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context
+
+#define BUFT_TO_GPU(name) ((ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+struct ggml_backend_remoting_device_context {
+    size_t      device;
+    std::string name;
+    std::string description;
+
+    std::vector<std::tuple<void *, size_t, virtgpu_shmem *>> shared_memory;
+
+    virtgpu * gpu;
+};
+
+struct ggml_backend_remoting_buffer_context {
+    apir_buffer_context_t apir_context;
+
+    virtgpu * gpu;
+
+    void * base;
+
+    bool is_from_ptr;
+};
+
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
+extern const ggml_backend_device_i      ggml_backend_remoting_device_interface;
+extern const ggml_backend_buffer_i      ggml_backend_remoting_buffer_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface;
+extern const ggml_backend_buffer_i      ggml_backend_remoting_buffer_from_ptr_interface;
+
+ggml_backend_dev_t         ggml_backend_remoting_get_device(size_t device);
+ggml_backend_t             ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev);
+
+static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+    // in the backend, the buffer handle is the buffer pointer
+    return (apir_buffer_type_host_handle_t) buft->context;
+}
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+    if (!buffer->context) {
+        GGML_ABORT("%s: no context available :/", __func__);
+    }
+    return BUFFER_TO_HOST_HANDLE(buffer);
+}
--- a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
+++ b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
@@ -0,0 +1,168 @@
+# YAML schema for GGML remoting API functions
+# This defines the structure for generating the remoting layer code
+
+# Configuration for the generated files
+config:
+  # Base path for the generated files
+  base_path: "ggml/src"
+
+  # Header files to update
+  files:
+    apir_backend_header: "ggml-virtgpu-apir/backend/shared/apir_backend.gen.h"
+    backend_dispatched_header: "ggml-virtgpu-apir/backend/backend-dispatched.gen.h"
+    virtgpu_forward_header: "ggml-virtgpu-apir/virtgpu-forward.gen.h"
+
+# Simplified function definitions with grouping and metadata combined
+functions:
+  device:
+    group_description: "device"
+    functions:
+      get_device_count:
+        # No specific metadata - uses default void return and base params
+
+      get_count:
+        frontend_return: "int"
+
+      get_name:
+        frontend_return: "const char *"
+
+      get_description:
+        frontend_return: "const char *"
+
+      get_type:
+        frontend_return: "uint32_t"
+
+      get_memory:
+        frontend_return: "void"
+        frontend_extra_params:
+        - "size_t *free"
+        - "size_t *total"
+
+      supports_op:
+        frontend_return: "bool"
+        frontend_extra_params:
+        - "const ggml_tensor *op"
+
+      get_buffer_type:
+        frontend_return: "apir_buffer_type_host_handle_t"
+
+      get_props:
+        frontend_return: "void"
+        frontend_extra_params:
+        - "bool *async"
+        - "bool *host_buffer"
+        - "bool *buffer_from_host_ptr"
+        - "bool *events"
+
+      buffer_from_ptr:
+        frontend_return: "apir_buffer_context_t"
+        frontend_extra_params:
+        - "size_t size"
+        - "size_t max_tensor_size"
+
+  buffer_type:
+    group_description: "buffer-type"
+    functions:
+      get_name:
+        frontend_return: "const char *"
+        frontend_extra_params:
+        - "ggml_backend_buffer_type_t buft"
+
+      get_alignment:
+        frontend_return: "size_t"
+        frontend_extra_params:
+        - "ggml_backend_buffer_type_t buft"
+
+      get_max_size:
+        frontend_return: "size_t"
+        frontend_extra_params:
+        - "ggml_backend_buffer_type_t buft"
+
+      is_host:
+        frontend_return: "bool"
+        frontend_extra_params:
+        - "ggml_backend_buffer_type_t buft"
+
+      alloc_buffer:
+        frontend_return: "apir_buffer_context_t"
+        frontend_extra_params:
+        - "ggml_backend_buffer_type_t buffer_buft"
+        - "size_t size"
+
+      get_alloc_size:
+        frontend_return: "size_t"
+        frontend_extra_params:
+        - "ggml_backend_buffer_type_t buft"
+        - "const ggml_tensor *op"
+
+  buffer:
+    group_description: "buffer"
+    functions:
+      get_base:
+        frontend_return: "void *"
+        frontend_extra_params:
+        - "apir_buffer_context_t *buffer_context"
+
+      set_tensor:
+        frontend_return: "void"
+        frontend_extra_params:
+        - "apir_buffer_context_t *buffer_context"
+        - "ggml_tensor *tensor"
+        - "const void *data"
+        - "size_t offset"
+        - "size_t size"
+
+      get_tensor:
+        frontend_return: "void"
+        frontend_extra_params:
+        - "apir_buffer_context_t *buffer_context"
+        - "const ggml_tensor *tensor"
+        - "void *data"
+        - "size_t offset"
+        - "size_t size"
+
+      cpy_tensor:
+        frontend_return: "bool"
+        frontend_extra_params:
+        - "apir_buffer_context_t *buffer_context"
+        - "const ggml_tensor *src"
+        - "const ggml_tensor *dst"
+
+      clear:
+        frontend_return: "void"
+        frontend_extra_params:
+        - "apir_buffer_context_t *buffer_context"
+        - "uint8_t value"
+
+      free_buffer:
+        frontend_return: "void"
+        frontend_extra_params:
+        - "apir_buffer_context_t *buffer_context"
+
+  backend:
+    group_description: "backend"
+    functions:
+      graph_compute:
+        frontend_return: "ggml_status"
+        frontend_extra_params:
+        - "ggml_cgraph *cgraph"
+
+      graph_optimize:
+        frontend_return: "ggml_cgraph *"
+        frontend_extra_params:
+        - "ggml_cgraph *cgraph"
+        enabled: false
+
+# Naming patterns used for code generation
+naming_patterns:
+  # How to generate enum names
+  enum_prefix: "APIR_COMMAND_TYPE_"
+
+  # How to generate backend function names
+  backend_function_prefix: "backend_"
+
+  # How to generate frontend function names
+  frontend_function_prefix: "apir_"
+
+  # Standard frontend first parameter
+  frontend_base_param: "struct virtgpu *gpu"
--- a/ggml/src/ggml-virtgpu/include/apir_hw.h
+++ b/ggml/src/ggml-virtgpu/include/apir_hw.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <stdint.h>
+
+struct virgl_renderer_capset_apir {
+   uint32_t apir_version;
+   uint32_t supports_blob_resources;
+   uint32_t reserved[4];           // For future expansion
+};
--- a/ggml/src/ggml-virtgpu/regenerate_remoting.py
+++ b/ggml/src/ggml-virtgpu/regenerate_remoting.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python3
+"""
+# Generated by Claude AI
+
+Script to completely regenerate the GGML remoting codebase from YAML configuration.
+
+This script reads api_functions.yaml and regenerates all the header files and
+implementation templates for the GGML remoting layer.
+
+Usage:
+  python regenerate_remoting.py
+
+The script will:
+1. Read ggmlremoting_functions.yaml configuration
+2. Generate updated header files
+3. Generate implementation templates in dedicated files
+4. Show a summary of what was generated
+"""
+
+import yaml
+from typing import Dict, List, Any
+from pathlib import Path
+import os
+import subprocess
+import shutil
+import logging
+
+NL = '\n' # can't have f"{'\n'}" in f-strings
+
+
+class RemotingCodebaseGenerator:
+    def __init__(self, yaml_path: str = "ggmlremoting_functions.yaml"):
+        """Initialize the generator with the YAML configuration."""
+        self.yaml_path = yaml_path
+
+        if not Path(yaml_path).exists():
+            raise FileNotFoundError(f"Configuration file {yaml_path} not found")
+
+        with open(yaml_path, 'r') as f:
+            self.config = yaml.safe_load(f)
+
+        self.functions = self.config['functions']
+        self.naming_patterns = self.config['naming_patterns']
+        self.config_data = self.config['config']
+
+        # Check if clang-format is available
+        self.clang_format_available = self._check_clang_format_available()
+
+    def _check_clang_format_available(self) -> bool:
+        """Check if clang-format is available in the system PATH."""
+        return shutil.which("clang-format") is not None
+
+    def _format_file_with_clang_format(self, file_path: Path) -> bool:
+        """Format a file with clang-format -i. Returns True if successful, False otherwise."""
+        if not self.clang_format_available:
+            return False
+
+        try:
+            subprocess.run(
+                ["clang-format", "-i", str(file_path)],
+                check=True,
+                capture_output=True,
+                text=True
+            )
+            return True
+        except subprocess.CalledProcessError:
+            logging.exception(f"   ⚠️  clang-format failed for {file_path}")
+            return False
+        except Exception as e:
+            logging.exception(f"   ⚠️  Unexpected error formatting {file_path}: {e}")
+            return False
+
+    def generate_enum_name(self, group_name: str, function_name: str) -> str:
+        """Generate the APIR_COMMAND_TYPE enum name for a function."""
+        prefix = self.naming_patterns['enum_prefix']
+        return f"{prefix}{group_name.upper()}_{function_name.upper()}"
+
+    def generate_backend_function_name(self, group_name: str, function_name: str) -> str:
+        """Generate the backend function name."""
+        function_key = f"{group_name}_{function_name}"
+        overrides = self.naming_patterns.get('backend_function_overrides', {})
+
+        if function_key in overrides:
+            return overrides[function_key]
+
+        prefix = self.naming_patterns['backend_function_prefix']
+        return f"{prefix}{group_name}_{function_name}"
+
+    def generate_frontend_function_name(self, group_name: str, function_name: str) -> str:
+        """Generate the frontend function name."""
+        prefix = self.naming_patterns['frontend_function_prefix']
+        return f"{prefix}{group_name}_{function_name}"
+
+    def get_enabled_functions(self) -> List[Dict[str, Any]]:
+        """Get all enabled functions with their metadata."""
+        functions = []
+        enum_value = 0
+
+        for group_name, group_data in self.functions.items():
+            group_description = group_data['group_description']
+
+            for function_name, func_metadata in group_data['functions'].items():
+                # Handle case where func_metadata is None or empty (functions with only comments)
+                if func_metadata is None:
+                    func_metadata = {}
+
+                # Functions are enabled by default unless explicitly disabled
+                if func_metadata.get('enabled', True):
+                    functions.append({
+                        'group_name': group_name,
+                        'function_name': function_name,
+                        'enum_name': self.generate_enum_name(group_name, function_name),
+                        'enum_value': enum_value,
+                        'backend_function': self.generate_backend_function_name(group_name, function_name),
+                        'frontend_function': self.generate_frontend_function_name(group_name, function_name),
+                        'frontend_return': func_metadata.get('frontend_return', 'void'),
+                        'frontend_extra_params': func_metadata.get('frontend_extra_params', []),
+                        'group_description': group_description,
+                        'newly_added': func_metadata.get('newly_added', False)
+                    })
+                    enum_value += 1
+
+        return functions
+
+    def generate_apir_backend_header(self) -> str:
+        """Generate the complete apir_backend.h file."""
+        functions = self.get_enabled_functions()
+
+        # Generate the enum section
+        enum_lines = ["typedef enum ApirBackendCommandType {"]
+        current_group = None
+
+        for func in functions:
+            # Add comment for new group
+            if func['group_name'] != current_group:
+                enum_lines.append("")
+                enum_lines.append(f"  /* {func['group_description']} */")
+                current_group = func['group_name']
+
+            enum_lines.append(f"  {func['enum_name']} = {func['enum_value']},")
+
+        # Add the count
+        total_count = len(functions)
+        enum_lines.append("\n  // last command_type index + 1")
+        enum_lines.append(f"  APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},")
+        enum_lines.append("} ApirBackendCommandType;")
+
+        # Full header template
+        header_content = NL.join(enum_lines) + "\n"
+
+        return header_content
+
+    def generate_backend_dispatched_header(self) -> str:
+        """Generate the complete backend-dispatched.h file."""
+        functions = self.get_enabled_functions()
+
+        # Function declarations
+        decl_lines = []
+        current_group = None
+
+        for func in functions:
+            if func['group_name'] != current_group:
+                decl_lines.append(f"\n/* {func['group_description']} */")
+                current_group = func['group_name']
+
+            signature = "uint32_t"
+            params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx"
+            decl_lines.append(f"{signature} {func['backend_function']}({params});")
+
+        # Switch cases
+        switch_lines = []
+        current_group = None
+
+        for func in functions:
+            if func['group_name'] != current_group:
+                switch_lines.append(f"  /* {func['group_description']} */")
+                current_group = func['group_name']
+
+            switch_lines.append(f"  case {func['enum_name']}: return \"{func['backend_function']}\";")
+
+        # Dispatch table
+        table_lines = []
+        current_group = None
+
+        for func in functions:
+            if func['group_name'] != current_group:
+                table_lines.append(f"\n  /* {func['group_description']} */")
+                table_lines.append("")
+                current_group = func['group_name']
+
+            table_lines.append(f"  /* {func['enum_name']}  = */ {func['backend_function']},")
+
+        header_content = f'''\
+#pragma once
+
+{NL.join(decl_lines)}
+
+static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
+{{
+  switch (type) {{
+{NL.join(switch_lines)}
+
+  default: return "unknown";
+  }}
+}}
+
+extern "C" {{
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{
+  {NL.join(table_lines)}
+}};
+}}
+'''
+        return header_content
+
+    def generate_virtgpu_forward_header(self) -> str:
+        """Generate the complete virtgpu-forward.gen.h file."""
+        functions = self.get_enabled_functions()
+
+        decl_lines = []
+        current_group = None
+
+        for func in functions:
+            if func['group_name'] != current_group:
+                decl_lines.append("")
+                decl_lines.append(f"/* {func['group_description']} */")
+                current_group = func['group_name']
+
+            # Build parameter list
+            params = [self.naming_patterns['frontend_base_param']]
+            params.extend(func['frontend_extra_params'])
+            param_str = ', '.join(params)
+
+            decl_lines.append(f"{func['frontend_return']} {func['frontend_function']}({param_str});")
+
+        header_content = f'''\
+#pragma once
+{NL.join(decl_lines)}
+'''
+        return header_content
+
+    def regenerate_codebase(self) -> None:
+        """Regenerate the entire remoting codebase."""
+        logging.info("🔄 Regenerating GGML Remoting Codebase...")
+        logging.info("=" * 50)
+
+        # Detect if we're running from frontend directory
+        current_dir = os.getcwd()
+        is_frontend_dir = current_dir.endswith('ggml-virtgpu')
+
+        if is_frontend_dir:
+            # Running from ggml/src/ggml-virtgpu-apir
+            logging.info("📍 Detected frontend directory execution")
+            frontend_base = Path(".")
+        else:
+            # Running from project root (fallback to original behavior)
+            logging.info("📍 Detected project root execution")
+            base_path = self.config_data.get('base_path', 'ggml/src')
+            frontend_base = Path(base_path) / "ggml-virtgpu"
+
+        # Compute final file paths
+        backend_base = frontend_base / "backend"
+        apir_backend_path = backend_base / "shared" / "apir_backend.gen.h"
+        backend_dispatched_path = backend_base / "backend-dispatched.gen.h"
+        virtgpu_forward_path = frontend_base / "virtgpu-forward.gen.h"
+
+        # Create output directories for each file
+        apir_backend_path.parent.mkdir(parents=True, exist_ok=True)
+        backend_dispatched_path.parent.mkdir(parents=True, exist_ok=True)
+        virtgpu_forward_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Generate header files
+        logging.info("📁 Generating header files...")
+
+        apir_backend_content = self.generate_apir_backend_header()
+        apir_backend_path.write_text(apir_backend_content)
+        logging.info(f"   ✅ {apir_backend_path.resolve()}")
+
+        backend_dispatched_content = self.generate_backend_dispatched_header()
+        backend_dispatched_path.write_text(backend_dispatched_content)
+        logging.info(f"   ✅ {backend_dispatched_path.resolve()}")
+
+        virtgpu_forward_content = self.generate_virtgpu_forward_header()
+        virtgpu_forward_path.write_text(virtgpu_forward_content)
+        logging.info(f"   ✅ {virtgpu_forward_path.resolve()}")
+
+        # Format generated files with clang-format
+        generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path]
+
+        if not self.clang_format_available:
+            logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted."
+                            "   Install clang-format to enable automatic code formatting.")
+        else:
+            logging.info("\n🎨 Formatting files with clang-format...")
+            for file_path in generated_files:
+                if self._format_file_with_clang_format(file_path):
+                    logging.info(f"   ✅ Formatted {file_path.name}")
+                else:
+                    logging.warning(f"   ❌ Failed to format {file_path.name}")
+
+        # Generate summary
+        functions = self.get_enabled_functions()
+        total_functions = len(functions)
+
+        logging.info("\n📊 Generation Summary:")
+        logging.info("=" * 50)
+        logging.info(f"   Total functions: {total_functions}")
+        logging.info(f"   Function groups: {len(self.functions)}")
+        logging.info("   Header files: 3")
+        logging.info(f"   Working directory: {current_dir}")
+
+
+def main():
+    try:
+        generator = RemotingCodebaseGenerator()
+        generator.regenerate_codebase()
+    except Exception as e:
+        logging.exception(f"❌ Error: {e}")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/ggml/src/ggml-virtgpu/virtgpu-apir.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-apir.h
@@ -0,0 +1,15 @@
+#include "backend/shared/apir_backend.h"
+#include "ggml-alloc.h"
+#include "ggml-impl.h"
+#include "ggml.h"
+#include "virtgpu-shm.h"
+#include "virtgpu-utils.h"
+
+struct apir_buffer_context_t {
+    apir_buffer_host_handle_t host_handle;
+
+    struct virtgpu_shmem           shmem;
+    apir_buffer_type_host_handle_t buft_host_handle;
+};
+
+#include "virtgpu-forward.gen.h"
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp
@@ -0,0 +1,50 @@
+#include "virtgpu-forward-impl.h"
+
+static long long current_time_ms() {
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+    return (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
+
+    std::vector<uint8_t> cgraph_data;
+    size_t               cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data);
+
+    virtgpu_shmem   temp_shmem;  // Local storage for large buffers
+    virtgpu_shmem * shmem = &temp_shmem;
+
+    if (cgraph_size <= gpu->data_shmem.mmap_size) {
+        // prefer the init-time allocated page, if large enough
+        shmem = &gpu->data_shmem;
+    } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
+        GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+    }
+
+    apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
+    apir_encode_size_t(encoder, &cgraph_size);
+
+    char *       shmem_data    = (char *) shmem->mmap_ptr;
+    apir_encoder secondary_enc = apir_new_encoder(shmem_data, cgraph_size);
+
+    apir_encode_cgraph_data(&secondary_enc, cgraph_data);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    ggml_status status = GGML_STATUS_ABORTED;
+    apir_decode_ggml_status(decoder, &status);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    if (shmem != &gpu->data_shmem) {
+        virtgpu_shmem_destroy(gpu, shmem);
+    }
+
+    return status;
+}
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
@@ -0,0 +1,125 @@
+#include "virtgpu-forward-impl.h"
+
+const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
+
+    apir_encode_ggml_buffer_type(encoder, buft);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    const size_t string_size = apir_decode_array_size_unchecked(decoder);
+    char *       string      = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+    if (!string) {
+        GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
+        apir_decoder_set_fatal(decoder);
+    }
+    apir_decode_char_array(decoder, string, string_size);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return string;
+}
+
+size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
+
+    apir_encode_ggml_buffer_type(encoder, buft);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    size_t alignment;
+    apir_decode_size_t(decoder, &alignment);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return alignment;
+}
+
+size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
+
+    apir_encode_ggml_buffer_type(encoder, buft);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    size_t max_size;
+    apir_decode_size_t(decoder, &max_size);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return max_size;
+}
+
+bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
+
+    apir_encode_ggml_buffer_type(encoder, buft);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    bool is_host;
+    apir_decode_bool_t(decoder, &is_host);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return is_host;
+}
+
+apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    apir_buffer_context_t buffer_context;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
+
+    apir_encode_ggml_buffer_type(encoder, buft);
+
+    apir_encode_size_t(encoder, &size);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return buffer_context;
+}
+
+size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE);
+
+    apir_encode_ggml_buffer_type(encoder, buft);
+
+    apir_encode_ggml_tensor_inline(encoder, op);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    size_t alloc_size;
+    apir_decode_size_t(decoder, &alloc_size);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return alloc_size;
+}
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp
@@ -0,0 +1,157 @@
+#include "virtgpu-forward-impl.h"
+
+void * apir_buffer_get_base(virtgpu * gpu, apir_buffer_context_t * buffer_context) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
+
+    apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    uintptr_t base;
+    apir_decode_uintptr_t(decoder, &base);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return (void *) base;
+}
+
+void apir_buffer_set_tensor(virtgpu *               gpu,
+                            apir_buffer_context_t * buffer_context,
+                            ggml_tensor *           tensor,
+                            const void *            data,
+                            size_t                  offset,
+                            size_t                  size) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
+
+    apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+    apir_encode_ggml_tensor(encoder, tensor);
+
+    virtgpu_shmem   temp_shmem;  // Local storage for large buffers
+    virtgpu_shmem * shmem = &temp_shmem;
+
+    if (size <= gpu->data_shmem.mmap_size) {
+        // prefer the init-time allocated page, if large enough
+        shmem = &gpu->data_shmem;
+
+    } else if (virtgpu_shmem_create(gpu, size, shmem)) {
+        GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+    }
+
+    memcpy(shmem->mmap_ptr, data, size);
+    apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
+    apir_encode_size_t(encoder, &offset);
+    apir_encode_size_t(encoder, &size);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    if (shmem != &gpu->data_shmem) {
+        virtgpu_shmem_destroy(gpu, shmem);
+    }
+
+    return;
+}
+
+void apir_buffer_get_tensor(virtgpu *               gpu,
+                            apir_buffer_context_t * buffer_context,
+                            const ggml_tensor *     tensor,
+                            void *                  data,
+                            size_t                  offset,
+                            size_t                  size) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
+
+    apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+    apir_encode_ggml_tensor(encoder, tensor);
+
+    virtgpu_shmem   temp_shmem;  // Local storage for large buffers
+    virtgpu_shmem * shmem = &temp_shmem;
+
+    if (size <= gpu->data_shmem.mmap_size) {
+        // prefer the init-time allocated page, if large enough
+        shmem = &gpu->data_shmem;
+
+    } else if (virtgpu_shmem_create(gpu, size, shmem)) {
+        GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+    }
+
+    apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+    apir_encode_size_t(encoder, &offset);
+    apir_encode_size_t(encoder, &size);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    memcpy(data, shmem->mmap_ptr, size);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    if (shmem != &gpu->data_shmem) {
+        virtgpu_shmem_destroy(gpu, shmem);
+    }
+}
+
+bool apir_buffer_cpy_tensor(virtgpu *               gpu,
+                            apir_buffer_context_t * buffer_context,
+                            const ggml_tensor *     src,
+                            const ggml_tensor *     dst) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR);
+
+    apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+    apir_encode_ggml_tensor(encoder, src);
+    apir_encode_ggml_tensor(encoder, dst);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    bool ret_val;
+    apir_decode_bool_t(decoder, &ret_val);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return ret_val;
+}
+
+void apir_buffer_clear(virtgpu * gpu, apir_buffer_context_t * buffer_context, uint8_t value) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR);
+
+    apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+    apir_encode_uint8_t(encoder, &value);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    remote_call_finish(gpu, encoder, decoder);
+}
+
+void apir_buffer_free_buffer(virtgpu * gpu, apir_buffer_context_t * buffer_context) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER);
+
+    apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    remote_call_finish(gpu, encoder, decoder);
+}
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
@@ -0,0 +1,200 @@
+#include "virtgpu-forward-impl.h"
+#include "virtgpu-shm.h"
+
+int apir_device_get_count(virtgpu * gpu) {
+    static int32_t dev_count = -1;
+    if (dev_count != -1) {
+        return dev_count;
+    }
+
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_decode_int32_t(decoder, &dev_count);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return dev_count;
+}
+
+const char * apir_device_get_name(virtgpu * gpu) {
+    static char * string = nullptr;
+    if (string) {
+        return string;
+    }
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME);
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    const size_t string_size = apir_decode_array_size_unchecked(decoder);
+    string                   = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+    if (!string) {
+        GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
+        return NULL;
+    }
+    apir_decode_char_array(decoder, string, string_size);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return string;
+}
+
+const char * apir_device_get_description(virtgpu * gpu) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    const size_t string_size = apir_decode_array_size_unchecked(decoder);
+    char *       string      = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+    if (!string) {
+        GGML_LOG_ERROR("%s: Could not allocate the device description buffer\n", __func__);
+
+        return NULL;
+    }
+    apir_decode_char_array(decoder, string, string_size);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return string;
+}
+
+uint32_t apir_device_get_type(virtgpu * gpu) {
+    static uint32_t dev_type = 255;
+    if (dev_type != 255) {
+        return dev_type;
+    }
+
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_decode_uint32_t(decoder, &dev_type);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return dev_type;
+}
+
+void apir_device_get_memory(virtgpu * gpu, size_t * free, size_t * total) {
+    static size_t         dev_free  = 0;
+    static size_t         dev_total = 0;
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_decode_size_t(decoder, &dev_free);
+    apir_decode_size_t(decoder, &dev_total);
+
+    *free  = dev_free;
+    *total = dev_total;
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return;
+}
+
+bool apir_device_supports_op(virtgpu * gpu, const ggml_tensor * op) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
+
+    apir_encode_ggml_tensor_inline(encoder, op);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    bool supports_op;
+    apir_decode_bool_t(decoder, &supports_op);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return supports_op;
+}
+
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(virtgpu * gpu) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_buffer_type_host_handle_t buft_handle;
+    apir_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return buft_handle;
+}
+
+void apir_device_get_props(virtgpu * gpu,
+                           bool *    async,
+                           bool *    host_buffer,
+                           bool *    buffer_from_host_ptr,
+                           bool *    events) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_decode_bool_t(decoder, async);
+    apir_decode_bool_t(decoder, host_buffer);
+    apir_decode_bool_t(decoder, buffer_from_host_ptr);
+    apir_decode_bool_t(decoder, events);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return;
+}
+
+apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, size_t max_tensor_size) {
+    apir_encoder *        encoder;
+    apir_decoder *        decoder;
+    ApirForwardReturnCode ret;
+
+    apir_buffer_context_t buffer_context;
+
+    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
+
+    if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
+        GGML_ABORT("Couldn't allocate the guest-host shared buffer");
+    }
+
+    apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);
+
+    apir_encode_size_t(encoder, &size);
+    apir_encode_size_t(encoder, &max_tensor_size);
+
+    REMOTE_CALL(gpu, encoder, decoder, ret);
+
+    apir_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+    buffer_context.buft_host_handle = apir_decode_apir_buffer_type_host_handle(decoder);
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    return buffer_context;
+}
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h
@@ -0,0 +1,29 @@
+#include "virtgpu.h"
+
+#include "ggml-remoting.h"
+#include "backend/shared/apir_backend.h"
+#include "backend/shared/apir_cs_ggml.h"
+
+#include "ggml-backend-impl.h"
+
+#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__)                               \
+    do {                                                                                                   \
+        int32_t forward_flag = (int32_t) apir_command_type__;                                              \
+        encoder_name         = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
+        if (!encoder_name) {                                                                               \
+            GGML_ABORT("%s: failed to prepare the remote call encoder", __func__);                       \
+        }                                                                                                  \
+    } while (0)
+
+#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name)                                           \
+    do {                                                                                                          \
+        ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL);       \
+        if (!decoder_name) {                                                                                      \
+            GGML_ABORT("%s: failed to kick the remote call", __func__);                                         \
+        }                                                                                                         \
+        if (ret_name < APIR_FORWARD_BASE_INDEX) {                                                                 \
+            GGML_ABORT("%s: failed to forward the API call: %s: code %d", __func__,                             \
+                       apir_forward_error(ret_name), ret_name);                                                   \
+        }                                                                                                         \
+        ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX);                                  \
+    } while (0)
--- a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
@@ -0,0 +1,51 @@
+#pragma once
+
+/* device */
+void                           apir_device_get_device_count(struct virtgpu * gpu);
+int                            apir_device_get_count(struct virtgpu * gpu);
+const char *                   apir_device_get_name(struct virtgpu * gpu);
+const char *                   apir_device_get_description(struct virtgpu * gpu);
+uint32_t                       apir_device_get_type(struct virtgpu * gpu);
+void                           apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total);
+bool                           apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op);
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu * gpu);
+void                           apir_device_get_props(struct virtgpu * gpu,
+                                                     bool *           async,
+                                                     bool *           host_buffer,
+                                                     bool *           buffer_from_host_ptr,
+                                                     bool *           events);
+apir_buffer_context_t          apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size);
+
+/* buffer-type */
+const char *          apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+size_t                apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+size_t                apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+bool                  apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
+apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *           gpu,
+                                                    ggml_backend_buffer_type_t buffer_buft,
+                                                    size_t                     size);
+size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op);
+
+/* buffer */
+void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
+void   apir_buffer_set_tensor(struct virtgpu *        gpu,
+                              apir_buffer_context_t * buffer_context,
+                              ggml_tensor *           tensor,
+                              const void *            data,
+                              size_t                  offset,
+                              size_t                  size);
+void   apir_buffer_get_tensor(struct virtgpu *        gpu,
+                              apir_buffer_context_t * buffer_context,
+                              const ggml_tensor *     tensor,
+                              void *                  data,
+                              size_t                  offset,
+                              size_t                  size);
+bool   apir_buffer_cpy_tensor(struct virtgpu *        gpu,
+                              apir_buffer_context_t * buffer_context,
+                              const ggml_tensor *     src,
+                              const ggml_tensor *     dst);
+void   apir_buffer_clear(struct virtgpu * gpu, apir_buffer_context_t * buffer_context, uint8_t value);
+void   apir_buffer_free_buffer(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
+
+/* backend */
+ggml_status apir_backend_graph_compute(struct virtgpu * gpu, ggml_cgraph * cgraph);
--- a/ggml/src/ggml-virtgpu/virtgpu-shm.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-shm.cpp
@@ -0,0 +1,99 @@
+#include "virtgpu-shm.h"
+
+#include "virtgpu.h"
+
+#include <assert.h>
+
+static uint32_t virtgpu_ioctl_resource_create_blob(virtgpu *  gpu,
+                                                   uint32_t   blob_mem,
+                                                   uint32_t   blob_flags,
+                                                   size_t     blob_size,
+                                                   uint64_t   blob_id,
+                                                   uint32_t * res_id) {
+#ifdef SIMULATE_BO_SIZE_FIX
+    blob_size = align64(blob_size, 4096);
+#endif
+
+    drm_virtgpu_resource_create_blob args = {
+        .blob_mem   = blob_mem,
+        .blob_flags = blob_flags,
+        .bo_handle  = 0,
+        .res_handle = 0,
+        .size       = blob_size,
+        .pad        = 0,
+        .cmd_size   = 0,
+        .cmd        = 0,
+        .blob_id    = blob_id,
+    };
+
+    if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args)) {
+        return 0;
+    }
+
+    *res_id = args.res_handle;
+    return args.bo_handle;
+}
+
+static void virtgpu_ioctl_gem_close(virtgpu * gpu, uint32_t gem_handle) {
+    drm_gem_close args = {
+        .handle = gem_handle,
+        .pad    = 0,
+    };
+
+    const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
+    assert(!ret);
+#ifdef NDEBUG
+    UNUSED(ret);
+#endif
+}
+
+static void * virtgpu_ioctl_map(virtgpu * gpu, uint32_t gem_handle, size_t size) {
+    drm_virtgpu_map args = {
+        .offset = 0,
+        .handle = gem_handle,
+        .pad    = 0,
+    };
+
+    if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) {
+        return NULL;
+    }
+
+    void * ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd, args.offset);
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+
+    return ptr;
+}
+
+void virtgpu_shmem_destroy(virtgpu * gpu, virtgpu_shmem * shmem) {
+    munmap(shmem->mmap_ptr, shmem->mmap_size);
+    virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
+}
+
+int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem) {
+    size = align64(size, 16384);
+
+    uint32_t res_id;
+    uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(gpu, VIRTGPU_BLOB_MEM_HOST3D,
+                                                             VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0, &res_id);
+
+    if (!gem_handle) {
+        return 1;
+    }
+
+    void * ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
+    if (!ptr) {
+        virtgpu_ioctl_gem_close(gpu, gem_handle);
+        GGML_LOG_ERROR("virtgpu_ioctl_map FAILED\n");
+        exit(1);
+        return 1;
+    }
+
+    shmem->res_id     = res_id;
+    shmem->mmap_size  = size;
+    shmem->mmap_ptr   = ptr;
+    shmem->gem_handle = gem_handle;
+
+    return 0;
+}
--- a/ggml/src/ggml-virtgpu/virtgpu-shm.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-shm.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "virtgpu-utils.h"
+
+#include <sys/mman.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+struct virtgpu;
+
+struct virtgpu_shmem {
+    uint32_t res_id;
+    size_t   mmap_size;
+    void *   mmap_ptr;
+
+    uint32_t gem_handle;
+};
+
+int  virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem);
+void virtgpu_shmem_destroy(virtgpu * gpu, virtgpu_shmem * shmem);
--- a/ggml/src/ggml-virtgpu/virtgpu-utils.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-utils.cpp
@@ -0,0 +1,179 @@
+#include "virtgpu-utils.h"
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include <cstring>
+
+#define NODE_ALLOC_ALIGN 64
+#define NODE_PTR_MASK    (~((uintptr_t) NODE_ALLOC_ALIGN - 1))
+#define NODE_LEVEL_MASK  ((uintptr_t) NODE_ALLOC_ALIGN - 1)
+#define NULL_NODE        0
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr)            free(_ptr)
+#define p_atomic_cmpxchg(v, old, _new)   __sync_val_compare_and_swap((v), (old), (_new))
+
+static inline uint64_t util_logbase2_64(uint64_t n) {
+#if defined(HAVE___BUILTIN_CLZLL)
+    return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+    uint64_t pos = 0ull;
+    if (n >= 1ull << 32) {
+        n >>= 32;
+        pos += 32;
+    }
+    if (n >= 1ull << 16) {
+        n >>= 16;
+        pos += 16;
+    }
+    if (n >= 1ull << 8) {
+        n >>= 8;
+        pos += 8;
+    }
+    if (n >= 1ull << 4) {
+        n >>= 4;
+        pos += 4;
+    }
+    if (n >= 1ull << 2) {
+        n >>= 2;
+        pos += 2;
+    }
+    if (n >= 1ull << 1) {
+        pos += 1;
+    }
+    return pos;
+#endif
+}
+
+void util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size) {
+    memset(arr, 0, sizeof(*arr));
+    arr->elem_size      = elem_size;
+    arr->node_size_log2 = util_logbase2_64(node_size);
+    assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
+}
+
+static inline void * os_malloc_aligned(size_t size, size_t alignment) {
+    void * ptr;
+    alignment = (alignment + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
+    if (posix_memalign(&ptr, alignment, size) != 0) {
+        return NULL;
+    }
+    return ptr;
+}
+
+static inline void * _util_sparse_array_node_data(uintptr_t handle) {
+    return (void *) (handle & NODE_PTR_MASK);
+}
+
+static inline unsigned _util_sparse_array_node_level(uintptr_t handle) {
+    return handle & NODE_LEVEL_MASK;
+}
+
+static inline void _util_sparse_array_node_finish(util_sparse_array * arr, uintptr_t node) {
+    if (_util_sparse_array_node_level(node) > 0) {
+        uintptr_t * children  = (uintptr_t *) _util_sparse_array_node_data(node);
+        size_t      node_size = 1ull << arr->node_size_log2;
+        for (size_t i = 0; i < node_size; i++) {
+            if (children[i]) {
+                _util_sparse_array_node_finish(arr, children[i]);
+            }
+        }
+    }
+
+    os_free_aligned(_util_sparse_array_node_data(node));
+}
+
+static inline uintptr_t _util_sparse_array_node(void * data, unsigned level) {
+    assert(data != NULL);
+    assert(((uintptr_t) data & NODE_LEVEL_MASK) == 0);
+    assert((level & NODE_PTR_MASK) == 0);
+    return (uintptr_t) data | level;
+}
+
+inline uintptr_t _util_sparse_array_node_alloc(util_sparse_array * arr, unsigned level) {
+    size_t size;
+    if (level == 0) {
+        size = arr->elem_size << arr->node_size_log2;
+    } else {
+        size = sizeof(uintptr_t) << arr->node_size_log2;
+    }
+
+    void * data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
+    memset(data, 0, size);
+
+    return _util_sparse_array_node(data, level);
+}
+
+static inline uintptr_t _util_sparse_array_set_or_free_node(uintptr_t * node_ptr, uintptr_t cmp_node, uintptr_t node) {
+    uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
+
+    if (prev_node != cmp_node) {
+        /* We lost the race.  Free this one and return the one that was already
+       * allocated.
+       */
+        os_free_aligned(_util_sparse_array_node_data(node));
+        return prev_node;
+    } else {
+        return node;
+    }
+}
+
+void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx) {
+    const unsigned node_size_log2 = arr->node_size_log2;
+    uintptr_t      root           = p_atomic_read(&arr->root);
+    if (unlikely(!root)) {
+        unsigned root_level = 0;
+        uint64_t idx_iter   = idx >> node_size_log2;
+        while (idx_iter) {
+            idx_iter >>= node_size_log2;
+            root_level++;
+        }
+        uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
+        root               = _util_sparse_array_set_or_free_node(&arr->root, NULL_NODE, new_root);
+    }
+
+    while (1) {
+        unsigned root_level = _util_sparse_array_node_level(root);
+        uint64_t root_idx   = idx >> (root_level * node_size_log2);
+        if (likely(root_idx < (1ull << node_size_log2))) {
+            break;
+        }
+
+        /* In this case, we have a root but its level is low enough that the
+       * requested index is out-of-bounds.
+       */
+        uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
+
+        uintptr_t * new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
+        new_root_children[0]          = root;
+
+        /* We only add one at a time instead of the whole tree because it's
+       * easier to ensure correctness of both the tree building and the
+       * clean-up path.  Because we're only adding one node we never have to
+       * worry about trying to free multiple things without freeing the old
+       * things.
+       */
+        root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
+    }
+
+    void *   node_data  = _util_sparse_array_node_data(root);
+    unsigned node_level = _util_sparse_array_node_level(root);
+    while (node_level > 0) {
+        uint64_t child_idx = (idx >> (node_level * node_size_log2)) & ((1ull << node_size_log2) - 1);
+
+        uintptr_t * children = (uintptr_t *) node_data;
+        uintptr_t   child    = p_atomic_read(&children[child_idx]);
+
+        if (unlikely(!child)) {
+            child = _util_sparse_array_node_alloc(arr, node_level - 1);
+            child = _util_sparse_array_set_or_free_node(&children[child_idx], NULL_NODE, child);
+        }
+
+        node_data  = _util_sparse_array_node_data(child);
+        node_level = _util_sparse_array_node_level(child);
+    }
+
+    uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
+    return (void *) ((char *) node_data + (elem_idx * arr->elem_size));
+}
--- a/ggml/src/ggml-virtgpu/virtgpu-utils.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-utils.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define likely(x)   __builtin_expect(!!(x), 1)
+
+#ifndef UNUSED
+#    define UNUSED(x) (void) (x)
+#endif
+
+/** Checks is a value is a power of two. Does not handle zero. */
+#define IS_POT(v) (((v) & ((v) - 1)) == 0)
+
+/** Checks is a value is a power of two. Zero handled. */
+#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v))
+
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
+#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
+
+static inline bool util_is_power_of_two_nonzero64(uint64_t v) {
+    return IS_POT_NONZERO(v);
+}
+
+static inline uint64_t align64(uint64_t value, uint64_t alignment) {
+    assert(util_is_power_of_two_nonzero64(alignment));
+    return ALIGN_POT(value, alignment);
+}
+
+struct list_head {
+    list_head * prev;
+    list_head * next;
+};
+
+struct util_sparse_array {
+    size_t   elem_size;
+    unsigned node_size_log2;
+
+    uintptr_t root;
+};
+
+void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx);
+void   util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size);
+
+inline void os_time_sleep(int64_t usecs) {
+    timespec time;
+    time.tv_sec  = usecs / 1000000;
+    time.tv_nsec = (usecs % 1000000) * 1000;
+    while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR)
+        ;
+}
+
+struct timer_data {
+    long long start;
+    long long total;
+    long long count;
+};
+
+static inline void start_timer(timer_data * timer) {
+    timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    timer->start = (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+// returns the duration in ns
+static inline long long stop_timer(timer_data * timer) {
+    timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    long long timer_end = (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+    long long duration = (timer_end - timer->start);
+    timer->total += duration;
+    timer->count += 1;
+
+    return duration;
+}
--- a/ggml/src/ggml-virtgpu/virtgpu.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu.cpp
@@ -0,0 +1,498 @@
+#include "virtgpu.h"
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cerrno>
+#include <cstdlib>
+
+static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev);
+static virt_gpu_result_t virtgpu_open(virtgpu * gpu);
+
+static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu);
+static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu);
+
+static int      virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id);
+static int      virtgpu_ioctl_get_caps(virtgpu *             gpu,
+                                       virgl_renderer_capset id,
+                                       uint32_t              version,
+                                       void *                capset,
+                                       size_t                capset_size);
+static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param);
+static void     virtgpu_init_renderer_info(virtgpu * gpu);
+
+static void log_call_duration(long long call_duration_ns, const char * name);
+
+const uint64_t APIR_HANDSHAKE_MAX_WAIT_MS   = 2 * 1000;   // 2s
+const uint64_t APIR_LOADLIBRARY_MAX_WAIT_MS = 60 * 1000;  // 60s
+
+static int virtgpu_handshake(virtgpu * gpu) {
+    apir_encoder * encoder;
+    apir_decoder * decoder;
+
+    encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0);
+    if (!encoder) {
+        GGML_ABORT("%s: failed to prepare the remote call encoder", __func__);
+        return 1;
+    }
+
+    /* write handshake props */
+
+    uint32_t guest_major = APIR_PROTOCOL_MAJOR;
+    uint32_t guest_minor = APIR_PROTOCOL_MINOR;
+    apir_encode_uint32_t(encoder, &guest_major);
+    apir_encode_uint32_t(encoder, &guest_minor);
+
+    /* *** */
+
+    uint32_t  ret_magic;
+    long long call_duration_ns;
+    ret_magic = remote_call(gpu, encoder, &decoder, APIR_HANDSHAKE_MAX_WAIT_MS, &call_duration_ns);
+    log_call_duration(call_duration_ns, "API Remoting handshake");
+
+    if (!decoder) {
+        GGML_ABORT(
+            "%s: failed to initiate the communication with the virglrenderer library. "
+            "Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
+            __func__);
+        return 1;
+    }
+
+    /* read handshake return values */
+
+    uint32_t host_major;
+    uint32_t host_minor;
+
+    if (ret_magic != APIR_HANDSHAKE_MAGIC) {
+        GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
+                   apir_backend_initialize_error(ret_magic));
+    } else {
+        apir_decode_uint32_t(decoder, &host_major);
+        apir_decode_uint32_t(decoder, &host_minor);
+    }
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    if (ret_magic != APIR_HANDSHAKE_MAGIC) {
+        return 1;
+    }
+
+    GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
+    GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
+
+    if (guest_major != host_major) {
+        GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
+    } else if (guest_minor != host_minor) {
+        GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
+    }
+
+    return 0;
+}
+
+static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
+    apir_encoder *            encoder;
+    apir_decoder *            decoder;
+    ApirLoadLibraryReturnCode ret;
+
+    encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0);
+    if (!encoder) {
+        GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__);
+        return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
+    }
+
+    long long call_duration_ns;
+
+    ret = (ApirLoadLibraryReturnCode) remote_call(gpu, encoder, &decoder, APIR_LOADLIBRARY_MAX_WAIT_MS,
+                                                  &call_duration_ns);
+    log_call_duration(call_duration_ns, "API Remoting LoadLibrary");
+
+    if (!decoder) {
+        GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__);
+        return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
+    }
+
+    remote_call_finish(gpu, encoder, decoder);
+
+    if (ret == APIR_LOAD_LIBRARY_SUCCESS) {
+        GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__);
+
+        return ret;
+    }
+
+    // something wrong happened, find out what.
+
+    if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
+        GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__,
+                   apir_load_library_error(ret), ret);
+        return ret;
+    }
+
+    GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__);
+
+    ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+    if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
+        GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)",
+                   __func__, apir_ret, apir_load_library_error(apir_ret));
+    } else {
+        uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
+        GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
+                   lib_ret);
+    }
+    return ret;
+}
+
+virtgpu * create_virtgpu() {
+    virtgpu * gpu = new virtgpu();
+
+    gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr;
+    util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024);
+
+    if (virtgpu_open(gpu) != APIR_SUCCESS) {
+        GGML_ABORT("%s: failed to open the virtgpu device", __func__);
+        return NULL;
+    }
+
+    if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
+        GGML_ABORT("%s: failed to initialize the GPU capset", __func__);
+        return NULL;
+    }
+
+    if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
+        GGML_ABORT("%s: failed to initialize the GPU context", __func__);
+        return NULL;
+    }
+
+    if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
+        GGML_ABORT("%s: failed to create the shared reply memory pages", __func__);
+        return NULL;
+    }
+
+    if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
+        GGML_ABORT("%s: failed to create the shared data memory pages", __func__);
+        return NULL;
+    }
+
+    if (virtgpu_handshake(gpu)) {
+        GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__);
+        return NULL;
+    }
+
+    if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
+        GGML_ABORT("%s: failed to load the backend library", __func__);
+        return NULL;
+    }
+
+    return gpu;
+}
+
+static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
+    drmDevicePtr devs[8];
+    int          count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
+    if (count < 0) {
+        GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__);
+        return APIR_ERROR_INITIALIZATION_FAILED;
+    }
+
+    virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED;
+    for (int i = 0; i < count; i++) {
+        result = virtgpu_open_device(gpu, devs[i]);
+        if (result == APIR_SUCCESS) {
+            break;
+        }
+    }
+
+    drmFreeDevices(devs, count);
+
+    return result;
+}
+
+static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev) {
+    const char * node_path = dev->nodes[DRM_NODE_RENDER];
+
+    int fd = open(node_path, O_RDWR | O_CLOEXEC);
+    if (fd < 0) {
+        GGML_ABORT("failed to open %s", node_path);
+        return APIR_ERROR_INITIALIZATION_FAILED;
+    }
+
+    drmVersionPtr version = drmGetVersion(fd);
+    if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
+        if (version) {
+            GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major);
+        } else {
+            GGML_ABORT("failed to get DRM driver version");
+        }
+
+        if (version) {
+            drmFreeVersion(version);
+        }
+        close(fd);
+        return APIR_ERROR_INITIALIZATION_FAILED;
+    }
+
+    gpu->fd = fd;
+
+    drmFreeVersion(version);
+
+    GGML_LOG_INFO("using DRM device %s\n", node_path);
+
+    return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
+    assert(!gpu->capset.version);
+    const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
+    if (ret) {
+        GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno));
+        return APIR_ERROR_INITIALIZATION_FAILED;
+    }
+
+    return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
+    if (gpu->use_apir_capset) {
+        GGML_LOG_INFO("Using the APIR capset\n");
+        gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR;
+    } else {
+        GGML_LOG_INFO("Using the Venus capset\n");
+        gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS;
+    }
+    gpu->capset.version = 0;
+
+    int ret =
+        virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
+
+    if (ret) {
+        GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno));
+        return APIR_ERROR_INITIALIZATION_FAILED;
+    }
+
+    assert(gpu->capset.data.supports_blob_resources);
+
+    return APIR_SUCCESS;
+}
+
+static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id) {
+    drm_virtgpu_context_set_param ctx_set_params[3] = {
+        {
+         .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID,
+         .value = capset_id,
+         },
+        {
+         .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS,
+         .value = 1,
+         },
+        {
+         .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK,
+         .value = 0, /* don't generate drm_events on fence signaling */
+        },
+    };
+
+    drm_virtgpu_context_init args = {
+        .num_params     = ARRAY_SIZE(ctx_set_params),
+        .pad            = 0,
+        .ctx_set_params = (uintptr_t) &ctx_set_params,
+    };
+
+    return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args);
+}
+
+static int virtgpu_ioctl_get_caps(virtgpu *             gpu,
+                                  virgl_renderer_capset id,
+                                  uint32_t              version,
+                                  void *                capset,
+                                  size_t                capset_size) {
+    drm_virtgpu_get_caps args = {
+        .cap_set_id  = id,
+        .cap_set_ver = version,
+        .addr        = (uintptr_t) capset,
+        .size        = (__u32) capset_size,
+        .pad         = 0,
+    };
+
+    return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
+}
+
+static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param) {
+    /* val must be zeroed because kernel only writes the lower 32 bits */
+    uint64_t             val  = 0;
+    drm_virtgpu_getparam args = {
+        .param = param,
+        .value = (uintptr_t) &val,
+    };
+
+    const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
+    return ret ? 0 : val;
+}
+
+apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags) {
+    /*
+     * Prepare the command encoder and its buffer
+     */
+
+    static char encoder_buffer[4096];
+
+    static apir_encoder enc;
+    enc = {
+        .cur   = encoder_buffer,
+        .start = encoder_buffer,
+        .end   = encoder_buffer + sizeof(encoder_buffer),
+        .fatal = false,
+    };
+
+    /*
+     * Fill the command encoder with the common args:
+     * - cmd_type (int32_t)
+     * - cmd_flags (int32_t)
+     * - reply res id (uint32_t)
+   */
+
+    int32_t cmd_type = apir_cmd_type;
+
+    // for testing during the hypervisor transition
+    if (!gpu->use_apir_capset) {
+        cmd_type += VENUS_COMMAND_TYPE_LENGTH;
+    }
+    apir_encode_int32_t(&enc, &cmd_type);
+    apir_encode_int32_t(&enc, &cmd_flags);
+
+    uint32_t reply_res_id = gpu->reply_shmem.res_id;
+    apir_encode_uint32_t(&enc, &reply_res_id);
+
+    return &enc;
+}
+
+void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) {
+    UNUSED(gpu);
+
+    if (!enc) {
+        GGML_LOG_ERROR("Invalid (null) encoder\n");
+    }
+
+    if (!dec) {
+        GGML_LOG_ERROR("Invalid (null) decoder\n");
+    }
+
+    if (apir_encoder_get_fatal(enc)) {
+        GGML_LOG_ERROR("Failed to encode the output parameters.\n");
+    }
+
+    if (apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR("Failed to decode the input parameters.\n");
+    }
+}
+
+uint32_t remote_call(virtgpu *       gpu,
+                     apir_encoder *  encoder,
+                     apir_decoder ** decoder,
+                     float           max_wait_ms,
+                     long long *     call_duration_ns) {
+    /*
+     * Prepare the reply notification pointer
+     */
+
+    volatile std::atomic_uint * atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem.mmap_ptr;
+    *atomic_reply_notif                            = 0;
+
+    /*
+     * Trigger the execbuf ioctl
+     */
+
+    drm_virtgpu_execbuffer args = {
+        .flags   = VIRTGPU_EXECBUF_RING_IDX,
+        .size    = (uint32_t) (encoder->cur - encoder->start),
+        .command = (uintptr_t) encoder->start,
+
+        .bo_handles     = 0,
+        .num_bo_handles = 0,
+
+        .fence_fd         = 0,
+        .ring_idx         = 0,
+        .syncobj_stride   = 0,
+        .num_in_syncobjs  = 0,
+        .num_out_syncobjs = 0,
+        .in_syncobjs      = 0,
+        .out_syncobjs     = 0,
+    };
+
+    *decoder = NULL;
+
+    int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
+
+    if (ret != 0) {
+        GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
+    }
+
+    /*
+     * Wait for the response notification
+     */
+    timer_data wait_host_reply_timer = { 0, 0, 0 };
+
+    start_timer(&wait_host_reply_timer);
+
+    timespec ts_start, ts_end;
+    clock_gettime(CLOCK_MONOTONIC, &ts_start);
+    long long start_time = (long long) ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec;
+
+    bool     timedout    = false;
+    uint32_t notif_value = 0;
+    while (true) {
+        notif_value = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire);
+
+        if (notif_value != 0) {
+            break;
+        }
+
+        int64_t base_sleep_us = 15;
+
+        os_time_sleep(base_sleep_us);
+
+        if (max_wait_ms) {
+            clock_gettime(CLOCK_MONOTONIC, &ts_end);
+            long long end_time    = (long long) ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec;
+            float     duration_ms = (end_time - start_time) / 1000000;
+
+            if (duration_ms > max_wait_ms) {
+                timedout = true;
+                break;
+            }
+        }
+    }
+
+    if (call_duration_ns) {
+        *call_duration_ns = stop_timer(&wait_host_reply_timer);
+    }
+
+    if (max_wait_ms && timedout) {
+        GGML_LOG_ERROR("timed out waiting for the host answer...\n");
+        return APIR_FORWARD_TIMEOUT;
+    }
+
+    /*
+     * Prepare the decoder
+     */
+    static apir_decoder response_dec;
+    response_dec.cur = (char *) gpu->reply_shmem.mmap_ptr + sizeof(*atomic_reply_notif);
+    response_dec.end = (char *) gpu->reply_shmem.mmap_ptr + gpu->reply_shmem.mmap_size;
+    *decoder         = &response_dec;
+
+    // extract the actual return value from the notif flag
+    uint32_t returned_value = notif_value - 1;
+    return returned_value;
+}
+
+static void log_call_duration(long long call_duration_ns, const char * name) {
+    double call_duration_ms = (double) call_duration_ns / 1e6;  // 1 millisecond = 1e6 nanoseconds
+    double call_duration_s  = (double) call_duration_ns / 1e9;  // 1 second = 1e9 nanoseconds
+
+    if (call_duration_s > 1) {
+        GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name);
+    } else if (call_duration_ms > 1) {
+        GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name);
+    } else {
+        GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name);
+    }
+}
--- a/ggml/src/ggml-virtgpu/virtgpu.h
+++ b/ggml/src/ggml-virtgpu/virtgpu.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "virtgpu-utils.h"
+#include "virtgpu-shm.h"
+#include "virtgpu-apir.h"
+
+#include "backend/shared/api_remoting.h"
+#include "backend/shared/apir_cs.h"
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <threads.h>
+#include <xf86drm.h>
+
+#include <cstring>
+
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "apir_hw.h"
+#include <drm/virtgpu_drm.h>
+#include "venus_hw.h"
+
+#ifndef VIRTGPU_DRM_CAPSET_APIR
+// Will be defined include/drm/virtgpu_drm.h when
+// https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs
+// is merged
+#define VIRTGPU_DRM_CAPSET_APIR 10
+#endif
+
+// Mesa/Virlgrenderer Venus internal. Only necessary during the
+// Venus->APIR transition in Virglrenderer
+#define VENUS_COMMAND_TYPE_LENGTH 331
+
+#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
+#define VIRTGPU_DRM_CAPSET_VENUS 4
+#endif
+
+typedef uint32_t virgl_renderer_capset;
+
+/* from src/virtio/vulkan/vn_renderer_virtgpu.c */
+#define VIRTGPU_PCI_VENDOR_ID       0x1af4
+#define VIRTGPU_PCI_DEVICE_ID       0x1050
+#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004
+#define VIRTGPU_PARAM_GUEST_VRAM    9
+
+#define SHMEM_DATA_SIZE  0x1830000  // 24MiB
+#define SHMEM_REPLY_SIZE 0x4000
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+enum virt_gpu_result_t {
+    APIR_SUCCESS                     = 0,
+    APIR_ERROR_INITIALIZATION_FAILED = -1,
+};
+
+#define PRINTFLIKE(f, a) __attribute__((format(__printf__, f, a)))
+
+struct virtgpu {
+    bool use_apir_capset;
+
+    int fd;
+
+    struct {
+        virgl_renderer_capset      id;
+        uint32_t                   version;
+        virgl_renderer_capset_apir data;
+    } capset;
+
+    util_sparse_array shmem_array;
+
+    /* APIR communication pages */
+    virtgpu_shmem reply_shmem;
+    virtgpu_shmem data_shmem;
+};
+
+static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) {
+    return drmIoctl(gpu->fd, request, args);
+}
+
+virtgpu * create_virtgpu();
+
+apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags);
+
+uint32_t remote_call(virtgpu *       gpu,
+                     apir_encoder *  enc,
+                     apir_decoder ** dec,
+                     float           max_wait_ms,
+                     long long *     call_duration_ns);
+
+void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -3162,17 +3162,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
        // For scalar, use 128 (arbitrary)
        // The same D_split value is used for both HSK and HSV, so just base it on the union of the LSBs.
        const uint32_t D = (hsk|hsv);
-        uint32_t wg_size = (path == FA_SCALAR || path == FA_COOPMAT1)
-                            ? scalar_flash_attention_workgroup_size
-                            : ((small_rows && (D % 32) == 0) ? 256 : 128);
        auto rows_cols = fa_rows_cols(path, hsk, hsv, clamp, type, small_rows, small_cache);

+        uint32_t wg_size;
+        switch (path) {
+        case FA_COOPMAT2:
+            wg_size = ((small_rows && (D % 32) == 0) ? 256 : 128);
+            break;
+        case FA_COOPMAT1:
+            wg_size = (rows_cols[1] / 16) * device->subgroup_size; // enough subgroups for Bc/MatBc
+            break;
+        default:
+            wg_size = scalar_flash_attention_workgroup_size;
+            break;
+        }
+
        // D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it.
        // D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader.
        const uint32_t D_lsb = D ^ (D & (D-1));
        uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);

-        return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split};
+        // Nvidia prefers shared memory use to load large tiles of K
+        // AMD prefers loading K directly from global memory
+        const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA ? 1 : 0;
+
+        return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem};
    };

 #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
@@ -3187,15 +3201,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
            if (path == FAPATH) { \
                if (aligned) { \
                    if (f32acc) { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0));     \
                    } else { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0));     \
                    } \
                } else { \
                    if (f32acc) { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), 1,                                        true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), 1,                                        true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0));     \
                    } else { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), 1,                                        true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), 1,                                        true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0));     \
                    } \
                } \
            } \
@@ -5522,22 +5536,32 @@ static void ggml_vk_instance_init() {

            if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) {
                // Check if there are two physical devices corresponding to the same GPU
+                // This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux),
+                // see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication.
+                // However, for MoltenVK on macOS, multiple GPUs on the same card may report the same UUID,
+                // see https://github.com/KhronosGroup/MoltenVK/issues/2683. Until this is fixed, we'll only deduplicate
+                // when drivers differ (same driver + same UUID = likely different GPUs)
                auto old_device = std::find_if(
                    vk_instance.device_indices.begin(),
                    vk_instance.device_indices.end(),
-                    [&devices, &new_id](const size_t k){
+                    [&devices, &new_id, &new_driver](const size_t k){
                        vk::PhysicalDeviceProperties2 old_props;
+                        vk::PhysicalDeviceDriverProperties old_driver;
                        vk::PhysicalDeviceIDProperties old_id;
-                        old_props.pNext = &old_id;
+                        old_props.pNext = &old_driver;
+                        old_driver.pNext = &old_id;
                        devices[k].getProperties2(&old_props);

-                        bool equals = std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
-                        equals = equals || (
+                        bool same_uuid = std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
+                        same_uuid = same_uuid || (
                            old_id.deviceLUIDValid && new_id.deviceLUIDValid &&
                            std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID))
                        );

-                        return equals;
+                        // Only deduplicate if same UUID AND different drivers
+                        // (same driver + same UUID on MoltenVK = likely different GPUs on multi-GPU card)
+                        bool different_driver = (old_driver.driverID != new_driver.driverID);
+                        return same_uuid && different_driver;
                    }
                );
                if (old_device == vk_instance.device_indices.end()) {
@@ -8334,41 +8358,49 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
    const uint32_t total_size = tmpsh + tmpshv4 + masksh + Qf;
    const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;

-    VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", total_size=" << total_size << ", supported=" << supported);
+    VK_LOG_DEBUG("ggml_vk_flash_attn_scalar_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", total_size=" << total_size << ", supported=" << supported);

    return supported;
 }

-static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t hsk, uint32_t hsv, bool f32acc) {
+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type kv_type) {
    // Needs to be kept up to date on shader changes
    GGML_UNUSED(hsv);
-    const uint32_t wg_size = scalar_flash_attention_workgroup_size;
-    const uint32_t Br = coopmat1_flash_attention_num_large_rows;
-    const uint32_t Bc = scalar_flash_attention_Bc;
+    const auto rows_cols = fa_rows_cols(FA_COOPMAT1, hsk, hsv, 0, kv_type, false, false);
+    const uint32_t Br = rows_cols[0];
+    const uint32_t Bc = rows_cols[1];
+
+    const uint32_t MatBr = 16, MatBc = 16;
+
+    const uint32_t row_split = Bc / MatBc;

    const uint32_t hsk_pad = ROUNDUP_POW2(hsk, 16);

    const uint32_t acctype = f32acc ? 4 : 2;
    const uint32_t f16vec4 = 8;

-    const uint32_t tmpsh = wg_size * sizeof(float);
-    const uint32_t tmpshv4 = wg_size * 4 * acctype;
+    const uint32_t tmpsh = (Bc / MatBc) * sizeof(float);

    const uint32_t qstride = hsk_pad / 4 + 2;
    const uint32_t Qf = Br * qstride * f16vec4;

+    const uint32_t psh_stride = Br / 4 + 2;
+    const uint32_t Psh = Bc * psh_stride * f16vec4;
+
    const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br;
    const uint32_t sfsh = Bc * sfshstride * acctype;

-    const uint32_t kshstride = hsk_pad / 4 + 2;
-    const uint32_t ksh = Bc * kshstride * f16vec4;
+    const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA;
+    const uint32_t kshstride = (k_load_shmem ? hsk_pad : MatBr) / 4 + 2;
+    const uint32_t vsh_stride = MatBc / 4 * row_split;
+    const uint32_t ksh = ((kshstride >= vsh_stride) ? (Bc * kshstride) : (Bc * vsh_stride)) * f16vec4;

-    const uint32_t slope = Br * sizeof(float);
+    const uint32_t slope = Br * acctype;

-    const uint32_t total_size = tmpsh + tmpshv4 + Qf + sfsh + ksh + slope;
+    const uint32_t total_size = tmpsh + Qf + Psh + sfsh + ksh + slope;
    const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;

-    VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported);
+    VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", kv_type=" << kv_type << ", total_size=" << total_size << ", supported=" << supported);

    return supported;
 }
@@ -8432,7 +8464,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
        const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
                                             (dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);

-        const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, HSK, HSV, dst->op_params[3] == GGML_PREC_F32);
+        const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, HSK, HSV, dst->op_params[3] == GGML_PREC_F32, k->type);

        if (!coopmat_shape_supported || !coopmat_shmem_supported) {
            path = FA_SCALAR;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
@@ -8,6 +8,8 @@ layout (constant_id = 3) const uint32_t HSK = 32;
 layout (constant_id = 4) const uint32_t HSV = 32;
 layout (constant_id = 5) const uint32_t Clamp = 0;
 layout (constant_id = 6) const uint32_t D_split = 16;
+layout (constant_id = 7) const uint32_t SubGroupSize = 32;
+layout (constant_id = 8) const uint32_t K_LOAD_SHMEM = 0;

 // Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
 const uint32_t HSK_pad = (HSK + 15) & ~15;
@@ -74,6 +76,10 @@ layout (binding = 1) readonly buffer K_PACKED16 {A_TYPE_PACKED16 k_data_packed16
 layout (binding = 2) readonly buffer V_PACKED16 {A_TYPE_PACKED16 v_data_packed16[];} v_packed;
 #endif

+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE 1
+#endif
+
 #if defined(DATA_A_F32)
 #undef BLOCK_SIZE
 #define BLOCK_SIZE 4
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -7,6 +7,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require

 #extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
 #extension GL_KHR_shader_subgroup_vote : enable
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable
@@ -14,12 +15,13 @@
 #include "types.glsl"
 #include "flash_attn_base.glsl"

-const uint32_t HSK_per_thread = HSK / D_split;
-const uint32_t HSV_per_thread = HSV / D_split;
+// These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
+const uint32_t MatBr = 16;
+const uint32_t MatBc = 16;

-const uint32_t row_split = 4;
+const uint32_t row_split = Bc / MatBc;
 const uint32_t rows_per_thread = Br / row_split;
-const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split / row_split;
+const uint32_t cols_per_iter = gl_WorkGroupSize.x / row_split;
 const uint32_t cols_per_thread = Bc / cols_per_iter;


@@ -40,24 +42,24 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
    return elem;
 }

-// These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
-const uint32_t MatBr = 16;
-const uint32_t MatBc = 16;
-
-shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
-shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];
+shared float tmpsh[row_split];

 const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
 shared f16vec4 Qf[Br * qstride];

+const uint psh_stride = Br / 4 + 2;
+shared f16vec4 Psh[Bc * psh_stride];
+
 // Avoid padding for hsk==256 to make it fit in 48KB shmem.
-const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br;
-shared ACC_TYPE sfsh[Bc * sfshstride];
+const uint32_t sfshstride = (HSK <= 128) ? (Br / 4 + 2) : Br / 4;
+shared ACC_TYPEV4 sfsh[Bc * sfshstride];

-const uint32_t kshstride = HSK_pad / 4 + 2; // in units of f16vec4
-shared f16vec4 ksh[Bc * kshstride];
+const uint32_t kshstride = (K_LOAD_SHMEM != 0 ? HSK_pad : MatBr) / 4 + 2; // in units of f16vec4
+const uint v_cols = MatBc / 4 * row_split; // total cols, 4 vec4s per MatBc * number of subgroups
+const uint vsh_stride = v_cols;
+shared f16vec4 ksh[(kshstride >= vsh_stride) ? (Bc * kshstride) : (Bc * vsh_stride)];

-shared float slope[Br];
+shared ACC_TYPE slope[Br];

 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
@@ -69,9 +71,9 @@ void main() {
    const uint32_t tid = gl_LocalInvocationIndex;

    const uint32_t threads_per_rowgroup = gl_WorkGroupSize.x / row_split;
+    const uint32_t d_per_thread = (HSV/4 + threads_per_rowgroup - 1) / threads_per_rowgroup;
    const uint32_t row_tid = gl_LocalInvocationIndex / threads_per_rowgroup;
-    const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
-    const uint32_t col_tid = (gl_LocalInvocationIndex % threads_per_rowgroup) / D_split;
+    const uint32_t col_tid = gl_LocalInvocationIndex % threads_per_rowgroup;

 #define tile_row(r) (row_tid * rows_per_thread + (r))

@@ -102,9 +104,9 @@ void main() {
    }
    barrier();

-    ACC_TYPEV4 Of[rows_per_thread][HSV_per_thread / 4];
-    [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
-        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+    ACC_TYPEV4 Of[rows_per_thread][d_per_thread];
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        [[unroll]] for (uint32_t d = 0; d < d_per_thread; ++d) {
            Of[r][d] = ACC_TYPEV4(0.0);
        }
    }
@@ -125,13 +127,11 @@ void main() {
            uint r = tid;
            slope[r] = perElemOpComputeSlope(r, col_tid, ACC_TYPE(0), iq2);
        }
-        barrier();
    } else {
        if (tid < Br) {
            uint r = tid;
-            slope[r] = 1.0;
+            slope[r] = ACC_TYPE(1.0);
        }
-        barrier();
    }

 #if BLOCK_SIZE > 1
@@ -149,19 +149,45 @@ void main() {
    [[dont_unroll]]
    for (uint32_t j = start_j; j < end_j; ++j) {

-        float mask_cache[Bc * Br / WorkGroupSize];
+        f16vec4 mask_cache[Bc * Br / 4 / WorkGroupSize];
        if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
            bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;

            float max_mask = NEG_FLT_MAX_OVER_2;
-            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
-                uint32_t c = (idx + tid) % Bc;
-                uint32_t r = (idx + tid) / Bc;
-                if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
-                    if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
-                        float m = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) {
+                uint32_t c = (idx + tid) / (Br / 4);
+                uint32_t r = (idx + tid) % (Br / 4);
+                if (idx + tid < Bc * Br / 4 || idx + gl_WorkGroupSize.x <= Bc * Br / 4) {
+                    if ((!KV_bounds_check || j * Bc + c < KV)) {
+                        f16vec4 m;
+                        if (!nem1_bounds_check || i * Br + r * 4 + 3 < p.nem1) {
+                            m = f16vec4(data_m[m_offset + (i * Br + r * 4    ) * m_stride + (j * Bc + c)],
+                                        data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)],
+                                        data_m[m_offset + (i * Br + r * 4 + 2) * m_stride + (j * Bc + c)],
+                                        data_m[m_offset + (i * Br + r * 4 + 3) * m_stride + (j * Bc + c)]);
+                            max_mask = max(max(max(max(max_mask, float(m[0])), float(m[1])), float(m[2])), float(m[3]));
+                        } else if (i * Br + r * 4 + 2 < p.nem1) {
+                            m = f16vec4(data_m[m_offset + (i * Br + r * 4    ) * m_stride + (j * Bc + c)],
+                                        data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)],
+                                        data_m[m_offset + (i * Br + r * 4 + 2) * m_stride + (j * Bc + c)],
+                                        0.0);
+                            max_mask = max(max(max(max_mask, float(m[0])), float(m[1])), float(m[2]));
+                        } else if (i * Br + r * 4 + 1 < p.nem1) {
+                            m = f16vec4(data_m[m_offset + (i * Br + r * 4    ) * m_stride + (j * Bc + c)],
+                                        data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)],
+                                        0.0,
+                                        0.0);
+                            max_mask = max(max(max_mask, float(m[0])), float(m[1]));
+                        } else if (i * Br + r * 4 < p.nem1) {
+                            m = f16vec4(data_m[m_offset + (i * Br + r * 4    ) * m_stride + (j * Bc + c)],
+                                        0.0,
+                                        0.0,
+                                        0.0);
+                            max_mask = max(max_mask, float(m[0]));
+                        } else {
+                            m = f16vec4(0.0);
+                        }
                        mask_cache[idx / WorkGroupSize] = m;
-                        max_mask = max(max_mask, m);
                    }
                }
            }
@@ -180,26 +206,28 @@ void main() {
            }
        }

-        [[unroll]] for (uint32_t idx = 0; idx < Bc * HSK / 4; idx += gl_WorkGroupSize.x) {
-            uint32_t d = (idx + tid) % (HSK / 4);
-            uint32_t c = (idx + tid) / (HSK / 4);
-            if (c < Bc && d < HSK / 4) {
-                f16vec4 K_Tf = f16vec4(0);
-                if (!KV_bounds_check || j * Bc + c < KV) {
+        if (K_LOAD_SHMEM != 0) {
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * HSK / 4; idx += gl_WorkGroupSize.x) {
+                uint32_t d = (idx + tid) % (HSK / 4);
+                uint32_t c = (idx + tid) / (HSK / 4);
+                if (c < Bc && d < HSK / 4) {
+                    f16vec4 K_Tf = f16vec4(0);
+                    if (!KV_bounds_check || j * Bc + c < KV) {
 #if BLOCK_SIZE > 1
-                    uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
-                    uint ib = coord / BLOCK_SIZE;
-                    uint iqs = (coord % BLOCK_SIZE);
-                    K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K));
+                        uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
+                        uint ib = coord / BLOCK_SIZE;
+                        uint iqs = (coord % BLOCK_SIZE);
+                        K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K));
 #else
-                    K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
+                        K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
 #endif
-                }
+                    }

-                ksh[c * kshstride + d] = K_Tf;
+                    ksh[c * kshstride + d] = K_Tf;
+                }
            }
+            barrier();
        }
-        barrier();

        // K * Q^T -> S^T: Bc x HSK_pad * HSK_pad x Br -> Bc x Br
        // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
@@ -208,11 +236,55 @@ void main() {
        coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
        coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;

-        for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
-            coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
+        [[unroll]] for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
+            if (K_LOAD_SHMEM == 0) {
+#if BLOCK_SIZE == 1
+            if (KV_bounds_check || d * 16 + 16 > HSK) {
+#endif
+            barrier();
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * MatBr / 4; idx += gl_WorkGroupSize.x) {
+                uint32_t col_vec = (idx + tid) % (MatBr / 4);
+                uint32_t row = (idx + tid) / (MatBr / 4);
+                if (idx + tid < Bc * MatBr / 4) {
+                    f16vec4 K_Tf = f16vec4(0);
+                    if ((!KV_bounds_check || j * Bc + row < KV) && (HSK == HSK_pad || d * 16 + col_vec * 4 < HSK)) {
+#if BLOCK_SIZE > 1
+                        uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE + d * 16 + col_vec * 4;
+                        uint ib = coord / BLOCK_SIZE;
+                        uint iqs = (coord % BLOCK_SIZE);
+                        K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K));
+#else
+                        K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
+#endif
+                    }

-            uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;
-            coopMatLoad(KMat, ksh, coord, kshstride, gl_CooperativeMatrixLayoutRowMajor);
+                    ksh[row * kshstride + col_vec] = K_Tf;
+                }
+            }
+            barrier();
+#if BLOCK_SIZE == 1
+            }
+#endif
+
+#if BLOCK_SIZE == 1
+            if (KV_bounds_check || d * 16 + 16 > HSK)
+#endif
+            {
+                uint coord = (gl_SubgroupID * MatBc) * kshstride;
+                coopMatLoad(KMat, ksh, coord, kshstride, gl_CooperativeMatrixLayoutRowMajor);
+            }
+#if BLOCK_SIZE == 1
+            else {
+                const uint coord = k_offset / 4 + (j * Bc + gl_SubgroupID * MatBc) * k_stride / 4 + d * 16 / 4;
+                coopMatLoad(KMat, data_kv4, coord, k_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
+            }
+#endif
+            } else {
+                uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;
+                coopMatLoad(KMat, ksh, coord, kshstride, gl_CooperativeMatrixLayoutRowMajor);
+            }
+
+            coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);

            SfMat = coopMatMulAdd(KMat, QMat, SfMat);
        }
@@ -222,26 +294,26 @@ void main() {
        barrier();

        if (p.logit_softcap != 0.0f) {
-            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
-                uint32_t c = (idx + tid) / Br;
-                uint32_t r = (idx + tid) % Br;
-                if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
-                    sfsh[c * sfshstride + r] = ACC_TYPE(p.logit_softcap * tanh(sfsh[c * sfshstride + r]));
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) {
+                uint32_t c = (idx + tid) / (Br / 4);
+                uint32_t r = (idx + tid) % (Br / 4);
+                if (idx + tid < Bc * Br / 4 || idx + gl_WorkGroupSize.x <= Bc * Br / 4) {
+                    sfsh[c * sfshstride + r] = ACC_TYPEV4(p.logit_softcap * tanh(sfsh[c * sfshstride + r]));
                }
            }
            barrier();
        }

        if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
-            bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
-
-            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
-                uint32_t c = (idx + tid) % Bc;
-                uint32_t r = (idx + tid) / Bc;
-                if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
-                    if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
-                        float f = mask_cache[idx / WorkGroupSize];
-                        sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * f);
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) {
+                uint32_t c = (idx + tid) / (Br / 4);
+                uint32_t r = (idx + tid) % (Br / 4);
+                if (idx + tid < Bc * Br / 4 || idx + gl_WorkGroupSize.x <= Bc * Br / 4) {
+                    if (!KV_bounds_check || j * Bc + c < KV) {
+                        // Mask nem1 bounds check is handled when loading masks
+                        ACC_TYPEV4 masks = ACC_TYPEV4(mask_cache[idx / WorkGroupSize]);
+                        ACC_TYPEV4 slopes = ACC_TYPEV4(slope[r * 4], slope[r * 4 + 1], slope[r * 4 + 2], slope[r * 4 + 3]);
+                        sfsh[c * sfshstride + r] += slopes * masks;
                    }
                }
            }
@@ -250,51 +322,145 @@ void main() {

        float eMf[rows_per_thread];
        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            const uint r_vec  = tile_row(r) / 4;
+            const uint r_comp = tile_row(r) % 4;
+
            float rowmaxf = NEG_FLT_MAX_OVER_2;
            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
                if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
                    continue;
                }
-                rowmaxf = max(rowmaxf, float(sfsh[tile_row(r) + (c * cols_per_iter + col_tid) * sfshstride]));
+                rowmaxf = max(rowmaxf, float(sfsh[r_vec + (c * cols_per_iter + col_tid) * sfshstride][r_comp]));
            }
            float Moldf = Mf[r];

+            // Compute max across the row
+            rowmaxf = subgroupMax(rowmaxf);
+
            // M = max(rowmax, Mold)
            // P = e^(S - M)
            // eM = e^(Mold - M)
            Mf[r] = max(rowmaxf, Moldf);
            eMf[r] = exp(Moldf - Mf[r]);
-        }

-        [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
-            [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-                Of[r][d] = ACC_TYPE(eMf[r]) * Of[r][d];
-            }
-        }
-        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
            Lf[r] = eMf[r]*Lf[r];
        }

-        [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
-            if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
-                continue;
-            }
-            float Pf[rows_per_thread];
+        [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
+            const uint d_local = d0 / threads_per_rowgroup;
            [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-                Pf[r] = exp(sfsh[tile_row(r) + (c * cols_per_iter + col_tid) * sfshstride] - Mf[r]);
-                Lf[r] += Pf[r];
+                Of[r][d_local] = ACC_TYPE(eMf[r]) * Of[r][d_local];
            }
-            [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
-#if BLOCK_SIZE > 1
-                uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
-                uint ib = coord / BLOCK_SIZE;
-                uint iqs = (coord % BLOCK_SIZE);
-                vec4 Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
-#else
-                vec4 Vf = vec4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
+        }
+
+        // Calculate and store Pf in Psh
+        [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+            const uint col = c * cols_per_iter + col_tid;
+
+            [[unroll]] for (uint32_t r = 0; r < rows_per_thread; r += 4) {
+                const uint row = tile_row(r);
+                if (KV_bounds_check && j * Bc + col >= KV) {
+                    Psh[col * psh_stride + row / 4] = f16vec4(0.0f);
+                } else {
+                    const vec4 mfvec = vec4(Mf[r], Mf[r + 1], Mf[r + 2], Mf[r + 3]);
+                    const f16vec4 Pf = f16vec4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
+                    [[unroll]] for (uint32_t vec_idx = 0; vec_idx < 4; ++vec_idx) {
+                        Lf[r + vec_idx] += Pf[vec_idx];
+                    }
+                    Psh[col * psh_stride + row / 4] = Pf;
+                }
+            }
+        }
+
+        const uint num_hsv_tiles = (HSV + MatBc * row_split - 1) / (MatBc * row_split); // round up
+
+        // Each subgroup handles HSV/4 columns
+        [[unroll]] for (uint32_t hsv_tile = 0; hsv_tile < num_hsv_tiles; ++hsv_tile) {
+            const uint hsv_offset = (hsv_tile * row_split + gl_SubgroupID) * 16;
+
+            SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
+
+            // Preload V tiles for [Bc, 16 * num subgroups]
+            const uint v_rows = Bc;
+            const uint v_total = v_rows * v_cols;
+            const uint v_loads_per_thread = v_total / gl_WorkGroupSize.x;
+
+#if BLOCK_SIZE == 1
+            // For f16, only preload if not aligned
+            if (KV_bounds_check) {
 #endif
-                [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-                    Of[r][d] += ACC_TYPE(Pf[r]) * ACC_TYPEV4(Vf);
+            [[unroll]] for (uint32_t i = 0; i < v_loads_per_thread; ++i) {
+                const uint idx = i * gl_WorkGroupSize.x + tid;
+                const uint row = idx / v_cols;
+                const uint col = idx % v_cols;
+
+                const uint v_row = j * Bc + row;
+                const uint v_col = hsv_tile * MatBc * row_split + col * 4;
+
+                const uint coord = v_row * v_stride * BLOCK_SIZE + v_col;
+                const uint ib = coord / BLOCK_SIZE;
+                const uint iqs = coord % BLOCK_SIZE;
+
+                if (!KV_bounds_check || (v_row < KV && v_col < HSV)) {
+#if BLOCK_SIZE > 1
+                    ksh[row * vsh_stride + col] = f16vec4(dequantize4(ib, iqs, v_offset, BINDING_IDX_V));
+#else
+                    ksh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4];
+#endif
+                } else {
+                    ksh[row * vsh_stride + col] = f16vec4(0.0f);
+                }
+            }
+#if BLOCK_SIZE == 1
+            }
+#endif
+
+            barrier();
+
+            [[unroll]] for (uint32_t bc_chunk = 0; bc_chunk < Bc / MatBc; ++bc_chunk) {
+                coopMatLoad(KMat, Psh, bc_chunk * MatBc * psh_stride, psh_stride, gl_CooperativeMatrixLayoutColumnMajor);
+
+#if BLOCK_SIZE == 1
+                if (!KV_bounds_check) {
+                    // F16 values can be loaded directly from global memory
+                    const uint v_tile_row = j * Bc + bc_chunk * MatBc;
+                    const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4;
+                    coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
+                } else
+#endif
+                {
+                    const uint v_tile_offset = bc_chunk * MatBr * v_cols + gl_SubgroupID * (MatBc / 4);
+                    coopMatLoad(QMat, ksh, v_tile_offset, vsh_stride, gl_CooperativeMatrixLayoutRowMajor);
+                }
+
+                SfMat = coopMatMulAdd(KMat, QMat, SfMat);
+            }
+
+            // Store SfMat to sfsh and load into Of
+            const uint osh_stride = row_split * MatBc / 4;
+            const uint o_offset = gl_SubgroupID * MatBc / 4;
+            coopMatStore(SfMat, sfsh, o_offset, osh_stride, gl_CooperativeMatrixLayoutRowMajor);
+
+            barrier();
+
+            const uint hsv_per_tile = row_split * MatBc;
+            const uint hsv_base = hsv_tile * hsv_per_tile;
+            const uint d_values_per_tile = hsv_per_tile / 4;
+
+            const uint d_start = hsv_tile * d_values_per_tile;
+            const uint d_end = min(d_start + d_values_per_tile, HSV / 4);
+
+            [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+                const uint row = tile_row(r);
+
+                [[unroll]] for (uint32_t d_local = 0; d_local < d_per_thread; ++d_local) {
+                    const uint d = d_local * threads_per_rowgroup + col_tid;
+                    const uint hsv_col = 4 * d;
+
+                    if (hsv_col >= hsv_base && hsv_col < hsv_base + hsv_per_tile && hsv_col < HSV) {
+                        const uint local_hsv = (hsv_col - hsv_base) / 4;
+                        Of[r][d_local] += ACC_TYPEV4(sfsh[row * osh_stride + local_hsv]);
+                    }
                }
            }
        }
@@ -302,69 +468,8 @@ void main() {
        barrier();
    }

-    // prevent race on tmpsh
-    barrier();
-
-    // reduce across threads
-
-    float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-        FLOAT_TYPE M = Mf[r];
-        tmpsh[tid] = M;
-        // Compute max across the row
-        barrier();
-        [[unroll]] for (int s = int(gl_WorkGroupSize.x / row_split) / 2; s >= D_split; s >>= 1) {
-            M = max(M, tmpsh[tid ^ s]);
-            barrier();
-            tmpsh[tid] = M;
-            barrier();
-        }
-        rowmaxf[r] = tmpsh[d_tid + row_tid * threads_per_rowgroup];
-        barrier();
-    }
-
-    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-        Moldf[r] = Mf[r];
-
-        // M = max(rowmax, Mold)
-        // eM = e^(Mold - M)
-        Mf[r] = max(rowmaxf[r], Moldf[r]);
-        eMf[r] = exp(Moldf[r] - Mf[r]);
-
-        Lf[r] = eMf[r]*Lf[r];
-    }
-
-    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-        FLOAT_TYPE L = Lf[r];
-        tmpsh[tid] = L;
-        // Compute sum across the row
-        barrier();
-        [[unroll]] for (int s = int(gl_WorkGroupSize.x / row_split) / 2; s >= D_split; s >>= 1) {
-            L += tmpsh[tid ^ s];
-            barrier();
-            tmpsh[tid] = L;
-            barrier();
-        }
-        Lf[r] = tmpsh[d_tid + row_tid * threads_per_rowgroup];
-        barrier();
-    }
-
-    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-        [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
-
-            Of[r][d] = ACC_TYPE(eMf[r]) * Of[r][d];
-            tmpshv4[tid] = Of[r][d];
-
-            barrier();
-            [[unroll]] for (int s = int(gl_WorkGroupSize.x / row_split) / 2; s >= D_split; s >>= 1) {
-                Of[r][d] += tmpshv4[tid ^ s];
-                barrier();
-                tmpshv4[tid] = Of[r][d];
-                barrier();
-            }
-            Of[r][d] = tmpshv4[d_tid + row_tid * threads_per_rowgroup];
-            barrier();
-        }
+        Lf[r] = subgroupAdd(Lf[r]);
    }

    // If there is split_k, then the split_k resolve shader does the final
@@ -375,9 +480,12 @@ void main() {

        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
            if (tile_row(r) < N) {
-                [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
+                [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
+                    const uint d = d0 + col_tid;
+                    if (d >= HSV/4) break;
+                    const uint d_local = d0 / threads_per_rowgroup;
                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
-                        perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N);
+                        perElemOpGqaStore(tile_row(r), 4 * d + comp, float(Of[r][d_local][comp]), o_offset, iq2, N);
                    }
                }
            }
@@ -404,8 +512,9 @@ void main() {
            if (sink > Mf[r]) {
                ms = exp(Mf[r] - sink);

-                [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
-                    Of[r][d] *= ACC_TYPE(ms);
+                [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
+                    const uint d_local = d0 / threads_per_rowgroup;
+                    Of[r][d_local] *= ACC_TYPE(ms);
                }
            } else {
                vs = exp(sink - Mf[r]);
@@ -420,11 +529,12 @@ void main() {
        Lfrcp[r] = (Lf[r] == 0.0) ? 0.0 : (1.0 / Lf[r]);
    }

-    [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
+    [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
+        const uint d_local = d0 / threads_per_rowgroup;
        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-            Of[r][d] *= ACC_TYPE(Lfrcp[r]);
+            Of[r][d_local] *= ACC_TYPE(Lfrcp[r]);
 #if defined(ACC_TYPE_MAX)
-            Of[r][d] = clamp(Of[r][d], -ACC_TYPE_MAX, ACC_TYPE_MAX);
+            Of[r][d_local] = clamp(Of[r][d_local], -ACC_TYPE_MAX, ACC_TYPE_MAX);
 #endif
        }
    }
@@ -434,9 +544,12 @@ void main() {
    if (p.gqa_ratio > 1) {
        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
            if (tile_row(r) < N) {
-                [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
+                [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
+                    const uint d = d0 + col_tid;
+                    if (d >= HSV / 4) break;
+                    const uint d_local = d0 / threads_per_rowgroup;
                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
-                        perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N);
+                        perElemOpGqaStore(tile_row(r), 4 * d + comp, float(Of[r][d_local][comp]), o_offset, iq2, N);
                    }
                }
            }
@@ -444,9 +557,12 @@ void main() {
    } else {
        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
            if (i * Br + tile_row(r) < N) {
-                [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
+                [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
+                    const uint d = d0 + col_tid;
+                    if (d >= HSV / 4) break;
+                    const uint d_local = d0 / threads_per_rowgroup;
                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
-                        data_o[o_offset + iq2 * HSV + (i * Br + tile_row(r)) * p.ne1 * HSV + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
+                        data_o[o_offset + iq2 * HSV + (i * Br + tile_row(r)) * p.ne1 * HSV + 4 * d + comp] = D_TYPE(Of[r][d_local][comp]);
                    }
                }
            }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -55,7 +55,7 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
    return max(elem0, elem1);
 }

-#if defined(BLOCK_SIZE)
+#if BLOCK_SIZE > 1
 #define DECODEFUNC , DEQUANTFUNC
 #else
 #define DECODEFUNC
@@ -85,7 +85,7 @@ void main() {

    tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);

-#if defined(BLOCK_SIZE)
+#if BLOCK_SIZE > 1
    tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE);
    tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE);
 #endif
@@ -98,7 +98,7 @@ void main() {
    if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
    {
        q_stride &= ~7;
-#if !defined(BLOCK_SIZE)
+#if BLOCK_SIZE == 1
        k_stride &= ~7;
        v_stride &= ~7;
 #endif
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
--- a/ggml/src/ggml-zendnn/CMakeLists.txt
+++ b/ggml/src/ggml-zendnn/CMakeLists.txt
@@ -21,7 +21,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
    ExternalProject_Add(
        zendnn
        GIT_REPOSITORY https://github.com/amd/ZenDNN.git
-        GIT_TAG zendnnl
+        GIT_TAG 21ce8f7879c86bf3637f707fae6f29e0951db5fe
        PREFIX      ${ZENDNN_PREFIX}
        SOURCE_DIR  ${ZENDNN_SOURCE_DIR}
        BINARY_DIR  ${ZENDNN_BUILD_DIR}
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -2,7 +2,6 @@

 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
-#include "ggml-cpu.h"
 #include "zendnnl.hpp"

 #include <cstring>
@@ -122,8 +121,8 @@ static void ggml_zendnn_compute_forward_mul_mat(

    GGML_TENSOR_BINARY_OP_LOCALS

-    ggml_type         const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
-    ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
+    ggml_type         const vec_dot_type = src0->type;
+    ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;

    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
--- a/include/llama.h
+++ b/include/llama.h
@@ -309,7 +309,7 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;      // only load the vocabulary, no weights
        bool use_mmap;        // use mmap if possible
-        bool use_direct_io;   // use direct io, takes precedence over use_mmap
+        bool use_direct_io;   // use direct io, takes precedence over use_mmap when supported
        bool use_mlock;       // force system to keep model in RAM
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -253,11 +253,7 @@ llama_context::llama_context(

        // graph outputs buffer
        {
-            // resized during inference when a batch uses more outputs
-            // Create a dummy batch for initialization.
-            llama_batch dummy_batch = {};
-            dummy_batch.n_tokens = 0;
-            if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
                throw std::runtime_error("failed to reserve initial output buffer");
            }

@@ -1225,7 +1221,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    n_queued_tokens += n_tokens;

    // reserve output buffer
-    if (output_reserve(n_tokens, batch_inp) < n_tokens) {
+    if (output_reserve(n_tokens) < n_tokens) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
        return -2;
    };
@@ -1456,6 +1452,23 @@ static void copy_tensor_async_candidates(
    }
 }

+static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_seq_id, llama_sampler *> & samplers) {
+    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+        if (!ubatch.output[i]) {
+            continue;
+        }
+
+        // Check if the output token has at least one sequence without a backend sampler.
+        for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
+            llama_seq_id seq_id = ubatch.seq_id[i][j];
+            if (samplers.find(seq_id) == samplers.end()) {
+                return true;
+            }
+        }
+    }
+    return false; // all sequences use backend sampling
+}
+
 int llama_context::decode(const llama_batch & batch_inp) {
    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT

@@ -1588,7 +1601,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    }

    // reserve output buffer
-    if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
        return -2;
    };
@@ -1661,10 +1674,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }

        // extract logits
-        // For multi-sequence batches that mix backend samplers and CPU sampler
-        // this is currently inefficient as we copy all logits even for the
-        // backend sampled tokens.
-        if (logits && t_logits && n_outputs > 0) {
+        if (logits && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
            GGML_ASSERT(backend_res != nullptr);
            GGML_ASSERT(logits != nullptr);
@@ -1734,11 +1744,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
            }
        }

-        // This flag indicates whether a backend sampler has actually sampled a specific
-        // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
-        const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
-
-        if (has_samplers && has_sampled) {
+        // Copy backend sampling output if this ubatch produced any sampling tensors.
+        if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) {
            const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
            const auto stride = n_vocab;

@@ -1813,7 +1820,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
 // output
 //

-uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
+
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;

@@ -1832,45 +1840,16 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        has_embd   = true;
    }

-    // Check which sampling modes are needed for the current batch.
-    // TODO: avoid this branching by working with the worst-case
-    bool has_sampling = false;
-    bool cpu_logits   = false;
-
-    if (batch.logits) {
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            if (!batch.logits[i]) {
-                continue;
-            }
-            for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
-                llama_seq_id seq_id = batch.seq_id[i][j];
-                if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
-                    has_sampling = true;
-                } else {
-                    cpu_logits = true;
-                }
-            }
-        }
-    } else {
-        // When batch.logits is nullptr (when loading state with a dummy batch),
-        // allocate CPU logits.
-        cpu_logits = true;
-    }

    size_t backend_float_count = 0;
    size_t backend_token_count = 0;

-    // Allocate CPU logits buffer only if needed by sequences in this batch
-    logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
    embd_size   = has_embd ? n_embd_out*n_outputs_max : 0;

-    // TODO: avoid this branching by working with the worst-case
-    if (!has_sampling) {
-        sampling.logits_size     = 0;
-        sampling.probs_size      = 0;
-        sampling.sampled_size    = 0;
-        sampling.candidates_size = 0;
-    } else {
+    // Allocate backend sampling output buffers if there are backend samplers configured.
+    const bool has_sampling = !sampling.samplers.empty();
+    if (has_sampling) {
        sampling.logits_size     = n_vocab*n_outputs_max;
        sampling.probs_size      = n_vocab*n_outputs_max;
        sampling.sampled_size    =         n_outputs_max;
@@ -1928,7 +1907,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
    size_t offset = 0;
    uint8_t * base = (uint8_t *) output_base;

-    logits = (has_logits && cpu_logits) ? output_base : nullptr;
+    logits = has_logits ? output_base : nullptr;
    offset += logits_size * sizeof(float);

    embd = has_embd ? (float *) (base + offset) : nullptr;
@@ -2614,10 +2593,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        auto n_outputs = this->n_outputs;
        io.read_to(&n_outputs, sizeof(n_outputs));

-        // Create a dummy batch for state loading.
-        llama_batch dummy_batch = {};
-        dummy_batch.n_tokens = 0;
-        if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
+        if (n_outputs > output_reserve(n_outputs)) {
            throw std::runtime_error("could not reserve outputs");
        }

@@ -2862,7 +2838,7 @@ void llama_context::opt_epoch_iter(
        }

        // reserve output buffer
-        if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+        if (output_reserve(n_outputs_all) < n_outputs_all) {
            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
            GGML_ABORT("TODO: handle this error");
        };
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -212,7 +212,7 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);

    void output_reorder();

--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -541,15 +541,15 @@ llama_model_loader::llama_model_loader(

    if (use_mmap && use_direct_io) {
        if (files.back()->has_direct_io()) {
-            // Disable mmap, as DirectIO is available
-            use_mmap = false;
            LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+            use_mmap = false;
        } else {
-            // Disable DirectIO and reopen file using std::fopen for mmap
+            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
            use_direct_io = false;
+
+            // reopen file using std::fopen for mmap
            files.pop_back();
            files.emplace_back(new llama_file(fname.c_str(), "rb", false));
-            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
        }
    }

--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -8125,7 +8125,7 @@ llama_model_params llama_model_default_params() {
        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
-        /*.use_direct_io               =*/ true,
+        /*.use_direct_io               =*/ false,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
        /*.use_extra_bufts             =*/ true,
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -545,7 +545,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vishal Singh	b33df266d0	ggml-zendnn : resolve ZenDNN backend cross-module symbol dependency (#19159 )	2026-01-29 12:28:57 +08:00
Aman Gupta	3bcc990997	CUDA: refactor topk-moe to enable more models (GLM 4.7, Nemotron etc.) (#19126 )	2026-01-29 10:31:28 +08:00
Neo Zhang	d4964a7c66	sycl: fix norm kernels: l2_norm, group_norm, rms_norm by remove assert to support more cases (#19154 ) Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>	2026-01-29 09:20:22 +08:00
Sigbjørn Skjæret	50e8962f79	ci : find latest release with asset for winget (#19161 )	2026-01-28 22:05:39 +01:00
Ruben Ortlam	f6b533d898	Vulkan Flash Attention Coopmat1 Refactor (#19075 ) * vulkan: use coopmat for flash attention pv matrix multiplication fix P loading issue * fix barrier position * remove reduction that is no longer needed * move max thread reduction into loop * remove osh padding * add bounds checks and padding * remove unused code * fix shmem sizes, loop duration and accesses * don't overwrite Qf, add new shared psh buffer instead * add missing bounds checks * use subgroup reductions * optimize * move bounds check, reduce barriers * support other Bc values and other subgroup sizes * remove D_split * replace Of register array with shared memory Ofsh array * parallelize HSV across the rowgroups * go back to Of in registers, not shmem * vectorize sfsh * don't store entire K tile in shmem * fixes * load large k tiles to shmem on Nvidia * adapt shared memory host check function to shader changes * remove Bc 32 case * remove unused variable * fix missing mask reduction tmspsh barrier * fix mask bounds check * fix rowmax f16 under/overflow to inf * fix flash_attn_cm2 BLOCK_SIZE preprocessor directives	2026-01-28 18:52:45 +01:00
Sascha Rogmann	72d3b1898a	spec : add self‑speculative decoding (no draft model required) + refactor (#18471 ) * server: introduce self-speculative decoding * server: moved self-call into speculative.cpp * can_speculate() includes self-speculation Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server: can_speculate() tests self-spec * server: replace can_speculate() with slot.can_speculate() Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * common: use %zu format specifier for size_t in logging Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * server: can_speculate() requires a task instance * common: ngram map, config self-speculative decoding * common: add enum common_speculative_type * common: add vector of speculative states * common: add option --spec-draftless * server: cleanup (remove slot.batch_spec, rename) * common: moved self-spec impl to ngram-map * common: cleanup (use common_speculative_state_draft) * spec : refactor * cont : naming * spec: remove --spec-config * doc: (draftless) speculative decoding * common: print performance in spec decoding * minor : cleanup * common : better names * minor : cleanup + fix build * minor: comments * CODEOWNERS: add common/ngram-map.* (#18471) * common : rename speculative.draftless_type -> speculative.type * ngram-map : fix uninitialized values * ngram-map : take into account the input can become shorter * ngram-map : revert len check for now * arg : change `--spec-draftless` -> `--spec-type` * spec : add common_speculative_state::accept() * spec : refactor + add common_speculative_begin() * spec : fix begin() call with mtmd * spec : additional refactor + remove common_speculative_params --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-01-28 19:42:42 +02:00
Daniel Bevenius	ebf5725870	convert : yield Mamba2Model/GraniteMoeModel modify_tensors (#19157 ) * convert : yield Mamba2Model/GraniteMoeModel modify_tensors This commit updates the `GraniteHybridModel` class' modify_tensors function to properly delegate to `Mamba2Model.modify_tensors` and `GraniteMoeModel.modify_tensors` using 'yield from' instead of 'return'. The motivation for this is that modify_tensors is a generator function (it uses 'yield from'), but the two calls above use return statements but don't yield anything which means that the the caller of this function will not receive any yielded values from it. And this causes layer tensors to be silently dropped during conversion.	2026-01-28 16:49:36 +01:00
Patryk Kaminski	0cd7032ca4	ggml-sycl: remove unused syclcompat header (#19140 ) The syclcompat/math.hpp is not used anymore. The change that intrduced it was successfuly reverted (https://github.com/ggml-org/llama.cpp/pull/17826). This include path will become obsolete and dropped in oneAPI 2026.0 effectively breaking ggml-sycl builds.	2026-01-28 23:33:54 +08:00
Sigbjørn Skjæret	60368e1d73	jinja : undefined should be treated as sequence/iterable (return string/array) by filters/tests (#19147 ) * undefined is treated as iterable (string/array) by filters `tojson` is not a supported `undefined` filter * add tests * add sequence and iterable tests keep it DRY and fix some types	2026-01-28 14:40:29 +01:00
Oleksandr Kuvshynov	88d23ad515	vulkan: handle device dedup on MacOS + Vega II Duo cards (#19058 ) Deduplication here relied on the fact that vulkan would return unique UUID for different physical GPUs. It is at the moment not always the case. On Mac Pro 2019 running Mac OS, with 2 Vega II Duo cards (so, 4 GPU total), MotlenVK would assign same UUID to pairs of GPUs, unless they are connected with Infinity Fabric. See more details here: KhronosGroup/MoltenVK#2683. The right way is to fix that in MoltenVK, but until it is fixed, llama.cpp would only recognize 2 of 4 GPUs in such configuration. The deduplication logic here is changed to only filter GPUs if UUID is same but driver is different.	2026-01-28 12:35:54 +01:00
Ben Chen	0a95026da9	doc: add build instruction to use Vulkan backend on macos (#19029 )	2026-01-28 12:30:16 +01:00
Kevin Pouget	b7feacf7f3	ggml: new backend for Virglrenderer API Remoting acceleration (v2) (#18718 )	2026-01-28 17:49:40 +08:00
Alberto Cabrera Pérez	6ad70c5a77	ggml-cpu: arm64: Q4_K scale unroll and vectorization (#19108 )	2026-01-28 09:15:56 +02:00
Georgi Gerganov	631cbfcc7a	cuda : fix "V is K view" check for non-unified KV cache (#19145 )	2026-01-28 09:15:27 +02:00
Georgi Gerganov	2eee6c866c	CUDA: tune GLM 4.7 Flash FA kernel selection logic (DGX Spark) (#19142 )	2026-01-28 09:15:11 +02:00
Georgi Gerganov	b931f81b5a	server : adjust spec tests to generate up to 16 tokens (#19093 )	2026-01-28 09:11:40 +02:00
Georgi Gerganov	c5c64f72ac	llama : disable Direct IO by default (#19109 ) * llama : disable Direct IO by default * cont : override mmap if supported	2026-01-28 09:11:13 +02:00
Daniel Bevenius	eef375ce16	sampling : remove sampling branching in output_reserve (#18811 ) * sampling : remove sampling branching in output_reserve This commit updates output_reserve in llama-context.cpp to always allocate sampling buffers regardless of whether sampling is needed for the current batch. The motivation for this is to avoid reallocations and branching based on the sampling requirements of the batch.	2026-01-28 05:59:30 +01:00
Nikhil Jain	06961e2876	ggml webgpu: Split shared state (webgpu_context) into global state and per-thread state (#18976 ) * Squashed commit of the following: commit b3c6bf4b0450d8d452b934df27a0fb7cb53cd755 Author: Abhijit Ramesh <abhijitramesh2k@gmail.com> Date: Mon Dec 1 18:29:00 2025 -0800 ggml webgpu: fix xielu parameter passing (#11) The XIELU operation was incorrectly using static_cast to convert float parameters to uint32_t, which converted numeric values instead of preserving IEEE 754 bit patterns. This caused incorrect values to be interpreted by the GPU shader. * Use reinterpret_cast to preserve float bit patterns when passing through uint32_t params buffer * Update WGSL shader parameter types from u32 to f32 * Re-enable XIELU support (was disabled due to numerical issues) Fixes NMSE test failures for XIELU operation on WebGPU backend. commit `5ca9b5e49e` Author: neha-ha <137219201+neha-ha@users.noreply.github.com> Date: Tue Nov 18 12:17:00 2025 -0800 Refactored pipelines and workgroup calculations (#10) * refactored pipelines * refactored workgroup calculation * removed commented out block of prior maps * Clean up ceiling division pattern --------- Co-authored-by: Neha Abbas <nehaabbas@eduroam-169-233-141-223.ucsc.edu> Co-authored-by: Reese Levine <reeselevine1@gmail.com> Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 29 23:13:06 2025 -0700 formatted embed wgsl and ggml-webgpu.cpp commit `e1f6baea31` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 29 23:08:37 2025 -0700 implemented REPL_Template support and removed bug in unary operators kernel commit `8c70b8fece` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 15 16:14:20 2025 -0700 responded and dealt with PR comments commit `f9282c660c` Author: James Contini <jamescontini@gmail.com> Date: Sun Oct 12 13:41:41 2025 -0700 removed unnecesarry checking if node->src[1] exists for unary operators commit `4cf28d7dec` Author: James Contini <jamescontini@gmail.com> Date: Sun Oct 12 13:32:45 2025 -0700 All operators (inlcluding xielu) working commit `74c6add176` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 13:16:48 2025 -0700 fixed autoconfig commit `362749910b` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 13:10:46 2025 -0700 removed vestigial files commit `cb08583337` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 12:59:32 2025 -0700 abides by editor-config commit `5360e2852a` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 12:45:57 2025 -0700 rms_norm double declaration bug atoned commit `7b09baa4aa` Merge: `8a6ec843` `74b8fc17` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 11:50:03 2025 -0700 resolving merge conflicts commit `8a6ec843a5` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 8 18:06:47 2025 -0700 unary operators pass ggml tests commit `c3ae38278a` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 1 16:22:40 2025 -0700 neg passes backend test commit `aa1c9b2f88` Author: James Contini <jamescontini@gmail.com> Date: Tue Sep 30 23:55:27 2025 -0700 neg f16xf32xip builds and runs, havent actually ran a model that uses neg kernel yet though Co-authored-by: James Contini <jamescontini@gmail.com> Co-authored-by: Neha Abbas <neabbas@ucsc.edu> Co-authored-by: Abhijit Ramesh <abhijitramesh2k@gmail.com> * Remove extra code and format * Add ops documentation (finally) * ggml webgpu: add SOFTPLUS unary operator Implements SOFTPLUS (log(1 + exp(x))) with f16/f32 support. Uses f32 precision for intermediate calculations to prevent f16 overflow. * Add shader implementation and 4 variants (f32/f16, inplace/non-inplace) * Register pipelines and device support * Follow Vulkan backend numerical stability pattern * ggml webgpu: add EXPM1 unary operator Implements EXPM1 (exp(x) - 1) with f16/f32 support. * Add shader implementation and 4 variants (f32/f16, inplace/non-inplace) * Register pipelines and device support * ggml webgpu: add FLOOR unary operator Implements FLOOR (rounds down to nearest integer) with f16/f32 support. * Add shader implementation and 4 variants (f32/f16, inplace/non-inplace) * Register pipelines and device support * ggml webgpu: add CEIL unary operator Implements CEIL (rounds up to nearest integer) with f16/f32 support. * Add shader implementation and 4 variants (f32/f16, inplace/non-inplace) * Register pipelines and device support * ggml webgpu: add ROUND unary operator Implements ROUND (rounds to nearest integer) with f16/f32 support. * Add shader implementation and 4 variants (f32/f16, inplace/non-inplace) * Register pipelines and device support * ggml webgpu: add TRUNC unary operator Implements TRUNC (truncates towards zero) with f16/f32 support. * Add shader implementation and 4 variants (f32/f16, inplace/non-inplace) * Register pipelines and device support * docs : update WebGPU support for unary operators (FLOOR, CEIL, ROUND, TRUNC, EXPM1, SOFTPLUS) * Updates to webgpu get_memory * Move shared state (webgpu_context) and device creation out of registration context, device context, and buffer context, and move into backend context * Small cleanup * Move Instance, Device, Adapter, Device creation, and capabilities to global state while moving Queue, pipelines, and buffers to per-thread state. * Cleanups * More cleanup * Move staging_buf mutex to global context * Resolve merge * Resolve merge * Resolve merge * Clean up merge errors, delete forward declaration, and run clang-format * Rename device_init to backend_init * Move webgpu_context to backend_context * Move buffer context members into global context and refactor function calls * Run clang-format * Remove commends * Move parameter buffers to per-thread, add single memset_tensor param buf * Fix CI compilation issue * Fix builds for emscripten not supporting subgroups * cleanup * cleanup --------- Co-authored-by: Reese Levine <reeselevine1@gmail.com>	2026-01-27 20:53:36 -08:00
Vishal Singh	f2571df8b7	ggml-zendnn : update ZenDNN git tag to main branch (#19133 )	2026-01-28 06:21:36 +08:00
Sigbjørn Skjæret	2b4cbd2834	jinja : implement mixed type object keys (#18955 ) * implement mixed type object keys * add tests * refactor * minor fixes * massive refactor * add more tests * forgotten tuples * fix array/object is_hashable * correct (albeit broken) jinja responses verified with transformers * improved hashing and equality * refactor hash function * more exhausive test case * clean up * cont * cont (2) * missing cstring --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2026-01-27 19:50:42 +01:00
David Lima	68ac3acb43	docs: Remove duplicated word on CUDA build section (#19136 )	2026-01-27 14:48:51 +01:00
Johannes Gäßler	a5bb8ba4c5	CUDA: tune GLM 4.7 Flash FA kernel selection logic (#19097 )	2026-01-27 14:28:56 +01:00
Sigbjørn Skjæret	c0204a0893	ci : revert slim runner for winget (#19129 )	2026-01-27 11:54:25 +01:00
Alberto Cabrera Pérez	be8890e721	ggml-cpu: aarm64: q6_K repack gemm and gemv (and generic) implementations (i8mm) #18860 (#18888 ) * Boilerplate for q6_K repack * q6_K repack to q6_Kx8 implementation Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> * q6_K generic gemv and gemm * wip, gemm_q6_K 8x8 * Still WIP: loading of q8s, q6h and q6l * first working version of q6_K gemm * Moved q6 loads outside of sb block, Unrolled inner loop * Replaced modulo with mask * First implementation of GEMV * ggml_vdotq_s32 -> vdotq_s32 * Reduce width of accumulators in q6_K gemv * Bsums instead of calc bias. Preload scales to use vget_lane. Unroll. * Reuse scales in GEMM (same GEMV opt) * Added todos for bsum and different qh repack * Arch fallback * VSLIQ for merging qh adn ql * Removed TODO, already tested * Apply suggestions Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Removed unused import --------- Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-01-27 11:08:10 +02:00