vulkan: split mul_mat into multiple dispatches to avoid overflow (#19509 )

* vulkan: split mul_mat into multiple dispatches to avoid overflow The batch dimensions can be greater than the max workgroup count limit, in which case we need to split into multiple dispatches and pass the base index through a push constant. Fall back for the less common p021 and nc variants. * address feedback
common : make small string helpers as inline functions (#19693 )
2026-05-22 08:54:06 +00:00 · 2026-02-18 10:47:10 +01:00 · 2026-02-18 08:03:01 +01:00 · 2026-02-17 14:47:18 -08:00 · 2026-02-17 13:56:09 -08:00 · 2026-02-17 20:43:22 +01:00
38 changed files with 958 additions and 605 deletions
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -17,7 +17,7 @@ jobs:

      - name: Install komac
        run: |
-          cargo binstall komac@2.11.2 -y
+          cargo binstall komac@2.15.0 -y

      - name: Find latest release
        id: find_latest_release
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -5,7 +5,6 @@ find_package(Threads REQUIRED)
 llama_add_compile_flags()

 # Build info header
-#

 if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
@@ -110,29 +109,16 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
-set(LLAMA_COMMON_EXTRA_LIBS build_info)
-set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
+target_link_libraries(${TARGET} PRIVATE
+    build_info
+    cpp-httplib
+)

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
-
-    # Set the correct library file extension based on platform
-    if (WIN32)
-        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
-        # Add Windows-specific libraries
-        set(LLGUIDANCE_PLATFORM_LIBS
-            ws2_32    # Windows Sockets API
-            userenv   # For GetUserProfileDirectoryW
-            ntdll     # For NT functions
-            bcrypt    # For BCryptGenRandom
-        )
-    else()
-        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
-        set(LLGUIDANCE_PLATFORM_LIBS "")
-    endif()
+    set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
@@ -154,8 +140,10 @@ if (LLAMA_LLGUIDANCE)
    add_dependencies(llguidance llguidance_ext)

    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
-    # Add platform libraries to the main target
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
-endif ()
+    target_link_libraries(${TARGET} PRIVATE llguidance)
+    if (WIN32)
+        target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
+    endif()
+endif()

-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -452,34 +452,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

-bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
-
-bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
-    bool has_suffix = string_ends_with(str, suffix);
-    if (has_suffix) {
-        str = str.substr(0, str.size() - suffix.size());
-    }
-    return has_suffix;
-}
-
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
-    if (!str.empty() && !stop.empty()) {
-        const char text_last_char = str.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-            if (stop[char_index] == text_last_char) {
-                const auto current_partial = stop.substr(0, char_index + 1);
-                if (string_ends_with(str, current_partial)) {
-                    return str.size() - char_index - 1;
-                }
-            }
-        }
-    }
-
-    return std::string::npos;
-}
-
 std::string regex_escape(const std::string & s) {
    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
    return std::regex_replace(s, special_chars, "\\$&");
--- a/common/common.h
+++ b/common/common.h
@@ -670,30 +670,55 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 }

 template<>
-inline std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
+inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
 {
    std::vector<std::string> parts;
    size_t begin_pos = 0;
-    size_t separator_pos = input.find(separator);
-    while (separator_pos != std::string::npos) {
-        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+    size_t delim_pos = str.find(delim);
+    while (delim_pos != std::string::npos) {
+        std::string part = str.substr(begin_pos, delim_pos - begin_pos);
        parts.emplace_back(part);
-        begin_pos = separator_pos + 1;
-        separator_pos = input.find(separator, begin_pos);
+        begin_pos = delim_pos + 1;
+        delim_pos = str.find(delim, begin_pos);
    }
-    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    parts.emplace_back(str.substr(begin_pos));
    return parts;
 }

-inline bool string_starts_with(const std::string & str,
-                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
-    return str.rfind(prefix, 0) == 0;
+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, std::string_view prefix) {
+    return str.size() >= prefix.size() &&
+           str.compare(0, prefix.size(), prefix) == 0;
 }

-// While we wait for C++20's std::string::ends_with...
-bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
-bool string_remove_suffix(std::string & str, const std::string_view & suffix);
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
+// remove when moving to c++20
+inline bool string_ends_with(std::string_view str, std::string_view suffix) {
+    return str.size() >= suffix.size() &&
+           str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
+    if (string_ends_with(str, suffix)) {
+        str.resize(str.size() - suffix.size());
+        return true;
+    }
+    return false;
+}
+
+inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
+    if (!str.empty() && !stop.empty()) {
+        const size_t max_len = std::min(str.size(), stop.size());
+        const char last_char = str.back();
+        for (size_t len = max_len; len > 0; --len) {
+            if (stop[len - 1] == last_char) {
+                if (string_ends_with(str, stop.substr(0, len))) {
+                    return str.size() - len;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}

 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1049,6 +1049,9 @@ class TextModel(ModelBase):
        if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
            # ref: https://huggingface.co/zai-org/GLM-4.5-Air
            res = "glm4"
+        if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
+            # ref: https://huggingface.co/zai-org/GLM-4.7-Flash
+            res = "glm4"
        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
            res = "minerva-7b"
@@ -1082,9 +1085,6 @@ class TextModel(ModelBase):
        if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
            # ref: https://huggingface.co/aari1995/German_Semantic_V3
            res = "jina-v2-de"
-        if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
-            # ref: https://huggingface.co/zai-org/GLM-4.7-Flash
-            res = "glm4"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@@ -1268,6 +1268,9 @@ class TextModel(ModelBase):
        if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4":
            # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct
            res = "qwen35"
+        if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d":
+            # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash
+            res = "joyai-llm"

        if res is None:
            logger.warning("\n")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -149,7 +149,8 @@ models = [
    {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
    {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
    {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
-    {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }
+    {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
+    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -159,6 +160,7 @@ pre_computed_hashes = [
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
+    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
    {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
    {"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
@@ -172,7 +174,6 @@ pre_computed_hashes = [
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
-    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
 ]


--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -42,11 +42,15 @@ def load_model_and_tokenizer(model_path, device="auto"):
        config = config.text_config
        multimodal = True

-    print("Vocab size:       ", config.vocab_size)
-    print("Hidden size:      ", config.hidden_size)
-    print("Number of layers: ", config.num_hidden_layers)
-    print("BOS token id:     ", config.bos_token_id)
-    print("EOS token id:     ", config.eos_token_id)
+    def print_if_exists(label, obj, attr, default="N/A"):
+        val = getattr(obj, attr) if hasattr(obj, attr) else default
+        print(f"{label}", val)
+
+    print_if_exists("Vocab size:       ", config, "vocab_size")
+    print_if_exists("Hidden size:      ", config, "hidden_size")
+    print_if_exists("Number of layers: ", config, "num_hidden_layers")
+    print_if_exists("BOS token id:     ", config, "bos_token_id")
+    print_if_exists("EOS token id:     ", config, "eos_token_id")

    unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
    if unreleased_model_name:
--- a/examples/model-conversion/scripts/utils/tensor-info.py
+++ b/examples/model-conversion/scripts/utils/tensor-info.py
@@ -78,7 +78,7 @@ def list_all_tensors(model_path: Path, unique: bool = False):
            print(tensor_name)


-def print_tensor_info(model_path: Path, tensor_name: str):
+def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
    tensor_file = find_tensor_file(model_path, tensor_name)

    if tensor_file is None:
@@ -96,6 +96,12 @@ def print_tensor_info(model_path: Path, tensor_name: str):
                print(f"Tensor: {tensor_name}")
                print(f"File:   {tensor_file}")
                print(f"Shape:  {shape}")
+                if num_values is not None:
+                    tensor = f.get_tensor(tensor_name)
+                    print(f"Dtype:  {tensor.dtype}")
+                    flat = tensor.flatten()
+                    n = min(num_values, flat.numel())
+                    print(f"Values: {flat[:n].tolist()}")
            else:
                print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
                sys.exit(1)
@@ -127,6 +133,15 @@ def main():
        action="store_true",
        help="List unique tensor patterns in the model (layer numbers replaced with #)"
    )
+    parser.add_argument(
+        "-n", "--num-values",
+        nargs="?",
+        const=10,
+        default=None,
+        type=int,
+        metavar="N",
+        help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
+    )

    args = parser.parse_args()

@@ -152,7 +167,7 @@ def main():
        if args.tensor_name is None:
            print("Error: tensor_name is required when not using --list")
            sys.exit(1)
-        print_tensor_info(model_path, args.tensor_name)
+        print_tensor_info(model_path, args.tensor_name, args.num_values)


 if __name__ == "__main__":
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -9,6 +9,11 @@ function(ggml_add_cpu_backend_features cpu_name arch)
    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
    set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    # Disable LTO for the feature detection code to prevent cross-module optimization
+    # from inlining architecture-specific instructions into the score function.
+    # Without this, LTO can cause SIGILL when loading backends on older CPUs
+    # (e.g., loading power10 backend on power9 crashes before feature check runs).
+    target_compile_options(${GGML_CPU_FEATS_NAME} PRIVATE -fno-lto)
    target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
 endfunction()

--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2278,11 +2278,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;

+    // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
    if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
        static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
        if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
            if (ggml_is_quantized(src0->type)) {
-                if (ne2 <= 4) {
+                if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) {
                    ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
                    return;
                }
@@ -2305,6 +2306,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
        }
    }

+    // note: this path should not be reached when recording CUDA graphs, because it requires stream synchronization
+    // TODO: add asserts to verify this. should work with CUDA, HIP, etc.
    cudaStream_t stream = ctx.stream();

    GGML_ASSERT(nb12 % nb11 == 0);
@@ -2865,15 +2868,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
    bool use_cuda_graph = true;
    // Loop over nodes in GGML graph to obtain info needed for CUDA graph

-    const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
-    const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
-    const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
-    const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
-    const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
-    const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
-    const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
-    const std::string delta_net_prefix = "dnet_add";
-
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

@@ -2888,31 +2882,14 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
 #endif
        }

-        if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
-            use_cuda_graph = false; // This node type is not supported by CUDA graph capture
-#ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
-#endif
-        }
-
-        if (node->op == GGML_OP_ADD &&
-            node->src[1] && node->src[1]->ne[1] > 1 &&
-            (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
-            (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
-            strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
-            strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
-            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
-            strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
-            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
-            // by means of matching node names. See
-            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
-            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
-            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+        // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
+        if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) {
+            // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
+            // TODO: figure out a way to enable for larger batch sizes, without hurting performance
+            // ref: https://github.com/ggml-org/llama.cpp/pull/18958
            use_cuda_graph = false;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
 #endif
        }

--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -1,6 +1,7 @@
 #include "common.cuh"

 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
+#define MMVQ_MMID_MAX_BATCH_SIZE 4 // Max. batch size for which to use MMVQ kernels for MUL_MAT_ID

 void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -484,7 +484,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_scale_f32, kernel_scale_f32_4;
    cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
    cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
-    cl_kernel kernel_mean_f32;
+    cl_kernel kernel_mean_f32, kernel_mean_f32_4;
    cl_kernel kernel_silu, kernel_silu_4;
    cl_kernel kernel_gelu, kernel_gelu_4;
    cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
@@ -543,15 +543,15 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_solve_tri_f32;
    cl_kernel kernel_im2col_f32, kernel_im2col_f16;
    cl_kernel kernel_argsort_f32_i32;
-    cl_kernel kernel_sum_rows_f32;
+    cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
    cl_kernel kernel_repeat_f32;
    cl_kernel kernel_pad;
    cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
    cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
-    cl_kernel kernel_expm1_f32_nd;
-    cl_kernel kernel_expm1_f16_nd;
-    cl_kernel kernel_softplus_f32_nd;
-    cl_kernel kernel_softplus_f16_nd;
+    cl_kernel kernel_expm1_f32, kernel_expm1_f32_4, kernel_expm1_f32_nc;
+    cl_kernel kernel_expm1_f16, kernel_expm1_f16_4, kernel_expm1_f16_nc;
+    cl_kernel kernel_softplus_f32, kernel_softplus_f32_4, kernel_softplus_f32_nc;
+    cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
    cl_kernel kernel_upscale;
    cl_kernel kernel_upscale_bilinear;
    cl_kernel kernel_concat_f32;
@@ -1837,6 +1837,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

        CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_mean_f32_4 = clCreateKernel(prog, "kernel_mean_f32_4", &err), err));

        CL_CHECK(clReleaseProgram(prog));
        GGML_LOG_CONT(".");
@@ -1874,6 +1875,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

        CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_sum_rows_f32_4 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32_4", &err), err));
        GGML_LOG_CONT(".");
    }

@@ -1978,20 +1980,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
        const std::string kernel_src = read_file("expm1.cl");
 #endif
-        cl_program prog;
-        if (!kernel_src.empty()) {
-            prog =
-                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-            CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
-            CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
-            GGML_LOG_CONT(".");
-        } else {
-            GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
-            prog = nullptr;
-            backend_ctx->kernel_expm1_f32_nd = nullptr;
-            backend_ctx->kernel_expm1_f16_nd = nullptr;
-        }
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_expm1_f32    = clCreateKernel(prog, "kernel_expm1_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_expm1_f32_4  = clCreateKernel(prog, "kernel_expm1_f32_4", &err), err));
+        CL_CHECK((backend_ctx->kernel_expm1_f32_nc = clCreateKernel(prog, "kernel_expm1_f32_nc", &err), err));
+        CL_CHECK((backend_ctx->kernel_expm1_f16    = clCreateKernel(prog, "kernel_expm1_f16", &err), err));
+        CL_CHECK((backend_ctx->kernel_expm1_f16_4  = clCreateKernel(prog, "kernel_expm1_f16_4", &err), err));
+        CL_CHECK((backend_ctx->kernel_expm1_f16_nc = clCreateKernel(prog, "kernel_expm1_f16_nc", &err), err));
        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
    }

    // softplus
@@ -2003,20 +2001,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
        const std::string kernel_src = read_file("softplus.cl");
 #endif
-        cl_program prog;
-        if (!kernel_src.empty()) {
-            prog =
-                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-            CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
-            CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
-            GGML_LOG_CONT(".");
-        } else {
-            GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
-            prog = nullptr;
-            backend_ctx->kernel_softplus_f32_nd = nullptr;
-            backend_ctx->kernel_softplus_f16_nd = nullptr;
-        }
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_softplus_f32    = clCreateKernel(prog, "kernel_softplus_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_softplus_f32_4  = clCreateKernel(prog, "kernel_softplus_f32_4", &err), err));
+        CL_CHECK((backend_ctx->kernel_softplus_f32_nc = clCreateKernel(prog, "kernel_softplus_f32_nc", &err), err));
+        CL_CHECK((backend_ctx->kernel_softplus_f16    = clCreateKernel(prog, "kernel_softplus_f16", &err), err));
+        CL_CHECK((backend_ctx->kernel_softplus_f16_4  = clCreateKernel(prog, "kernel_softplus_f16_4", &err), err));
+        CL_CHECK((backend_ctx->kernel_softplus_f16_nc = clCreateKernel(prog, "kernel_softplus_f16_nc", &err), err));
        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
    }

    // upscale
@@ -3463,11 +3457,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                case GGML_UNARY_OP_TANH:
                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                case GGML_UNARY_OP_EXPM1:
-                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
-                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
+                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                case GGML_UNARY_OP_SOFTPLUS:
-                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
-                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
+                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                default:
                    return false;
            }
@@ -3587,7 +3579,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
        }
        case GGML_OP_SUM_ROWS:
        case GGML_OP_MEAN:
-            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
+            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_FLASH_ATTN_EXT:
            {
                const ggml_tensor * q = op->src[0];
@@ -6400,7 +6392,6 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
    GGML_UNUSED(src1);

    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(ggml_is_contiguous(src0));

    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

@@ -6423,7 +6414,14 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
    const cl_ulong nb2  = dst->nb[2];
    const cl_ulong nb3  = dst->nb[3];

-    cl_kernel kernel = backend_ctx->kernel_mean_f32;
+    cl_kernel kernel;
+
+    const bool is_c4 = ne00 % 4 == 0;
+    if (is_c4) {
+        kernel = backend_ctx->kernel_mean_f32_4;
+    } else {
+        kernel = backend_ctx->kernel_mean_f32;
+    }

    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
@@ -6440,7 +6438,7 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));

-    size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
+    size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
    size_t local_work_size[] = {(size_t)64, 1, 1};

    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -7388,18 +7386,8 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;

-    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
-    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
-
-    cl_kernel kernel;
-    if (dst->type == GGML_TYPE_F32) {
-        kernel = backend_ctx->kernel_expm1_f32_nd;
-    } else if (dst->type == GGML_TYPE_F16) {
-        kernel = backend_ctx->kernel_expm1_f16_nd;
-    } else {
-        GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
-    }
-    GGML_ASSERT(kernel != nullptr);
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;

    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
@@ -7411,70 +7399,74 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
    const cl_ulong nb02 = src0->nb[2];
    const cl_ulong nb03 = src0->nb[3];

-    const int ne10 = dst->ne[0];
-    const int ne11 = dst->ne[1];
-    const int ne12 = dst->ne[2];
-    const int ne13 = dst->ne[3];
+    const cl_ulong nb0 = dst->nb[0];
+    const cl_ulong nb1 = dst->nb[1];
+    const cl_ulong nb2 = dst->nb[2];
+    const cl_ulong nb3 = dst->nb[3];

-    const cl_ulong nb10 = dst->nb[0];
-    const cl_ulong nb11 = dst->nb[1];
-    const cl_ulong nb12 = dst->nb[2];
-    const cl_ulong nb13 = dst->nb[3];
+    cl_kernel kernel;

-    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
-    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
-
-    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
-    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
-    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
-    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
-    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
-    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
-
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
-    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
-    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
-    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
-    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
-    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
-    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
-    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
-
-    size_t global_work_size[3];
-    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
-        return;
-    }
-    global_work_size[0] = (size_t)ne10;
-    global_work_size[1] = (size_t)ne11;
-    global_work_size[2] = (size_t)ne12;
-
-    size_t lws0 = 16, lws1 = 4, lws2 = 1;
-    if (ne10 < 16) lws0 = ne10;
-    if (ne11 < 4) lws1 = ne11;
-    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
-
-    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
-    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
-    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
-
-
-    size_t local_work_size[] = {lws0, lws1, lws2};
-
-    size_t* local_work_size_ptr = local_work_size;
-    if (!backend_ctx->non_uniform_workgroups) {
-        if (global_work_size[0] % local_work_size[0] != 0 ||
-            global_work_size[1] % local_work_size[1] != 0 ||
-            global_work_size[2] % local_work_size[2] != 0) {
-            local_work_size_ptr = NULL;
+    if (ggml_is_contiguous(src0)) {
+        // Handle contiguous input
+        int n = ggml_nelements(dst);
+        if (n % 4 == 0) {
+            if (src0->type == GGML_TYPE_F32) {
+                kernel = backend_ctx->kernel_expm1_f32_4;
+            } else {
+                kernel = backend_ctx->kernel_expm1_f16_4;
+            }
+            n /= 4;
+        } else {
+            if (src0->type == GGML_TYPE_F32) {
+                kernel = backend_ctx->kernel_expm1_f32;
+            } else {
+                kernel = backend_ctx->kernel_expm1_f16;
+            }
        }
-    }
-    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;

-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;
+        }
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    } else {
+        // Handle non-contiguous input
+        if (src0->type == GGML_TYPE_F32) {
+            kernel = backend_ctx->kernel_expm1_f32_nc;
+        } else {
+            kernel = backend_ctx->kernel_expm1_f16_nc;
+        }
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
+
+        int nth = 64;
+
+        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    }
 }

 static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7490,18 +7482,8 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;

-    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
-    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
-
-    cl_kernel kernel;
-    if (dst->type == GGML_TYPE_F32) {
-        kernel = backend_ctx->kernel_softplus_f32_nd;
-    } else if (dst->type == GGML_TYPE_F16) {
-        kernel = backend_ctx->kernel_softplus_f16_nd;
-    } else {
-        GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
-    }
-    GGML_ASSERT(kernel != nullptr);
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;

    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
@@ -7513,70 +7495,74 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
    const cl_ulong nb02 = src0->nb[2];
    const cl_ulong nb03 = src0->nb[3];

-    const int ne10 = dst->ne[0];
-    const int ne11 = dst->ne[1];
-    const int ne12 = dst->ne[2];
-    const int ne13 = dst->ne[3];
+    const cl_ulong nb0 = dst->nb[0];
+    const cl_ulong nb1 = dst->nb[1];
+    const cl_ulong nb2 = dst->nb[2];
+    const cl_ulong nb3 = dst->nb[3];

-    const cl_ulong nb10 = dst->nb[0];
-    const cl_ulong nb11 = dst->nb[1];
-    const cl_ulong nb12 = dst->nb[2];
-    const cl_ulong nb13 = dst->nb[3];
+    cl_kernel kernel;

-    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
-    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
-
-    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
-    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
-    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
-    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
-    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
-    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
-
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
-    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
-    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
-    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
-    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
-    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
-    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
-    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
-
-    size_t global_work_size[3];
-    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
-        return;
-    }
-    global_work_size[0] = (size_t)ne10;
-    global_work_size[1] = (size_t)ne11;
-    global_work_size[2] = (size_t)ne12;
-
-    size_t lws0 = 16, lws1 = 4, lws2 = 1;
-    if (ne10 < 16) lws0 = ne10;
-    if (ne11 < 4) lws1 = ne11;
-    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
-
-    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
-    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
-    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
-
-
-    size_t local_work_size[] = {lws0, lws1, lws2};
-
-    size_t* local_work_size_ptr = local_work_size;
-    if (!backend_ctx->non_uniform_workgroups) {
-        if (global_work_size[0] % local_work_size[0] != 0 ||
-            global_work_size[1] % local_work_size[1] != 0 ||
-            global_work_size[2] % local_work_size[2] != 0) {
-            local_work_size_ptr = NULL;
+    if (ggml_is_contiguous(src0)) {
+        // Handle contiguous input
+        int n = ggml_nelements(dst);
+        if (n % 4 == 0) {
+            if (src0->type == GGML_TYPE_F32) {
+                kernel = backend_ctx->kernel_softplus_f32_4;
+            } else {
+                kernel = backend_ctx->kernel_softplus_f16_4;
+            }
+            n /= 4;
+        } else {
+            if (src0->type == GGML_TYPE_F32) {
+                kernel = backend_ctx->kernel_softplus_f32;
+            } else {
+                kernel = backend_ctx->kernel_softplus_f16;
+            }
        }
-    }
-    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;

-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;
+        }
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    } else {
+        // Handle non-contiguous input
+        if (src0->type == GGML_TYPE_F32) {
+            kernel = backend_ctx->kernel_softplus_f32_nc;
+        } else {
+            kernel = backend_ctx->kernel_softplus_f16_nc;
+        }
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
+
+        int nth = 64;
+
+        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    }
 }

 static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
@@ -11088,7 +11074,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    GGML_UNUSED(src1);

    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(ggml_is_contiguous(src0));

    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

@@ -11111,7 +11096,14 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    const cl_ulong nb2  = dst->nb[2];
    const cl_ulong nb3  = dst->nb[3];

-    cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
+    cl_kernel kernel;
+
+    const bool is_c4 = ne00 % 4 == 0;
+    if (is_c4) {
+        kernel = backend_ctx->kernel_sum_rows_f32_4;
+    } else {
+        kernel = backend_ctx->kernel_sum_rows_f32;
+    }

    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &extra0->data_device));
    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong), &offset0));
@@ -11128,7 +11120,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    CL_CHECK(clSetKernelArg(kernel,  12, sizeof(cl_ulong), &nb2));
    CL_CHECK(clSetKernelArg(kernel,  13, sizeof(cl_ulong), &nb3));

-    size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
+    size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
    size_t local_work_size[] = {(size_t)64, 1, 1};

    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
--- a/ggml/src/ggml-opencl/kernels/expm1.cl
+++ b/ggml/src/ggml-opencl/kernels/expm1.cl
@@ -3,80 +3,111 @@
 //------------------------------------------------------------------------------
 // expm1
 //------------------------------------------------------------------------------
-kernel void kernel_expm1_f32_nd(
-        global void * p_src0_base,
-        ulong off_src0_abs,
-        global void * p_dst_base,
-        ulong off_dst_abs,
-        int ne00,
-        int ne01,
-        int ne02,
-        int ne03,
+
+kernel void kernel_expm1_f32(
+        global const float * src0,
+        ulong                offset0,
+        global       float * dst,
+        ulong                offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst  = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0f;
+}
+
+kernel void kernel_expm1_f32_4(
+        global const float4 * src0,
+        ulong                 offset0,
+        global       float4 * dst,
+        ulong                 offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst  = (global float4*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0f;
+}
+
+kernel void kernel_expm1_f16(
+        global const half * src0,
+        ulong               offset0,
+        global       half * dst,
+        ulong               offsetd
+) {
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst  = (global half*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
+}
+
+kernel void kernel_expm1_f16_4(
+        global const half4 * src0,
+        ulong                offset0,
+        global       half4 * dst,
+        ulong                offsetd
+) {
+    src0 = (global half4*)((global char*)src0 + offset0);
+    dst  = (global half4*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
+}
+
+kernel void kernel_expm1_f32_nc(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int   ne00,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
 ) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;

-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);

-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
+        global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+        global       float * y = (global       float *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

-            *dst_val_ptr = exp(*src_val_ptr) - 1;
-        }
+        *y = exp(*x) - 1.0f;
    }
 }

-kernel void kernel_expm1_f16_nd(
-        global void * p_src0_base,
-        ulong off_src0_abs,
-        global void * p_dst_base,
-        ulong off_dst_abs,
-        int ne00,
-        int ne01,
-        int ne02,
-        int ne03,
+kernel void kernel_expm1_f16_nc(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int   ne00,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
 ) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;

-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);

-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
+        global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+        global       half * y = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

-            *dst_val_ptr = exp(*src_val_ptr) - 1;
-        }
+        *y = exp(*x) - 1.0f;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mean.cl
+++ b/ggml/src/ggml-opencl/kernels/mean.cl
@@ -1,8 +1,13 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable

+// Most devices have max workgroup size of 1024, so this is enough for subgroup
+// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
+#define MAX_SUBGROUPS 64
 kernel void kernel_mean_f32(
-    global float *  src0,
+    global char *  src0,
    ulong           offset0,
-    global float *  dst,
+    global char *  dst,
    ulong           offsetd,
    int             ne00,
    int             ne01,
@@ -15,25 +20,121 @@ kernel void kernel_mean_f32(
    ulong           nb2,
    ulong           nb3
 ) {
-    src0 = (global float *)((global char *)src0 + offset0);
-    dst  = (global float *)((global char *)dst  + offsetd);
+    src0 = src0 + offset0;
+    dst  = dst  + offsetd;

-    int i3 = get_global_id(2);
-    int i2 = get_global_id(1);
-    int i1 = get_global_id(0);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    const int lid = get_local_id(0);
+    const int lsize = get_local_size(0);
+
+    const uint sg_size = get_sub_group_size();
+    const uint sg_id = get_sub_group_id();
+    const uint sg_lid = get_sub_group_local_id();
+
+    __local float lmem[MAX_SUBGROUPS];

    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
        return;
    }

-    global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
-    global float * dst_row = (global float *) ((global char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
-
-    float row_sum = 0;
-
-    for (int i0 = 0; i0 < ne00; i0++) {
-        row_sum += src_row[i0];
+    if(sg_id == 0){
+        lmem[sg_lid] = 0.0f;
    }

-    dst_row[0] = row_sum / ne00;
+    global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    global float * dst_row = (global float *) (dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float sumf = 0.0f;
+
+    for (int i0 = lid; i0 < ne00; i0 += lsize) {
+        sumf += src_row[i0];
+    }
+
+    sumf = sub_group_reduce_add(sumf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(sg_lid == 0){
+        lmem[sg_id] = sumf;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sumf = lmem[sg_lid];
+    sumf = sub_group_reduce_add(sumf);
+
+    if (lid == 0) {
+        dst_row[0] = sumf / ne00;
+    }
+}
+
+kernel void kernel_mean_f32_4(
+    global char *  src0,
+    ulong           offset0,
+    global char *  dst,
+    ulong           offsetd,
+    int             ne00,
+    int             ne01,
+    int             ne02,
+    int             ne03,
+    ulong           nb01,
+    ulong           nb02,
+    ulong           nb03,
+    ulong           nb1,
+    ulong           nb2,
+    ulong           nb3
+) {
+    src0 = src0 + offset0;
+    dst  = dst  + offsetd;
+
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    const int lid = get_local_id(0);
+    const int lsize = get_local_size(0);
+
+    const uint sg_size = get_sub_group_size();
+    const uint sg_id = get_sub_group_id();
+    const uint sg_lid = get_sub_group_local_id();
+
+    __local float lmem[MAX_SUBGROUPS];
+
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    if(sg_id == 0){
+        lmem[sg_lid] = 0.0f;
+    }
+
+    global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    global float  * dst_row = (global float  *) (dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float4 sum_vec = (float4)0.0f;
+
+    for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
+        sum_vec += src_row[i0];
+    }
+
+    float sumf = dot(sum_vec, (float4)(1.0f));
+    sumf = sub_group_reduce_add(sumf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(sg_lid == 0){
+        lmem[sg_id] = sumf;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sumf = lmem[sg_lid];
+    sumf = sub_group_reduce_add(sumf);
+
+    if (lid == 0) {
+        dst_row[0] = sumf / ne00;
+    }
 }
--- a/ggml/src/ggml-opencl/kernels/softplus.cl
+++ b/ggml/src/ggml-opencl/kernels/softplus.cl
@@ -3,86 +3,114 @@
 //------------------------------------------------------------------------------
 // softplus
 //------------------------------------------------------------------------------
-inline float softplus_f32(float x){
-    float ax = fabs(x);
-    float m = fmax(x, 0.0f);
-    return log1p(exp(-ax)) + m;
+
+kernel void kernel_softplus_f32(
+        global const float * src0,
+        ulong                offset0,
+        global       float * dst,
+        ulong                offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst  = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = (src0[get_global_id(0)] > 20.0f) ? src0[get_global_id(0)] : log(1.0f + exp(src0[get_global_id(0)]));
 }

-kernel void kernel_softplus_f32_nd(
-        global void * p_src0_base,
-        ulong off_src0_abs,
-        global void * p_dst_base,
-        ulong off_dst_abs,
-        int ne00,
-        int ne01,
-        int ne02,
-        int ne03,
+kernel void kernel_softplus_f32_4(
+        global const float4 * src0,
+        ulong                 offset0,
+        global       float4 * dst,
+        ulong                 offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst  = (global float4*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = (src0[get_global_id(0)] > 20.0f) ? src0[get_global_id(0)] : log(1.0f + exp(src0[get_global_id(0)]));
+}
+
+kernel void kernel_softplus_f16(
+        global const half * src0,
+        ulong               offset0,
+        global       half * dst,
+        ulong               offsetd
+) {
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst  = (global half*)((global char*)dst + offsetd);
+
+    const float x = convert_float(src0[get_global_id(0)]);
+    dst[get_global_id(0)] = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
+}
+
+kernel void kernel_softplus_f16_4(
+        global const half4 * src0,
+        ulong                offset0,
+        global       half4 * dst,
+        ulong                offsetd
+) {
+    src0 = (global half4*)((global char*)src0 + offset0);
+    dst  = (global half4*)((global char*)dst + offsetd);
+
+    const float4 x = convert_float4(src0[get_global_id(0)]);
+    dst[get_global_id(0)] = convert_half4_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
+}
+
+kernel void kernel_softplus_f32_nc(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int   ne00,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
 ) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;

-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);

-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
+        global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+        global       float * y = (global       float *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

-            *dst_val_ptr = softplus_f32(*src_val_ptr);
-        }
+        *y = (*x > 20.0f) ? *x : log(1.0f + exp(*x));
    }
 }

-kernel void kernel_softplus_f16_nd(
-        global void * p_src0_base,
-        ulong off_src0_abs,
-        global void * p_dst_base,
-        ulong off_dst_abs,
-        int ne00,
-        int ne01,
-        int ne02,
-        int ne03,
+kernel void kernel_softplus_f16_nc(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int   ne00,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
 ) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;

-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);

-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
+        global const half * hx = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+        global       half * hy = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

-            *dst_val_ptr = (half)(softplus_f32((float)(*src_val_ptr)));
-        }
+        const float x = convert_float(*hx);
+        *hy = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
    }
 }
--- a/ggml/src/ggml-opencl/kernels/sum_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/sum_rows.cl
@@ -1,8 +1,13 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable

+// Most devices have max workgroup size of 1024, so this is enough for subgroup
+// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
+#define MAX_SUBGROUPS 64
 kernel void kernel_sum_rows_f32(
-    global float *  src0,
+    global char *  src0,
    ulong           offset0,
-    global float *  dst,
+    global char *  dst,
    ulong           offsetd,
    int             ne00,
    int             ne01,
@@ -15,25 +20,121 @@ kernel void kernel_sum_rows_f32(
    ulong           nb2,
    ulong           nb3
 ) {
-    src0 = (global float *)((global char *)src0 + offset0);
-    dst  = (global float *)((global char *)dst  + offsetd);
+    src0 = src0 + offset0;
+    dst  = dst  + offsetd;

-    int i3 = get_global_id(2);
-    int i2 = get_global_id(1);
-    int i1 = get_global_id(0);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    const int lid = get_local_id(0);
+    const int lsize = get_local_size(0);
+
+    const uint sg_size = get_sub_group_size();
+    const uint sg_id = get_sub_group_id();
+    const uint sg_lid = get_sub_group_local_id();
+
+    __local float lmem[MAX_SUBGROUPS];

    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
        return;
    }

-    global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
-    global float * dst_row = (global float *) ((global char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
-
-    float row_sum = 0;
-
-    for (int i0 = 0; i0 < ne00; i0++) {
-        row_sum += src_row[i0];
+    if(sg_id == 0){
+        lmem[sg_lid] = 0.0f;
    }

-    dst_row[0] = row_sum;
+    global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    global float * dst_row = (global float *) (dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float sumf = 0.0f;
+
+    for (int i0 = lid; i0 < ne00; i0 += lsize) {
+        sumf += src_row[i0];
+    }
+
+    sumf = sub_group_reduce_add(sumf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(sg_lid == 0){
+        lmem[sg_id] = sumf;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sumf = lmem[sg_lid];
+    sumf = sub_group_reduce_add(sumf);
+
+    if (lid == 0) {
+        dst_row[0] = sumf;
+    }
+}
+
+kernel void kernel_sum_rows_f32_4(
+    global char *  src0,
+    ulong           offset0,
+    global char *  dst,
+    ulong           offsetd,
+    int             ne00,
+    int             ne01,
+    int             ne02,
+    int             ne03,
+    ulong           nb01,
+    ulong           nb02,
+    ulong           nb03,
+    ulong           nb1,
+    ulong           nb2,
+    ulong           nb3
+) {
+    src0 = src0 + offset0;
+    dst  = dst  + offsetd;
+
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    const int lid = get_local_id(0);
+    const int lsize = get_local_size(0);
+
+    const uint sg_size = get_sub_group_size();
+    const uint sg_id = get_sub_group_id();
+    const uint sg_lid = get_sub_group_local_id();
+
+    __local float lmem[MAX_SUBGROUPS];
+
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    if(sg_id == 0){
+        lmem[sg_lid] = 0.0f;
+    }
+
+    global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    global float  * dst_row = (global float  *) (dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float4 sum_vec = (float4)0.0f;
+
+    for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
+        sum_vec += src_row[i0];
+    }
+
+    float sumf = dot(sum_vec, (float4)(1.0f));
+    sumf = sub_group_reduce_add(sumf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(sg_lid == 0){
+        lmem[sg_id] = sumf;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sumf = lmem[sg_lid];
+    sumf = sub_group_reduce_add(sumf);
+
+    if (lid == 0) {
+        dst_row[0] = sumf;
+    }
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -944,6 +944,7 @@ struct vk_mat_mat_push_constants {
    uint32_t M; uint32_t N; uint32_t K;
    uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
    uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
+    uint32_t base_work_group_z; uint32_t num_batches;
    uint32_t k_split;
    uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
    uint32_t padded_N;
@@ -963,6 +964,7 @@ struct vk_mat_vec_push_constants {
    uint32_t batch_stride_b;
    uint32_t batch_stride_d;
    uint32_t fusion_flags;
+    uint32_t base_work_group_y;
    uint32_t ne02;
    uint32_t ne12;
    uint32_t broadcast2;
@@ -6773,8 +6775,16 @@ static void ggml_vk_matmul(
        uint32_t padded_n) {
        VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
    if (split_k == 1) {
-        const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
+
+        uint32_t base_work_group_z = 0;
+        while (base_work_group_z < batch) {
+            uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
+
+            const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n };
+            ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z });
+            base_work_group_z += groups_z;
+        }
        return;
    }

@@ -6788,9 +6798,17 @@ static void ggml_vk_matmul(
    uint32_t k_split = CEIL_DIV(k, split_k);
    k_split = ROUNDUP_POW2(k_split, 256);

-    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
-    // Make sure enough workgroups get assigned for split k to work
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
+
+    uint32_t base_work_group_z = 0;
+    while (base_work_group_z < batch) {
+        uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
+
+        const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
+        // Make sure enough workgroups get assigned for split k to work
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z });
+        base_work_group_z += groups_z;
+    }
    ggml_vk_sync_buffers(ctx, subctx);
    const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
@@ -7186,7 +7204,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
        }

        // Request descriptor sets
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
        if (qx_needs_dequant) {
            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
        }
@@ -7484,7 +7501,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
        if (quantize_y) {
            ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
        }
-        ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
    }

    vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]);
@@ -7579,22 +7595,29 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
        fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1;
    }

-    // compute
-    const vk_mat_vec_push_constants pc = {
-        (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
-        stride_batch_x, stride_batch_y, stride_batch_d,
-        fusion_flags,
-        (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
-    };
-    ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
-                              {
-                                d_X,
-                                d_Y,
-                                d_D,
-                                d_F0,
-                                d_F1,
-                              },
-                              pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
+    ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1]));
+
+    uint32_t base_work_group_y = 0;
+    while (base_work_group_y < ne12 * ne13) {
+
+        uint32_t groups_y = std::min((uint32_t)(ne12 * ne13) - base_work_group_y, ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
+        const vk_mat_vec_push_constants pc = {
+            (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
+            stride_batch_x, stride_batch_y, stride_batch_d,
+            fusion_flags, base_work_group_y,
+            (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
+        };
+        ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
+                                  {
+                                    d_X,
+                                    d_Y,
+                                    d_D,
+                                    d_F0,
+                                    d_F1,
+                                  },
+                                  pc, { groups_x, groups_y, groups_z });
+        base_work_group_y += groups_y;
+    }

    if (x_non_contig) {
        ctx->prealloc_x_need_sync = true;
@@ -7832,10 +7855,15 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
        src1->nb[2] <= src1->nb[1] &&
        src1->nb[1] <= src1->nb[3] &&
        src0->ne[3] == 1 &&
-        src1->ne[3] == 1) {
+        src1->ne[3] == 1 &&
+        src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
+        src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) {
        ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx);
    } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
-               !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
+               !ggml_is_permuted(src0) && !ggml_is_permuted(src1) &&
+               src0->ne[3] <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
+               src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
+               src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) {
        ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx);
    // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
    // when ne12 and ne13 are one.
@@ -11560,7 +11588,6 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
        }
    }

-    ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
    if (split_k > 1) {
        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);

@@ -12069,7 +12096,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
        // y[i] = i % k;
    }

-    ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
    if (split_k > 1) {
        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl
@@ -32,6 +32,7 @@ layout (push_constant) uniform parameter
    uint expert_i1;
    uint nbi1;
 #else
+    uint base_work_group_y;
    uint ne02;
    uint ne12;
    uint broadcast2;
@@ -45,9 +46,9 @@ uint expert_id;

 void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
 #ifdef MUL_MAT_ID
-    const uint expert_i0 = gl_GlobalInvocationID.y;
+    const uint expert_i0 = gl_WorkGroupID.y;
 #else
-    const uint batch_idx = gl_GlobalInvocationID.y;
+    const uint batch_idx = gl_WorkGroupID.y + p.base_work_group_y;
 #endif

 #ifndef MUL_MAT_ID
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -90,6 +90,8 @@ layout (push_constant) uniform parameter
    uint nbi1;
    uint ne11;
 #else
+    uint base_work_group_z;
+    uint num_batches;
    uint k_split;
    uint ne02;
    uint ne12;
@@ -139,7 +141,7 @@ void main() {
    const uint ic = gl_WorkGroupID.y;

 #ifdef MUL_MAT_ID
-    const uint expert_idx = gl_GlobalInvocationID.z;
+    const uint expert_idx = gl_WorkGroupID.z;
    if (ic * BN >= data_expert_count[expert_idx]) {
        return;
    }
@@ -149,7 +151,7 @@ void main() {
 #endif

 #ifndef MUL_MAT_ID
-    const uint batch_idx = gl_GlobalInvocationID.z;
+    const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;

    const uint i13 = batch_idx / p.ne12;
    const uint i12 = batch_idx % p.ne12;
@@ -366,7 +368,7 @@ void main() {
    const uint dc = ic * BN + warp_c * WN;

 #ifndef MUL_MAT_ID
-    const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
+    const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
 #endif

 #ifdef COOPMAT
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -53,6 +53,8 @@ layout (push_constant) uniform parameter
    uint nbi1;
    uint ne11;
 #else
+    uint base_work_group_z;
+    uint num_batches;
    uint k_split;
    uint ne02;
    uint ne12;
@@ -197,7 +199,7 @@ void main() {
    const uint ic = gl_WorkGroupID.y;

 #ifdef MUL_MAT_ID
-    const uint expert_idx = gl_GlobalInvocationID.z;
+    const uint expert_idx = gl_WorkGroupID.z;
    if (ic * BN >= data_expert_count[expert_idx]) {
        return;
    }
@@ -215,7 +217,7 @@ void main() {
 #endif

 #ifndef MUL_MAT_ID
-    const uint batch_idx = gl_GlobalInvocationID.z;
+    const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;

    const uint i13 = batch_idx / p.ne12;
    const uint i12 = batch_idx % p.ne12;
@@ -255,7 +257,7 @@ void main() {
 #else
    uint pos_a = batch_idx_a * (p.batch_stride_a / QUANT_K);
    uint pos_b = batch_idx * p.batch_stride_b;
-    uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
+    uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
 #endif

    uint stride_a = p.stride_a / QUANT_K;
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -308,6 +308,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
+            case LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM:
                regex_exprs = {
                    "\\p{N}{1,3}",
                    "[一-龥぀-ゟ゠-ヿ]+",
@@ -2051,6 +2052,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "hunyuan-dense") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "joyai-llm") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM;
+                clean_spaces = false;
            } else if (
                tokenizer_pre == "kimi-k2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -56,6 +56,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE      = 45,
    LLAMA_VOCAB_PRE_TYPE_QWEN35          = 46,
    LLAMA_VOCAB_PRE_TYPE_TINY_AYA        = 47,
+    LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
 };

 struct LLM_KV;
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -347,7 +347,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
    int count = 0;
    double nll = 0.0;

-    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    const int n_seq = std::max(1, n_batch / n_ctx);
+    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);

    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.ppl_stride;
@@ -1737,11 +1738,21 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
    }

    const int n_batch = params.n_batch;
-    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
+    const int num_batches = (static_cast<int>(n_ctx) + n_batch - 1) / n_batch;
+    // Calculate n_seq based on the logits file's n_ctx, but cap it at what the context supports
+    const int n_seq_max = llama_n_seq_max(ctx);
+    int n_seq = std::max(1, n_batch / static_cast<int>(n_ctx));
+    if (n_seq > n_seq_max) {
+        LOG_WRN("%s: calculated n_seq=%d exceeds context's n_seq_max=%d, capping at %d\n",
+                __func__, n_seq, n_seq_max, n_seq_max);
+        n_seq = n_seq_max;
+    }
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_vocab_get_add_bos(vocab);
    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));

+    llama_batch batch = llama_batch_init(std::min(n_batch, static_cast<int>(n_ctx)*n_seq), 0, 1);
+
    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
    std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -1750,6 +1761,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        logits.reserve(size_t(n_ctx) * n_vocab);
    }

+    LOG_INF("%s: computing over %d chunks, n_ctx=%u, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
@@ -1774,107 +1787,122 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
    auto    kld_ptr =    kld_values.data();
    auto p_diff_ptr = p_diff_values.data();

-    for (int i = 0; i < n_chunk; ++i) {
+    const int first = n_ctx/2;
+
+    for (int i = 0; i < n_chunk; i += n_seq) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

-        const auto t_start = std::chrono::high_resolution_clock::now();
+        const int n_seq_batch = std::min(n_seq, n_chunk - i);

-        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
-            return;
-        }
+        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
        llama_memory_clear(llama_get_memory(ctx), true);

-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
-
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_vocab_bos(vocab);
-            }
+            int n_outputs = 0;

            common_batch_clear(batch);
-            for (int i = 0; i < batch_size; i++) {
-                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                int seq_start = batch_start + seq*n_ctx;
+
+                // save original token and restore it after eval
+                const auto token_org = tokens[seq_start];
+
+                // add BOS token for the first batch of each chunk
+                if (add_bos && j == 0) {
+                    tokens[seq_start] = llama_vocab_bos(vocab);
+                }
+
+                for (int k = 0; k < batch_size; ++k) {
+                    const int pos = j*n_batch + k;
+                    const bool need_logits = pos >= first;
+                    common_batch_add(batch, tokens[seq_start + k], pos, { seq }, need_logits);
+                    n_outputs += need_logits;
+                }
+
+                // restore the original token in case it was set to BOS
+                tokens[seq_start] = token_org;
            }

            if (llama_decode(ctx, batch)) {
-                LOG_ERR("%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to decode\n", __func__);
                llama_batch_free(batch);
                return;
            }

-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
-            if (num_batches > 1) {
+            if (num_batches > 1 && n_outputs > 0) {
                const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
            }
        }

-        llama_batch_free(batch);
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
        if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
+            int total_seconds = (int)(t_total * n_chunk / n_seq);
            if (total_seconds >= 60*60) {
                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
            LOG("%.2f minutes\n", total_seconds / 60.0);
+            LOG("\n");
+            LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
        }
-        LOG("\n");
-        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");

-        const int first = n_ctx/2;
-        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
-        p_diff_ptr += n_ctx - 1 - first;
-        kld_ptr    += n_ctx - 1 - first;
+        // Read log probs for each sequence in the batch
+        for (int seq = 0; seq < n_seq_batch; seq++) {
+            if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
+                LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i + seq);
+                llama_batch_free(batch);
+                return;
+            }

-        LOG("%4d", i+1);
+            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);

-        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
-        const double ppl_val = exp(log_ppl.first);
-        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+            process_logits(n_vocab, all_logits, tokens.data() + start + seq*n_ctx + first, n_ctx - 1 - first,
+                    workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
+            p_diff_ptr += n_ctx - 1 - first;
+            kld_ptr    += n_ctx - 1 - first;

-        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
-        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
-        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+            LOG("%4d", i + seq + 1);

-        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+            auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+            const double ppl_val = exp(log_ppl.first);
+            const double ppl_unc = ppl_val * log_ppl.second;
+            LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);

-        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-        const double p_diff_rms_val = sqrt(p_diff_mse.first);
-        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+            auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+            const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+            const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+            const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+            LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);

-        double p_top_val = 1.*kld.n_same_top/kld.count;
-        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+            auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+            LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);

-        LOG("\n");
+            auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+            const double p_diff_rms_val = sqrt(p_diff_mse.first);
+            const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+            LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+            double p_top_val = 1.*kld.n_same_top/kld.count;
+            double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
+            LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+
+            LOG("\n");
+        }

        logits.clear();
    }
+
+    llama_batch_free(batch);
    LOG("\n");

    if (kld.count < 100) return; // we do not wish to do statistics on so few values
@@ -1996,7 +2024,7 @@ int main(int argc, char ** argv) {

    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;

-    if (ppl) {
+    if (ppl || params.kl_divergence) {
        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
        const int32_t n_kv = n_seq * n_ctx;

@@ -2006,12 +2034,8 @@ int main(int argc, char ** argv) {
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
-        if (params.kl_divergence) {
-            params.n_parallel = 1;
-        } else {
-            // ensure there's at least enough seq_ids for HellaSwag
-            params.n_parallel = std::max(4, params.n_parallel);
-        }
+        // ensure there's at least enough seq_ids for HellaSwag
+        params.n_parallel = std::max(4, params.n_parallel);
    }

    if (params.ppl_stride > 0) {
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -59,8 +59,4 @@ target_include_directories(${TARGET} PRIVATE ../mtmd)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})

-if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
-endif()
-
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte
@@ -1,5 +1,5 @@
 <script lang="ts">
-	import { RemoveButton } from '$lib/components/app';
+	import { ActionIconRemove } from '$lib/components/app';
 	import { formatFileSize, getFileTypeLabel, getPreviewText, isTextFile } from '$lib/utils';
 	import { AttachmentType } from '$lib/enums';

@@ -104,7 +104,7 @@
 			onclick={onClick}
 		>
 			<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
-				<RemoveButton {id} {onRemove} />
+				<ActionIconRemove {id} {onRemove} />
 			</div>

 			<div class="pr-8">
@@ -158,7 +158,7 @@

 		{#if !readonly}
 			<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
-				<RemoveButton {id} {onRemove} />
+				<ActionIconRemove {id} {onRemove} />
 			</div>
 		{/if}
 	</button>
--- a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte
@@ -1,5 +1,5 @@
 <script lang="ts">
-	import { RemoveButton } from '$lib/components/app';
+	import { ActionIconRemove } from '$lib/components/app';

 	interface Props {
 		id: string;
@@ -58,7 +58,7 @@
 		<div
 			class="absolute top-1 right-1 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
 		>
-			<RemoveButton {id} {onRemove} class="text-white" />
+			<ActionIconRemove {id} {onRemove} class="text-white" />
 		</div>
 	{/if}
 </div>
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte
@@ -5,6 +5,7 @@
 	interface Props {
 		class?: string;
 		disabled?: boolean;
+		onInput?: () => void;
 		onKeydown?: (event: KeyboardEvent) => void;
 		onPaste?: (event: ClipboardEvent) => void;
 		placeholder?: string;
@@ -14,6 +15,7 @@
 	let {
 		class: className = '',
 		disabled = false,
+		onInput,
 		onKeydown,
 		onPaste,
 		placeholder = 'Ask anything...',
@@ -52,7 +54,10 @@
 		class:cursor-not-allowed={disabled}
 		{disabled}
 		onkeydown={onKeydown}
-		oninput={(event) => autoResizeTextarea(event.currentTarget)}
+		oninput={(event) => {
+			autoResizeTextarea(event.currentTarget);
+			onInput?.();
+		}}
 		onpaste={onPaste}
 		{placeholder}
 	></textarea>
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
@@ -14,12 +14,17 @@
 </script>

 <header
-	class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl duration-200 ease-linear {sidebar.open
+	class="pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end p-4 duration-200 ease-linear {sidebar.open
 		? 'md:left-[var(--sidebar-width)]'
 		: ''}"
 >
 	<div class="pointer-events-auto flex items-center space-x-2">
-		<Button variant="ghost" size="sm" onclick={toggleSettings}>
+		<Button
+			variant="ghost"
+			size="icon"
+			onclick={toggleSettings}
+			class="rounded-full backdrop-blur-lg"
+		>
 			<Settings class="h-4 w-4" />
 		</Button>
 	</div>
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
@@ -11,7 +11,7 @@
 	let isCurrentConversationLoading = $derived(isLoading());
 	let isStreaming = $derived(isChatStreaming());
 	let hasProcessingData = $derived(processingState.processingState !== null);
-	let processingDetails = $derived(processingState.getProcessingDetails());
+	let processingDetails = $derived(processingState.getTechnicalDetails());

 	let showProcessingInfo = $derived(
 		isCurrentConversationLoading || isStreaming || config().keepStatsVisible || hasProcessingData
@@ -63,7 +63,7 @@
 <div class="chat-processing-info-container pointer-events-none" class:visible={showProcessingInfo}>
 	<div class="chat-processing-info-content">
 		{#each processingDetails as detail (detail)}
-			<span class="chat-processing-info-detail pointer-events-auto">{detail}</span>
+			<span class="chat-processing-info-detail pointer-events-auto backdrop-blur-sm">{detail}</span>
 		{/each}
 	</div>
 </div>
@@ -73,7 +73,7 @@
 		position: sticky;
 		top: 0;
 		z-index: 10;
-		padding: 1.5rem 1rem;
+		padding: 0 1rem 0.75rem;
 		opacity: 0;
 		transform: translateY(50%);
 		transition:
@@ -100,7 +100,6 @@
 		color: var(--muted-foreground);
 		font-size: 0.75rem;
 		padding: 0.25rem 0.75rem;
-		background: var(--muted);
 		border-radius: 0.375rem;
 		font-family:
 			ui-monospace, SFMono-Regular, 'SF Mono', Consolas, 'Liberation Mono', Menlo, monospace;
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
@@ -1,11 +1,10 @@
 <script lang="ts">
 	import { Download, Upload, Trash2 } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
-	import { DialogConversationSelection } from '$lib/components/app';
+	import { DialogConversationSelection, DialogConfirmation } from '$lib/components/app';
 	import { createMessageCountMap } from '$lib/utils';
 	import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
 	import { toast } from 'svelte-sonner';
-	import DialogConfirmation from '$lib/components/app/dialogs/DialogConfirmation.svelte';

 	let exportedConversations = $state<DatabaseConversation[]>([]);
 	let importedConversations = $state<DatabaseConversation[]>([]);
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
@@ -9,7 +9,7 @@
 	import Input from '$lib/components/ui/input/input.svelte';
 	import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
 	import { chatStore } from '$lib/stores/chat.svelte';
-	import { getPreviewText } from '$lib/utils/text';
+	import { getPreviewText } from '$lib/utils';
 	import ChatSidebarActions from './ChatSidebarActions.svelte';

 	const sidebar = Sidebar.useSidebar();
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
@@ -1,6 +1,6 @@
 <script lang="ts">
 	import { Trash2, Pencil, MoreHorizontal, Download, Loader2, Square } from '@lucide/svelte';
-	import { ActionDropdown } from '$lib/components/app';
+	import { DropdownMenuActions } from '$lib/components/app';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { getAllLoadingChats } from '$lib/stores/chat.svelte';
 	import { conversationsStore } from '$lib/stores/conversations.svelte';
@@ -128,7 +128,7 @@

 	{#if renderActionsDropdown}
 		<div class="actions flex items-center">
-			<ActionDropdown
+			<DropdownMenuActions
 				triggerIcon={MoreHorizontal}
 				triggerTooltip="More actions"
 				bind:open={dropdownOpen}
--- a/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
+++ b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
@@ -616,7 +616,7 @@
 					code={incompleteCodeBlock.code}
 					language={incompleteCodeBlock.language || 'text'}
 					disabled={true}
-					onPreview={(code: string, lang: string) => {
+					onPreview={(code, lang) => {
 						previewCode = code;
 						previewLanguage = lang;
 						previewDialogOpen = true;
--- a/tools/server/webui/src/lib/stores/server.svelte.ts
+++ b/tools/server/webui/src/lib/stores/server.svelte.ts
@@ -18,9 +18,13 @@ import { ServerRole } from '$lib/enums';
 * - **Default Params**: Server-wide generation defaults
 */
 class ServerStore {
-	// ─────────────────────────────────────────────────────────────────────────────
-	// State
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * State
+	 *
+	 *
+	 */

 	props = $state<ApiLlamaCppServerProps | null>(null);
 	loading = $state(false);
@@ -28,16 +32,22 @@ class ServerStore {
 	role = $state<ServerRole | null>(null);
 	private fetchPromise: Promise<void> | null = null;

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Getters
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Getters
+	 *
+	 *
+	 */

 	get defaultParams(): ApiLlamaCppServerProps['default_generation_settings']['params'] | null {
 		return this.props?.default_generation_settings?.params || null;
 	}

 	get contextSize(): number | null {
-		return this.props?.default_generation_settings?.n_ctx ?? null;
+		const nCtx = this.props?.default_generation_settings?.n_ctx;
+
+		return typeof nCtx === 'number' ? nCtx : null;
 	}

 	get webuiSettings(): Record<string, string | number | boolean> | undefined {
@@ -52,9 +62,13 @@ class ServerStore {
 		return this.role === ServerRole.MODEL;
 	}

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Data Handling
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Data Handling
+	 *
+	 *
+	 */

 	async fetch(): Promise<void> {
 		if (this.fetchPromise) return this.fetchPromise;
@@ -115,9 +129,13 @@ class ServerStore {
 		this.fetchPromise = null;
 	}

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Utilities
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Utilities
+	 *
+	 *
+	 */

 	private detectRole(props: ApiLlamaCppServerProps): void {
 		const newRole = props?.role === ServerRole.ROUTER ? ServerRole.ROUTER : ServerRole.MODEL;
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -47,18 +47,26 @@ import {
 } from '$lib/constants/localstorage-keys';

 class SettingsStore {
-	// ─────────────────────────────────────────────────────────────────────────────
-	// State
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * State
+	 *
+	 *
+	 */

 	config = $state<SettingsConfigType>({ ...SETTING_CONFIG_DEFAULT });
 	theme = $state<string>('auto');
 	isInitialized = $state(false);
 	userOverrides = $state<Set<string>>(new Set());

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Utilities (private helpers)
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Utilities (private helpers)
+	 *
+	 *
+	 */

 	/**
 	 * Helper method to get server defaults with null safety
@@ -76,9 +84,13 @@ class SettingsStore {
 		}
 	}

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Lifecycle
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Lifecycle
+	 *
+	 *
+	 */

 	/**
 	 * Initialize the settings store by loading from localStorage
@@ -130,9 +142,13 @@ class SettingsStore {

 		this.theme = localStorage.getItem('theme') || 'auto';
 	}
-	// ─────────────────────────────────────────────────────────────────────────────
-	// Config Updates
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Config Updates
+	 *
+	 *
+	 */

 	/**
 	 * Update a specific configuration setting
@@ -234,9 +250,13 @@ class SettingsStore {
 		}
 	}

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Reset
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Reset
+	 *
+	 *
+	 */

 	/**
 	 * Reset configuration to defaults
@@ -285,9 +305,13 @@ class SettingsStore {
 		this.saveConfig();
 	}

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Server Sync
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Server Sync
+	 *
+	 *
+	 */

 	/**
 	 * Initialize settings with props defaults when server properties are first loaded
@@ -349,9 +373,13 @@ class SettingsStore {
 		this.saveConfig();
 	}

-	// ─────────────────────────────────────────────────────────────────────────────
-	// Utilities
-	// ─────────────────────────────────────────────────────────────────────────────
+	/**
+	 *
+	 *
+	 * Utilities
+	 *
+	 *
+	 */

 	/**
 	 * Get a specific configuration value
--- a/tools/server/webui/tests/stories/ChatScreenForm.stories.svelte
+++ b/tools/server/webui/tests/stories/ChatScreenForm.stories.svelte
@@ -44,8 +44,7 @@
 <Story
 	name="Default"
 	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
-	play={async (context) => {
-		const { canvas, userEvent } = context;
+	play={async ({ canvas, userEvent }) => {
 		const textarea = await canvas.findByRole('textbox');
 		const submitButton = await canvas.findByRole('button', { name: 'Send' });

@@ -75,8 +74,7 @@
 		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
 		uploadedFiles: fileAttachments
 	}}
-	play={async (context) => {
-		const { canvas } = context;
+	play={async ({ canvas }) => {
 		const jpgAttachment = canvas.getByAltText('1.jpg');
 		const svgAttachment = canvas.getByAltText('hf-logo.svg');
 		const pdfFileExtension = canvas.getByText('PDF');
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -17,7 +17,7 @@ endif()
 target_link_libraries(${TARGET} PRIVATE Threads::Threads)

 if (WIN32 AND NOT MSVC)
-    target_link_libraries(${TARGET} PRIVATE ws2_32)
+    target_link_libraries(${TARGET} PUBLIC ws2_32)
 endif()

 target_compile_features(${TARGET} PRIVATE cxx_std_17)
Author	SHA1	Message	Date
Jeff Bolz	d0061be838	vulkan: split mul_mat into multiple dispatches to avoid overflow (#19509 ) * vulkan: split mul_mat into multiple dispatches to avoid overflow The batch dimensions can be greater than the max workgroup count limit, in which case we need to split into multiple dispatches and pass the base index through a push constant. Fall back for the less common p021 and nc variants. * address feedback	2026-02-18 10:47:10 +01:00
Adrien Gallouët	a569bda445	common : make small string helpers as inline functions (#19693 ) Also use string_view when it make sense and fix some corner cases. Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-02-18 08:03:01 +01:00
shaofeiqi	e2f19b320f	opencl: refactor expm1 and softplus (#19404 ) * opencl: refactor expm1 * opencl: refactor softplus * opencl: use h for half literals --------- Co-authored-by: Li He <lih@qti.qualcomm.com>	2026-02-17 14:47:18 -08:00
shaofeiqi	983559d24b	opencl: optimize mean and sum_row kernels (#19614 ) * opencl: optimize mean and sum_row kernels * opencl: add comment for max subgroups * opencl: format --------- Co-authored-by: Li He <lih@qti.qualcomm.com>	2026-02-17 13:56:09 -08:00
Daniel Bevenius	2b089c7758	model-conversion : add option to print tensor values (#19692 ) This commit updates the tensor-info.py script to support the option to print the first N values of a tensor when displaying its information. The motivation for this is that it can be useful to inspect some actual values in addition to the shapes of the tensors.	2026-02-17 20:43:22 +01:00
Aleksander Grygier	afa6bfe4f7	Pre-MCP UI and architecture cleanup (#19685 ) * webui: extract non-MCP changes from mcp-mvp review split * webui: extract additional pre-MCP UI and architecture cleanup * chore: update webui build output	2026-02-17 13:47:45 +01:00
Talha Can Havadar	ae2d3f28a8	ggml: ggml-cpu: force-no-lto-for-cpu-feats (#19609 ) When LTO enabled in build environments it forces all builds to have LTO in place. But feature detection logic is fragile, and causing Illegal instruction errors with lto. This disables LTO for the feature detection code to prevent cross-module optimization from inlining architecture-specific instructions into the score function. Without this, LTO can cause SIGILL when loading backends on older CPUs (e.g., loading power10 backend on power9 crashes before feature check runs).	2026-02-17 13:22:46 +02:00
Georgi Gerganov	ad8207af77	cuda : enable CUDA graphs for MMID 1 <= BS <= 4 (#19645 ) * cuda : enable CUDA graphs for MMID BS <= 4 * cont : add stream capture check Co-authored-by: Oliver Simons <osimons@nvidia.com> * cont : add MMVQ_MMID_MAX_BATCH_SIZE --------- Co-authored-by: Oliver Simons <osimons@nvidia.com>	2026-02-17 12:31:49 +02:00
Daniel Bevenius	667b694278	model-conversion : make printing of config values optional (#19681 ) * model-conversion : make printing of config values optional This commit updates run-org-model.py to make the printing of model configuration values optional. The motivation for this change is that not all models have these configuration values defined and those that do not will error when running this script. With these changes we only print the values if they exist or a default value. We could optionally just remove them but it can be useful to see these values when running the original model.	2026-02-17 10:46:53 +01:00
Sigbjørn Skjæret	e48349a49d	ci : bump komac version (#19682 )	2026-02-17 09:30:31 +01:00
Adrien Gallouët	ae46a61e41	build : link ws2_32 as PUBLIC on Windows (#19666 ) Signed-off-by: Adrien Gallouët <adrien@gallouet.fr>	2026-02-17 08:37:07 +01:00
Adrien Gallouët	65cede7c70	build : cleanup library linking logic (#19665 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-02-17 08:36:45 +01:00
DAN™	05fa625eac	convert : add JoyAI-LLM-Flash (#19651 ) * convert_hf_to_gguf: add JoyAI-LLM-Flash tokenizer hash mapping to deepseek-v3 * llama-vocab: create a new pre-tokenizer name for joyai-llm. * add missing vocab type section * Update convert_hf_to_gguf_update.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-02-16 22:49:57 +01:00
AesSedai	d612901116	perplexity: add proper batching (#19661 )	2026-02-16 18:44:44 +02:00