kleidiai : fix work size and threads sync for fp16 (#16246 )

codeowners: add codeowners for opencl backend (#16344 )
tests: override test_set_rows::max_nmse_err to allow for occasional rounding differences (#16295 )
2026-05-04 08:04:07 +00:00 · 2025-09-30 10:07:20 +03:00 · 2025-09-30 08:30:16 +03:00 · 2025-09-29 19:26:34 -05:00 · 2025-09-29 18:49:47 +02:00 · 2025-09-29 17:51:48 +03:00
40 changed files with 5166 additions and 174 deletions
--- a/.github/workflows/build-amd.yml
+++ b/.github/workflows/build-amd.yml
@@ -0,0 +1,52 @@
+name: CI (AMD)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-amd.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.comp'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ggml-ci-x64-amd-vulkan:
+    runs-on: [self-hosted, Linux, X64, AMD]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-amd-rocm:
+    runs-on: [self-hosted, Linux, X64, AMD]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          amd-smi static
+          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -253,3 +253,47 @@ jobs:
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-riscv64-cpu-spacemit-ime-cross:
+    runs-on: ubuntu-24.04
+
+    env:
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+      SPACEMIT_IME_TOOLCHAIN_PATH: "spacemit-toolchain-linux-glibc-x86_64"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Cache Toolchain
+        uses: actions/cache@v4
+        id: cache-spacemit-ime-cross-toolchain
+        with:
+          path: ./${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
+          key: ${{ runner.os }}-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
+
+      - name: Setup Toolchain
+        if: steps.cache-spacemit-ime-cross-toolchain.outputs.cache-hit != 'true'
+        run: |
+          wget --quiet --no-check-certificate https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}.tar.xz -O ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz
+          rm -rf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
+          mkdir -p ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
+          tar xf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz -C ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }} --strip-components=1
+          rm -rf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz
+
+      - name: Build
+        run: |
+          export RISCV_ROOT_PATH=${PWD}/${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
+                         -DGGML_RVV=ON \
+                         -DGGML_RV_ZFH=ON \
+                         -DGGML_RV_ZICBOP=ON \
+                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+
+          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -58,3 +58,63 @@ jobs:
            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
+
+  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
+  #   runs-on: [self-hosted, RISCV64]
+
+  #   steps:
+  #     - name: Install prerequisites
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y libatomic1
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu \
+  #                 ccache \
+  #                 cmake
+  #         sudo apt-get upgrade binutils -y
+
+  #     - name: Setup ccache
+  #       run: |
+  #         mkdir -p $HOME/.ccache
+  #         ccache -M 5G -d $HOME/.ccache
+  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
+  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
+  #         echo "$GITHUB_WORKSPACE"
+  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
+  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
+  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
+
+  #     - name: Build
+  #       run: |
+  #         cmake -B build \
+  #           -DLLAMA_CURL=OFF \
+  #           -DCMAKE_BUILD_TYPE=Release \
+  #           -DGGML_OPENMP=OFF \
+  #           -DLLAMA_BUILD_EXAMPLES=ON \
+  #           -DLLAMA_BUILD_TOOLS=ON \
+  #           -DLLAMA_BUILD_TESTS=OFF \
+  #           -DCMAKE_SYSTEM_NAME=Linux \
+  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
+  #           -DGGML_RVV=ON \
+  #           -DGGML_RV_ZFH=ON \
+  #           -DGGML_RV_ZICBOP=ON \
+  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
+  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
+
+  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1461,34 +1461,6 @@ jobs:
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-#  ggml-ci-x64-amd-vulkan:
-#    runs-on: [self-hosted, Linux, X64, AMD]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v4
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          vulkaninfo --summary
-#          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-#
-#  ggml-ci-x64-amd-rocm:
-#    runs-on: [self-hosted, Linux, X64, AMD]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v4
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          amd-smi static
-#          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]

--- a/2
+++ b/2
@@ -50,6 +50,7 @@
 /ggml/src/ggml-blas/                    @slaren
 /ggml/src/ggml-common.h                 @ggerganov @slaren
 /ggml/src/ggml-cpu/                     @ggerganov @slaren
+/ggml/src/ggml-cpu/spacemit/            @alex-spacemit
 /ggml/src/ggml-cuda/common.cuh          @slaren
 /ggml/src/ggml-cuda/fattn*              @JohannesGaessler
 /ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
@@ -59,6 +60,7 @@
 /ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
 /ggml/src/ggml-impl.h                   @ggerganov @slaren
 /ggml/src/ggml-metal/                   @ggerganov
+/ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -114,6 +114,7 @@ if [ ! -z ${GG_BUILD_NO_SVE} ]; then
    # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
 fi
+
 ## helpers

 # download a file if it does not exist or if it is outdated
--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+set(CMAKE_SYSTEM_VERSION 1)
+
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
+    message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+else()
+    set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
+    if (DEFINED ENV{RISCV_ROOT_PATH})
+        file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+    else()
+        message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+    endif()
+
+    set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
+    set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
+    set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
+    set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
+    set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -0,0 +1,89 @@
+> [!IMPORTANT]
+> This build documentation is specific only to RISC-V SpacemiT SOCs.
+
+## Build llama.cpp locally (for riscv64)
+
+1. Prepare Toolchain For RISCV
+~~~
+wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
+~~~
+
+2. Build
+Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
+```bash
+
+cmake -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_CPU_RISCV64_SPACEMIT=ON \
+    -DLLAMA_CURL=OFF \
+    -DGGML_RVV=ON \
+    -DGGML_RV_ZFH=ON \
+    -DGGML_RV_ZICBOP=ON \
+    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
+    -DCMAKE_INSTALL_PREFIX=build/installed
+
+cmake --build build --parallel $(nproc) --config Release
+
+pushd build
+make install
+popd
+```
+
+## Simulation
+You can use QEMU to perform emulation on non-RISC-V architectures.
+
+1. Download QEMU
+~~~
+wget https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz
+~~~
+
+2. Run Simulation
+After build your llama.cpp, you can run the executable file via QEMU for simulation, for example:
+~~~
+export QEMU_ROOT_PATH={your QEMU file path}
+export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
+
+${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
+~~~
+## Performance
+#### Quantization Support For Matrix
+~~~
+model name      : Spacemit(R) X60
+isa             : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
+mmu             : sv39
+uarch           : spacemit,x60
+mvendorid       : 0x710
+marchid         : 0x8000000058000001
+~~~
+
+Q4_0
+|   Model    |   Size   | Params | backend | threads | test | t/s |
+| -----------| -------- | ------ | ------- | ------- | ---- |------|
+Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | pp512|64.12 ± 0.26|
+Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | tg128|10.03 ± 0.01|
+Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | pp512|24.16 ± 0.02|
+Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | tg128|3.83 ± 0.06|
+Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | pp512|12.08 ± 0.02|
+Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | tg128|2.23 ± 0.02|
+
+Q4_1
+|   Model    |   Size   | Params | backend | threads | test | t/s |
+| -----------| -------- | ------ | ------- | ------- | ---- |------|
+Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | pp512|62.07 ± 0.12|
+Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | tg128|9.91 ± 0.01|
+Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | pp512|22.95 ± 0.25|
+Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | tg128|4.01 ± 0.15|
+Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | pp512|11.55 ± 0.16|
+Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | tg128|2.25 ± 0.04|
+
+
+Q4_K
+|   Model    |   Size   | Params | backend | threads | test | t/s |
+| -----------| -------- | ------ | ------- | ------- | ---- |------|
+Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | pp512|9.29 ± 0.05|
+Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | tg128|5.67 ± 0.04|
+Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | pp512|10.38 ± 0.10|
+Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | tg128|3.17 ± 0.08|
+Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | pp512|4.23 ± 0.04|
+Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | tg128|1.73 ± 0.00|
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,8 +4,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 0)
-set(GGML_VERSION_DEV "-dev")  # "-dev" for development, "" for releases
+set(GGML_VERSION_PATCH 3)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -26,8 +25,8 @@ if(GIT_EXE)
    )
 endif()

-# Build the version string with optional -dev suffix and dirty flag
-set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
+# Build the version string with optional dirty flag
+set(GGML_VERSION "${GGML_VERSION_BASE}")
 if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
    set(GGML_VERSION "${GGML_VERSION}-dirty")
 endif()
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -135,6 +135,10 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
    return p;
 }

+static const char * dl_error() {
+    return "";
+}
+
 #else

 using dl_handle = void;
@@ -155,6 +159,11 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
    return dlsym(handle, name);
 }

+static const char * dl_error() {
+    const char *rslt = dlerror();
+    return rslt != nullptr ? rslt : "";
+}
+
 #endif

 using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
@@ -240,7 +249,7 @@ struct ggml_backend_registry {
        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
            }
            return nullptr;
        }
@@ -530,7 +539,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
                    dl_handle_ptr handle { dl_load_library(entry) };
                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
+                        GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
                    }
                    if (handle) {
                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -74,7 +74,7 @@ if (BLAS_FOUND)

    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})

-    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
        add_compile_definitions(GGML_BLAS_USE_MKL)
    endif()

--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -439,6 +439,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ggml-cpu/arch/riscv/quants.c
            ggml-cpu/arch/riscv/repack.cpp
            )
+        if (GGML_CPU_RISCV64_SPACEMIT)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
+            list(APPEND GGML_CPU_SOURCES
+                ggml-cpu/spacemit/ime.cpp
+                ggml-cpu/spacemit/ime.h
+                ggml-cpu/spacemit/ime1_kernels.cpp
+                ggml-cpu/spacemit/ime_kernels.h
+            )
+        endif()
        set(MARCH_STR "rv64gc")
        if (GGML_RV_ZFH)
            string(APPEND MARCH_STR "_zfh")
@@ -504,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.13.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.14.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "d82a8de939d9814621a5ba23907bdac1")
+        set(KLEIDIAI_ARCHIVE_MD5  "45e110675d93f99f82c23a1afcca76bc")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
@@ -583,6 +592,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
                ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -18,6 +18,10 @@
 #    include "kleidiai/kleidiai.h"
 #endif

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+#    include "spacemit/ime.h"
+#endif
+
 #if defined(_WIN32)
 #    define WIN32_LEAN_AND_MEAN
 #    ifndef NOMINMAX
@@ -45,6 +49,12 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
        }
 #endif

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+        if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
+        }
+#endif
+
 #ifdef GGML_USE_CPU_KLEIDIAI
        if (ggml_backend_cpu_kleidiai_buffer_type()) {
            bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -87,15 +87,38 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
    return tensor->ne[dim];
 }

+template <typename Variant, typename Ret, typename... Args, std::size_t... Is>
+constexpr bool variant_any_invocable_impl(std::index_sequence<Is...>) {
+    using V = std::remove_reference_t<Variant>;
+    return (std::is_invocable_r_v<
+                Ret,
+                std::variant_alternative_t<Is, V>,
+                Args...> || ...);
+}
+
+template <typename Variant, typename Ret, typename... Args>
+constexpr bool variant_any_invocable_v =
+    variant_any_invocable_impl<Variant, Ret, Args...>(
+        std::make_index_sequence<
+            std::variant_size_v<std::remove_reference_t<Variant>>>{});
+
 template<typename Ret, typename Variant, typename... Args>
-static Ret variant_call(const Variant & var, Args&&... args) {
-    return std::visit([&](auto&& func) -> Ret {
-        if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) {
-            return func(std::forward<Args>(args)...);
-        } else {
-            throw std::runtime_error("Invalid function type in variant_call");
-        }
-    }, var);
+static inline Ret variant_call(Variant && var, Args&&... args) {
+    static_assert(variant_any_invocable_v<std::remove_reference_t<Variant>, Ret, Args...>,
+                  "No alternative in Variant is invocable with the provided arguments and return type.");
+
+    return std::visit(
+        [&](auto && f) -> Ret {
+            using F = std::decay_t<decltype(f)>;
+            if constexpr (std::is_invocable_r_v<Ret, F, Args...>) {
+                return std::invoke(std::forward<decltype(f)>(f), std::forward<Args>(args)...);
+            } else {
+                GGML_ABORT("Invalid function type in variant_call");
+                GGML_UNREACHABLE();
+            }
+        },
+        std::forward<Variant>(var)
+    );
 }

 namespace ggml::cpu::kleidiai {
@@ -138,7 +161,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        if (kernels->rhs_type == GGML_TYPE_Q4_0) {
            size = variant_call<size_t>(lhs_info->packed_size, m, k, QK4_0, mr, kr, sr);
        } else if (kernels->rhs_type == GGML_TYPE_F16) {
-            size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr) +
+            const int64_t lhs_batch_size0 = op->src[1]->ne[2];
+            const int64_t rhs_batch_size0 = op->src[0]->ne[2];
+            const int64_t r = lhs_batch_size0 / rhs_batch_size0;
+            size = variant_call<size_t>(lhs_info->packed_size, m * r, k, mr, kr, sr) +
                   variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
                   k * n * sizeof(float) + n * sizeof(float);
        } else {
@@ -148,7 +174,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        return true;
    }

-
    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
        if (dst->op == GGML_OP_MUL_MAT) {
            if (dst->src[0]->type == GGML_TYPE_Q4_0) {
@@ -165,8 +190,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
    }

    bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
-        static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
-
        const ggml_tensor * src0 = dst->src[0];
        const ggml_tensor * src1 = dst->src[1];

@@ -175,7 +198,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
        GGML_ASSERT(kernels);

-        bool is_gemv = src1->ne[1] == 1;
+        const bool is_gemv = src1->ne[1] == 1;
        kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
        lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
        GGML_ASSERT(kernel);
@@ -185,27 +208,30 @@ class tensor_traits : public ggml::cpu::tensor_traits {

        const int64_t lhs_batch_size0 = ne12;
        const int64_t rhs_batch_size0 = ne02;
-        const int64_t batch_size      = rhs_batch_size0;
+        const int64_t batch_size      = lhs_batch_size0;

+        GGML_ASSERT(rhs_batch_size0 > 0);
+        GGML_ASSERT(lhs_batch_size0 % rhs_batch_size0 == 0);
        const int64_t r = lhs_batch_size0 / rhs_batch_size0;

-        const int64_t m = ne11 * r;
-        const int64_t n = ne01;
-        const int64_t k = ne00;
+        const int64_t m_group = ne11;
+        const int64_t m       = m_group;
+        const int64_t n       = ne01;
+        const int64_t k       = ne00;

        const size_t lhs_stride = src1->nb[1];
        const size_t rhs_stride = src0->nb[1];
        const size_t dst_stride = dst->nb[1];

-        const int64_t mr = static_cast<int64_t>(kernel->get_mr());
-        const int64_t nr = static_cast<int64_t>(kernel->get_nr());
-        const int64_t kr = static_cast<int64_t>(kernel->get_kr());
-        const int64_t sr = static_cast<int64_t>(kernel->get_sr());
+        const int64_t mr = (int64_t) kernel->get_mr();
+        const int64_t nr = (int64_t) kernel->get_nr();
+        const int64_t kr = (int64_t) kernel->get_kr();
+        const int64_t sr = (int64_t) kernel->get_sr();

-        const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr);
-        const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
-        const size_t kxn_size        = k * n * sizeof(float);
-        const size_t bias_size       = n * sizeof(float);
+        const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, (size_t)m, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
+        const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, (size_t)n, (size_t)k);
+        const size_t kxn_size        = (size_t)k * (size_t)n * sizeof(float);
+        const size_t bias_size       = (size_t)n * sizeof(float);

        const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
        GGML_ASSERT(wsize_required <= params->wsize);
@@ -216,82 +242,102 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        uint8_t * bias       = rhs_kxn + kxn_size;

        for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
-            const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride;
-            const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride;
-            uint8_t * dst_batch       = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride;
+            const int64_t rhs_batch_idx = batch_idx / r;
+            const uint8_t * rhs_batch_base = static_cast<const uint8_t *>(src0->data) + rhs_batch_idx * src0->nb[2];
+            uint8_t * dst_batch_base = static_cast<uint8_t *>(dst->data) + batch_idx * dst->nb[2];

-            // LHS packing
+            // LHS packing (threaded over m, honoring mr alignment and KV groups)
            {
                const int64_t m_roundup_mr = kai_roundup(m, mr);
                const int64_t num_threads  = KAI_MIN(m_roundup_mr / mr, nth);

                if (ith < num_threads) {
-                    const int64_t num_m_per_thread0   = round_down(m_roundup_mr / num_threads, mr);
+                    const int64_t num_m_per_thread0   = round_down((size_t)(m_roundup_mr / num_threads), (size_t)mr);
                    const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;

-                    const int64_t m_start          = ith * num_m_per_thread0;
-                    const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
+                    const int64_t m_start = ith * num_m_per_thread0;
+                    const int64_t m_count = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;

-                    const size_t lhs_offset        = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
-                    const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, mr, kr, sr);
+                    // Base packed offset (aligned) and per-row stride in bytes
+                    const size_t base_packed_off = variant_call<size_t>(
+                        lhs_info->get_packed_offset, (size_t)m_start, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
+                    const size_t next_block_off = variant_call<size_t>(
+                        lhs_info->get_packed_offset, (size_t)(m_start + mr), (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
+                    const size_t row_stride_bytes = (next_block_off - base_packed_off) / (size_t)mr;

-                    const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
-                    void * dst_ptr       = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
+                    int64_t remaining = m_count;
+                    int64_t cur       = m_start;

-                    variant_call<void>(lhs_info->pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
+                    while (remaining > 0) {
+                        const int64_t row_in_group = cur;
+                        const int64_t avail        = m_group - row_in_group;
+                        const int64_t take         = std::min(avail, remaining);
+
+                        const uint8_t * lhs_batch_base = static_cast<const uint8_t *>(src1->data) + batch_idx * src1->nb[2];
+                        const void * src_ptr = lhs_batch_base + (size_t)row_in_group * lhs_stride;
+                        const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes;
+                        void * dst_ptr       = lhs_packed + dst_off;
+
+                        variant_call<void>(lhs_info->pack_func,
+                                        (size_t)take, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr,
+                                        /*m_idx_start*/ 0, src_ptr, lhs_stride, dst_ptr);
+
+                        cur       += take;
+                        remaining -= take;
+                    }
                }
            }

-            // RHS packing
-            if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) {
-                // First thread to reach this point handles RHS packing
-                memset(bias, 0, n * sizeof(float));
-                transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn),
-                                        reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride);
+            // RHS packing (single thread), then synchronize
+            if (ith == 0) {
+                memset(bias, 0, (size_t)n * sizeof(float));
+                transpose_f32kxn_f16nxk((size_t)n, (size_t)k,
+                                        reinterpret_cast<float *>(rhs_kxn),
+                                        reinterpret_cast<const uint16_t *>(rhs_batch_base),
+                                        rhs_stride);

-                variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float),
-                             rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
+                variant_call<void>(kernels->rhs_info.pack_func,
+                                   /*num_groups*/ 1, (size_t)n, (size_t)k, (size_t)nr, (size_t)kr, (size_t)sr,
+                                   /*rhs_stride (bytes)*/ (size_t)(n * sizeof(float)),
+                                   rhs_kxn, bias, nullptr, rhs_packed, /*extra_bytes*/ 0, /*params*/ nullptr);
            }

            ggml_barrier(params->threadpool);

-            first_to_arrive.clear(std::memory_order_release);
-
-            // Perform the matmul
+            // Matmul (threaded over n)
            {
-                const int64_t m_to_process = m;
-                const int64_t m_start      = 0;
-
-                const int64_t n_step      = static_cast<int64_t>(kernel->get_n_step());
-                int64_t num_threads       = KAI_MIN(n / n_step, nth);
-                if (num_threads <= 0) {
-                    num_threads = 1;
+                const int64_t n_step  = (int64_t) kernel->get_n_step();
+                int64_t num_threads_n = KAI_MIN(n / n_step, nth);
+                if (num_threads_n <= 0) {
+                    num_threads_n = 1;
                }

-                if (ith < num_threads) {
-                    const int64_t num_n_per_thread0   = round_down(n / num_threads, n_step);
-                    const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0;
+                if (ith < num_threads_n) {
+                    const int64_t num_n_per_thread0   = round_down((size_t)(n / num_threads_n), (size_t)n_step);
+                    const int64_t num_n_per_threadN_1 = n - (num_threads_n - 1) * num_n_per_thread0;

                    const int64_t n_start      = ith * num_n_per_thread0;
-                    const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
+                    const int64_t n_to_process = (ith == num_threads_n - 1) ? num_n_per_threadN_1 : num_n_per_thread0;

-                    const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k);
-                    const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k);
-                    const size_t dst_offset        = kernel->get_dst_offset(m_start, n_start, dst_stride);
+                    // LHS packed base at row 0 (consistent with packing above)
+                    const size_t lhs_packed_offset0 = variant_call<size_t>(
+                        lhs_info->get_packed_offset, (size_t)0, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
+                    const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, (size_t)n_start, (size_t)k);
+                    const size_t dst_offset        = kernel->get_dst_offset((size_t)0, (size_t)n_start, dst_stride);

-                    const void * lhs_ptr = lhs_packed + lhs_packed_offset;
+                    const void * lhs_ptr = lhs_packed + lhs_packed_offset0;
                    const void * rhs_ptr = rhs_packed + rhs_packed_offset;
-                    float * dst_ptr      = reinterpret_cast<float *>(dst_batch + dst_offset);
+                    float * dst_ptr      = reinterpret_cast<float *>(dst_batch_base + dst_offset);

-                    variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
+                    variant_call<void>(kernel->run_kernel,
+                                       (size_t)m, (size_t)n_to_process, (size_t)k,
+                                       lhs_ptr, rhs_ptr,
+                                       dst_ptr, dst_stride, sizeof(float),
+                                       -FLT_MAX, FLT_MAX);
                }
            }

            if (batch_idx != batch_size - 1) {
-                // This barrier is necessary when the batch size is larger than 1. While processing a batch,
-                // the work data buffer (params->wdata) is used as temporary storage which means that only
-                // a single batch can be processed at any given time. No barrier is needed for the last
-                // batch since GGML inserts a barrier between the execution of every operator.
                ggml_barrier(params->threadpool);
            }
        }
--- a/ggml/src/ggml-cpu/spacemit/ime.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime.h
+++ b/ggml/src/ggml-cpu/spacemit/ime.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "ggml-alloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime_kernels.h
+++ b/ggml/src/ggml-cpu/spacemit/ime_kernels.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstddef>
+
+namespace sqnbitgemm_spacemit_ime {
+namespace ime1 {
+size_t gemm_kernel_i8i4(size_t            blk_len,
+                        const std::byte * quant_a_ptr,
+                        const std::byte * quant_b_data,
+                        const float *     quant_b_scale,
+                        const std::byte * quant_b_zp,
+                        float *           c_ptr,
+                        size_t            count_m,
+                        size_t            count_n,
+                        size_t            count_k,
+                        size_t            block_count_k,
+                        size_t            ldc,
+                        const float *     bias,
+                        const size_t      scale_stride);
+
+void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+
+void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+
+}  // namespace ime1
+}  // namespace sqnbitgemm_spacemit_ime
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3639,9 +3639,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CONV_TRANSPOSE_2D:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM:
-        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
            return true;
+        case GGML_OP_ARGSORT:
+            // TODO: Support arbitrary column width
+            return op->src[0]->ne[0] <= 1024;
        case GGML_OP_SUM_ROWS:
        case GGML_OP_MEAN:
        case GGML_OP_GROUP_NORM:
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -683,9 +683,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                   (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_TIMESTEP_EMBEDDING:
-        case GGML_OP_ARGSORT:
        case GGML_OP_LEAKY_RELU:
            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_ARGSORT:
+            // TODO: Support arbitrary column width
+            return op->src[0]->ne[0] <= 1024;
        case GGML_OP_ARANGE:
            return true;
        case GGML_OP_FLASH_ATTN_EXT:
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp
@@ -67,30 +67,48 @@ layout (binding = 5) writeonly buffer O {D_TYPE data_o[];};
 #if defined(A_TYPE_PACKED16)
 #define BINDING_IDX_K 0
 #define BINDING_IDX_V 1
-layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
+layout (binding = 1) readonly buffer K_PACKED16 {A_TYPE_PACKED16 k_data_packed16[];} k_packed;
+layout (binding = 2) readonly buffer V_PACKED16 {A_TYPE_PACKED16 v_data_packed16[];} v_packed;
 #endif

 #if defined(DATA_A_Q4_0)
 #define BLOCK_BYTE_SIZE 18

 vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
-    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
-    uint shift = (iqs & 0x10) >> 2;
-    vui_lo >>= shift;
-    vui_hi >>= shift;
+    if (binding_idx == BINDING_IDX_K) {
+        uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
+        uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
+        uint shift = (iqs & 0x10) >> 2;
+        vui_lo >>= shift;
+        vui_hi >>= shift;

-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+        return float(k_packed.k_data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+    } else {
+        uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
+        uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
+        uint shift = (iqs & 0x10) >> 2;
+        vui_lo >>= shift;
+        vui_hi >>= shift;
+
+        return float(v_packed.v_data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+    }
 }
 #endif

 #if defined(DATA_A_Q8_0)
 #define BLOCK_BYTE_SIZE 34
 vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
-    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
+    if (binding_idx == BINDING_IDX_K) {
+        const i8vec2 v0 = unpack8(int32_t(k_packed.k_data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
+        const i8vec2 v1 = unpack8(int32_t(k_packed.k_data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;

-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+        return float(k_packed.k_data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+    } else {
+        const i8vec2 v0 = unpack8(int32_t(v_packed.v_data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
+        const i8vec2 v1 = unpack8(int32_t(v_packed.v_data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
+
+        return float(v_packed.v_data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+    }
 }
 #endif

--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3687,6 +3687,7 @@ struct ggml_tensor * ggml_set_rows(
    result->op     = GGML_OP_SET_ROWS;
    result->src[0] = b;
    result->src[1] = c;
+    result->src[2] = a; // note: order is weird due to legacy reasons (https://github.com/ggml-org/llama.cpp/pull/16063#discussion_r2385795931)

    return result;
 }
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-978f6e1993f2eeb4e99b63d4e70b4401c0a2dae2
+83a15e113b130337a892fb6575c337754557d56f
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2140,6 +2140,27 @@ struct test_set_rows : public test_case {
            }
        }
    }
+
+    double max_nmse_err() override {
+        if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_IQ4_NL ||
+            type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1 || type == GGML_TYPE_Q8_0) {
+            // estimate what the max nmse error would be if one quantized value is
+            // off by one. The test values are distributed in [-1,1], so it'll be
+            // roughly (2.0 / 2^bits)^2, divided by the mean square value of the reference,
+            // which is roughly 0.25 times the number of elements.
+            double err_estimate = 1.0f/8.0f;
+            if (type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
+                err_estimate /= 2.0f;
+            }
+            if (type == GGML_TYPE_Q8_0) {
+                err_estimate /= 8.0f;
+            }
+            err_estimate *= err_estimate;
+            err_estimate /= 0.25f*float(ne[0] * r * ne[2]*nr23[0] * ne[3]*nr23[1]);
+            return err_estimate;
+        }
+        return 1e-7;
+    }
 };

 // GGML_OP_ARGMAX
@@ -2430,6 +2451,30 @@ struct test_cpy : public test_case {
    }

    double max_nmse_err() override {
+        if (type_src == type_dst) {
+            return 0.0;
+        }
+        if (type_dst == GGML_TYPE_Q4_0 || type_dst == GGML_TYPE_Q4_1 || type_dst == GGML_TYPE_IQ4_NL ||
+            type_dst == GGML_TYPE_Q5_0 || type_dst == GGML_TYPE_Q5_1 || type_dst == GGML_TYPE_Q8_0) {
+            // estimate what the max nmse error would be if one quantized value is
+            // off by one. The test values are distributed in [-150,150], so it'll be
+            // roughly (150*2.0 / 2^bits)^2, divided by the mean square value of the reference,
+            // which is roughly 0.25*150^2 times the number of elements.
+            double err_estimate = 1.0f/8.0f * 150.0f;
+            if (type_dst == GGML_TYPE_IQ4_NL) {
+                // iq4_nl values are a bit more spread out
+                err_estimate *= 2.0f;
+            }
+            if (type_dst == GGML_TYPE_Q5_0 || type_dst == GGML_TYPE_Q5_1) {
+                err_estimate /= 2.0f;
+            }
+            if (type_dst == GGML_TYPE_Q8_0) {
+                err_estimate /= 8.0f;
+            }
+            err_estimate *= err_estimate;
+            err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]);
+            return err_estimate;
+        }
        return 1e-6;
    }

@@ -6567,6 +6612,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 1, 1, 1}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16384, 1, 1, 1}, order)); // bailingmoe2 (group selection)
    }

    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -707,6 +707,10 @@ int main(int argc, char ** argv) {

            embd.push_back(id);

+            if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) {
+                assistant_ss << common_token_to_piece(ctx, id, false);
+            }
+
            // echo this to console
            input_echo = true;

@@ -824,11 +828,7 @@ int main(int argc, char ** argv) {
                }
            }

-            // if current token is not EOG, we add it to current assistant message
            if (params.conversation_mode && !waiting_for_first_input) {
-                const auto id = common_sampler_last(smpl);
-                assistant_ss << common_token_to_piece(ctx, id, false);
-
                if (!prompt.empty()) {
                    prompt.clear();
                    is_interacting = false;
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -1931,11 +1931,13 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
    LOG("Maximum KLD: %10.6f\n", kld_values.back());
    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("95.0%%   KLD: %10.6f\n", percentile(kld_values, 0.950f));
    LOG("90.0%%   KLD: %10.6f\n", percentile(kld_values, 0.900f));
    LOG("Median  KLD: %10.6f\n", kld_median);
    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG(" 0.1%%   KLD: %10.6f\n", percentile(kld_values, 0.001f));
    LOG("Minimum KLD: %10.6f\n", kld_values.front());

    LOG("\n");
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/webui/src/app.css
+++ b/tools/server/webui/src/app.css
@@ -39,6 +39,7 @@
 	--sidebar-ring: oklch(0.708 0 0);
 	--code-background: oklch(0.225 0 0);
 	--code-foreground: oklch(0.875 0 0);
+	--layer-popover: 1000000;
 }

 .dark {
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
@@ -362,7 +362,8 @@

 <Dialog.Root {open} onOpenChange={handleClose}>
 	<Dialog.Content
-		class="z-999999 flex h-[100vh] flex-col gap-0 rounded-none p-0 md:h-[64vh] md:rounded-lg"
+		class="z-999999 flex h-[100dvh] max-h-[100dvh] min-h-[100dvh] flex-col gap-0 rounded-none p-0
+			md:h-[64vh] md:max-h-[64vh] md:min-h-0 md:rounded-lg"
 		style="max-width: 48rem;"
 	>
 		<div class="flex flex-1 flex-col overflow-hidden md:flex-row">
@@ -441,7 +442,7 @@
 				</div>
 			</div>

-			<ScrollArea class="max-h-[calc(100vh-13.5rem)] flex-1">
+			<ScrollArea class="max-h-[calc(100dvh-13.5rem)] flex-1 md:max-h-[calc(100vh-13.5rem)]">
 				<div class="space-y-6 p-4 md:p-6">
 					<div>
 						<div class="mb-6 flex hidden items-center gap-2 border-b border-border/30 pb-6 md:flex">
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
@@ -5,7 +5,6 @@
 	import * as Select from '$lib/components/ui/select';
 	import { Textarea } from '$lib/components/ui/textarea';
 	import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
-	import { IsMobile } from '$lib/hooks/is-mobile.svelte';
 	import { supportsVision } from '$lib/stores/server.svelte';
 	import type { Component } from 'svelte';

@@ -17,8 +16,6 @@
 	}

 	let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
-
-	let isMobile = $state(new IsMobile());
 </script>

 {#each fields as field (field.key)}
@@ -30,10 +27,10 @@

 			<Input
 				id={field.key}
-				value={String(localConfig[field.key] || '')}
+				value={String(localConfig[field.key] ?? '')}
 				onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
-				placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
-				class={isMobile ? 'w-full' : 'max-w-md'}
+				placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
+				class="w-full md:max-w-md"
 			/>
 			{#if field.help || SETTING_CONFIG_INFO[field.key]}
 				<p class="mt-1 text-xs text-muted-foreground">
@@ -47,10 +44,10 @@

 			<Textarea
 				id={field.key}
-				value={String(localConfig[field.key] || '')}
+				value={String(localConfig[field.key] ?? '')}
 				onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
-				placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
-				class={isMobile ? 'min-h-[100px] w-full' : 'min-h-[100px] max-w-2xl'}
+				placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
+				class="min-h-[100px] w-full md:max-w-2xl"
 			/>
 			{#if field.help || SETTING_CONFIG_INFO[field.key]}
 				<p class="mt-1 text-xs text-muted-foreground">
@@ -78,7 +75,7 @@
 					}
 				}}
 			>
-				<Select.Trigger class={isMobile ? 'w-full' : 'max-w-md'}>
+				<Select.Trigger class="w-full md:w-auto md:max-w-md">
 					<div class="flex items-center gap-2">
 						{#if selectedOption?.icon}
 							{@const IconComponent = selectedOption.icon}
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import { Button } from '$lib/components/ui/button';
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';

 	interface Props {
 		onReset?: () => void;
@@ -8,8 +9,15 @@

 	let { onReset, onSave }: Props = $props();

-	function handleReset() {
+	let showResetDialog = $state(false);
+
+	function handleResetClick() {
+		showResetDialog = true;
+	}
+
+	function handleConfirmReset() {
 		onReset?.();
+		showResetDialog = false;
 	}

 	function handleSave() {
@@ -18,7 +26,23 @@
 </script>

 <div class="flex justify-between border-t border-border/30 p-6">
-	<Button variant="outline" onclick={handleReset}>Reset to default</Button>
+	<Button variant="outline" onclick={handleResetClick}>Reset to default</Button>

 	<Button onclick={handleSave}>Save settings</Button>
 </div>
+
+<AlertDialog.Root bind:open={showResetDialog}>
+	<AlertDialog.Content>
+		<AlertDialog.Header>
+			<AlertDialog.Title>Reset Settings to Default</AlertDialog.Title>
+			<AlertDialog.Description>
+				Are you sure you want to reset all settings to their default values? This action cannot be
+				undone and will permanently remove all your custom configurations.
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+		<AlertDialog.Footer>
+			<AlertDialog.Cancel>Cancel</AlertDialog.Cancel>
+			<AlertDialog.Action onclick={handleConfirmReset}>Reset to Default</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
@@ -87,7 +87,7 @@
 		<Sidebar.GroupContent>
 			<Sidebar.Menu>
 				{#each filteredConversations as conversation (conversation.id)}
-					<Sidebar.MenuItem class="mb-1" onclick={handleMobileSidebarItemClick}>
+					<Sidebar.MenuItem class="mb-1">
 						<ChatSidebarConversationItem
 							conversation={{
 								id: conversation.id,
@@ -95,6 +95,7 @@
 								lastModified: conversation.lastModified,
 								currNode: conversation.currNode
 							}}
+							{handleMobileSidebarItemClick}
 							isActive={currentChatId === conversation.id}
 							onSelect={selectConversation}
 							onEdit={editConversation}
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
@@ -8,6 +8,7 @@
 	interface Props {
 		isActive?: boolean;
 		conversation: DatabaseConversation;
+		handleMobileSidebarItemClick?: () => void;
 		onDelete?: (id: string) => void;
 		onEdit?: (id: string, name: string) => void;
 		onSelect?: (id: string) => void;
@@ -16,6 +17,7 @@

 	let {
 		conversation,
+		handleMobileSidebarItemClick,
 		onDelete,
 		onEdit,
 		onSelect,
@@ -47,6 +49,7 @@

 	function handleConfirmEdit() {
 		if (!editedName.trim()) return;
+		showEditDialog = false;
 		onEdit?.(conversation.id, editedName);
 	}

@@ -85,7 +88,12 @@
 		: ''}"
 	onclick={handleSelect}
 >
-	<div class="text flex min-w-0 flex-1 items-center space-x-3">
+	<!-- svelte-ignore a11y_click_events_have_key_events -->
+	<!-- svelte-ignore a11y_no_static_element_interactions -->
+	<div
+		class="text flex min-w-0 flex-1 items-center space-x-3"
+		onclick={handleMobileSidebarItemClick}
+	>
 		<div class="min-w-0 flex-1">
 			<p class="truncate text-sm font-medium">{conversation.name}</p>

@@ -178,5 +186,10 @@
 		&:is(:hover) :global([data-slot='dropdown-menu-trigger']) {
 			opacity: 1;
 		}
+		@media (max-width: 768px) {
+			:global([data-slot='dropdown-menu-trigger']) {
+				opacity: 1 !important;
+			}
+		}
 	}
 </style>
--- a/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte
@@ -37,6 +37,7 @@
 <DropdownMenu.Root bind:open>
 	<DropdownMenu.Trigger
 		class="flex h-6 w-6 cursor-pointer items-center justify-center rounded-md p-0 text-sm font-medium transition-colors hover:bg-accent hover:text-accent-foreground focus:bg-accent focus:text-accent-foreground focus:outline-none disabled:pointer-events-none disabled:opacity-50 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground {triggerClass}"
+		onclick={(e) => e.stopPropagation()}
 	>
 		{#if triggerTooltip}
 			<Tooltip.Root delayDuration={TOOLTIP_DELAY_DURATION}>
@@ -53,7 +54,7 @@
 		{/if}
 	</DropdownMenu.Trigger>

-	<DropdownMenu.Content {align} class="z-999 w-48">
+	<DropdownMenu.Content {align} class="z-[999999] w-48">
 		{#each actions as action, index (action.label)}
 			{#if action.separator && index > 0}
 				<DropdownMenu.Separator />
--- a/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte
@@ -19,7 +19,15 @@
 		bind:ref
 		data-slot="alert-dialog-content"
 		class={cn(
-			'fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-lg border bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg',
+			'fixed z-[999999] grid w-full gap-4 border bg-background p-6 shadow-lg duration-200',
+			// Mobile: Bottom sheet behavior
+			'right-0 bottom-0 left-0 max-h-[100dvh] translate-x-0 translate-y-0 overflow-y-auto rounded-t-lg',
+			'data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:slide-out-to-bottom-full',
+			'data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:slide-in-from-bottom-full',
+			// Desktop: Centered dialog behavior
+			'sm:top-[50%] sm:right-auto sm:bottom-auto sm:left-[50%] sm:max-h-[100vh] sm:max-w-lg sm:translate-x-[-50%] sm:translate-y-[-50%] sm:rounded-lg',
+			'sm:data-[state=closed]:slide-out-to-bottom-0 sm:data-[state=closed]:zoom-out-95',
+			'sm:data-[state=open]:slide-in-from-bottom-0 sm:data-[state=open]:zoom-in-95',
 			className
 		)}
 		{...restProps}
--- a/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte
+++ b/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte
@@ -13,7 +13,10 @@
 <div
 	bind:this={ref}
 	data-slot="alert-dialog-footer"
-	class={cn('flex flex-col-reverse gap-2 sm:flex-row sm:justify-end', className)}
+	class={cn(
+		'mt-6 flex flex-row gap-2 sm:mt-0 sm:justify-end [&>*]:flex-1 sm:[&>*]:flex-none',
+		className
+	)}
 	{...restProps}
 >
 	{@render children?.()}
--- a/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte
@@ -25,7 +25,7 @@
 		bind:ref
 		data-slot="dialog-content"
 		class={cn(
-			'fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-lg border border-border/30 bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg',
+			`fixed top-[50%] left-[50%] z-50 grid max-h-[100dvh] w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 overflow-y-auto rounded-lg border border-border/30 bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg md:max-h-[100vh]`,
 			className
 		)}
 		{...restProps}
--- a/tools/server/webui/src/lib/components/ui/select/select-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/select/select-content.svelte
@@ -1,4 +1,5 @@
 <script lang="ts">
+	import { onDestroy, onMount } from 'svelte';
 	import { Select as SelectPrimitive } from 'bits-ui';
 	import SelectScrollUpButton from './select-scroll-up-button.svelte';
 	import SelectScrollDownButton from './select-scroll-down-button.svelte';
@@ -14,6 +15,76 @@
 	}: WithoutChild<SelectPrimitive.ContentProps> & {
 		portalProps?: SelectPrimitive.PortalProps;
 	} = $props();
+
+	let cleanupInternalListeners: (() => void) | undefined;
+
+	onMount(() => {
+		const listenerOptions: AddEventListenerOptions = { passive: false };
+
+		const blockOutsideWheel = (event: WheelEvent) => {
+			if (!ref) {
+				return;
+			}
+
+			const target = event.target as Node | null;
+
+			if (!target || !ref.contains(target)) {
+				event.preventDefault();
+				event.stopPropagation();
+			}
+		};
+
+		const blockOutsideTouchMove = (event: TouchEvent) => {
+			if (!ref) {
+				return;
+			}
+
+			const target = event.target as Node | null;
+
+			if (!target || !ref.contains(target)) {
+				event.preventDefault();
+				event.stopPropagation();
+			}
+		};
+
+		document.addEventListener('wheel', blockOutsideWheel, listenerOptions);
+		document.addEventListener('touchmove', blockOutsideTouchMove, listenerOptions);
+
+		return () => {
+			document.removeEventListener('wheel', blockOutsideWheel, listenerOptions);
+			document.removeEventListener('touchmove', blockOutsideTouchMove, listenerOptions);
+		};
+	});
+
+	$effect(() => {
+		const element = ref;
+
+		cleanupInternalListeners?.();
+
+		if (!element) {
+			return;
+		}
+
+		const stopWheelPropagation = (event: WheelEvent) => {
+			event.stopPropagation();
+		};
+
+		const stopTouchPropagation = (event: TouchEvent) => {
+			event.stopPropagation();
+		};
+
+		element.addEventListener('wheel', stopWheelPropagation);
+		element.addEventListener('touchmove', stopTouchPropagation);
+
+		cleanupInternalListeners = () => {
+			element.removeEventListener('wheel', stopWheelPropagation);
+			element.removeEventListener('touchmove', stopTouchPropagation);
+		};
+	});
+
+	onDestroy(() => {
+		cleanupInternalListeners?.();
+	});
 </script>

 <SelectPrimitive.Portal {...portalProps}>
@@ -22,7 +93,7 @@
 		{sideOffset}
 		data-slot="select-content"
 		class={cn(
-			'relative z-50 max-h-(--bits-select-content-available-height) min-w-[8rem] origin-(--bits-select-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border bg-popover text-popover-foreground shadow-md data-[side=bottom]:translate-y-1 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:-translate-x-1 data-[side=left]:slide-in-from-right-2 data-[side=right]:translate-x-1 data-[side=right]:slide-in-from-left-2 data-[side=top]:-translate-y-1 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+			'relative z-[var(--layer-popover,1000000)] max-h-(--bits-select-content-available-height) min-w-[8rem] origin-(--bits-select-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border bg-popover text-popover-foreground shadow-md data-[side=bottom]:translate-y-1 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:-translate-x-1 data-[side=left]:slide-in-from-right-2 data-[side=right]:translate-x-1 data-[side=right]:slide-in-from-left-2 data-[side=top]:-translate-y-1 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
 			className
 		)}
 		{...restProps}
--- a/tools/server/webui/src/lib/utils/thinking.ts
+++ b/tools/server/webui/src/lib/utils/thinking.ts
@@ -1,7 +1,8 @@
 /**
- * Parses thinking content from a message that may contain <think> tags
+ * Parses thinking content from a message that may contain <think> tags or [THINK] tags
 * Returns an object with thinking content and cleaned message content
- * Handles both complete <think>...</think> blocks and incomplete <think> blocks (streaming)
+ * Handles both complete blocks and incomplete blocks (streaming)
+ * Supports formats: <think>...</think> and [THINK]...[/THINK]
 * @param content - The message content to parse
 * @returns An object containing the extracted thinking content and the cleaned message content
 */
@@ -9,12 +10,11 @@ export function parseThinkingContent(content: string): {
 	thinking: string | null;
 	cleanContent: string;
 } {
-	const incompleteMatch = content.includes('<think>') && !content.includes('</think>');
+	const incompleteThinkMatch = content.includes('<think>') && !content.includes('</think>');
+	const incompleteThinkBracketMatch = content.includes('[THINK]') && !content.includes('[/THINK]');

-	if (incompleteMatch) {
-		// Remove the entire <think>... part from clean content
+	if (incompleteThinkMatch) {
 		const cleanContent = content.split('</think>')?.[1]?.trim();
-		// Extract everything after <think> as thinking content
 		const thinkingContent = content.split('<think>')?.[1]?.trim();

 		return {
@@ -23,12 +23,40 @@ export function parseThinkingContent(content: string): {
 		};
 	}

-	const completeMatch = content.includes('</think>');
+	if (incompleteThinkBracketMatch) {
+		const cleanContent = content.split('[/THINK]')?.[1]?.trim();
+		const thinkingContent = content.split('[THINK]')?.[1]?.trim();

-	if (completeMatch) {
 		return {
-			thinking: content.split('</think>')?.[0]?.trim(),
-			cleanContent: content.split('</think>')?.[1]?.trim()
+			cleanContent,
+			thinking: thinkingContent
+		};
+	}
+
+	const completeThinkMatch = content.match(/<think>([\s\S]*?)<\/think>/);
+	const completeThinkBracketMatch = content.match(/\[THINK\]([\s\S]*?)\[\/THINK\]/);
+
+	if (completeThinkMatch) {
+		const thinkingContent = completeThinkMatch[1]?.trim() ?? '';
+		const cleanContent = `${content.slice(0, completeThinkMatch.index ?? 0)}${content.slice(
+			(completeThinkMatch.index ?? 0) + completeThinkMatch[0].length
+		)}`.trim();
+
+		return {
+			thinking: thinkingContent,
+			cleanContent
+		};
+	}
+
+	if (completeThinkBracketMatch) {
+		const thinkingContent = completeThinkBracketMatch[1]?.trim() ?? '';
+		const cleanContent = `${content.slice(0, completeThinkBracketMatch.index ?? 0)}${content.slice(
+			(completeThinkBracketMatch.index ?? 0) + completeThinkBracketMatch[0].length
+		)}`.trim();
+
+		return {
+			thinking: thinkingContent,
+			cleanContent
 		};
 	}

@@ -39,26 +67,33 @@ export function parseThinkingContent(content: string): {
 }

 /**
- * Checks if content contains an opening <think> tag (for streaming)
+ * Checks if content contains an opening thinking tag (for streaming)
+ * Supports both <think> and [THINK] formats
 * @param content - The message content to check
- * @returns True if the content contains an opening <think> tag
+ * @returns True if the content contains an opening thinking tag
 */
 export function hasThinkingStart(content: string): boolean {
-	return content.includes('<think>') || content.includes('<|channel|>analysis');
+	return (
+		content.includes('<think>') ||
+		content.includes('[THINK]') ||
+		content.includes('<|channel|>analysis')
+	);
 }

 /**
- * Checks if content contains a closing </think> tag (for streaming)
+ * Checks if content contains a closing thinking tag (for streaming)
+ * Supports both </think> and [/THINK] formats
 * @param content - The message content to check
- * @returns True if the content contains a closing </think> tag
+ * @returns True if the content contains a closing thinking tag
 */
 export function hasThinkingEnd(content: string): boolean {
-	return content.includes('</think>');
+	return content.includes('</think>') || content.includes('[/THINK]');
 }

 /**
 * Extracts partial thinking content during streaming
- * Used when we have <think> but not yet </think>
+ * Supports both <think> and [THINK] formats
+ * Used when we have opening tag but not yet closing tag
 * @param content - The message content to extract partial thinking from
 * @returns An object containing the extracted partial thinking content and the remaining content
 */
@@ -66,23 +101,41 @@ export function extractPartialThinking(content: string): {
 	thinking: string | null;
 	remainingContent: string;
 } {
-	const startIndex = content.indexOf('<think>');
-	if (startIndex === -1) {
+	const thinkStartIndex = content.indexOf('<think>');
+	const thinkEndIndex = content.indexOf('</think>');
+
+	const bracketStartIndex = content.indexOf('[THINK]');
+	const bracketEndIndex = content.indexOf('[/THINK]');
+
+	const useThinkFormat =
+		thinkStartIndex !== -1 && (bracketStartIndex === -1 || thinkStartIndex < bracketStartIndex);
+	const useBracketFormat =
+		bracketStartIndex !== -1 && (thinkStartIndex === -1 || bracketStartIndex < thinkStartIndex);
+
+	if (useThinkFormat) {
+		if (thinkEndIndex === -1) {
+			const thinkingStart = thinkStartIndex + '<think>'.length;
+
+			return {
+				thinking: content.substring(thinkingStart),
+				remainingContent: content.substring(0, thinkStartIndex)
+			};
+		}
+	} else if (useBracketFormat) {
+		if (bracketEndIndex === -1) {
+			const thinkingStart = bracketStartIndex + '[THINK]'.length;
+
+			return {
+				thinking: content.substring(thinkingStart),
+				remainingContent: content.substring(0, bracketStartIndex)
+			};
+		}
+	} else {
 		return { thinking: null, remainingContent: content };
 	}

-	const endIndex = content.indexOf('</think>');
-	if (endIndex === -1) {
-		// Still streaming thinking content
-		const thinkingStart = startIndex + '<think>'.length;
-		return {
-			thinking: content.substring(thinkingStart),
-			remainingContent: content.substring(0, startIndex)
-		};
-	}
-
-	// Complete thinking block found
 	const parsed = parseThinkingContent(content);
+
 	return {
 		thinking: parsed.thinking,
 		remainingContent: parsed.cleanContent
--- a/tools/server/webui/src/stories/ChatMessage.stories.svelte
+++ b/tools/server/webui/src/stories/ChatMessage.stories.svelte
@@ -59,6 +59,60 @@
 		thinking: '',
 		children: []
 	});
+
+	// Message with <think> format thinking content
+	const thinkTagMessage: DatabaseMessage = {
+		id: '6',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: Date.now() - 1000 * 60 * 2,
+		role: 'assistant',
+		content:
+			"<think>\nLet me analyze this step by step:\n\n1. The user is asking about thinking formats\n2. I need to demonstrate the &lt;think&gt; tag format\n3. This content should be displayed in the thinking section\n4. The main response should be separate\n\nThis is a good example of reasoning content.\n</think>\n\nHere's my response after thinking through the problem. The thinking content above should be displayed separately from this main response content.",
+		parent: '1',
+		thinking: '',
+		children: []
+	};
+
+	// Message with [THINK] format thinking content
+	const thinkBracketMessage: DatabaseMessage = {
+		id: '7',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: Date.now() - 1000 * 60 * 1,
+		role: 'assistant',
+		content:
+			'[THINK]\nThis is the DeepSeek-style thinking format:\n\n- Using square brackets instead of angle brackets\n- Should work identically to the &lt;think&gt; format\n- Content parsing should extract this reasoning\n- Display should be the same as &lt;think&gt; format\n\nBoth formats should be supported seamlessly.\n[/THINK]\n\nThis is the main response content that comes after the [THINK] block. The reasoning above should be parsed and displayed in the thinking section.',
+		parent: '1',
+		thinking: '',
+		children: []
+	};
+
+	// Streaming message for <think> format
+	let streamingThinkMessage = $state({
+		id: '8',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: 0, // No timestamp = streaming
+		role: 'assistant',
+		content: '',
+		parent: '1',
+		thinking: '',
+		children: []
+	});
+
+	// Streaming message for [THINK] format
+	let streamingBracketMessage = $state({
+		id: '9',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: 0, // No timestamp = streaming
+		role: 'assistant',
+		content: '',
+		parent: '1',
+		thinking: '',
+		children: []
+	});
 </script>

 <Story
@@ -144,3 +198,115 @@
 		await new Promise(resolve => setTimeout(resolve, 100));
 	}}
 />
+
+<Story
+	name="ThinkTagFormat"
+	args={{
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+		message: thinkTagMessage
+	}}
+/>
+
+<Story
+	name="ThinkBracketFormat"
+	args={{
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+		message: thinkBracketMessage
+	}}
+/>
+
+<Story
+	name="StreamingThinkTag"
+	args={{
+		message: streamingThinkMessage
+	}}
+	parameters={{
+		test: {
+			timeout: 30000
+		}
+	}}
+	asChild
+	play={async () => {
+		// Phase 1: Stream <think> reasoning content
+		const thinkingContent =
+			'Let me work through this problem systematically:\n\n1. First, I need to understand what the user is asking\n2. Then I should consider different approaches\n3. I need to evaluate the pros and cons\n4. Finally, I should provide a clear recommendation\n\nThis step-by-step approach will ensure accuracy.';
+
+		let currentContent = '<think>\n';
+		streamingThinkMessage.content = currentContent;
+
+		for (let i = 0; i < thinkingContent.length; i++) {
+			currentContent += thinkingContent[i];
+			streamingThinkMessage.content = currentContent;
+			await new Promise((resolve) => setTimeout(resolve, 5));
+		}
+
+		// Close the thinking block
+		currentContent += '\n</think>\n\n';
+		streamingThinkMessage.content = currentContent;
+		await new Promise((resolve) => setTimeout(resolve, 200));
+
+		// Phase 2: Stream main response content
+		const responseContent =
+			"Based on my analysis above, here's the solution:\n\n**Key Points:**\n- The approach should be systematic\n- We need to consider all factors\n- Implementation should be step-by-step\n\nThis ensures the best possible outcome.";
+
+		for (let i = 0; i < responseContent.length; i++) {
+			currentContent += responseContent[i];
+			streamingThinkMessage.content = currentContent;
+			await new Promise((resolve) => setTimeout(resolve, 10));
+		}
+
+		streamingThinkMessage.timestamp = Date.now();
+	}}
+>
+	<div class="w-[56rem]">
+		<ChatMessage message={streamingThinkMessage} />
+	</div>
+</Story>
+
+<Story
+	name="StreamingThinkBracket"
+	args={{
+		message: streamingBracketMessage
+	}}
+	parameters={{
+		test: {
+			timeout: 30000
+		}
+	}}
+	asChild
+	play={async () => {
+		// Phase 1: Stream [THINK] reasoning content
+		const thinkingContent =
+			'Using the DeepSeek format now:\n\n- This demonstrates the &#91;THINK&#93; bracket format\n- Should parse identically to &lt;think&gt; tags\n- The UI should display this in the thinking section\n- Main content should be separate\n\nBoth formats provide the same functionality.';
+
+		let currentContent = '[THINK]\n';
+		streamingBracketMessage.content = currentContent;
+
+		for (let i = 0; i < thinkingContent.length; i++) {
+			currentContent += thinkingContent[i];
+			streamingBracketMessage.content = currentContent;
+			await new Promise((resolve) => setTimeout(resolve, 5));
+		}
+
+		// Close the thinking block
+		currentContent += '\n[/THINK]\n\n';
+		streamingBracketMessage.content = currentContent;
+		await new Promise((resolve) => setTimeout(resolve, 200));
+
+		// Phase 2: Stream main response content
+		const responseContent =
+			"Here's my response after using the &#91;THINK&#93; format:\n\n**Observations:**\n- Both &lt;think&gt; and &#91;THINK&#93; formats work seamlessly\n- The parsing logic handles both cases\n- UI display is consistent across formats\n\nThis demonstrates the enhanced thinking content support.";
+
+		for (let i = 0; i < responseContent.length; i++) {
+			currentContent += responseContent[i];
+			streamingBracketMessage.content = currentContent;
+			await new Promise((resolve) => setTimeout(resolve, 10));
+		}
+
+		streamingBracketMessage.timestamp = Date.now();
+	}}
+>
+	<div class="w-[56rem]">
+		<ChatMessage message={streamingBracketMessage} />
+	</div>
+</Story>
Author	SHA1	Message	Date
Charles Xu	f1eb1cb1eb	kleidiai : fix work size and threads sync for fp16 (#16246 )	2025-09-30 10:07:20 +03:00
lhez	de41f2b7bf	codeowners: add codeowners for opencl backend (#16344 )	2025-09-30 08:30:16 +03:00
Jeff Bolz	a74a0d69f3	tests: override test_set_rows::max_nmse_err to allow for occasional rounding differences (#16295 ) * tests: override test_set_rows::max_nmse_err to allow for occasional rounding differences * apply similar error bounds to test_cpy	2025-09-29 19:26:34 -05:00
Pascal	5f7e166cbf	Fix thinking blocks with quotes + add handling `[THINK]...[/THINK]` blocks (#16326 ) * fix: prevent reasoning blocks with quotes from being truncated * chore: update webui build output * feat: Improve thinking content parsing * test: Adds ChatMessage component stories for different thinking blocks * chore: update webui build output * fix: ChatMessage story fix --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>	2025-09-29 18:49:47 +02:00
Georgi Gerganov	d72f5f7ba2	ci : add AMD runners and workflows (#16249 ) * ci : add AMD runners and workflows * ci : move AMD jobs to separate workflow * cont : fix paths	2025-09-29 17:51:48 +03:00
alex-spacemit	b77e6c18e1	ggml: riscv: add riscv spacemit backend (#15288 ) * ggml: add spacemit backend Change-Id: I249bdc043485d815a9c351867137bc1e27cc2e23 * add new line at end of file Change-Id: I889ed1c85fb45e62350ecde0c06f70450cadfbe2 * add riscv zba extension limit Change-Id: I321eb200f859751727afe5cae13074dfce2bb0ce * fixed for review comments, file renamed and format Change-Id: Ia20b6ec24a36638e62e0fe07cf100916a7cce3ce * fixed for code format, after clang-format Change-Id: I5dc33a0412da3d3f2d77075d8939185d3009eca2 * use _Float16 instead of __fp16 Change-Id: I039fb02bb95270e641bc4442204e658735859d43 * add ci for riscv64-spacemit-ime-native Change-Id: I711c1033061df1a289ea77891b2997599dfe8279 * update debian-13-riscv64-spacemit-ime-native ci label Change-Id: Ifb2b891e2fca57b5da604fce2ac255f27731179a * remove license comment for spacemit ime Change-Id: If0dc3ca30a958631ccca0a28b62e0b825f9fb0c3 * upgrade binutils for gcc ime Change-Id: Ibf2fa74c1064408974cb5b45f044d40987e5fb45 * add spacemit ime cross jobs Change-Id: I80d74909941d41cb9cd09e51d8baf01c985cbfc6 * remove native compile for riscv64-spacemit-ime Change-Id: I01920afafdc73fa7424014fd648d243f8ec9e25e * ci : add caching for spacemit ime cross toolchain Change-Id: Ic54a192019a2fd982bbd58225ce3bbc38f4053de * ci: bug fixed for cache path and env Change-Id: I28c42e10b6fff053bb6580926ca2353448cb042a * Update .github/workflows/build-linux-cross.yml for cache path Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * bugfixed for build-linux-cross.yml, syntax error Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: cailinxi <linxi.cai@spacemit.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-09-29 17:50:44 +03:00
Georgi Gerganov	2ddd3f2356	sync : ggml	2025-09-29 17:43:58 +03:00
Georgi Gerganov	4d3d455d3c	sync : whisper.cpp (ggml/1359) * ggml : Fix MKL detection by quoting BLAS_INCLUDE_DIRS (whisper/3426) * sync : whisper.cpp	2025-09-29 17:43:58 +03:00
Daniel Bevenius	c9b1c06467	ggml : remove -dev suffix from release version (ggml/1355) This commit removes the `-dev` suffix from the version string in CMakeLists.txt and the release script. The version will now be just be formatted as `MAJOR.MINOR.PATCH`.	2025-09-29 17:43:58 +03:00
Daniel Bevenius	b6ae75afb4	ggml : bump version to 0.9.3 (ggml/1353)	2025-09-29 17:43:58 +03:00
Georgi Gerganov	b6dff20e2f	ggml : prepare for development of 0.9.2-dev	2025-09-29 17:43:58 +03:00
Georgi Gerganov	2db78c75e4	ggml : bump version to 0.9.1	2025-09-29 17:43:58 +03:00
Rafal Lewczuk	02463ab27b	ggml-backend : add root cause in error message if loading backend library fails (#16172 ) This PR adds additional information to an error message when loading backend library via ld_load_library() fails. This helps spotting why backend library did not load (missing library, missing dependency or unresolved symbol etc.).	2025-09-29 13:17:09 +02:00
Sigbjørn Skjæret	adc76347d7	ggml : check cuda and metal argsort limits and add test (#16323 ) * check cuda argsort limits and add test * add metal check	2025-09-29 11:09:00 +02:00
Aleksander Grygier	3a2bdcda0b	Improve Mobile UI for dialogs and action dropdowns (#16222 ) * fix: Always show conversation item actions * feat: Improve Alert Dialog and Dialog mobile UI * feat: Add settings reset to default confirmation * fix: Close Edit dialog on save * chore: update webui build output * webui: implement proper z-index system and scroll management - Add CSS variable for centralized z-index control - Fix dropdown positioning with Settings dialog conflicts - Prevent external scroll interference with proper event handling - Clean up hardcoded z-index values for maintainable architecture * webui: ensured the settings dialog enforces dynamic viewport height on mobile while retaining existing desktop sizing overrides * feat: Use `dvh` instead of computed px height for dialogs max height on mobile * chore: update webui build output * feat: Improve Settings fields UI * chore: update webui build output * chore: update webui build output --------- Co-authored-by: Pascal <admin@serveurperso.com>	2025-09-29 10:37:20 +02:00
Pascal	66bb7985c3	fix: preserved zero values in chat settings inputs and textareas by switching to nullish coalescing for field values and default placeholders (#16312 )	2025-09-29 09:08:41 +02:00
Vinkal	2f61c0f5bf	llama-cli: prevent spurious assistant token (#16202 ) * tools/main: llama-cli: prevent spurious assistant token (#13402) During prompt ingestion, prompt tokens are accepted into the sampler history (for repetition penalties). The conversation-mode path then appended `common_sampler_last(smpl)` to `assistant_ss` before any new token was sampled. At that point, "last" was a prompt-side token (e.g., an input prefix), so the assistant chat message began with an extra piece. Fix: append to `assistant_ss` only for a newly sampled (non-EOG) token. This affects only chat message assembly (`assistant_ss` / `chat_msgs` / `common_chat_format_single`); terminal stdout is unchanged. Sampling order/logits are unchanged. Fixes #13402. Signed-off-by: Vinkal Chudgar <vinkal.chudgar@gmail.com> * Update tools/main/main.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * tools/main: remove outdated comment Signed-off-by: Vinkal Chudgar <vinkal.chudgar@gmail.com> --------- Signed-off-by: Vinkal Chudgar <vinkal.chudgar@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-09-29 10:03:12 +03:00
ddh0	3ffd0fae47	perplexity : show more kl-divergence data (#16321 ) Adds additional percentile data for displayed in the output of `llama-perplexity --kl-divergence`: - Added 95 percentile (mirroring existing 5 percentile) - Added 0.1 percentile (mirroring existing 99.9 percentile)	2025-09-29 09:30:45 +03:00
Georgi Gerganov	a4a0aa5ea2	ggml : fix dependencies for ggml_set_rows (#16318 )	2025-09-29 08:41:28 +03:00
Jeff Bolz	92cd103f62	vulkan: Fix validation failure in quantized flash attention (#16292 )	2025-09-29 06:50:37 +02:00