download : prefer q8_0 when q4_k not available

model : remove duplicate wo_s scale after build_attn (Qwen3, LLaMA) (#22421 )
Signed-off-by: Yash Nankani <ynankani@nvidia.com>
2026-05-16 14:04:05 +00:00 · 2026-04-27 12:08:25 +03:00 · 2026-04-27 09:58:48 +02:00 · 2026-04-27 08:45:01 +02:00 · 2026-04-27 09:30:55 +03:00 · 2026-04-27 08:06:39 +03:00
384 changed files with 28161 additions and 12926 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04

 ## Build Image

--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -18,6 +18,7 @@
  vulkan-loader,
  openssl,
  shaderc,
+  spirv-headers,
  useBlas ?
    builtins.all (x: !x) [
      useCuda
@@ -145,6 +146,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
+      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04

-# Optional proxy build arguments - empty by default
+# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
+ARG IGC_VERSION=v2.30.1
+ARG IGC_VERSION_FULL=2_2.30.1+20950
+ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGDGMM_VERSION=22.9.0
+
+# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
+ARG NPU_DRIVER_VERSION=v1.32.0
+ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
+ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
+
+# Optional proxy build arguments
 ARG http_proxy=
 ARG https_proxy=

@@ -78,13 +90,47 @@ ARG http_proxy
 ARG https_proxy

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl \
+    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete

+# Install GPU drivers
+ARG IGC_VERSION
+ARG IGC_VERSION_FULL
+ARG COMPUTE_RUNTIME_VERSION
+ARG COMPUTE_RUNTIME_VERSION_FULL
+ARG IGDGMM_VERSION
+RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
+    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && dpkg --install *.deb \
+    && rm -rf /tmp/neo/
+
+# Install NPU drivers
+ARG NPU_DRIVER_VERSION
+ARG NPU_DRIVER_FULL
+ARG LIBZE1_VERSION
+RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
+    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
+    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
+    && dpkg --install *.deb \
+    && rm -rf /tmp/npu/
+
+RUN cd /tmp \
+    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
+    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
+    && rm libze1_${LIBZE1_VERSION}_amd64.deb
+
 COPY --from=build /app/lib/ /app/

 ### Full (all binaries)
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -6,7 +6,7 @@

 <!-- You can provide more details and link related discussions here. Delete this section if not applicable -->

-# Requirements
+## Requirements

 <!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->

--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -0,0 +1,116 @@
+name: CI (snapdragon)
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+    paths:
+      - '.github/workflows/build-and-test-snapdragon.yml'
+      - 'ggml/include/ggml-hexagon.h'
+      - 'ggml/src/ggml-hexagon/**'
+      - 'docs/backend/snapdragon/**'
+      - 'scripts/snapdragon/**'
+      - 'CMakePresets.json'
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - '.github/workflows/build-and-test-snapdragon.yml'
+      - 'ggml/include/ggml-hexagon.h'
+      - 'ggml/src/ggml-hexagon/**'
+      - 'docs/backend/snapdragon/**'
+      - 'scripts/snapdragon/**'
+      - 'CMakePresets.json'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  android-ndk-snapdragon:
+    runs-on: ubuntu-latest
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false
+
+      - name: Build Llama.CPP for Snapdragon Android
+        id: build_llama_cpp_snapdragon_android
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+          cmake --preset arm64-android-snapdragon-release -B build
+          cmake --build build
+          cmake --install build --prefix pkg-snapdragon/llama.cpp
+
+      - name: Upload Llama.CPP Snapdragon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-android-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
+  test-snapdragon-qdc:
+    name: Test on QDC Android Device (${{ matrix.device }})
+    needs: [android-ndk-snapdragon]
+    runs-on: ubuntu-slim
+    strategy:
+      fail-fast: false
+      matrix:
+        device: [SM8750, SM8650, SM8850]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Download build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: llama-cpp-android-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+          cache: pip
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y curl unzip
+
+      - name: Install QDC SDK wheel
+        run: |
+          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
+          unzip qdc_sdk.zip -d qdc_sdk
+          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
+
+      - name: Check QDC API key
+        id: check_secret
+        env:
+          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
+        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
+
+      - name: Run QDC tests (${{ matrix.device }})
+        if: steps.check_secret.outputs.has-qdc-key == 'true'
+        run: |
+          python scripts/snapdragon/qdc/run_qdc_jobs.py \
+              --test       all \
+              --pkg-dir    pkg-snapdragon/llama.cpp \
+              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
+              --device     ${{ matrix.device }}
+        env:
+          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
+
+      - name: Cleanup
+        if: always()
+        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,26 +1,24 @@
 name: CI (android)

 on:
-  workflow_dispatch: # allows manual triggering
+  workflow_dispatch:
  push:
    branches:
      - master
-    paths: [
-      '.github/workflows/build-android.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
+    paths:
+      - '.github/workflows/build-android.yml'
+      - '**/CMakeLists.txt'
+      - '**/.cmake'
+      - '**/*.h'
+      - '**/*.hpp'
+      - '**/*.c'
+      - '**/*.cpp'

  pull_request:
    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-android.yml',
-      'examples/llama.android/**'
-    ]
+    paths:
+      - '.github/workflows/build-android.yml'
+      - 'examples/llama.android/**'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -51,7 +49,7 @@ jobs:
          distribution: zulu

      - name: Setup Android SDK
-        uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
+        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
        with:
          log-accepted-android-sdk-licenses: false

@@ -67,35 +65,24 @@ jobs:
    defaults:
      run:
        shell: bash
-    strategy:
-      matrix:
-        include:
-          - build: 'arm64-cpu'
-            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
-          - build: 'arm64-snapdragon'
-            defines: '--preset arm64-android-snapdragon-release'

    steps:
      - name: Clone
-        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          lfs: false

-      - name: Build Llama.CPP for Hexagon Android
-        id: build_llama_cpp_hexagon_android
+      - name: Build
+        id: ndk_build
        run: |
-          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
-            cp docs/backend/snapdragon/CMakeUserPresets.json .
-          fi
-          cmake ${{ matrix.defines }} -B build
+          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Upload Llama.CPP Hexagon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
+      - name: Upload Android Build Artifact
+        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
        uses: actions/upload-artifact@v6
        with:
-          name: llama-cpp-android-${{ matrix.build }}
+          name: llama-cpp-android-arm64-cpu
          path: pkg-adb/llama.cpp
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -246,6 +246,7 @@ jobs:
          apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
+                  spirv-headers \
                  gcc-14-loongarch64-linux-gnu \
                  g++-14-loongarch64-linux-gnu \
                  libvulkan-dev:loong64
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -0,0 +1,120 @@
+name: CI (openvino)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-openvino.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-openvino.yml',
+      'ggml/src/ggml-openvino/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+  ubuntu-24-openvino:
+    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+
+    concurrency:
+      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
+      cancel-in-progress: false
+
+    strategy:
+      matrix:
+        include:
+          - variant: cpu
+            runner: '"ubuntu-24.04"'
+            openvino_device: "CPU"
+          - variant: gpu
+            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
+            openvino_device: "GPU"
+
+    runs-on: ${{ fromJSON(matrix.runner) }}
+
+    env:
+      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.0"
+      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        if: runner.environment == 'github-hosted'
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
+          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+      - name: Use OpenVINO Toolkit Cache
+        if: runner.environment == 'github-hosted'
+        uses: actions/cache@v5
+        id: cache-openvino
+        with:
+          path: ./openvino_toolkit
+          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+      - name: Setup OpenVINO Toolkit
+        if: steps.cache-openvino.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-openvino
+        with:
+          path: ./openvino_toolkit
+          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+          version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+      - name: Install OpenVINO dependencies
+        run: |
+          cd ./openvino_toolkit
+          chmod +x ./install_dependencies/install_openvino_dependencies.sh
+          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source ./openvino_toolkit/setupvars.sh
+          cmake -B build/ReleaseOV -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENVINO=ON
+          time cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        # TODO: fix and re-enable the `test-llama-archs` test below
+        run: |
+          cd ${{ github.workspace }}
+          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
+            export GGML_OPENVINO_DEVICE=GPU
+          fi
+          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -97,6 +97,36 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

+  # TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
+  #ggml-ci-nvidia-webgpu:
+  #  runs-on: [self-hosted, Linux, NVIDIA]
+
+  #  steps:
+  #    - name: Clone
+  #      id: checkout
+  #      uses: actions/checkout@v6
+
+  #    - name: Dawn Dependency
+  #      id: dawn-depends
+  #      run: |
+  #        DAWN_VERSION="v20260317.182325"
+  #        DAWN_OWNER="google"
+  #        DAWN_REPO="dawn"
+  #        DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+  #        echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+  #        curl -L -o artifact.tar.gz \
+  #          "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+  #        mkdir dawn
+  #        tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+  #    - name: Test
+  #      id: ggml-ci
+  #      run: |
+  #        GG_BUILD_WEBGPU=1 \
+  #        GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+  #        GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
+  #          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
  # TODO: provision AMX-compatible machine
  #ggml-ci-cpu-amx:
  #  runs-on: [self-hosted, Linux, CPU, AMX]
@@ -235,6 +265,10 @@ jobs:
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

+    concurrency:
+      group: openvino-gpu-${{ github.head_ref || github.ref }}
+      cancel-in-progress: false
+
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -0,0 +1,142 @@
+name: CI (sycl)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-sycl.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-sycl.yml',
+      'ggml/src/ggml-sycl/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-sycl-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -267,6 +267,56 @@ jobs:
          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

+  android-arm64:
+    runs-on: ubuntu-latest
+
+    env:
+      NDK_VERSION: "29.0.14206865"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: android-arm64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          java-version: 17
+          distribution: temurin
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Install NDK
+        run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
  ubuntu-latest-rpc:
    runs-on: ubuntu-latest

@@ -505,186 +555,6 @@ jobs:
            -DGGML_MUSA=ON
          time cmake --build build --config Release -j $(nproc)

-  ubuntu-22-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-sycl
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          time cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-sycl-fp16
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          time cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-openvino:
-      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
-      strategy:
-        matrix:
-          include:
-            - variant: cpu
-              runner: '"ubuntu-24.04"'
-              openvino_device: "CPU"
-            - variant: gpu
-              runner: '["self-hosted","Linux","X64","Intel"]'
-              openvino_device: "GPU"
-
-      runs-on: ${{ fromJSON(matrix.runner) }}
-
-      env:
-        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-        OPENVINO_VERSION_MAJOR: "2026.0"
-        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-      steps:
-        - name: Clone
-          id: checkout
-          uses: actions/checkout@v6
-
-        - name: ccache
-          if: runner.environment == 'github-hosted'
-          uses: ggml-org/ccache-action@v1.2.21
-          with:
-            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
-            evict-old-files: 1d
-            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-        - name: Dependencies
-          id: depends
-          run: |
-            sudo apt-get update
-            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-        - name: Use OpenVINO Toolkit Cache
-          if: runner.environment == 'github-hosted'
-          uses: actions/cache@v5
-          id: cache-openvino
-          with:
-            path: ./openvino_toolkit
-            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-        - name: Setup OpenVINO Toolkit
-          if: steps.cache-openvino.outputs.cache-hit != 'true'
-          uses: ./.github/actions/linux-setup-openvino
-          with:
-            path: ./openvino_toolkit
-            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-            version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-        - name: Install OpenVINO dependencies
-          run: |
-            cd ./openvino_toolkit
-            chmod +x ./install_dependencies/install_openvino_dependencies.sh
-            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-        - name: Build
-          id: cmake_build
-          run: |
-            source ./openvino_toolkit/setupvars.sh
-            cmake -B build/ReleaseOV -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DGGML_OPENVINO=ON
-            time cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-        - name: Test
-          id: cmake_test
-          # TODO: fix and re-enable the `test-llama-archs` test below
-          run: |
-            cd ${{ github.workspace }}
-            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-              export GGML_OPENVINO_DEVICE=GPU
-            fi
-            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

  windows-latest:
    runs-on: windows-2025
@@ -893,39 +763,6 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat

  windows-latest-hip:
    runs-on: windows-2022
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -236,6 +236,75 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

+  android-arm64:
+    runs-on: ubuntu-latest
+
+    env:
+      NDK_VERSION: "29.0.14206865"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: android-arm64
+          evict-old-files: 1d
+
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          java-version: 17
+          distribution: temurin
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Install NDK
+        run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz
+          name: llama-bin-android-arm64.tar.gz
+
  ubuntu-24-openvino:
    runs-on: ubuntu-24.04

@@ -529,15 +598,29 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -545,10 +628,6 @@ jobs:
          variant: ccache
          evict-old-files: 1d

-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
      - name: Build
        id: cmake_build
        shell: cmd
@@ -601,6 +680,82 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-sycl-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+
  ubuntu-22-rocm:
    runs-on: ubuntu-22.04

@@ -618,6 +773,11 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Free up disk space
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -971,6 +1131,8 @@ jobs:
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
+      - ubuntu-24-sycl
+      - android-arm64
      - macOS-cpu
      - ios-xcode-build
      - openEuler-cann
@@ -1058,6 +1220,11 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
+
+            **Android:**
+            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)

            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
--- a/.gitignore
+++ b/.gitignore
@@ -34,7 +34,6 @@
 /.vscode/
 /nppBackup

-
 # Coverage

 /gcovr-report/
@@ -74,6 +73,7 @@
 !/models/templates

 # Zig
+
 /zig-out/
 /zig-cache/

@@ -93,6 +93,7 @@
 !/examples/sycl/*.sh

 # Server Web UI temporary files
+
 /tools/server/webui/node_modules
 /tools/server/webui/dist
 # we no longer use gz for index.html
@@ -106,9 +107,11 @@ __pycache__/
 poetry.toml

 # Nix
+
 /result

 # Test binaries
+
 /tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
@@ -124,6 +127,7 @@ poetry.toml
 /tests/test-tokenizer-1-spm

 # Scripts
+
 !/scripts/install-oneapi.bat

 # Generated by scripts
@@ -132,16 +136,24 @@ poetry.toml
 /wikitext-2-raw/

 # Test models for lora adapters
+
 /lora-tests

 # Local scripts
+
 /run-vim.sh
 /run-chat.sh
 /run-spec.sh
 /.ccache/

 # IDE
+
 /*.code-workspace
 /.windsurf/
 # emscripten
 a.out.*
+
+# AGENTS
+
+AGENTS.local.md
+.pi/SYSTEM.md
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -0,0 +1,33 @@
+You are a coding agent. Here are some very important rules that you must follow:
+
+General:
+- By very precise and concise when writing code, comments, explanations, etc.
+- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
+- Don't try to build or run the code unless you are explicitly asked to do so
+
+Coding:
+- When in doubt, always refer to the CONTRIBUTING.md file of the project
+- When referencing issues or PRs in comments, use the format:
+  - C/C++ code: `// ref: <url>`
+  - Other (CMake, etc.): `# ref: <url>`
+
+Pull requests (PRs):
+- New branch names are prefixed with "gg/"
+- Before opening a pull request, ask the user to confirm the description
+- When creating a pull request, look for the repository's PR template and follow it
+- For the AI usage disclosure section, write "YES. llama.cpp + pi"
+- Always create the pull requests in draft mode
+
+Commits:
+- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
+- Do not explicitly set the git author in commits - rely on the default git config
+
+Resources (read on demand):
+- [CONTRIBUTING.md](CONTRIBUTING.md)
+- [Build documentation](docs/build.md)
+- [Server usage documentation](tools/server/README.md)
+- [Server development documentation](tools/server/README-dev.md)
+- [PEG parser](docs/development/parsing.md)
+- [Auto parser](docs/autoparser.md)
+- [Jinja engine](common/jinja/README.md)
+- [PR template](.github/pull_request_template.md)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,7 +225,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
 endforeach()

 if (LLAMA_BUILD_COMMON)
-    license_generate(common)
+    license_generate(llama-common)
 endif()

 #
@@ -249,6 +249,10 @@ set_target_properties(llama

 install(TARGETS llama LIBRARY PUBLIC_HEADER)

+if (LLAMA_BUILD_COMMON)
+    install(TARGETS llama-common LIBRARY)
+endif()
+
 configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
--- a/30
+++ b/30
@@ -1,5 +1,21 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
-# multiplie collaborators per item can be specified
+# multiple collaborators per item can be specified
+#
+# ggml-org/ci               : CISC, danbev, ggerganov, netrunnereve, ngxson, taronaeo
+# ggml-org/ggml-cann        : hipudding
+# ggml-org/ggml-cuda        : JohannesGaessler, am17an, IMbackK, ORippler
+# ggml-org/ggml-hexagon     : lhez, max-krasnyansky
+# ggml-org/ggml-metal       : ggerganov
+# ggml-org/ggml-opencl      : lhez, max-krasnyansky
+# ggml-org/ggml-rpc         : rgerganov
+# ggml-org/ggml-sycl        : arthw
+# ggml-org/ggml-vulkan      : 0cc4m, jeffbolznv
+# ggml-org/ggml-webgpu      : reeselevine
+# ggml-org/ggml-zdnn        : taronaeo
+# ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
+# ggml-org/llama-mtmd       : ngxson
+# ggml-org/llama-server     : ggerganov, ngxson, allozaur, angt, ServeurpersoCom
+# ggml-org/llama-webui      : allozaur

 /.devops/*.Dockerfile                   @ngxson
 /.github/actions/                       @ggml-org/ci
@@ -7,6 +23,7 @@
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/                                @ggml-org/llama-common
+/common/fit.*                           @JohannesGaessler
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
 /convert_*.py                           @CISC
@@ -36,28 +53,29 @@
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
 /ggml/include/                          @ggerganov
+/ggml/src/ggml-backend-meta.cpp         @JohannesGaessler
 /ggml/src/ggml-cann/                    @ggml-org/ggml-cann
 /ggml/src/ggml-common.h                 @ggerganov
 /ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
 /ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
-/ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
-/ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
+/ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
+/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
+/ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggml-org/ggml-metal
 /ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
+/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
 /ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
 /ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-virtgpu/                 @kpouget
+/ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,9 +1,11 @@
-# common
-
 find_package(Threads REQUIRED)

 llama_add_compile_flags()

+#
+# llama-common-base
+#
+
 # Build info header

 if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
@@ -33,17 +35,25 @@ endif()

 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
 set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
+
 configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})

-set(TARGET build_info)
-add_library(${TARGET} OBJECT ${OUTPUT_FILE})
+set(TARGET llama-common-base)
+add_library(${TARGET} STATIC ${OUTPUT_FILE})
+
+target_include_directories(${TARGET} PUBLIC .)
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-set(TARGET common)
+#
+# llama-common
+#

-add_library(${TARGET} STATIC
+set(TARGET llama-common)
+
+add_library(${TARGET}
    arg.cpp
    arg.h
    base64.hpp
@@ -63,6 +73,8 @@ add_library(${TARGET} STATIC
    debug.h
    download.cpp
    download.h
+    fit.cpp
+    fit.h
    hf-cache.cpp
    hf-cache.h
    http.h
@@ -106,17 +118,24 @@ add_library(${TARGET} STATIC
    jinja/caps.h
    )

+set_target_properties(${TARGET} PROPERTIES
+    VERSION ${LLAMA_INSTALL_VERSION}
+    SOVERSION 0
+    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
+)
+
 target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)

 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+    # TODO: make fine-grained exports in the future
+    set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()

-target_link_libraries(${TARGET} PRIVATE
-    build_info
-    cpp-httplib
-)
+target_link_libraries(${TARGET} PUBLIC  llama-common-base)
+target_link_libraries(${TARGET} PRIVATE cpp-httplib)

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"

+#include "build-info.h"
 #include "chat.h"
 #include "common.h"
 #include "download.h"
@@ -291,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
        hf_tag = "default";
    }

-    std::string model_endpoint = get_model_endpoint();
+    std::string model_endpoint = common_get_model_endpoint();
    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";

    // prepare local path for caching
@@ -1044,8 +1045,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--version"},
        "show version and build info",
        [](common_params &) {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
+            fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
            exit(0);
        }
    ));
@@ -1315,13 +1316,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
-        {"--clear-idle"},
-        {"--no-clear-idle"},
+        {"--cache-idle-slots"},
+        {"--no-cache-idle-slots"},
        "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
        [](common_params & params, bool value) {
-            params.clear_idle = value;
+            params.cache_idle_slots = value;
        }
-    ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--context-shift"},
        {"--no-context-shift"},
@@ -2425,6 +2426,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_env("LLAMA_ARG_FIT"));
+    add_opt(common_arg(
+        { "-fitp", "--fit-print" }, "[on|off]",
+        string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
+        [](common_params & params, const std::string & value) {
+            if (is_truthy(value)) {
+                params.fit_params_print = true;
+            } else if (is_falsey(value)) {
+                params.fit_params_print = false;
+            } else {
+                throw std::runtime_error(
+                    string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
    add_opt(common_arg(
        { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
        string_format("target margin per device for --fit, comma-separated list of values, "
@@ -3107,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
        [](common_params & params, int value) {
            if (value < -1) { throw std::invalid_argument("invalid value"); }
-            params.reasoning_budget = value;
+            params.sampling.reasoning_budget_tokens = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
    add_opt(common_arg(
        {"--reasoning-budget-message"}, "MESSAGE",
        "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
        [](common_params & params, const std::string & value) {
-            params.reasoning_budget_message = value;
+            params.sampling.reasoning_budget_message = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
    add_opt(common_arg(
@@ -3887,6 +3902,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));

+    add_opt(common_arg(
+        {"--spec-default"},
+        string_format("enable default speculative decoding config"),
+        [](common_params & params) {
+            params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
+            params.speculative.ngram_size_n = 24;
+            params.speculative.n_min = 48;
+            params.speculative.n_max = 64;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
    return ctx_arg;
 }

--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +1,35 @@
+#include "build-info.h"
+
+#include <cstdio>
+#include <string>
+
 int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
-char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
+char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
+char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
+char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
+
+int llama_build_number(void) {
+    return LLAMA_BUILD_NUMBER;
+}
+
+const char * llama_commit(void) {
+    return LLAMA_COMMIT;
+}
+
+const char * llama_compiler(void) {
+    return LLAMA_COMPILER;
+}
+
+const char * llama_build_target(void) {
+    return LLAMA_BUILD_TARGET;
+}
+
+const char * llama_build_info(void) {
+    static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
+    return s.c_str();
+}
+
+void llama_print_build_info(void) {
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, llama_build_number(), llama_commit());
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
+}
--- a/common/build-info.h
+++ b/common/build-info.h
@@ -0,0 +1,11 @@
+#pragma once
+
+int llama_build_number(void);
+
+const char * llama_commit(void);
+const char * llama_compiler(void);
+
+const char * llama_build_target(void);
+const char * llama_build_info(void);
+
+void llama_print_build_info(void);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
    if (!format.per_call_start.empty()) {
        auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
+            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call);
+            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
        }
        if (!format.section_start.empty()) {
            tool_calls = p.trigger_rule("tool-calls",
                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
+                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
        }
    } else {
        std::string separator = ", ";  // Default
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                start = trim_leading_whitespace(result.tags["pre"]);
-                end   = trim_trailing_whitespace(result.tags["post"]);
+                start = result.tags["pre"];
+                end   = result.tags["post"];
            } else if (!result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
            }
        }
    }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -397,6 +397,25 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
    return render_message_to_json(msgs, c);
 }

+json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
+    if (tools.empty()) {
+        return json();
+    }
+
+    auto result = json::array();
+    for (const auto & tool : tools) {
+        result.push_back({
+            { "type",     "function" },
+            { "function", {
+                { "name", tool.name },
+                { "description", tool.description },
+                { "parameters", json::parse(tool.parameters) },
+            }},
+        });
+    }
+    return result;
+}
+
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
    std::vector<common_chat_tool> result;

@@ -432,56 +451,6 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    return result;
 }

-json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
-    if (tools.empty()) {
-        return json();
-    }
-
-    auto result = json::array();
-    for (const auto & tool : tools) {
-        result.push_back({
-            { "type",     "function" },
-            { "function",
-             {
-                  { "name", tool.name },
-                  { "description", tool.description },
-                  { "parameters", json::parse(tool.parameters) },
-              }                      },
-        });
-    }
-    return result;
-}
-
-json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
-    json delta = json::object();
-    if (!diff.reasoning_content_delta.empty()) {
-        delta["reasoning_content"] = diff.reasoning_content_delta;
-    }
-    if (!diff.content_delta.empty()) {
-        delta["content"] = diff.content_delta;
-    }
-    if (diff.tool_call_index != std::string::npos) {
-        json tool_call;
-        tool_call["index"] = diff.tool_call_index;
-        if (!diff.tool_call_delta.id.empty()) {
-            tool_call["id"]   = diff.tool_call_delta.id;
-            tool_call["type"] = "function";
-        }
-        if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
-            json function = json::object();
-            if (!diff.tool_call_delta.name.empty()) {
-                function["name"] = diff.tool_call_delta.name;
-            }
-            if (!diff.tool_call_delta.arguments.empty()) {
-                function["arguments"] = diff.tool_call_delta.arguments;
-            }
-            tool_call["function"] = function;
-        }
-        delta["tool_calls"] = json::array({ tool_call });
-    }
-    return delta;
-}
-
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
    if (use_jinja) {
        try {
@@ -575,6 +544,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }

+// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+static bool is_lfm2_template(const std::string & src) {
+    return src.find("<|tool_list_start|>") != std::string::npos &&
+           src.find("<|tool_list_end|>")   != std::string::npos;
+}
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
+    common_chat_prompt_preset asr_preset;
+    asr_preset.system = "";
+    asr_preset.user   = "Transcribe audio to text";
+
+    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
+        asr_preset.system = "Perform ASR.";
+        asr_preset.user   = "";
+    }
+
+    return asr_preset;
+}
+
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
    if (!variant.empty()) {
        if (variant == "tool_use") {
@@ -2084,10 +2073,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-    if (src.find("<|tool_list_start|>") != std::string::npos &&
-        src.find("<|tool_list_end|>") != std::string::npos) {
+    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }
@@ -2334,7 +2320,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
        ? input
        : params.generation_prompt + input;

-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
+    //LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());

    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
    if (params.debug) {
@@ -2396,4 +2382,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
    GGML_ASSERT(chat_templates->template_default != nullptr);
    return chat_templates->template_default->caps.to_map();
 }
-
--- a/common/chat.h
+++ b/common/chat.h
@@ -256,14 +256,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
+
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
 nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

-nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
-
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

@@ -275,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);
+
+// specialized per-task preset
+struct common_chat_prompt_preset {
+    std::string system;
+    std::string user;
+};
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,7 +1,9 @@
 #include "ggml.h"
 #include "gguf.h"

+#include "build-info.h"
 #include "common.h"
+#include "fit.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
@@ -372,7 +374,7 @@ void common_init() {
    const char * build_type = " (debug)";
 #endif

-    LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -1146,7 +1148,7 @@ common_init_result::common_init_result(common_params & params) :

    if (params.fit_params) {
        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
-        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
            params.fit_params_target.data(),
@@ -1381,7 +1383,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {

 common_init_result::~common_init_result() = default;

-std::string get_model_endpoint() {
+std::string common_get_model_endpoint() {
    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
@@ -1396,6 +1398,42 @@ std::string get_model_endpoint() {
    return model_endpoint;
 }

+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
+    auto * mem = llama_get_memory(ctx);
+    if (mem == nullptr) {
+        return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+    }
+
+    common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
+
+    llama_memory_clear(mem, true);
+
+    // eval 2 tokens to check if the context is compatible
+    std::vector<llama_token> tmp;
+    tmp.push_back(0);
+    tmp.push_back(0);
+
+    int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
+    if (ret != 0) {
+        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
+        res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+        goto done;
+    }
+
+    // try to remove the last tokens
+    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
+        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
+        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
+        goto done;
+    }
+
+done:
+    llama_memory_clear(mem, true);
+    llama_synchronize(ctx);
+
+    return res;
+}
+
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
    std::vector<llama_adapter_lora *> loras;
    std::vector<float> scales;
--- a/common/common.h
+++ b/common/common.h
@@ -2,15 +2,15 @@

 #pragma once

+#include "llama-cpp.h"
+
 #include "ggml-opt.h"
 #include "ggml.h"
-#include "llama-cpp.h"

 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
-#include <variant>
 #include <vector>
 #include <map>

@@ -27,11 +27,6 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

-#define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
-} while(0)
-
 struct common_time_meas {
    common_time_meas(int64_t & t_acc, bool disable = false);
    ~common_time_meas();
@@ -53,14 +48,6 @@ struct common_adapter_lora_info {

 using llama_tokens = std::vector<llama_token>;

-// build info
-extern int LLAMA_BUILD_NUMBER;
-extern const char * LLAMA_COMMIT;
-extern const char * LLAMA_COMPILER;
-extern const char * LLAMA_BUILD_TARGET;
-
-const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
-
 struct common_control_vector_load_info;

 //
@@ -287,6 +274,7 @@ struct common_params_sampling {
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
+    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@@ -315,15 +303,15 @@ struct common_params_speculative {
    // general-purpose speculative decoding parameters

    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min   = 0;  // minimum number of draft tokens to use for speculative decoding
    float   p_split = 0.1f; // speculative decoding split probability
    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)

    // ngram-based speculative decoding

-    uint16_t ngram_size_n     = 12; // ngram size for lookup
-    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
-    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+    uint16_t ngram_size_n   = 12; // ngram size for lookup
+    uint16_t ngram_size_m   = 48; // mgram size for speculative tokens
+    uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed

    std::shared_ptr<common_ngram_mod> ngram_mod;

@@ -433,11 +421,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
-    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
-    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
+    bool    fit_params_print   = false; // print the estimated required memory to run the model
+    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use

    // margin per device in bytes for fitting parameters to free memory:
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
@@ -579,7 +568,7 @@ struct common_params {
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
    bool    cache_prompt        = true;  // whether to enable prompt caching
-    bool    clear_idle          = true;  // save and clear idle slots upon starting a new task
+    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
@@ -593,8 +582,6 @@ struct common_params {
    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
-    int reasoning_budget = -1;
-    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@@ -759,6 +746,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, char prefix) {
+    return !str.empty() && str.front() == prefix;
+}
+
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
@@ -859,7 +851,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

-std::string                   get_model_endpoint();
+// model endpoint from env
+std::string common_get_model_endpoint();
+
+//
+// Context utils
+//
+
+enum common_context_seq_rm_type {
+    COMMON_CONTEXT_SEQ_RM_TYPE_NO   = 0, // seq_rm not supported (e.g. no memory module)
+    COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
+    COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
+};
+
+// check if the llama_context can remove sequences
+// note: clears the memory of the context
+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
+

 //
 // Batch utils
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -1,9 +1,38 @@
 #include "debug.h"

+#include "common.h"
 #include "log.h"

 #include <cmath>
+#include <regex>
 #include <string>
+#include <vector>
+
+struct common_debug_cb_user_data::impl {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;
+    bool                    abort_on_nan{false};
+};
+
+common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
+common_debug_cb_user_data::~common_debug_cb_user_data() = default;
+
+common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
+    : pimpl(std::make_unique<impl>())
+{
+    for (const auto & pattern : filter_patterns) {
+        try {
+            std::string anchored_pattern = "^" + pattern;
+            pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+        } catch (const std::regex_error & e) {
+            throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+        }
+    }
+    pimpl->abort_on_nan = abort_on_nan;
+
+    params.cb_eval           = common_debug_cb_eval;
+    params.cb_eval_user_data = this;
+}

 static std::string common_ggml_ne_string(const ggml_tensor * t) {
    std::string str;
@@ -47,8 +76,7 @@ static float common_ggml_get_float_value(const uint8_t * data,

 #define INDENT "    "

-template <bool abort>
-void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -94,7 +122,7 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
        LOG(INDENT "sum = %f\n", sum);
    }

-    if constexpr (abort) {
+    if (abort_on_nan) {
        if (std::isnan(sum)) {
            LOG("encountered NaN - aborting\n");
            exit(0);
@@ -112,8 +140,9 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
 * @param user_data user data to pass at each call back
 * @return true to receive data or continue the graph, false otherwise
 */
-template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (base_callback_data *) user_data;
+bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (common_debug_cb_user_data *) user_data;
+    auto * pimpl = cb_data->pimpl.get();

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
@@ -122,10 +151,10 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
        return true;  // Always retrieve data
    }

-    bool matches_filter = cb_data->tensor_filters.empty();
+    bool matches_filter = pimpl->tensor_filters.empty();

    if (!matches_filter) {
-        for (const auto & filter : cb_data->tensor_filters) {
+        for (const auto & filter : pimpl->tensor_filters) {
            if (std::regex_search(t->name, filter)) {
                matches_filter = true;
                break;
@@ -148,20 +177,14 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b

    if (!is_host) {
        auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+        pimpl->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
    }

    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
+        uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
+        common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
    }

    return true;
 }
-
-// Explicit template instantiations
-template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
-template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
-template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
-template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@@ -1,43 +1,31 @@
 #pragma once
-#include "common.h"
+
+#include <memory>
 #include <string>
 #include <vector>
-#include <regex>

 // common debug functions and structs

-// Print a tensor's detailed data
-// data - the tensor's data in byte format
-// type - the tensor's quantization type
-// ne   - the tensor dimensions array
-// nb   - the tensor strides array
-// n    - the number of rows/columns to fully print
-template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
+struct common_params;

 // Intended to use as callback for ggml_backend_sched_eval_callback
 // prints tensors that are processed in the computation graph
-// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
-// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determines whether an error should be thrown whenever a NaN is encountered
+// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
+// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
+// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
-template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
-struct base_callback_data {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
+bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);

-    base_callback_data() = default;
+struct common_debug_cb_user_data {
+    struct impl;
+    std::unique_ptr<impl> pimpl;

-    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
-        for (const auto & pattern : filter_patterns) {
-            try {
-                std::string anchored_pattern = "^" + pattern;
-                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-            } catch (const std::regex_error & e) {
-                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-            }
-        }
-        params.cb_eval           = common_debug_cb_eval<false>;
-        params.cb_eval_user_data = this;
-    }
+    common_debug_cb_user_data();
+    ~common_debug_cb_user_data();
+
+    common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
+    common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
+
+    common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
 };
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"

+#include "build-info.h"
 #include "common.h"
 #include "log.h"
 #include "download.h"
@@ -303,7 +304,7 @@ static int common_download_file_single_online(const std::string & url,
        headers.emplace(h.first, h.second);
    }
    if (headers.find("User-Agent") == headers.end()) {
-        headers.emplace("User-Agent", "llama-cpp/" + build_info);
+        headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
    }
    if (!opts.bearer_token.empty()) {
        headers.emplace("Authorization", "Bearer " + opts.bearer_token);
@@ -441,7 +442,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
        headers.emplace(h.first, h.second);
    }
    if (headers.find("User-Agent") == headers.end()) {
-        headers.emplace("User-Agent", "llama-cpp/" + build_info);
+        headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
    }

    if (params.timeout > 0) {
@@ -626,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
    if (!tag.empty()) {
        tags.push_back(tag);
    } else {
-        tags = {"Q4_K_M", "Q4_0"};
+        tags = {"Q4_K_M", "Q8_0"};
    }

    for (const auto & t : tags) {
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -0,0 +1,951 @@
+#include "fit.h"
+
+#include "log.h"
+
+#include "../src/llama-ext.h"
+
+#include <array>
+#include <cassert>
+#include <stdexcept>
+#include <cinttypes>
+#include <set>
+#include <string>
+#include <vector>
+
+// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
+// enum to identify part of a layer for distributing its tensors:
+enum common_layer_fraction_t {
+    LAYER_FRACTION_NONE = 0, // nothing
+    LAYER_FRACTION_ATTN = 1, // attention
+    LAYER_FRACTION_UP   = 2, // attention + up
+    LAYER_FRACTION_GATE = 3, // attention + up + gate
+    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
+};
+
+class common_params_fit_exception : public std::runtime_error {
+    using std::runtime_error::runtime_error;
+};
+
+static std::vector<llama_device_memory_data> common_get_device_memory_data(
+        const char * path_model,
+        const llama_model_params * mparams,
+        const llama_context_params * cparams,
+        std::vector<ggml_backend_dev_t> & devs,
+        uint32_t & hp_ngl,
+        uint32_t & hp_n_ctx_train,
+        uint32_t & hp_n_expert,
+        ggml_log_level log_level) {
+    struct user_data_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original_logger;
+        ggml_log_level min_level; // prints below this log level go to debug log
+    };
+    user_data_t ud;
+    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+    ud.min_level = log_level;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+        const user_data_t * ud = (const user_data_t *) user_data;
+        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+    }, &ud);
+
+    llama_model_params mparams_copy = *mparams;
+    mparams_copy.no_alloc  = true;
+    mparams_copy.use_mmap  = false;
+    mparams_copy.use_mlock = false;
+
+    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
+    if (model == nullptr) {
+        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+        throw std::runtime_error("failed to load model");
+    }
+
+    llama_context * ctx = llama_init_from_model(model, *cparams);
+    if (ctx == nullptr) {
+        llama_model_free(model);
+        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+        throw std::runtime_error("failed to create llama_context from model");
+    }
+
+    const size_t nd = llama_model_n_devices(model);
+    std::vector<llama_device_memory_data> ret(nd + 1);
+
+    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
+
+    for (const auto & [buft, mb] : memory_breakdown) {
+        if (ggml_backend_buft_is_host(buft)) {
+            ret.back().mb.model   += mb.model;
+            ret.back().mb.context += mb.context;
+            ret.back().mb.compute += mb.compute;
+            continue;
+        }
+
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            continue;
+        }
+        for (size_t i = 0; i < nd; i++) {
+            if (dev == llama_model_get_device(model, i)) {
+                ret[i].mb.model   += mb.model;
+                ret[i].mb.context += mb.context;
+                ret[i].mb.compute += mb.compute;
+                break;
+            }
+        }
+    }
+
+    {
+        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (cpu_dev == nullptr) {
+            throw std::runtime_error("no CPU backend found");
+        }
+        size_t free;
+        size_t total;
+        ggml_backend_dev_memory(cpu_dev, &free, &total);
+        ret.back().free  = free;
+        ret.back().total = total;
+    }
+    for (size_t i = 0; i < nd; i++) {
+        size_t free;
+        size_t total;
+        ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
+
+        // devices can return 0 bytes for free and total memory if they do not
+        // have any to report. in this case, we will use the host memory as a fallback
+        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+        if (free == 0 && total == 0) {
+            free  = ret.back().free;
+            total = ret.back().total;
+        }
+        ret[i].free  = free;
+        ret[i].total = total;
+    }
+
+    devs.clear();
+    for (int i = 0; i < llama_model_n_devices(model); i++) {
+        devs.push_back(llama_model_get_device(model, i));
+    }
+
+    hp_ngl         = llama_model_n_layer(model);
+    hp_n_ctx_train = llama_model_n_ctx_train(model);
+    hp_n_expert    = llama_model_n_expert(model);
+
+    common_memory_breakdown_print(ctx);
+
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+
+    return ret;
+}
+
+static void common_params_fit_impl(
+        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
+        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
+        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+    if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
+        throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
+    }
+    constexpr int64_t MiB = 1024*1024;
+    typedef std::vector<llama_device_memory_data> dmds_t;
+    const llama_model_params default_mparams = llama_model_default_params();
+
+    std::vector<ggml_backend_dev_t> devs;
+    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
+    uint32_t hp_nct = 0; // hparams.n_ctx_train
+    uint32_t hp_nex = 0; // hparams.n_expert
+
+    // step 1: get data for default parameters and check whether any changes are necessary in the first place
+
+    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
+    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+    const size_t nd = devs.size(); // number of devices
+
+    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
+    margins.reserve(nd);
+    if (nd == 0) {
+        margins.push_back(margins_s[0]);
+    } else {
+        for (size_t id = 0; id < nd; id++) {
+            margins.push_back(margins_s[id]);
+        }
+    }
+
+    std::vector<std::string> dev_names;
+    {
+        dev_names.reserve(nd);
+        size_t max_length = 0;
+        for (const auto & dev : devs) {
+            std::string name = ggml_backend_dev_name(dev);
+            name += " (";
+            name += ggml_backend_dev_description(dev);
+            name += ")";
+            dev_names.push_back(name);
+            max_length = std::max(max_length, name.length());
+        }
+        for (std::string & dn : dev_names) {
+            dn.insert(dn.end(), max_length - dn.length(), ' ');
+        }
+    }
+
+    int64_t sum_free            = 0;
+    int64_t sum_projected_free  = 0;
+    int64_t sum_projected_used  = 0;
+    int64_t sum_projected_model = 0;
+    std::vector<int64_t> projected_free_per_device;
+    projected_free_per_device.reserve(nd);
+
+    if (nd == 0) {
+        sum_projected_used = dmds_full.back().mb.total();
+        sum_free           = dmds_full.back().total;
+        sum_projected_free = sum_free - sum_projected_used;
+        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
+            __func__, sum_projected_used/MiB, sum_free/MiB);
+        if (sum_projected_free >= margins[0]) {
+            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+                __func__, sum_projected_free/MiB, margins[0]/MiB);
+            return;
+        }
+    } else {
+        if (nd > 1) {
+            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+        }
+        for (size_t id = 0; id < nd; id++) {
+            const llama_device_memory_data & dmd = dmds_full[id];
+
+            const int64_t projected_used = dmd.mb.total();
+            const int64_t projected_free = dmd.free - projected_used;
+            projected_free_per_device.push_back(projected_free);
+
+            sum_free            += dmd.free;
+            sum_projected_used  += projected_used;
+            sum_projected_free  += projected_free;
+            sum_projected_model += dmd.mb.model;
+
+            if (nd > 1) {
+                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
+            }
+        }
+        assert(sum_free >= 0 && sum_projected_used >= 0);
+        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+            __func__, sum_projected_used/MiB, sum_free/MiB);
+        if (nd == 1) {
+            if (projected_free_per_device[0] >= margins[0]) {
+                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+                return;
+            }
+        } else {
+            bool changes_needed = false;
+            for (size_t id = 0; id < nd; id++) {
+                if (projected_free_per_device[id] < margins[id]) {
+                    changes_needed = true;
+                    break;
+                }
+            }
+            if (!changes_needed) {
+                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                return;
+            }
+        }
+    }
+
+    // step 2: try reducing memory use by reducing the context size
+
+    {
+        int64_t global_surplus = sum_projected_free;
+        if (nd == 0) {
+            global_surplus -= margins[0];
+        } else {
+            for (size_t id = 0; id < nd; id++) {
+                global_surplus -= margins[id];
+            }
+        }
+        if (global_surplus < 0) {
+            if (nd <= 1) {
+                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                    __func__, margins[0]/MiB, -global_surplus/MiB);
+            } else {
+                LOG_INF(
+                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
+                    __func__, -global_surplus/MiB);
+            }
+            if (cparams->n_ctx == 0) {
+                if (hp_nct > n_ctx_min) {
+                    int64_t sum_used_target = sum_free;
+                    if (nd == 0) {
+                        sum_used_target -= margins[0];
+                    } else {
+                        for (size_t id = 0; id < nd; id++) {
+                            sum_used_target -= margins[id];
+                        }
+                    }
+                    if (nd > 1) {
+                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
+                        //   - for dense models only whole layers can be assigned to devices
+                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
+                        //   - on average we expect a waste of 0.5 layers/tensors per device
+                        //   - use slightly more than the expected average for nd devices to be safe
+                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
+                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
+                    }
+
+                    int64_t sum_projected_used_min_ctx = 0;
+                    cparams->n_ctx = n_ctx_min;
+                    const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+                    if (nd == 0) {
+                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
+                    } else {
+                        for (size_t id = 0; id < nd; id++) {
+                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
+                        }
+                    }
+                    if (sum_used_target > sum_projected_used_min_ctx) {
+                        // linear interpolation between minimum and maximum context size:
+                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
+                            / (sum_projected_used - sum_projected_used_min_ctx);
+                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
+
+                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
+                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
+                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                        if (nd <= 1) {
+                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
+                            return;
+                        }
+                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
+                    } else {
+                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
+                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    }
+                } else {
+                    if (n_ctx_min == UINT32_MAX) {
+                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+                    } else {
+                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                            __func__, hp_nct, n_ctx_min);
+                    }
+                }
+            } else {
+                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+            }
+        }
+    }
+    if (nd == 0) {
+        throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
+    }
+
+    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
+        throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+    }
+    if (nd > 1) {
+        if (!tensor_split) {
+            throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
+        }
+        if (mparams->tensor_split) {
+            for (size_t id = 0; id < nd; id++) {
+                if (mparams->tensor_split[id] != 0.0f) {
+                    throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
+                }
+            }
+        }
+        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+        }
+    }
+    if (!tensor_buft_overrides) {
+        throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
+    }
+    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
+        throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
+    }
+
+    // step 3: iteratively fill the back to front with "dense" layers
+    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
+    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
+
+    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
+    auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
+        constexpr size_t n_strings = 1000;
+        if (il >= n_strings) {
+            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
+        }
+        switch (lf) {
+            case LAYER_FRACTION_ATTN: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
+                }
+                return patterns[il].c_str();
+            }
+            case LAYER_FRACTION_UP: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
+                }
+                return patterns[il].c_str();
+            }
+            case LAYER_FRACTION_GATE: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
+                }
+                return patterns[il].c_str();
+            }
+            case LAYER_FRACTION_MOE: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
+                }
+                return patterns[il].c_str();
+            }
+            default:
+                GGML_ABORT("fatal error");
+        }
+    };
+
+    struct ngl_t {
+        uint32_t n_layer = 0; // number of total layers
+        uint32_t n_part  = 0; // number of partial layers, <= n_layer
+
+        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
+        common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
+
+        uint32_t n_full() const {
+            assert(n_layer >= n_part);
+            return n_layer - n_part;
+        }
+    };
+
+    const size_t ntbo = llama_max_tensor_buft_overrides();
+
+    // utility function to set n_gpu_layers and tensor_split
+    auto set_ngl_tensor_split_tbo = [&](
+            const std::vector<ngl_t> & ngl_per_device,
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+            llama_model_params & mparams) {
+        mparams.n_gpu_layers = 0;
+        for (size_t id = 0; id < nd; id++) {
+            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
+            if (nd > 1) {
+                tensor_split[id] = ngl_per_device[id].n_layer;
+            }
+        }
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
+        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
+
+        mparams.tensor_split = tensor_split;
+
+        size_t itbo = 0;
+        for (size_t id = 0; id < nd; id++) {
+            il0 += ngl_per_device[id].n_full();
+            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
+                if (itbo + 1 >= ntbo) {
+                    tensor_buft_overrides[itbo].pattern = nullptr;
+                    tensor_buft_overrides[itbo].buft    = nullptr;
+                    itbo++;
+                    mparams.tensor_buft_overrides = tensor_buft_overrides;
+                    throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
+                        + std::to_string(ntbo) + " is insufficient for model");
+                }
+                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
+                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
+                itbo++;
+            }
+            il0 += ngl_per_device[id].n_part;
+        }
+        tensor_buft_overrides[itbo].pattern = nullptr;
+        tensor_buft_overrides[itbo].buft    = nullptr;
+        itbo++;
+        mparams.tensor_buft_overrides = tensor_buft_overrides;
+    };
+
+    // utility function that returns the memory use per device for given numbers of layers per device
+    auto get_memory_for_layers = [&](
+            const char * func_name,
+            const std::vector<ngl_t> & ngl_per_device,
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
+        llama_model_params mparams_copy = *mparams;
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
+
+        const dmds_t dmd_nl = common_get_device_memory_data(
+            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+
+        LOG_INF("%s: memory for test allocation by device:\n", func_name);
+        for (size_t id = 0; id < nd; id++) {
+            const ngl_t & n = ngl_per_device[id];
+            LOG_INF(
+                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
+                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
+        }
+
+        std::vector<int64_t> ret;
+        ret.reserve(nd);
+        for (size_t id = 0; id < nd; id++) {
+            ret.push_back(dmd_nl[id].mb.total());
+        }
+        return ret;
+    };
+
+    int64_t global_surplus_cpu_moe = 0;
+    if (hp_nex > 0) {
+        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
+        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
+        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
+        tensor_buft_overrides[1] = {nullptr, nullptr};
+        mparams->tensor_buft_overrides = tensor_buft_overrides;
+
+        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
+            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+
+        for (size_t id = 0; id < nd; id++) {
+            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
+            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
+        }
+
+        if (global_surplus_cpu_moe > 0) {
+            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+                __func__, global_surplus_cpu_moe/MiB);
+        } else {
+            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+                __func__, -global_surplus_cpu_moe/MiB);
+        }
+
+        // reset
+        tensor_buft_overrides[0] = {nullptr, nullptr};
+        mparams->tensor_buft_overrides = tensor_buft_overrides;
+    }
+
+    std::vector<int64_t> targets; // maximum acceptable memory use per device
+    targets.reserve(nd);
+    for (size_t id = 0; id < nd; id++) {
+        targets.push_back(dmds_full[id].free - margins[id]);
+        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+    }
+
+    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
+    overflow_bufts.reserve(nd);
+    for (size_t id = 0; id < nd; id++) {
+        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+    }
+
+    std::vector<ngl_t> ngl_per_device(nd);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
+
+    // optimize the number of layers per device using the method of false position:
+    //   - ngl_per_device has 0 layers for each device, lower bound
+    //   - try a "high" configuration where a device is given all unassigned layers
+    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
+    //   - check memory use of our guess, replace either the low or high bound
+    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
+    //   - the last device has the output layer, which cannot be a partial layer
+    if (hp_nex == 0) {
+        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
+    } else {
+        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
+    }
+    for (int id = nd - 1; id >= 0; id--) {
+        uint32_t n_unassigned = hp_ngl + 1;
+        for (size_t jd = id + 1; jd < nd; ++jd) {
+            assert(n_unassigned >= ngl_per_device[jd].n_layer);
+            n_unassigned -= ngl_per_device[jd].n_layer;
+        }
+
+        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
+        ngl_per_device_high[id].n_layer = n_unassigned;
+        if (hp_nex > 0) {
+            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
+        }
+        if (ngl_per_device_high[id].n_layer > 0) {
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+            if (mem_high[id] > targets[id]) {
+                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
+                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
+                while (delta > 1) {
+                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
+                    step_size = std::max(step_size, uint32_t(1));
+                    step_size = std::min(step_size, delta - 1);
+
+                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+                    ngl_per_device_test[id].n_layer += step_size;
+                    if (hp_nex) {
+                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
+                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
+                    }
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+
+                    if (mem_test[id] <= targets[id]) {
+                        ngl_per_device = ngl_per_device_test;
+                        mem            = mem_test;
+                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                    } else {
+                        ngl_per_device_high = ngl_per_device_test;
+                        mem_high            = mem_test;
+                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                    }
+                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+                }
+            } else {
+                assert(ngl_per_device_high[id].n_layer == n_unassigned);
+                ngl_per_device = ngl_per_device_high;
+                mem            = mem_high;
+                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+            }
+        }
+
+        const int64_t projected_margin = dmds_full[id].free - mem[id];
+        LOG_INF(
+            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
+    }
+    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+        return;
+    }
+
+    // step 4: for a MoE model where all dense tensors fit,
+    //     convert the dense-only layers in the back to full layers in the front until all devices are full
+    // essentially the same procedure as for the dense-only layers except front-to-back
+    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
+
+    size_t id_dense_start = nd;
+    for (int id = nd - 1; id >= 0; id--) {
+        if (ngl_per_device[id].n_layer > 0) {
+            id_dense_start = id;
+            continue;
+        }
+        break;
+    }
+    assert(id_dense_start < nd);
+
+    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
+        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
+        for (size_t jd = id_dense_start; jd < nd; jd++) {
+            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
+            ngl_per_device_high[id].n_layer += n_layer_move;
+            ngl_per_device_high[jd].n_layer -= n_layer_move;
+            ngl_per_device_high[jd].n_part = 0;
+        }
+        size_t id_dense_start_high = nd - 1;
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+
+        if (mem_high[id] > targets[id]) {
+            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+            while (delta > 1) {
+                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
+                step_size = std::max(step_size, uint32_t(1));
+                step_size = std::min(step_size, delta - 1);
+
+                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+                size_t id_dense_start_test = id_dense_start;
+                uint32_t n_converted_test = 0;
+                for (;id_dense_start_test < nd; id_dense_start_test++) {
+                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
+                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
+                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
+                    ngl_per_device_test[id].n_layer += n_convert_jd;
+                    n_converted_test += n_convert_jd;
+
+                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
+                        break;
+                    }
+                }
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+
+                if (mem_test[id] <= targets[id]) {
+                    ngl_per_device = ngl_per_device_test;
+                    mem            = mem_test;
+                    id_dense_start = id_dense_start_test;
+                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+                } else {
+                    ngl_per_device_high = ngl_per_device_test;
+                    mem_high            = mem_test;
+                    id_dense_start_high = id_dense_start_test;
+                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
+                }
+                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+            }
+        } else {
+            ngl_per_device = ngl_per_device_high;
+            mem            = mem_high;
+            id_dense_start = id_dense_start_high;
+            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+        }
+
+        // try to fit at least part of one more layer
+        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
+            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+            size_t id_dense_start_test = id_dense_start;
+            ngl_per_device_test[id_dense_start_test].n_layer--;
+            ngl_per_device_test[id_dense_start_test].n_part--;
+            ngl_per_device_test[id].n_layer++;
+            ngl_per_device_test[id].n_part++;
+            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
+                id_dense_start_test++;
+            }
+            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
+            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
+            if (id < nd - 1) {
+                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
+            }
+            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                ngl_per_device = ngl_per_device_test;
+                overflow_bufts = overflow_bufts_test;
+                mem            = mem_test;
+                id_dense_start = id_dense_start_test;
+                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+
+                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
+                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                    ngl_per_device = ngl_per_device_test;
+                    overflow_bufts = overflow_bufts_test;
+                    mem            = mem_test;
+                    id_dense_start = id_dense_start_test;
+                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+                }
+            } else {
+                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
+                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                    ngl_per_device = ngl_per_device_test;
+                    overflow_bufts = overflow_bufts_test;
+                    mem            = mem_test;
+                    id_dense_start = id_dense_start_test;
+                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+                }
+            }
+        }
+
+        const int64_t projected_margin = dmds_full[id].free - mem[id];
+        LOG_INF(
+            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
+    }
+
+    // print info for devices that were not changed during the conversion from dense only to full layers:
+    for (size_t id = id_dense_start + 1; id < nd; id++) {
+        const int64_t projected_margin = dmds_full[id].free - mem[id];
+        LOG_INF(
+            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
+    }
+
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+}
+
+enum common_params_fit_status common_fit_params(
+        const char * path_model,
+        llama_model_params * mparams,
+        llama_context_params * cparams,
+        float * tensor_split,
+        llama_model_tensor_buft_override * tensor_buft_overrides,
+        size_t * margins,
+        uint32_t n_ctx_min,
+        ggml_log_level log_level) {
+    const int64_t t0_us = llama_time_us();
+    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
+    try {
+        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
+        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
+    } catch (const common_params_fit_exception & e) {
+        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
+        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
+    } catch (const std::runtime_error & e) {
+        LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
+        status = COMMON_PARAMS_FIT_STATUS_ERROR;
+    }
+    const int64_t t1_us = llama_time_us();
+    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+    return status;
+}
+
+void common_memory_breakdown_print(const struct llama_context * ctx) {
+    //const auto & devices = ctx->get_model().devices;
+    const auto * model = llama_get_model(ctx);
+
+    std::vector<ggml_backend_dev_t> devices;
+    for (int i = 0; i < llama_model_n_devices(model); i++) {
+        devices.push_back(llama_model_get_device(model, i));
+    }
+
+    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
+
+    std::vector<std::array<std::string, 9>> table_data;
+    table_data.reserve(devices.size());
+    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
+    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
+    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
+
+    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
+
+    constexpr size_t MiB = 1024 * 1024;
+    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
+
+    // track seen buffer types to avoid double counting:
+    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
+
+    // accumulative memory breakdown for each device and for host:
+    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
+    llama_memory_breakdown_data              mb_host;
+
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (ggml_backend_buft_is_host(buft)) {
+            mb_host.model   += mb.model;
+            mb_host.context += mb.context;
+            mb_host.compute += mb.compute;
+            seen_buffer_types.insert(buft);
+            continue;
+        }
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (dev) {
+            int i_dev = -1;
+            for (size_t i = 0; i < devices.size(); i++) {
+                if (devices[i] == dev) {
+                    i_dev = i;
+                    break;
+                }
+            }
+            if (i_dev != -1) {
+                mb_dev[i_dev].model   += mb.model;
+                mb_dev[i_dev].context += mb.context;
+                mb_dev[i_dev].compute += mb.compute;
+                seen_buffer_types.insert(buft);
+                continue;
+            }
+        }
+    }
+
+    // print memory breakdown for each device:
+    for (size_t i = 0; i < devices.size(); i++) {
+        ggml_backend_dev_t dev = devices[i];
+        llama_memory_breakdown_data mb = mb_dev[i];
+
+        const std::string name = ggml_backend_dev_name(dev);
+        std::string desc = ggml_backend_dev_description(dev);
+        for (const std::string & prefix : desc_prefixes_strip) {
+            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
+                desc = desc.substr(prefix.length());
+            }
+        }
+
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+
+        const size_t self = mb.model + mb.context + mb.compute;
+        const size_t unaccounted = total - self - free;
+
+        table_data.push_back({
+            template_gpu,
+            "  - " + name + " (" + desc + ")",
+            std::to_string(total / MiB),
+            std::to_string(free / MiB),
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            std::to_string(unaccounted / MiB)});
+    }
+
+    // print memory breakdown for host:
+    {
+        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
+        table_data.push_back({
+            template_other,
+            "  - Host",
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb_host.model / MiB),
+            std::to_string(mb_host.context / MiB),
+            std::to_string(mb_host.compute / MiB),
+            ""}); // unaccounted
+    }
+
+    // print memory breakdown for all remaining buffer types:
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (seen_buffer_types.count(buft) == 1) {
+            continue;
+        }
+        const std::string name = ggml_backend_buft_name(buft);
+        const size_t self = mb.model + mb.context + mb.compute;
+        table_data.push_back({
+            template_other,
+            "  - " + name,
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            ""}); // unaccounted
+        seen_buffer_types.insert(buft);
+    }
+
+    for (size_t j = 1; j < table_data[0].size(); j++) {
+        size_t max_len = 0;
+        for (const auto & td : table_data) {
+            max_len = std::max(max_len, td[j].length());
+        }
+        for (auto & td : table_data) {
+            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
+        }
+    }
+    for (const auto & td : table_data) {
+        LOG_INF(td[0].c_str(),
+            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
+            td[6].c_str(), td[7].c_str(), td[8].c_str());
+    }
+}
+
+void common_fit_print(
+        const char * path_model,
+        llama_model_params * mparams,
+        llama_context_params * cparams) {
+    std::vector<ggml_backend_dev_t> devs;
+    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
+    uint32_t hp_nct = 0; // hparams.n_ctx_train
+    uint32_t hp_nex = 0; // hparams.n_expert
+
+    auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
+    GGML_ASSERT(dmd.size() == devs.size() + 1);
+
+    for (size_t id = 0; id < devs.size(); id++) {
+        printf("%s ",  ggml_backend_dev_name(devs[id]));
+        printf("%zu ", dmd[id].mb.model/1024/1024);
+        printf("%zu ", dmd[id].mb.context/1024/1024);
+        printf("%zu ", dmd[id].mb.compute/1024/1024);
+        printf("\n");
+    }
+
+    printf("Host ");
+    printf("%zu ", dmd.back().mb.model/1024/1024);
+    printf("%zu ", dmd.back().mb.context/1024/1024);
+    printf("%zu ", dmd.back().mb.compute/1024/1024);
+    printf("\n");
+}
--- a/common/fit.h
+++ b/common/fit.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "ggml.h"
+
+enum common_params_fit_status {
+    COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
+    COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
+    COMMON_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
+};
+
+// fits mparams and cparams to free device memory (assumes system memory is unlimited)
+//   - returns true if the parameters could be successfully modified to fit device memory
+//   - this function is NOT thread safe because it modifies the global llama logger state
+//   - only parameters that have the same value as in llama_default_model_params are modified
+//     with the exception of the context size which is modified if and only if equal to 0
+enum common_params_fit_status common_fit_params(
+                               const char   * path_model,
+                struct llama_model_params   * mparams,
+                struct llama_context_params * cparams,
+                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                                     size_t * margins,               // margins of memory to leave per device in bytes
+                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+
+// print estimated memory to stdout
+void common_fit_print(
+                               const char   * path_model,
+                struct llama_model_params   * mparams,
+                struct llama_context_params * cparams);
+
+void common_memory_breakdown_print(const struct llama_context * ctx);
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -1,5 +1,6 @@
 #include "hf-cache.h"

+#include "build-info.h"
 #include "common.h"
 #include "log.h"
 #include "http.h"
@@ -200,7 +201,7 @@ static nl::json api_get(const std::string & url,
    auto [cli, parts] = common_http_client(url);

    httplib::Headers headers = {
-        {"User-Agent", "llama-cpp/" + build_info},
+        {"User-Agent", "llama-cpp/" + std::string(llama_build_info())},
        {"Accept", "application/json"}
    };

@@ -229,7 +230,7 @@ static nl::json api_get(const std::string & url,
 static std::string get_repo_commit(const std::string & repo_id,
                                   const std::string & token) {
    try {
-        auto endpoint = get_model_endpoint();
+        auto endpoint = common_get_model_endpoint();
        auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);

        if (!json.is_object() ||
@@ -307,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id,
    hf_files files;

    try {
-        auto endpoint = get_model_endpoint();
+        auto endpoint = common_get_model_endpoint();
        auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);

        if (!json.is_array()) {
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -1,4 +1,3 @@
-#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -106,10 +106,16 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
+
    // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
+    virtual value execute_impl(context &) { throw_exec_error(); }
    // execute is the public method to execute a statement with error handling
    value execute(context &);
+
+private:
+    [[noreturn]] void throw_exec_error() const {
+        throw std::runtime_error("cannot exec " + type());
+    }
 };

 // Type Checking Utilities
@@ -143,7 +149,7 @@ struct program : public statement {
    program() = default;
    explicit program(statements && body) : body(std::move(body)) {}
    std::string type() const override { return "Program"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
    }
 };
@@ -195,7 +201,7 @@ struct break_statement : public statement {
        }
    };

-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw break_statement::signal();
    }
 };
@@ -209,7 +215,7 @@ struct continue_statement : public statement {
        }
    };

-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw continue_statement::signal();
    }
 };
@@ -509,7 +515,7 @@ struct slice_expression : public expression {
        chk_type<expression>(this->step_expr);
    }
    std::string type() const override { return "SliceExpression"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
 };
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
    return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }

+[[noreturn]] static value string_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("String join builtin not implemented");
+}
+
 const func_builtins & value_string_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
@@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
            res->val_str.mark_input_based_on(val_input->as_string());
            return res;
        }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("String join builtin not implemented");
-        }},
+        {"join", string_join_not_implemented},
    };
    return builtins;
 }
@@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
    return builtins;
 }

+[[noreturn]] static value array_unique_not_implemented(const func_args &) {
+    throw not_implemented_exception("Array unique builtin not implemented");
+}

 const func_builtins & value_array_t::get_builtins() const {
    static const func_builtins builtins = {
@@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"unique", [](const func_args &) -> value {
-            throw not_implemented_exception("Array unique builtin not implemented");
-        }},
+        {"unique", array_unique_not_implemented},
    };
    return builtins;
 }

+[[noreturn]] static value object_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("object join not implemented");
+}

 const func_builtins & value_object_t::get_builtins() const {
    if (!has_builtins) {
@@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
            });
            return result;
        }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("object join not implemented");
-        }},
+        {"join", object_join_not_implemented},
    };
    return builtins;
 }
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -129,27 +129,25 @@ struct value_t {
    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

-    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
-    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
-    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
-    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
+    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
+    virtual double as_float() const { throw_type_error("is not a float value"); }
+    virtual string as_string() const { throw_type_error("is not a string value"); }
+    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
+    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const {
-        throw std::runtime_error("No builtins available for type " + type());
-    }
+    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }

-    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }

    virtual bool is_numeric() const { return false; }
    virtual bool is_hashable() const { return false; }
@@ -163,6 +161,11 @@ struct value_t {
    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }

+private:
+    [[noreturn]] void throw_type_error(const char* expected) const {
+        throw std::runtime_error(type() + " " + expected);
+    }
+
 protected:
    virtual bool equivalent(const value_t &) const = 0;
    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -23,6 +23,10 @@

 int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;

+int common_log_get_verbosity_thold(void) {
+    return common_log_verbosity_thold;
+}
+
 void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }
--- a/common/log.h
+++ b/common/log.h
@@ -38,7 +38,7 @@ enum log_colors {

 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
 // set via common_log_set_verbosity()
-extern int common_log_verbosity_thold;
+int  common_log_get_verbosity_thold(void);

 void common_log_set_verbosity_thold(int verbosity); // not thread-safe

@@ -98,7 +98,7 @@ void common_log_flush         (struct common_log * log);                    // f

 #define LOG_TMPL(level, verbosity, ...) \
    do { \
-        if ((verbosity) <= common_log_verbosity_thold) { \
+        if ((verbosity) <= common_log_get_verbosity_thold()) { \
            common_log_add(common_log_main(), (level), __VA_ARGS__); \
        } \
    } while (0)
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -208,7 +208,7 @@ void common_ngram_map_begin(
                count_keys, count_keys_del, count_values_del, count_map_entries_upd);
    }

-    map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    map.idx_last_check = size_begin;
    map.size_last_begin = size_begin;
 }

@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
    }

-    if (map.idx_last_check  > cur_len) {
+    if (map.idx_last_check > cur_len) {
        // Should not happen because of common_ngram_map_begin().
        GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
    }
@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
                curr_key.key_idx, key_offset, curr_key.key_num, draft.size());

-        map.last_draft_created   = false;
+        map.last_draft_created   = true;
        map.last_draft_key_idx   = key_offset;
        map.last_draft_value_idx = 0; // value 0 is used for simple mode
        return;
@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
    struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.

    // update the value statistics
-    LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
+    LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
            n_accepted, curr_value.n_accepted);
    curr_value.n_accepted = n_accepted;
 }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,10 +1,12 @@
 #include "sampling.h"

 #include "common.h"
-#include "ggml.h"
+#include "fit.h"
 #include "log.h"
 #include "reasoning-budget.h"

+#include "ggml.h"
+
 #include <algorithm>
 #include <cctype>
 #include <climits>
@@ -511,7 +513,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);

-        llama_memory_breakdown_print(ctx);
+        common_memory_breakdown_print(ctx);
    }
 }

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <iomanip>
 #include <map>
+#include <cinttypes>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -60,18 +61,26 @@ static bool common_speculative_are_compatible(
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
-        LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
+                "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
        return false;
    }

-    if (
-        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
-        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
-        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
-    ) {
-        LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
+    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        (llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
+        LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
+                __func__,
+                llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
+                llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
+        return false;
+    }
+
+    if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        (llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
+        LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
+                __func__,
+                llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
+                llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
        return false;
    }

@@ -144,10 +153,28 @@ struct common_speculative_state {
    virtual void accept(uint16_t n_accepted) = 0;
 };

+struct common_speculative_checkpoint {
+    llama_pos pos_min  = 0;
+    llama_pos pos_max  = 0;
+
+    int64_t   n_tokens = 0;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+
+    size_t ckpt_size   = 0;
+};
+
 struct common_speculative_state_draft : public common_speculative_state {
    llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
    llama_context * ctx_dft;

+    bool use_ckpt = false;
+    struct common_speculative_checkpoint ckpt;
+
    common_sampler * smpl;

    llama_batch  batch;
@@ -160,10 +187,12 @@ struct common_speculative_state_draft : public common_speculative_state {
            enum common_speculative_type type,
            llama_context * ctx_tgt,
            llama_context * ctx_dft,
-            const std::vector<std::pair<std::string, std::string>> & replacements)
+            const std::vector<std::pair<std::string, std::string>> & replacements,
+            bool use_ckpt)
        : common_speculative_state(type)
        , ctx_tgt(ctx_tgt)
        , ctx_dft(ctx_dft)
+        , use_ckpt(use_ckpt)
    {
        batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
        smpl = nullptr;
@@ -218,7 +247,48 @@ struct common_speculative_state_draft : public common_speculative_state {
    }

    void begin(const llama_tokens & prompt) override {
-        GGML_UNUSED(prompt);
+        if (use_ckpt && ckpt.size() > 0) {
+            // delete checkpoint
+            LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
+                    __func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
+            ckpt.pos_min   = 0;
+            ckpt.pos_max   = 0;
+            ckpt.n_tokens  = 0;
+            ckpt.ckpt_size = 0;
+            ckpt.data.clear();
+        }
+    }
+
+    size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
+        int slot_id = 0;
+        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+        ckpt.pos_min  = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
+        ckpt.pos_max  = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
+        ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
+        ckpt.data.resize(checkpoint_size);
+
+        const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        if (n != checkpoint_size) {
+            GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
+        }
+
+        LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
+                ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
+        return n;
+    }
+
+    size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
+        int slot_id = 0;
+        LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
+        const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        if (n != ckpt_size_part_expected) {
+            GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
+                        __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
+        }
+        llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
+
+        return n;
    }

    void draft(
@@ -236,8 +306,8 @@ struct common_speculative_state_draft : public common_speculative_state {

        auto * mem_dft = llama_get_memory(ctx_dft);

-        int reuse_i = 0;
-        int reuse_n = 0;
+        int reuse_i = 0; // index of part to be reused in prompt_dft
+        int reuse_n = 0; // length of part to be reused in prompt_dft

        const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;

@@ -287,18 +357,26 @@ struct common_speculative_state_draft : public common_speculative_state {
            }
        }

-        LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
+        LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
+                __func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
+        if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
+            LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
+                    __func__, reuse_i, reuse_n);
+            reuse_i = 0;
+            reuse_n = 0;
+        }

        result.clear();
        result.reserve(params.n_max);

-        if (reuse_n == 0) {
+        bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
+        if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
            llama_memory_clear(mem_dft, false);
            prompt_dft.clear();
        } else {
            // this happens when a previous draft has been discarded (for example, due to being too small), but the
            // target model agreed with it. in this case, we simply pass back the previous results to save compute
-            if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
+            if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
                for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
                    result.push_back(prompt_dft[i]);

@@ -310,19 +388,50 @@ struct common_speculative_state_draft : public common_speculative_state {
                return;
            }

+            bool do_restore = false;
+            if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
+                // This can happen after a partial acceptance (speculative decoding with checkpoints)
+                LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
+                        __func__, prompt_dft.size(), prompt_cur.size());
+                prompt_dft.resize(prompt_cur.size());
+                do_restore = true;
+            }
+
            if (reuse_i > 0) {
-                llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
+                bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
+                if (!is_removed) {
+                    LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
+                }
                llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);

                prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
            }

-            if (reuse_n < (int) prompt_dft.size()) {
-                llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
-                prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
+            if (reuse_n < (int) prompt_dft.size() || do_restore) {
+                if (use_ckpt) {
+                    if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
+                        LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
+                                __func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
+                    }
+                    draft_restore_checkpoint(ckpt.ckpt_size);
+                    reuse_n = ckpt.n_tokens;
+                    prompt_dft.resize(reuse_n);
+                    needs_ckpt = false;
+                } else {
+                    bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
+                    if (!is_removed) {
+                        LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
+                                __func__, reuse_n, prompt_dft.size());
+                    }
+                    prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
+                }
            }
        }

+        if (needs_ckpt) {
+            ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
+        }
+
        // prepare a batch to evaluate any new tokens in the prompt
        common_batch_clear(batch);

@@ -337,7 +446,11 @@ struct common_speculative_state_draft : public common_speculative_state {
        if (batch.n_tokens > 0) {
            //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());

-            llama_decode(ctx_dft, batch);
+            int ret = llama_decode(ctx_dft, batch);
+            if (ret != 0 && ret != 1) {
+                LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
+                        __func__, ret, prompt_cur.size());
+            }
        }

        const llama_pos n_past = prompt_dft.size();
@@ -351,7 +464,11 @@ struct common_speculative_state_draft : public common_speculative_state {

        LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());

-        llama_decode(ctx_dft, batch);
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0 && ret != 1) {
+            LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
+                    __func__, ret, prompt_cur.size(), prompt_dft.size());
+        }

        common_sampler_reset(smpl);

@@ -387,7 +504,11 @@ struct common_speculative_state_draft : public common_speculative_state {
            common_batch_add(batch, id, n_past + i + 1, { 0 }, true);

            // evaluate the drafted tokens on the draft model
-            llama_decode(ctx_dft, batch);
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
+                        __func__, i, ret, prompt_cur.size(), prompt_dft.size());
+            }

            prompt_dft.push_back(id);
        }
@@ -636,6 +757,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {

                    mod.reset();
                    n_low = 0;
+                    i_last = 0;
                }
            } else {
                n_low = 0;
@@ -739,6 +861,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {

 struct common_speculative {
    std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
+
    common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
 };

@@ -798,42 +921,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
    return it->second;
 }

-bool common_speculative_is_compat(llama_context * ctx_tgt) {
-    auto * mem = llama_get_memory(ctx_tgt);
-    if (mem == nullptr) {
-        return false;
-    }
-
-    bool res = true;
-
-    llama_memory_clear(mem, true);
-
-    // eval 2 tokens to check if the context is compatible
-    std::vector<llama_token> tmp;
-    tmp.push_back(0);
-    tmp.push_back(0);
-
-    int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
-    if (ret != 0) {
-        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
-        res = false;
-        goto done;
-    }
-
-    // try to remove the last tokens
-    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
-        res = false;
-        goto done;
-    }
-
-done:
-    llama_memory_clear(mem, true);
-    llama_synchronize(ctx_tgt);
-
-    return res;
-}
-
 // initialization of the speculative decoding system
 //
 common_speculative * common_speculative_init(
@@ -908,10 +995,13 @@ common_speculative * common_speculative_init(
            case COMMON_SPECULATIVE_TYPE_NONE:
                break;
            case COMMON_SPECULATIVE_TYPE_DRAFT: {
+                const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
+
                impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
                    /* .ctx_tgt      = */ ctx_tgt,
                    /* .ctx_dft      = */ ctx_dft,
-                    /* .replacements = */ params.replacements
+                    /* .replacements = */ params.replacements,
+                    /* .use_ckpt     = */ use_ckpt
                ));
                break;
            }
@@ -966,7 +1056,8 @@ common_speculative * common_speculative_init(
    }

    auto * result = new common_speculative {
-        /* .impls = */ std::move(impls)
+        /* .impls     = */ std::move(impls),
+        /* .curr_impl = */ nullptr,
    };

    return result;
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

-// check if the llama_context is compatible for speculative decoding
-// note: clears the memory of the context
-bool common_speculative_is_compat(llama_context * ctx_tgt);
-
 common_speculative * common_speculative_init(
        common_params_speculative & params,
        llama_context             * ctx_tgt);
@@ -39,3 +35,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);

 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
+
+struct common_speculative_deleter {
+    void operator()(common_speculative * s) { common_speculative_free(s); }
+};
+
+typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -272,6 +272,22 @@ class ModelBase:

        return tensors

+    @staticmethod
+    def _scale_is_trivial(scale: Tensor) -> bool:
+        return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
+
+    def _write_scale_tensor(self, scale_name: str, scale: Tensor):
+        if not self._scale_is_trivial(scale):
+            scale_f32 = scale.float().numpy().flatten()
+            logger.info(f"  + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
+            self.gguf_writer.add_tensor(scale_name, scale_f32)
+
+    def _write_scales_tensor(self, scale_name: str, scales: list[float]):
+        if not np.allclose(scales, 1.0, atol=1e-6):
+            scale_vals = np.array(scales, dtype=np.float32)
+            logger.info(f"  + {scale_name} (per-expert scale, shape [{len(scales)}])")
+            self.gguf_writer.add_tensor(scale_name, scale_vals)
+
    def dequant_model(self):
        # If all quantized tensors were already handled (e.g. pure NVFP4), skip
        if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -494,7 +510,7 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
                        tensors_to_remove.append(name)
-                    if name.endswith((".k_scale", ".v_scale")):
+                    if name.endswith((".input_scale", ".k_scale", ".v_scale")):
                        tensors_to_remove.append(name)
            elif quant_method is not None:
                raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -602,10 +618,6 @@ class ModelBase:
        raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
        return raw, [out_features, n_super * 64]

-    @staticmethod
-    def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
-        return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
-
    def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
        if "language_model." in name:
            name = name.replace("language_model.", "")
@@ -616,19 +628,8 @@ class ModelBase:
        logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)

-        # Emit per-tensor scale2 as a separate F32 tensor when non-trivial
-        if not self._nvfp4_scale2_is_trivial(scale2):
-            scale2_f32 = scale2.float().numpy().flatten()
-            scale_name = new_name.replace(".weight", ".scale")
-            logger.info(f"  + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
-            self.gguf_writer.add_tensor(scale_name, scale2_f32)
-
-        # Emit per-tensor input_scale as a separate F32 tensor when non-trivial
-        if not self._nvfp4_scale2_is_trivial(input_scale):
-            input_scale_f32 = input_scale.float().numpy().flatten()
-            input_scale_name = new_name.replace(".weight", ".input_scale")
-            logger.info(f"  + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
-            self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
+        self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
+        self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)

    def _generate_nvfp4_tensors(self):
        # Per-layer expert merging to avoid holding all experts in memory
@@ -719,21 +720,11 @@ class ModelBase:
        logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)

-        # Emit per-expert scale2 tensor if any expert has non-trivial scale2
        scales.sort(key=lambda x: x[0])
-        scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
-        if not np.allclose(scale_vals, 1.0, atol=1e-6):
-            scale_name = new_name.replace(".weight", ".scale")
-            logger.info(f"  + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
-            self.gguf_writer.add_tensor(scale_name, scale_vals)
+        self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])

-        # Emit per-expert input_scale tensor if any expert has non-trivial input_scale
        input_scales.sort(key=lambda x: x[0])
-        input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
-        if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
-            input_scale_name = new_name.replace(".weight", ".input_scale")
-            logger.info(f"  + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
-            self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
+        self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])

        del experts, merged

@@ -746,7 +737,12 @@ class ModelBase:

        if (not quant_algo or not quant_layers) and quant_config_file.is_file():
            with open(quant_config_file, "r", encoding="utf-8") as f:
-                quant_config = json.load(f).get("quantization") or {}
+                hf_quant_config = json.load(f)
+                quant_config = hf_quant_config.get("quantization") or {}
+                producer = hf_quant_config.get("producer") or {}
+                producer_name = (producer.get("name") or "").lower()
+                if quant_method is None:
+                    self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
                quant_algo = quant_config.get("quant_algo", quant_algo)
                quant_layers = quant_config.get("quantized_layers", quant_layers) or {}

@@ -1850,20 +1846,28 @@ class TextModel(ModelBase):
            with open(module_path, encoding="utf-8") as f:
                modules = json.load(f)
            for mod in modules:
-                if mod["type"] == "sentence_transformers.models.Pooling":
+                if mod["type"].endswith("Pooling"):
                    pooling_path = mod["path"]
                    break

+        mode_mapping = {
+            "mean": gguf.PoolingType.MEAN,
+            "cls": gguf.PoolingType.CLS,
+            "lasttoken": gguf.PoolingType.LAST,
+        }
+
        # get pooling type
        if pooling_path is not None:
            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
                pooling = json.load(f)
-            if pooling["pooling_mode_mean_tokens"]:
+            if pooling.get("pooling_mode_mean_tokens"):
                pooling_type = gguf.PoolingType.MEAN
-            elif pooling["pooling_mode_cls_token"]:
+            elif pooling.get("pooling_mode_cls_token"):
                pooling_type = gguf.PoolingType.CLS
-            elif pooling["pooling_mode_lasttoken"]:
+            elif pooling.get("pooling_mode_lasttoken"):
                pooling_type = gguf.PoolingType.LAST
+            elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
+                pooling_type = mode_mapping[pooling_mode]
            else:
                raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
            self.gguf_writer.add_pooling_type(pooling_type)
@@ -7180,7 +7184,7 @@ class EmbeddingGemma(Gemma3Model):
                with open(modules_file, encoding="utf-8") as modules_json_file:
                    mods = json.load(modules_json_file)
                for mod in mods:
-                    if mod["type"] == "sentence_transformers.models.Dense":
+                    if mod["type"].endswith("Dense"):
                        mod_path = mod["path"]
                        # check if model.safetensors file for Dense layer exists
                        model_tensors_file = self.dir_model / mod_path / "model.safetensors"
@@ -10893,7 +10897,64 @@ class NemotronHModel(GraniteHybridModel):
                self.gguf_writer.add_moe_latent_size(latent_size)

    def set_vocab(self):
-        super().set_vocab()
+        # The NemotronH config uses pattern characters (e.g. '-') that may not
+        # be supported by the installed transformers version. AutoTokenizer
+        # internally calls AutoConfig which triggers this parsing failure.
+        # Using trust_remote_code=True to load the model's own config class.
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+
+        # Pad vocab size (from Mamba2Model/GraniteHybridModel)
+        self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now.
+        # From Mamba2Model.set_vocab():
+        vocab_size = self.hparams["vocab_size"]
+        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
+        # ref: https://stackoverflow.com/a/17511341/22827863
+        vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
+        self.hparams["vocab_size"] = vocab_size
+
+        assert max(tokenizer.vocab.values()) < vocab_size  # ty: ignore[unresolved-attribute]
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}  # ty: ignore[unresolved-attribute]
+        added_vocab = tokenizer.get_added_vocab()  # ty: ignore[unresolved-attribute]
+
+        added_tokens_decoder = tokenizer.added_tokens_decoder  # ty: ignore[unresolved-attribute]
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            else:
+                token: str = reverse_vocab[i]
+                if token in added_vocab:
+                    if not added_tokens_decoder[i].normalized:
+                        previous_token = token
+                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))  # ty: ignore[unresolved-attribute, invalid-assignment]
+                        if previous_token != token:
+                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
+
+                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
+                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                else:
+                    toktypes.append(gguf.TokenType.NORMAL)
+                tokens.append(token)
+
+        # From TextModel.set_vocab_gpt2():
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)

        # The tokenizer _does_ add a BOS token (via post_processor type
        # TemplateProcessing) but does not set add_bos_token to true in the
@@ -11790,7 +11851,7 @@ class LLaDAMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
+@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE

@@ -11929,28 +11990,58 @@ class HunYuanModel(TextModel):


@ModelBase.register("HunYuanVLForConditionalGeneration")
-class HunyuanOCRVisionModel(MmprojModel):
+class HunyuanVLVisionModel(MmprojModel):
+    # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
+    # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
+    # Each variant maps to a different projector type in clip.cpp so image
+    # preprocessing follows the correct code path.
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self.hparams_vision is not None
-        # HunyuanOCR uses max_image_size instead of image_size
+        # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
        if "image_size" not in self.hparams_vision:
            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)

+    @staticmethod
+    def is_ocr_variant(hparams: dict) -> bool:
+        """Return True for HunyuanOCR, False for HunyuanVL.
+
+        The projector's output dim must equal the text model's hidden_size by
+        construction (that's what "projector" means). HunyuanOCR pairs a 1B text
+        backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
+        ViT -> LLM projection dim is a hard architectural signature, not a
+        magic number.
+        """
+        vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
+        return vision_out == 1024
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        assert self.hparams_vision is not None
-        hparams = self.hparams_vision
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
-        self.gguf_writer.add_vision_use_gelu(True)
-        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
-        self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
-        self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
-        self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+        vcfg = self.hparams_vision
+
+        if self.is_ocr_variant(self.global_config):
+            # --- HunyuanOCR ---
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
+            self.gguf_writer.add_vision_use_gelu(True)
+            self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
+            self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
+            self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+            self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+            return
+
+        # --- HunyuanVL ---
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
+        self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
+        self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
+        self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
+        self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
+        self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith("vit."):
-            return  # skip text tensors
+            return
        # strip CLS token (row 0) from position embeddings so resize_position_embeddings works
        if "position_embedding" in name:
            data_torch = data_torch[1:]  # [n_patches+1, n_embd] -> [n_patches, n_embd]
@@ -11958,11 +12049,66 @@ class HunyuanOCRVisionModel(MmprojModel):

    def tensor_force_quant(self, name, new_name, bid, n_dims):
        # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
+        # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
        if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
        return super().tensor_force_quant(name, new_name, bid, n_dims)


+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanVLTextModel(HunYuanModel):
+    # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
+    # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
+    # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
+    # the config and pick the matching GGUF architecture.
+    model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
+
+    @staticmethod
+    def _is_ocr_config(hparams: dict) -> bool:
+        # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
+        # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
+        # HunyuanVLVisionModel.is_ocr_variant.
+        return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
+
+    def __init__(self, dir_model: Path, *args, **kwargs):
+        raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
+        if self._is_ocr_config(raw_hparams):
+            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
+        else:
+            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
+        super().__init__(dir_model, *args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
+        # the HunYuan-Dense arch which already handles standard rope in super().
+        if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
+            return
+
+        if self.rope_parameters.get("rope_type") != "xdrope":
+            return
+
+        # defaults for HunyuanVL. The C++ side later computes:
+        #   freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
+        self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
+        self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
+
+        ctx_len = int(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
+        self.gguf_writer.add_context_length(ctx_len)
+
+        self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip vision tensors — they are written by HunyuanVLVisionModel
+        if name.startswith("vit."):
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.SMOLLM3
--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
 - `-fa 1` is required when running llama-bench with the OpenVINO backend.
  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
 - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)

 > [!NOTE]
 > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.

-> [!NOTE]
-> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).

 ```bash
 #  Run Docker container
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -31,6 +31,8 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.

 ## Recommended Release

+### Windows
+
 The following releases are verified and recommended:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
@@ -39,9 +41,22 @@ The following releases are verified and recommended:
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

+### Ubuntu 24.04
+
+The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
+
+It is recommended to use them with Intel Docker.
+
+The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.

 ## News

+- 2026.04
+
+  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
+  - Fused MoE.
+  - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
+
 - 2026.03
  - Support Flash-Attention: less memory usage, performance impact depends on LLM.

@@ -229,6 +244,7 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 |Verified release|
 |-|
+|2025.3.3 |
 |2025.2.1|
 |2025.1|
 |2024.1|
@@ -339,6 +355,12 @@ Choose one of following methods to run.
 ./examples/sycl/test.sh
 ```

+- Run llama-server:
+
+```sh
+./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
+```
+
 2. Command line
 Launch inference

@@ -627,10 +649,18 @@ Choose one of following methods to run.

 1. Script

+- Run test:
+
 ```
 examples\sycl\win-test.bat
 ```

+- Run llama-server:
+
+```
+examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
+```
+
 2. Command line

 Launch inference
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
  ```

 - `GGML_HEXAGON_PROFILE=1`
-  Generates a host-side profile for the ggml-hexagon Ops.
+  Enables Op profiling:

- `GGML_HEXAGON_OPMASK=0x0`
-  Allows enabling specific stages of the processing pipeline:
+  - `1` Basic profile with per-op `usecs` and `cycles` counters
+  - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
+  - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
+
+  The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
+  Examples:
+
+      `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
+
+- `GGML_HEXAGON_OPSTAGE=0x0`
+  Allows enabling specific stages of the Op processing pipeline:

  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
  - `0x2` Enable Op Compute (MUL_MAT, etc.)

  Examples:

-      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
+      `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)

 - `GGML_HEXAGON_OPFILTER=regex`
  Allows filtering (disabling) Ops that match the regex pattern:
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -22,13 +22,13 @@ Legend:
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
@@ -46,7 +46,7 @@ Legend:
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -60,7 +60,7 @@ Legend:
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
@@ -84,10 +84,10 @@ Legend:
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -105,7 +105,7 @@ Legend:
 |                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
@@ -116,6 +116,6 @@ Legend:
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/debug/CMakeLists.txt
+++ b/examples/debug/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-debug)
 add_executable(${TARGET} debug.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -202,10 +202,14 @@ static bool run(llama_context * ctx, const common_params & params) {
    print_tokenized_prompt(ctx, tokens, params.prompt);

    if (params.save_logits) {
-        output_data output {ctx, model, params};
-        std::filesystem::path model_path{params.model.path};
-        std::string model_name{model_path.stem().string()};
-        save_output_data(output, model_name, params.logits_output_dir);
+        try {
+            output_data output {ctx, model, params};
+            std::filesystem::path model_path{params.model.path};
+            std::string model_name{model_path.stem().string()};
+            save_output_data(output, model_name, params.logits_output_dir);
+        } catch (const std::exception & e) {
+            LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
+        }
    }

    return true;
@@ -223,7 +227,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    std::optional<base_callback_data> cb_data;
+    std::optional<common_debug_cb_user_data> cb_data;
    if (!params.save_logits) {
        cb_data.emplace(params, params.tensor_filter);
    }
--- a/examples/diffusion/CMakeLists.txt
+++ b/examples/diffusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-diffusion-cli)
 add_executable(${TARGET} diffusion-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_BUILD_TESTS)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -3,7 +3,6 @@
 #include "debug.h"
 #include "log.h"
 #include "llama.h"
-#include "llama-cpp.h"

 #include <clocale>
 #include <string>
@@ -38,7 +37,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");

-    base_callback_data cb_data;
+    common_debug_cb_user_data cb_data;

    common_params params;

@@ -53,7 +52,7 @@ int main(int argc, char ** argv) {

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    params.cb_eval = common_debug_cb_eval<false>;
+    params.cb_eval = common_debug_cb_eval;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gen-docs)
 add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/idle/CMakeLists.txt
+++ b/examples/idle/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-idle)
 add_executable(${TARGET} idle.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/llama.android/lib/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/lib/src/main/cpp/CMakeLists.txt
@@ -51,6 +51,6 @@ target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE

 target_link_libraries(${CMAKE_PROJECT_NAME}
        llama
-        common
+        llama-common
        android
        log)
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,23 +1,23 @@
 set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -25,7 +25,11 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+if [[ -n "$MMPROJ" ]]; then
+    CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
+else
+    CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+fi

 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
@@ -38,6 +42,7 @@ if [[ -n "$DEBUG" ]]; then
 else
    CMD_ARGS=("python")
 fi
+
 CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -50,7 +55,3 @@ CMD_ARGS+=("--outtype" "${TYPE}")
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
-if [[ -n "$MMPROJ" ]]; then
-    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
-    echo "The mmproj model was created in $(realpath "$mmproj_file")"
-fi
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-retrieval)
 add_executable(${TARGET} retrieval.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/speculative-simple/CMakeLists.txt
+++ b/examples/speculative-simple/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-speculative-simple)
 add_executable(${TARGET} speculative-simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -8,8 +8,24 @@
 #include <clocale>
 #include <cstdio>
 #include <cstring>
+#include <cinttypes>
 #include <string>
 #include <vector>
+#include <utility>
+
+struct spec_checkpoint {
+    int64_t n_tokens = 0;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+
+    bool empty() const {
+        return data.empty();
+    }
+};

 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
@@ -46,6 +62,14 @@ int main(int argc, char ** argv) {
    model_tgt = llama_init_tgt->model();
    ctx_tgt   = llama_init_tgt->context();

+    // check if the context supports partial sequence removal
+    const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
+    const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
+
+    if (use_ckpt) {
+        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
+    }
+
    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);

    // load the draft model
@@ -119,7 +143,7 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+    common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));

    // eval the prompt
    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
@@ -142,21 +166,61 @@ int main(int argc, char ** argv) {

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

+    size_t n_draft = 0;
+
+    llama_tokens draft;
+    spec_checkpoint spec_ckpt;
+
    const auto t_enc_end = ggml_time_us();

    const auto t_dec_start = ggml_time_us();

    while (true) {
-        // optionally, generate draft tokens that can be appended to the target batch
+        // generate or reuse draft tokens
        //
        // this is the most important part of the speculation. the more probable tokens that are provided here
        // the better the performance will be. in theory, this computation can be performed asynchronously and even
        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
        // from a cache or lookup tables.
        //
-        llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
+        if (draft.empty()) {
+            // generate a new draft
+            draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);

-        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
+            if ((int) draft.size() > params_spec.n_max) {
+                LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max);
+                draft.resize(params_spec.n_max);
+            }
+
+            if ((int) draft.size() < params_spec.n_min) {
+                LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min);
+                draft.clear();
+            }
+
+            // save the original draft size
+            n_draft = draft.size();
+
+            // save a checkpoint of the target context before evaluating the draft
+            // this allows us to restore the state if partial draft acceptance occurs
+            if (!draft.empty() && use_ckpt) {
+                const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                spec_ckpt.data.resize(ckpt_size);
+
+                const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                GGML_ASSERT(n == ckpt_size);
+
+                spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
+                LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
+                        spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
+            }
+        } else {
+            // we have a previous (partial) draft to reuse from checkpoint restoration
+            if (use_ckpt) {
+                GGML_ASSERT(!spec_ckpt.empty());
+            }
+        }
+
+        GGML_ASSERT(n_draft > 0);

        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
@@ -178,6 +242,12 @@ int main(int argc, char ** argv) {
            llama_decode(ctx_tgt, batch_tgt);
        }

+        // only save the sampler sampler state if we use checkpoints
+        common_sampler_ptr smpl_save;
+        if (use_ckpt) {
+            smpl_save.reset(common_sampler_clone(smpl.get()));
+        }
+
        // sample from the full target batch and return the accepted tokens based on the target sampler
        //
        // for each token to be accepted, the sampler would have to sample that same token
@@ -185,14 +255,38 @@ int main(int argc, char ** argv) {
        // available logits from the batch and sample the next token until we run out of logits or the sampler
        // disagrees with the draft
        //
-        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
+        auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);

        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());

        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token

+        // check for partial draft acceptance:
+        // if the context doesn't support partial sequence removal, restore the checkpoint
+        // and make the accepted tokens the new partial draft for the next iteration
+        if (use_ckpt && ids.size() - 1 < draft.size()) {
+            LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
+
+            draft = std::move(ids);
+
+            const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+            GGML_ASSERT(n == spec_ckpt.size());
+
+            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
+
+            prompt_tgt.resize(spec_ckpt.n_tokens);
+            smpl = std::move(smpl_save);
+
+            n_past = (int) prompt_tgt.size();
+
+            continue;
+        }
+
+        common_speculative_accept(spec, ids.size() - 1);
+
+        // full acceptance: consume the draft and commit accepted tokens
        n_past    += ids.size() - 1;
-        n_drafted += draft.size(); // note: we ignore the discarded small drafts
+        n_drafted += n_draft; // note: we ignore the discarded small drafts
        n_accept  += ids.size() - 1;
        n_predict += ids.size();

@@ -222,6 +316,9 @@ int main(int argc, char ** argv) {

        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);

+        // clear the draft since it has been consumed
+        draft.clear();
+
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);

@@ -254,11 +351,10 @@ int main(int argc, char ** argv) {

    LOG_INF("\n");
    LOG_INF("target:\n\n");
-    common_perf_print(ctx_tgt, smpl);
+    common_perf_print(ctx_tgt, smpl.get());

    llama_batch_free(batch_tgt);

-    common_sampler_free(smpl);
    common_speculative_free(spec);

    llama_backend_free();
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -5,5 +5,5 @@
 set(TARGET llama-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/start-svr.sh
+++ b/examples/sycl/start-svr.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+Help() {
+  cat << EOF
+Usage: $(basename "$0") [OPTIONS]
+
+This script processes files with specified options.
+
+Options:
+  -h, --help    Display this help message and exit.
+  -c, --context <value>    Set context length. Bigger need more memory.
+  -p, --promote <value>    Prompt to start generation with.
+  -m, --model   <value>    Full model file path.
+  -mg,--main-gpu <value>   Set main GPU ID (0 - n) for single GPU mode.
+  -sm,--split-mode <value> How to split the model across multiple GPUs, one of:
+                            - none: use one GPU only
+                            - layer (default): split layers and KV across GPUs
+                            - row: split rows across GPUs
+  -ngl,--n-gpu-layers <value>  Max. number of layers to store in VRAM (default: -1)
+  -lv,--log-verbosity <value>  Set the verbosity threshold. Messages with a higher verbosity will be
+                               ignored. Values:
+                                - 0: generic output
+                                - 1: error
+                                - 2: warning
+                                - 3: info
+                                - 4: debug
+
+
+EOF
+}
+
+BIN_FILE=./build/bin/llama-server
+SEED=0
+GPUS_SETTING=""
+
+MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
+NGL=99
+CONTEXT=4096
+GGML_SYCL_DEVICE=-1
+SPLIT_MODE=layer
+LOG_VERBOSE=3
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -c|--context)
+            CONTEXT=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -m|--model)
+            MODEL_FILE="$2"
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -mg|--main-gpu)
+            GGML_SYCL_DEVICE=$2
+            SPLIT_MODE=none
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -sm|--split-mode)
+            SPLIT_MODE=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -ngl|--n-gpu-layers)
+            NGL=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -lv|--log-verbosity)
+            LOG_VERBOSE=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -h|--help)
+            Help
+            exit 0
+            ;;
+        *)
+            # Handle unknown options or stop processing options
+            echo "Invalid option: $1"
+            # Optional: exit script or shift to treat remaining as positional args
+            exit 1
+            ;;
+    esac
+done
+
+
+
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
+
+if [ $GGML_SYCL_DEVICE -ne -1 ]; then
+    echo "Use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
+else
+    echo "Use all Intel GPUs, including iGPU & dGPU"
+    GPUS_SETTING="-sm ${SPLIT_MODE}"
+ fi
+
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
+
+
--- a/examples/sycl/test.sh
+++ b/examples/sycl/test.sh
@@ -38,7 +38,7 @@ SEED=0
 GPUS_SETTING=""

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
 NGL=99
 CONTEXT=4096
 GGML_SYCL_DEVICE=-1
@@ -122,9 +122,10 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
-   echo "Use all Intel GPUs, including iGPU & dGPU"
+    echo "Use all Intel GPUs, including iGPU & dGPU"
+    GPUS_SETTING="-sm ${SPLIT_MODE}"
 fi

-echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
-ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap

--- a/examples/sycl/win-start-svr.bat
+++ b/examples/sycl/win-start-svr.bat
@@ -0,0 +1,179 @@
+::  MIT license
+::  Copyright (C) 2024 Intel Corporation
+::  SPDX-License-Identifier: MIT
+
+@echo off
+setlocal EnableExtensions EnableDelayedExpansion
+
+set "BIN_FILE=.\build\bin\llama-server.exe"
+set "SEED=0"
+set "GPUS_SETTING="
+
+set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
+set "NGL=99"
+set "CONTEXT=4096"
+set "GGML_SYCL_DEVICE=-1"
+set "SPLIT_MODE=layer"
+set "LOG_VERBOSE=3"
+
+if "%~1"=="" goto after_args
+
+:parse_args
+if "%~1"=="" goto after_args
+
+if /I "%~1"=="-c" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--context" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-m" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--model" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-mg" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--main-gpu" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-sm" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--split-mode" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-ngl" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--n-gpu-layers" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-lv" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--log-verbosity" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-h" goto help
+if /I "%~1"=="--help" goto help
+
+echo Invalid option: %~1
+exit /b 1
+
+:missing_value
+echo Missing value for option: %~1
+exit /b 1
+
+:help
+echo Usage: %~n0 [OPTIONS]
+echo.
+echo This script processes files with specified options.
+echo.
+echo Options:
+echo   -h, --help    Display this help message and exit.
+echo   -c, --context ^<value^>    Set context length. Bigger need more memory.
+echo   -m, --model   ^<value^>    Full model file path.
+echo   -mg,--main-gpu ^<value^>   Set main GPU ID (0 - n) for single GPU mode.
+echo   -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
+echo                             - none: use one GPU only
+echo                             - layer (default): split layers and KV across GPUs
+echo                             - row: split rows across GPUs
+echo   -ngl,--n-gpu-layers ^<value^>  Max. number of layers to store in VRAM (default: -1)
+echo   -lv,--log-verbosity ^<value^>  Set the verbosity threshold. Messages with a higher verbosity will be
+echo                                ignored. Values:
+echo                                 - 0: generic output
+echo                                 - 1: error
+echo                                 - 2: warning
+echo                                 - 3: info
+echo                                 - 4: debug
+exit /b 0
+
+:after_args
+
+REM In Windows CMD, source is not available; call oneAPI setvars if present.
+if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
+  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
+) else (
+  echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
+)
+
+REM Support malloc device memory more than 4GB.
+set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
+echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
+
+if not "%GGML_SYCL_DEVICE%"=="-1" (
+  echo Use %GGML_SYCL_DEVICE% as main GPU
+  REM Use single GPU only.
+  set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
+  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
+  echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
+) else (
+  echo Use all Intel GPUs, including iGPU ^& dGPU
+  set "GPUS_SETTING=-sm %SPLIT_MODE%"
+)
+
+echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
+set "ZES_ENABLE_SYSMAN=1"
+%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
+
+endlocal
+
--- a/examples/sycl/win-test.bat
+++ b/examples/sycl/win-test.bat
@@ -2,10 +2,200 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

-set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
-@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-:: support malloc device memory more than 4GB.
-set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-set LOAD_MODE="--mmap"
-.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
+@echo off
+setlocal EnableExtensions EnableDelayedExpansion
+
+REM MIT license
+REM Copyright (C) 2024 Intel Corporation
+REM SPDX-License-Identifier: MIT
+
+set "BIN_FILE=.\build\bin\llama-completion.exe"
+set "SEED=0"
+set "GPUS_SETTING="
+
+set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
+set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
+set "NGL=99"
+set "CONTEXT=4096"
+set "GGML_SYCL_DEVICE=-1"
+set "SPLIT_MODE=layer"
+set "LOG_VERBOSE=3"
+
+if "%~1"=="" goto after_args
+
+:parse_args
+if "%~1"=="" goto after_args
+
+if /I "%~1"=="-c" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--context" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-p" (
+  if "%~2"=="" goto missing_value
+  set "INPUT_PROMPT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--promote" (
+  if "%~2"=="" goto missing_value
+  set "INPUT_PROMPT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-m" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--model" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-mg" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--main-gpu" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-sm" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--split-mode" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-ngl" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--n-gpu-layers" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-lv" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--log-verbosity" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-h" goto help
+if /I "%~1"=="--help" goto help
+
+echo Invalid option: %~1
+exit /b 1
+
+:missing_value
+echo Missing value for option: %~1
+exit /b 1
+
+:help
+echo Usage: %~n0 [OPTIONS]
+echo.
+echo This script processes files with specified options.
+echo.
+echo Options:
+echo   -h, --help    Display this help message and exit.
+echo   -c, --context ^<value^>    Set context length. Bigger need more memory.
+echo   -p, --promote ^<value^>    Prompt to start generation with.
+echo   -m, --model   ^<value^>    Full model file path.
+echo   -mg,--main-gpu ^<value^>   Set main GPU ID (0 - n) for single GPU mode.
+echo   -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
+echo                             - none: use one GPU only
+echo                             - layer (default): split layers and KV across GPUs
+echo                             - row: split rows across GPUs
+echo   -ngl,--n-gpu-layers ^<value^>  Max. number of layers to store in VRAM (default: -1)
+echo   -lv,--log-verbosity ^<value^>  Set the verbosity threshold. Messages with a higher verbosity will be
+echo                                ignored. Values:
+echo                                 - 0: generic output
+echo                                 - 1: error
+echo                                 - 2: warning
+echo                                 - 3: info
+echo                                 - 4: debug
+exit /b 0
+
+:after_args
+
+REM In Windows CMD, source is not available; call oneAPI setvars if present.
+if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
+  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
+) else (
+  echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
+)
+
+REM Support malloc device memory more than 4GB.
+set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
+echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
+
+if not "%GGML_SYCL_DEVICE%"=="-1" (
+  echo Use %GGML_SYCL_DEVICE% as main GPU
+  REM Use single GPU only.
+  set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
+  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
+  echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
+) else (
+  echo Use all Intel GPUs, including iGPU ^& dGPU
+  set "GPUS_SETTING=-sm %SPLIT_MODE%"
+)
+
+echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
+set "ZES_ENABLE_SYSMAN=1"
+%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
+
+endlocal
+
--- a/examples/training/CMakeLists.txt
+++ b/examples/training/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-finetune)
 add_executable(${TARGET} finetune.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -1,17 +1,11 @@
 cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.

-# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
-# MSVC is not a valid assembler for the ASM language.
-# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
-if (POLICY CMP0194)
-    cmake_policy(SET CMP0194 NEW)
-endif()
 project("ggml" C CXX ASM)

 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 11)
+set(GGML_VERSION_MINOR 10)
+set(GGML_VERSION_PATCH 0)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -219,7 +213,7 @@ set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
 set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")

 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph"                              ON)
 option(GGML_HIP_RCCL                        "ggml: use ROCm Collective Comm. Library"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
-        target_link_libraries(ggml-base PRIVATE m)
+        target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
    endif()
 endif()

--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
        if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
            t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
            if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
-                GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
+                GGML_ASSERT(tensor->ne[split_dim] != 0);
                const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
                GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);

@@ -1170,6 +1170,28 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer

        simple_tensors.push_back(t_ij);
    }
+
+    // If one of the sources has a zero-sized slice, disable the computation:
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
+            continue;
+        }
+
+        const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
+        if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
+            continue;
+        }
+        for (size_t j = 0; j < n_simple_bufs; j++) {
+            int64_t ne_sum = 0;
+            for (size_t s = 0; s < split_state_src.n_segments; s++) {
+                ne_sum += split_state_src.ne[s*n_simple_bufs + j];
+            }
+            if (ne_sum == 0) {
+                simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
+            }
+        }
+    }
+
    buf_ctx->simple_tensors[tensor] = simple_tensors;

    return GGML_STATUS_SUCCESS;
@@ -1183,40 +1205,57 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg

    if (split_state.n_segments != 1) {
        GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
-        GGML_ASSERT(offset == 0);
-        GGML_ASSERT(size == ggml_nbytes(tensor));
        GGML_ASSERT(tensor->ne[3] == 1);
+
        size_t offset_data = 0;
        std::vector<size_t> simple_offsets(n_bufs, 0);
        if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
            GGML_ASSERT(tensor->ne[2] == 1);
+
+            const size_t row_stride = tensor->nb[1];
+            GGML_ASSERT(offset % row_stride == 0);
+            GGML_ASSERT(size   % row_stride == 0);
+            const int64_t r_start = offset / row_stride;
+            const int64_t r_count = size   / row_stride;
+            GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
+
            const int64_t blck_size = ggml_blck_size(tensor->type);
            for (size_t s = 0; s < split_state.n_segments; s++) {
                for (size_t j = 0; j < n_bufs; j++) {
                    ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                    GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
                    const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
-                    ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
-                        tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
+                    ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
+                        simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
+                        r_count, simple_tensor->nb[1], tensor->nb[1]);
                    offset_data       += nbytes;
                    simple_offsets[j] += nbytes;
                }
            }
-            GGML_ASSERT(offset_data*tensor->ne[1] == size);
+            GGML_ASSERT(offset_data*r_count == size);
            return;
        }
        GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
+
+        const size_t row_stride = tensor->nb[2];
+        GGML_ASSERT(offset % row_stride == 0);
+        GGML_ASSERT(size   % row_stride == 0);
+        const int64_t r_start = offset / row_stride;
+        const int64_t r_count = size   / row_stride;
+        GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
+
        for (size_t s = 0; s < split_state.n_segments; s++) {
            for (size_t j = 0; j < n_bufs; j++) {
                ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
-                ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
-                    tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
+                ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
+                    simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
+                    r_count, simple_tensor->nb[2], tensor->nb[2]);
                offset_data       += nbytes;
                simple_offsets[j] += nbytes;
            }
        }
-        GGML_ASSERT(offset_data*tensor->ne[2] == size);
+        GGML_ASSERT(offset_data*r_count == size);
        return;
    }

@@ -1270,7 +1309,62 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
    GGML_ASSERT(ggml_is_contiguous(tensor));

    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
-    GGML_ASSERT(split_state.n_segments == 1);
+
+    if (split_state.n_segments != 1) {
+        GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
+        GGML_ASSERT(tensor->ne[3] == 1);
+
+        size_t offset_data = 0;
+        std::vector<size_t> simple_offsets(n_bufs, 0);
+        if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
+            GGML_ASSERT(tensor->ne[2] == 1);
+
+            const size_t row_stride = tensor->nb[1];
+            GGML_ASSERT(offset % row_stride == 0);
+            GGML_ASSERT(size   % row_stride == 0);
+            const int64_t r_start = offset / row_stride;
+            const int64_t r_count = size   / row_stride;
+            GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
+
+            const int64_t blck_size = ggml_blck_size(tensor->type);
+            for (size_t s = 0; s < split_state.n_segments; s++) {
+                for (size_t j = 0; j < n_bufs; j++) {
+                    const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
+                    GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
+                    const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
+                    ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
+                        simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
+                        r_count, simple_tensor->nb[1], tensor->nb[1]);
+                    offset_data       += nbytes;
+                    simple_offsets[j] += nbytes;
+                }
+            }
+            GGML_ASSERT(offset_data*r_count == size);
+            return;
+        }
+        GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
+
+        const size_t row_stride = tensor->nb[2];
+        GGML_ASSERT(offset % row_stride == 0);
+        GGML_ASSERT(size   % row_stride == 0);
+        const int64_t r_start = offset / row_stride;
+        const int64_t r_count = size   / row_stride;
+        GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
+
+        for (size_t s = 0; s < split_state.n_segments; s++) {
+            for (size_t j = 0; j < n_bufs; j++) {
+                const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
+                const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
+                ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
+                    simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
+                    r_count, simple_tensor->nb[2], tensor->nb[2]);
+                offset_data       += nbytes;
+                simple_offsets[j] += nbytes;
+            }
+        }
+        GGML_ASSERT(offset_data*r_count == size);
+        return;
+    }

    switch (split_state.axis) {
        case GGML_BACKEND_SPLIT_AXIS_0:
@@ -1404,26 +1498,32 @@ struct ggml_backend_meta_context {
    struct backend_config {
        ggml_backend_t backend;

-        std::vector<cgraph_config> cgraphs;
-        std::vector<ggml_tensor *> nodes;
-        ggml_backend_buffer_ptr    buf;
+        std::vector<cgraph_config>           cgraphs;
+        std::vector<ggml_tensor *>           nodes;
+        std::vector<ggml_backend_buffer_ptr> bufs;

-        backend_config(ggml_backend_t backend) : backend(backend) {}
+        backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
+            bufs.resize(n_reduce_steps);
+        }
    };
    std::string                 name;
    std::vector<backend_config> backend_configs;
    ggml_context_ptr            ctx;
    std::vector<ggml_cgraph *>  cgraphs_aux;
    std::vector<ggml_tensor *>  nodes_aux;
+    size_t                      n_reduce_steps;
    int                         max_nnodes    = 0;
    size_t                      max_tmp_size  = 0;
    size_t                      max_subgraphs = 0;
+    size_t                      n_subgraphs   = 0;
+    uint64_t                    uid           = 0;

    void *                               comm_ctx       = nullptr;
    ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;

    ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
        const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
+        n_reduce_steps = std::ceil(std::log2(n_devs));
        name = "Meta(";
        std::vector<ggml_backend_t> simple_backends;
        backend_configs.reserve(n_devs);
@@ -1435,7 +1535,7 @@ struct ggml_backend_meta_context {
            }
            name += ggml_backend_dev_name(simple_dev);
            simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
-            backend_configs.emplace_back(simple_backends.back());
+            backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
        }
        name += ")";

@@ -1465,10 +1565,6 @@ struct ggml_backend_meta_context {
            ggml_backend_free(bc.backend);
        }
    }
-
-    size_t n_reduce_steps() const {
-        return std::ceil(std::log2(backend_configs.size()));
-    }
 };

 static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
@@ -1578,6 +1674,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    const size_t n_backends = ggml_backend_meta_n_backends(backend);
    ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;

+    // If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
+    const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
+
    bool max_nnodes_raised = false;
    if (cgraph->n_nodes > backend_ctx->max_nnodes) {
        for (size_t j = 0; j < n_backends; j++) {
@@ -1587,173 +1686,216 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        }
        backend_ctx->max_nnodes = cgraph->n_nodes;
        max_nnodes_raised = true;
+        assert(needs_rebuild);
    }
-    for (size_t j = 0; j < n_backends; j++) {
-        auto & bcj = backend_ctx->backend_configs[j];

-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-            if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
-                // FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
-                // For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
-                bcj.nodes[i] = node;
-                continue;
+    if (needs_rebuild) {
+        size_t n_subgraphs  = 0;
+        size_t max_tmp_size = 0;
+
+        for (size_t j = 0; j < n_backends; j++) {
+            auto & bcj = backend_ctx->backend_configs[j];
+
+            for (int i = 0; i < cgraph->n_nodes; i++) {
+                ggml_tensor * node = cgraph->nodes[i];
+                if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
+                    // FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
+                    // For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
+                    bcj.nodes[i] = node;
+                    continue;
+                }
+                bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
+                GGML_ASSERT(bcj.nodes[i]);
            }
-            bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
-            GGML_ASSERT(bcj.nodes[i]);
        }
-    }

-    size_t n_subgraphs  = 0;
-    size_t max_tmp_size = 0;
-    {
-        // For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
-        auto get_i_delayed = [&](const int i) -> int {
-            int id = i; // i_delayed
-            int idr = i; // i_delayed return, last safe return value
+        {
+            // For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
+            auto get_i_delayed = [&](const int i) -> int {
+                int id = i; // i_delayed
+                int idr = i; // i_delayed return, last safe return value

-            ggml_tensor * node = cgraph->nodes[id];
-            int32_t n_used = ggml_node_get_use_count(cgraph, id);
-            if (id + 1 >= cgraph->n_nodes) {
-                return idr;
-            }
-            {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
-                        ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
-                        ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                    node = next;
+                ggml_tensor * node = cgraph->nodes[id];
+                int32_t n_used = ggml_node_get_use_count(cgraph, id);
+
+                // Skip MIRRORED nodes that don't consume node
+                auto skip_unrelated = [&]() {
+                    while (id + 1 < cgraph->n_nodes) {
+                        ggml_tensor * next = cgraph->nodes[id+1];
+                        if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
+                            break;
+                        }
+                        bool safe = true;
+                        for (int s = 0; s < GGML_MAX_SRC; s++) {
+                            if (next->src[s] == nullptr) {
+                                continue;
+                            }
+                            if (next->src[s] == node) {
+                                safe = false;
+                                break;
+                            }
+                            if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
+                                safe = false;
+                                break;
+                            }
+                        }
+                        if (!safe) {
+                            break;
+                        }
+                        id++;
+                    }
+                };
+
+                skip_unrelated();
+                if (id + 1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
+                            ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
+                            ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
+                        node = next;
+                        id++;
+                        idr = id;
+                        n_used = ggml_node_get_use_count(cgraph, id);
+                    }
+                }
+                // Chain of MULs with MIRRORED src[1]
+                while (true) {
+                    skip_unrelated();
+                    if (id + 1 >= cgraph->n_nodes) {
+                        return idr;
+                    }
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op == GGML_OP_MUL && next->src[0] == node &&
+                            ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
+                        node = next;
+                        id++;
+                        idr = id;
+                        n_used = ggml_node_get_use_count(cgraph, id);
+                    } else {
+                        break;
+                    }
+                }
+
+                if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                for (int32_t k = 0; k < n_used; k++) {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
+                            next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
+                            ggml_node_get_use_count(cgraph, id+1) != 1) {
+                        return idr;
+                    }
                    id++;
-                    idr = id;
-                    n_used = ggml_node_get_use_count(cgraph, id);
                }
-            }
-            if (id + 1 >= cgraph->n_nodes) {
-                return idr;
-            }
-            {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op == GGML_OP_MUL && next->src[0] == node &&
-                        ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                    node = next;
+                {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
+                            next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
+                        return idr;
+                    }
                    id++;
-                    idr = id;
-                    n_used = ggml_node_get_use_count(cgraph, id);
                }
-            }
-
-            if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
+                for (int32_t k = 0; k < n_used - 2; k++) {
+                    ggml_tensor * next = cgraph->nodes[id+1];
+                    if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
+                            next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
+                        return idr;
+                    }
+                    id++;
+                }
+                idr = id;
                return idr;
-            }
-            for (int32_t k = 0; k < n_used; k++) {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
-                        next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
-                        ggml_node_get_use_count(cgraph, id+1) != 1) {
-                    return idr;
-                }
-                id++;
-            }
-            {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
-                        next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
-                    return idr;
-                }
-                id++;
-            }
-            for (int32_t k = 0; k < n_used - 2; k++) {
-                ggml_tensor * next = cgraph->nodes[id+1];
-                if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
-                        next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
-                    return idr;
-                }
-                id++;
-            }
-            idr = id;
-            return idr;
-        };
+            };

-        int i_start = 0;
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-            if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
-                continue;
-            }
-            const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
-            if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
-                max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
-            }
-            const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
-            if (!new_subgraph) {
-                continue;
-            }
+            int i_start = 0;
+            for (int i = 0; i < cgraph->n_nodes; i++) {
+                ggml_tensor * node = cgraph->nodes[i];
+                if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
+                    continue;
+                }
+                const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
+                if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
+                    max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
+                }
+                const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
+                if (!new_subgraph) {
+                    continue;
+                }

-            i = get_i_delayed(i);
+                i = get_i_delayed(i);

+                for (size_t j = 0; j < n_backends; j++) {
+                    auto & bcj = backend_ctx->backend_configs[j];
+                    bcj.cgraphs[n_subgraphs].offset = i_start;
+                }
+                n_subgraphs++;
+                i_start = i + 1;
+            }
+            GGML_ASSERT(i_start == cgraph->n_nodes);
+        }
+
+        backend_ctx->uid         = cgraph->uid;
+        backend_ctx->n_subgraphs = n_subgraphs;
+
+        if (max_tmp_size > backend_ctx->max_tmp_size) {
            for (size_t j = 0; j < n_backends; j++) {
                auto & bcj = backend_ctx->backend_configs[j];
-                bcj.cgraphs[n_subgraphs].offset = i_start;
+                for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
+                    bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
+                }
            }
-            n_subgraphs++;
-            i_start = i + 1;
+            backend_ctx->max_tmp_size = max_tmp_size;
        }
-        GGML_ASSERT(i_start == cgraph->n_nodes);
-    }

-    if (max_tmp_size > backend_ctx->max_tmp_size) {
-        for (size_t j = 0; j < n_backends; j++) {
-            auto & bcj = backend_ctx->backend_configs[j];
-            bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
-        }
-        backend_ctx->max_tmp_size = max_tmp_size;
-    }
-
-
-    if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
-        backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
-        const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
-        const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
-        const size_t n_cgraphs_per_device = n_reduce_steps;    // 1 ADD graph per step
-        const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
-        const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
-        const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
-        ggml_init_params params = {
-            /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        backend_ctx->ctx.reset(ggml_init(params));
-        for (size_t j = 0; j < n_backends; j++) {
-            auto & bcj = backend_ctx->backend_configs[j];
-            for (size_t i = 0; i < n_subgraphs; i++) {
-                bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
+        if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
+            backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
+            const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
+            const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
+            const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
+            const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
+            const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
+            ggml_init_params params = {
+                /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
+                /*.mem_buffer =*/ nullptr,
+                /*.no_alloc   =*/ true,
+            };
+            backend_ctx->ctx.reset(ggml_init(params));
+            for (size_t j = 0; j < n_backends; j++) {
+                auto & bcj = backend_ctx->backend_configs[j];
+                for (size_t i = 0; i < n_subgraphs; i++) {
+                    bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
+                }
+            }
+            backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
+            for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
+                backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
+            }
+            backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
+            for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
+                backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
            }
        }
-        backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
-        for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
-            backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
-        }
-        backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
-        for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
-            backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
-        }
-    }

-    for (size_t j = 0; j < n_backends; j++) {
-        auto & bcj = backend_ctx->backend_configs[j];
-        for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
-            ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
-            const size_t i_node_start = bcj.cgraphs[i_graph].offset;
-            const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
-            cgraph_ij->n_nodes = i_node_stop - i_node_start;
-            ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
-            for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
-                ggml_tensor * node_ij = bcj.nodes[i_node];
-                cgraph_ij->nodes[i_node - i_node_start] = node_ij;
-                const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
-                const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
-                cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
+        for (size_t j = 0; j < n_backends; j++) {
+            auto & bcj = backend_ctx->backend_configs[j];
+            for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
+                ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
+                const size_t i_node_start = bcj.cgraphs[i_graph].offset;
+                const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
+                cgraph_ij->n_nodes = i_node_stop - i_node_start;
+                ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
+                for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
+                    ggml_tensor * node_ij = bcj.nodes[i_node];
+                    cgraph_ij->nodes[i_node - i_node_start] = node_ij;
+                    const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
+                    const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
+                    cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
+                }
+                cgraph_ij->uid = ggml_graph_next_uid();
            }
        }
    }
@@ -1761,11 +1903,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    size_t iga = 0; // i graph aux
    size_t ina = 0; // i node aux

-    // FIXME usage_counts
-    auto get_cgraph_aux = [&]() -> ggml_cgraph * {
-        ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
-        return ret;
-    };
    auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
        ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
        memset(ret, 0, sizeof(ggml_tensor));
@@ -1777,75 +1914,110 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        }
        return ret;
    };
+    auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
+        auto & bcj = backend_ctx->backend_configs[j];
+        ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
+        if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
+            buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
+        }
+        tensor->buffer = buf_ptr.get();
+        tensor->data   = ggml_backend_buffer_get_base(buf_ptr.get());
+    };
+    // FIXME usage_counts
+    auto get_cgraph_aux = [&]() -> ggml_cgraph * {
+        ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
+        return ret;
+    };

    // Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
    auto allreduce_fallback = [&](size_t i) -> ggml_status {
        std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);

-        for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
+        // Zero out nodes that were disabled due to having a zero-sized slice:
+        for (size_t j = 0; j < n_backends; j++) {
+            auto & bcj = backend_ctx->backend_configs[j];
+            ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
+            if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
+                continue;
+            }
+            ggml_tensor * node_zero = get_node_aux(node);
+            node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
+            node_zero->src[0] = node;
+            ggml_set_op_params_f32(node_zero, 0, 0.0f);
+            node_zero->data = node->data;
+            node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
+
+            step_cgraphs[j] = get_cgraph_aux();
+            step_cgraphs[j]->nodes[0] = node_zero;
+            step_cgraphs[j]->n_nodes = 1;
+            const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
+            if (status != GGML_STATUS_SUCCESS) {
+                return status;
+            }
+        }
+        std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
+
+        auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
+            assert(step_cgraphs[j_dst] == nullptr);
+            auto & bcj_src = backend_ctx->backend_configs[j_src];
+            auto & bcj_dst = backend_ctx->backend_configs[j_dst];
+
+            ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
+            ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
+            GGML_ASSERT(ggml_is_contiguous(node_src));
+            GGML_ASSERT(ggml_is_contiguous(node_dst));
+
+            ggml_tensor * node_tmp = get_node_aux(node_dst);
+            set_tmp_data(node_tmp, j_dst, i_buf);
+
+            ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
+
+            ggml_tensor * node_red = get_node_aux(node_dst);
+            node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
+            node_red->view_offs = node_dst->view_offs;
+            node_red->op = GGML_OP_ADD;
+            node_red->src[0] = node_dst;
+            node_red->src[1] = node_tmp;
+            node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
+            ggml_backend_view_init(node_red);
+
+            ggml_cgraph * cgraph_aux = get_cgraph_aux();
+            cgraph_aux->nodes[0] = node_red;
+            cgraph_aux->n_nodes = 1;
+            step_cgraphs[j_dst] = cgraph_aux;
+        };
+
+        size_t offset_j = n_backends/2;
+        while ((offset_j & (offset_j - 1)) != 0) {
+            offset_j--;
+        }
+        const size_t offset_j_max = offset_j;
+        size_t i_buf = 0;
+
+        // If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
+        for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
+            const size_t j_dst = j_src - 2*offset_j_max;
+            push_data(j_src, j_dst, i_buf);
+            const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
+            if (status != GGML_STATUS_SUCCESS) {
+                return status;
+            }
+            i_buf = 1;
+        }
+
+        // Butterfly reduction:
+        for (; offset_j >= 1; offset_j /= 2) {
            std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);

-            for (size_t j = 0; j < n_backends; j++) {
+            for (size_t j = 0; j < 2*offset_j_max; j++) {
                const size_t j_other = j ^ offset_j;
-                if (j_other > j) {
+                if (j_other >= n_backends) {
                    continue;
                }
-
-                auto & bcj1 = backend_ctx->backend_configs[j];
-                auto & bcj2 = backend_ctx->backend_configs[j_other];
-
-                ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
-                ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
-                GGML_ASSERT(ggml_is_contiguous(node1));
-                GGML_ASSERT(ggml_is_contiguous(node2));
-
-                // Tmp tensors to receive P2P copies
-                ggml_tensor * node_tmp_1 = get_node_aux(node1);
-                node_tmp_1->buffer = bcj1.buf.get();
-                node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
-
-                ggml_tensor * node_tmp_2 = get_node_aux(node2);
-                node_tmp_2->buffer = bcj2.buf.get();
-                node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
-
-                // 2 P2P copies: exchange full buffers
-                ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
-                ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
-
-                // Local ADD: node1 += tmp1 (in-place via view)
-                ggml_tensor * node_red_1 = get_node_aux(node1);
-                node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
-                node_red_1->view_offs = node1->view_offs;
-                node_red_1->op = GGML_OP_ADD;
-                node_red_1->src[0] = node1;
-                node_red_1->src[1] = node_tmp_1;
-                node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
-                ggml_backend_view_init(node_red_1);
-
-                // Local ADD: node2 += tmp2 (in-place via view)
-                ggml_tensor * node_red_2 = get_node_aux(node2);
-                node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
-                node_red_2->view_offs = node2->view_offs;
-                node_red_2->op = GGML_OP_ADD;
-                node_red_2->src[0] = node2;
-                node_red_2->src[1] = node_tmp_2;
-                node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
-                ggml_backend_view_init(node_red_2);
-
-                // Build 1-node cgraphs for the ADD ops
-                ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
-                cgraph_aux_1->nodes[0] = node_red_1;
-                cgraph_aux_1->n_nodes = 1;
-                step_cgraphs[j] = cgraph_aux_1;
-
-                ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
-                cgraph_aux_2->nodes[0] = node_red_2;
-                cgraph_aux_2->n_nodes = 1;
-                step_cgraphs[j_other] = cgraph_aux_2;
+                push_data(j, j_other, i_buf);
            }

-            // Execute local ADDs for this step
-            for (size_t j = 0; j < n_backends; j++) {
+            for (size_t j = 0; j < 2*offset_j_max; j++) {
                if (step_cgraphs[j] == nullptr) {
                    continue;
                }
@@ -1855,12 +2027,25 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                    return status;
                }
            }
+            i_buf++;
        }
+        assert(i_buf == backend_ctx->n_reduce_steps);
+
+        // If n_backends is not a power of 2, copy back the reduced tensors to the excess:
+        for (size_t j = 2*offset_j_max; j < n_backends; j++) {
+            auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
+            auto & bcj_dst = backend_ctx->backend_configs[j];
+
+            ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
+            ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
+            ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
+        }
+
        return GGML_STATUS_SUCCESS;
    };


-    for (size_t i = 0; i < n_subgraphs; i++) {
+    for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
        for (size_t j = 0; j < n_backends; j++) {
            auto & bcj = backend_ctx->backend_configs[j];
            const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
@@ -1869,7 +2054,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
            }
        }

-        if (n_backends > 1 && i < n_subgraphs - 1) {
+        if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
            bool backend_allreduce_success = false;
            if (backend_ctx->comm_ctx) {
                std::vector<ggml_tensor *> nodes;
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1030,6 +1030,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
        GGML_ABORT("%s: failed to initialize context\n", __func__);
    }

+    graph->uid = ggml_graph_next_uid();
+
    // pass 1: assign backends to ops with pre-allocated inputs
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
@@ -1477,6 +1479,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
        assert(graph_copy->size > graph_copy->n_leafs);
        graph_copy->leafs[graph_copy->n_leafs++] = leaf;
    }
+
+    // set ids for all splits
+    for (int i = 0; i < sched->n_splits; ++i) {
+        sched->splits[i].graph.uid = ggml_graph_next_uid();
+    }
 }

 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
    const int lda = KB * sizeof(TA);
    //const int ldb = KB * sizeof(TB);

-    static thread_local packed_B_t Tile0[TILE_N * TILE_K];
-    static thread_local packed_B_t Tile1[TILE_N * TILE_K];
-    static thread_local int8_t Tile23[TILE_M * TILE_K];
+    alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
+    alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];

-    static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
-    static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
+    alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
+    alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];

    // double buffering C to interleave avx512 and amx
    int32_t * C_cur = TileC0;
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
    const int m1 = std::max(M - TILE_M, 0);
    //const int lda = KB * sizeof(TA);

-    static thread_local int8_t Tile0[TILE_N * TILE_K];
-    static thread_local int8_t Tile1[TILE_N * TILE_K];
-    static thread_local int8_t Tile23[TILE_M * TILE_K];
+    alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];

    // mat mul result for each group
-    static thread_local int32_t Tile4[TILE_M * TILE_N];
-    static thread_local int32_t Tile5[TILE_M * TILE_N];
-    static thread_local int32_t Tile6[TILE_M * TILE_N];
-    static thread_local int32_t Tile7[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];

    // sum of each QK_K block, contains 8 groups, int32
-    static thread_local int32_t Sumi4[TILE_M * TILE_N];
-    static thread_local int32_t Sumi5[TILE_M * TILE_N];
-    static thread_local int32_t Sumi6[TILE_M * TILE_N];
-    static thread_local int32_t Sumi7[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];

    const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
    for (int i = 0; i < KB; ++i) {
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -83,7 +83,6 @@
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
-#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -151,8 +151,6 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    const block_q1_0 * GGML_RESTRICT x = vx;
    const block_q8_0 * GGML_RESTRICT y = vy;

-    float sumf = 0.0f;
-
 #if defined(__ARM_NEON)
    float32x4_t sumv = vdupq_n_f32(0.0f);

@@ -212,31 +210,13 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        }
    }

-    sumf = vaddvq_f32(sumv);
+    *s = vaddvq_f32(sumv);
 #else
-    // Scalar fallback
-    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
-
-        // Process 4 Q8_0 blocks
-        for (int k = 0; k < 4; k++) {
-            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-
-            int sumi = 0;
-            for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
-                const int byte_index = bit_index / 8;
-                const int bit_offset = bit_index % 8;
-
-                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-                sumi += xi * y[i*4 + k].qs[j];
-            }
-            sumf += d0 * d1 * sumi;
-        }
-    }
+    UNUSED(nb);
+    UNUSED(x);
+    UNUSED(y);
+    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
-
-    *s = sumf;
 }


--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -274,6 +274,18 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
 }
 #endif
 #elif defined(__SSSE3__)
+static inline __m128i bytes_from_bits_16(const uint8_t * x) {
+    uint16_t x16;
+    memcpy(&x16, x, sizeof(uint16_t));
+
+    const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    __m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
+    const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
+    bytes = _mm_or_si128(bytes, bit_mask);
+
+    return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
+}
+
 // horizontally add 4x4 floats
 static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
    __m128 res_0 =_mm_hadd_ps(a, b);
@@ -540,6 +552,152 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif

+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__)
+    const __m256i ones_8 = _mm256_set1_epi8(1);
+    const __m256i ones_16 = _mm256_set1_epi16(1);
+    const __m256i byte_shuf = _mm256_setr_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
+    const __m256i bit_masks = _mm256_setr_epi8(
+            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
+            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
+    const __m256i zero = _mm256_setzero_si256();
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
+
+        __m256 acc_block;
+        {
+            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
+            const __m256i sm = _mm256_cmpeq_epi8(
+                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
+            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
+            acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
+        }
+        for (int K = 1; K < 4; ++K) {
+            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
+            const __m256i sm = _mm256_cmpeq_epi8(
+                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
+            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
+            acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
+        }
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    const __m128i ones_16 = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
+        __m256 acc_block;
+        {
+            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
+            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
+            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
+            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
+            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
+            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
+            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
+            const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
+            const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
+            const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
+            const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
+            const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
+            acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
+        }
+        for(int K = 1; K < 4; ++K) {
+            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
+            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
+            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
+            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
+            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
+            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
+            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
+            const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
+            const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
+            const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
+            const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
+            const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
+            acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
+        }
+#undef Q1_AVX_BLOCK
+
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    const __m128i ones_16 = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
+
+#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
+        { \
+            const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
+            const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
+            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
+            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
+            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
+            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
+            const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
+            const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
+            const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
+            (ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
+        }
+        Q1_SSSE3_BLOCK(0,  0, acc_0)
+        Q1_SSSE3_BLOCK(4,  1, acc_1)
+        Q1_SSSE3_BLOCK(8,  2, acc_2)
+        Q1_SSSE3_BLOCK(12, 3, acc_3)
+#undef Q1_SSSE3_BLOCK
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#else
+    UNUSED(nb);
+    UNUSED(x);
+    UNUSED(y);
+    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -2142,9 +2300,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 #if defined __AVX2__

-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i m15 = _mm256_set1_epi8(15);

    __m256 acc = _mm256_setzero_ps();

@@ -2156,53 +2313,45 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;

+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m256i scales_16 = _mm256_cvtepi8_epi16(scales);
+        const __m256i q8sclsub = _mm256_slli_epi32(_mm256_madd_epi16(q8sums, scales_16), 5);

        __m256i sumi = _mm256_setzero_si256();

        int is = 0;

        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;

-            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
-            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m3), 4);
+            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(12)), 2);
+            const __m256i q4h_2 = _mm256_and_si256(q4bitsH, _mm256_set1_epi8(48));
+            const __m256i q4h_3 = _mm256_srli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(-64)), 2);

-            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m15), q4h_0);
+            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m15), q4h_1);
+            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m15), q4h_2);
+            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m15), q4h_3);

            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;

-            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);

-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;

            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
@@ -2214,6 +2363,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

        }

+        sumi = _mm256_sub_epi32(sumi, q8sclsub);
        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
    }

--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -137,22 +137,28 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    float sumf = 0.0;

    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);

        float sumi = 0.0f;

        for (int k = 0; k < 4; k++) {
-            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-
+            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
            int sumi_block = 0;

-            for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
-                const int byte_index = bit_index / 8;
-                const int bit_offset = bit_index % 8;
+            const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
+            const int8_t  * GGML_RESTRICT qy   = yb->qs;

-                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-                sumi_block += xi * y[i*4 + k].qs[j];
+            for (int b = 0; b < 4; ++b, qy += 8) {
+                const unsigned mask = bits[b];
+                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+                           +  ((mask & 0x02) ? qy[1] : -qy[1])
+                           +  ((mask & 0x04) ? qy[2] : -qy[2])
+                           +  ((mask & 0x08) ? qy[3] : -qy[3])
+                           +  ((mask & 0x10) ? qy[4] : -qy[4])
+                           +  ((mask & 0x20) ? qy[5] : -qy[5])
+                           +  ((mask & 0x40) ? qy[6] : -qy[6])
+                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
            }

            sumi += d1 * sumi_block;
--- a/ggml/src/ggml-cpu/simd-gemm.h
+++ b/ggml/src/ggml-cpu/simd-gemm.h
@@ -109,6 +109,96 @@ static void simd_gemm(
        C += N;
    }
 }
+#elif defined(GGML_SIMD) && defined(__riscv_v_intrinsic)
+// RM accumulators + 1 B vector = RM + 1 <= 8  =>  RM <= 7
+// Microkernel: C[RM x vl] += A[RM x K] * B[K x N]
+template <int RM>
+static inline void rvv_simd_gemm_ukernel(
+    float       * GGML_RESTRICT C,
+    const float * GGML_RESTRICT A,
+    const float * GGML_RESTRICT B,
+    int K, int N, size_t vl)
+{
+    static_assert(RM >= 1 && RM <= 7, "RM must be 1..7 for LMUL=4");
+
+    vfloat32m4_t acc_0 = __riscv_vle32_v_f32m4(C + 0 * N, vl);
+    vfloat32m4_t acc_1, acc_2, acc_3, acc_4, acc_5, acc_6;
+    if constexpr (RM > 1) acc_1 = __riscv_vle32_v_f32m4(C + 1 * N, vl);
+    if constexpr (RM > 2) acc_2 = __riscv_vle32_v_f32m4(C + 2 * N, vl);
+    if constexpr (RM > 3) acc_3 = __riscv_vle32_v_f32m4(C + 3 * N, vl);
+    if constexpr (RM > 4) acc_4 = __riscv_vle32_v_f32m4(C + 4 * N, vl);
+    if constexpr (RM > 5) acc_5 = __riscv_vle32_v_f32m4(C + 5 * N, vl);
+    if constexpr (RM > 6) acc_6 = __riscv_vle32_v_f32m4(C + 6 * N, vl);
+
+    for (int kk = 0; kk < K; kk++) {
+        vfloat32m4_t b_0 = __riscv_vle32_v_f32m4(B + kk * N, vl);
+
+                              acc_0 = __riscv_vfmacc_vf_f32m4(acc_0, A[0 * K + kk], b_0, vl);
+        if constexpr (RM > 1) acc_1 = __riscv_vfmacc_vf_f32m4(acc_1, A[1 * K + kk], b_0, vl);
+        if constexpr (RM > 2) acc_2 = __riscv_vfmacc_vf_f32m4(acc_2, A[2 * K + kk], b_0, vl);
+        if constexpr (RM > 3) acc_3 = __riscv_vfmacc_vf_f32m4(acc_3, A[3 * K + kk], b_0, vl);
+        if constexpr (RM > 4) acc_4 = __riscv_vfmacc_vf_f32m4(acc_4, A[4 * K + kk], b_0, vl);
+        if constexpr (RM > 5) acc_5 = __riscv_vfmacc_vf_f32m4(acc_5, A[5 * K + kk], b_0, vl);
+        if constexpr (RM > 6) acc_6 = __riscv_vfmacc_vf_f32m4(acc_6, A[6 * K + kk], b_0, vl);
+    }
+
+                          __riscv_vse32_v_f32m4(C + 0 * N, acc_0, vl);
+    if constexpr (RM > 1) __riscv_vse32_v_f32m4(C + 1 * N, acc_1, vl);
+    if constexpr (RM > 2) __riscv_vse32_v_f32m4(C + 2 * N, acc_2, vl);
+    if constexpr (RM > 3) __riscv_vse32_v_f32m4(C + 3 * N, acc_3, vl);
+    if constexpr (RM > 4) __riscv_vse32_v_f32m4(C + 4 * N, acc_4, vl);
+    if constexpr (RM > 5) __riscv_vse32_v_f32m4(C + 5 * N, acc_5, vl);
+    if constexpr (RM > 6) __riscv_vse32_v_f32m4(C + 6 * N, acc_6, vl);
+}
+
+template <int RM>
+static inline void rvv_simd_gemm_dispatch_tail(
+    float       * GGML_RESTRICT C,
+    const float * GGML_RESTRICT A,
+    const float * GGML_RESTRICT B,
+    int K, int N, int KN, int remaining_rows)
+{
+    if constexpr (RM > 0) {
+        if (remaining_rows == RM) {
+            int64_t jj = 0;
+            for (; jj + KN <= N; jj += KN) {
+                rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, KN);
+            }
+            if (jj < N) {
+                rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, N - jj);
+            }
+        } else {
+            rvv_simd_gemm_dispatch_tail<RM - 1>(C, A, B, K, N, KN, remaining_rows);
+        }
+    }
+}
+
+static constexpr int GEMM_RM = 7;
+
+// C[M x N] += A[M x K] * B[K x N]
+static void simd_gemm(
+    float       * GGML_RESTRICT C,
+    const float * GGML_RESTRICT A,
+    const float * GGML_RESTRICT B,
+    int M, int K, int N)
+{
+    const int KN = (int)__riscv_vlenb();
+    int64_t ii = 0;
+    for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
+        int64_t jj = 0;
+        for (; jj + KN <= N; jj += KN) {
+            rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, KN);
+        }
+        if (jj < N) {
+            rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, N - jj);
+        }
+        A += GEMM_RM * K;
+        C += GEMM_RM * N;
+    }
+
+    int remaining_rows = M - ii;
+    rvv_simd_gemm_dispatch_tail<GEMM_RM - 1>(C, A, B, K, N, KN, remaining_rows);
+}

 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic pop
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -1036,12 +1036,12 @@ inline static float ggml_gelu_quick_f32(float x) {
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }

-//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-//    }
-//}
+inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_table_gelu_quick_f16[i16[i]];
+    }
+}

 #ifdef GGML_GELU_QUICK_FP16
 inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
@@ -1060,13 +1060,6 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
 }
 #endif

-inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_CPU_FP16_TO_FP32(x[i]);
-        y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
-    }
-}
-
 // Sigmoid Linear Unit (SiLU) function
 inline static float ggml_silu_f32(float x) {
    return x/(1.0f + expf(-x));
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -269,10 +269,6 @@ static const char * cu_get_error_str(CUresult err) {
 #define FLASH_ATTN_AVAILABLE
 #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)

-#if defined(TURING_MMA_AVAILABLE)
-#define LDMATRIX_TRANS_AVAILABLE
-#endif // defined(TURING_MMA_AVAILABLE)
-
 static bool fp16_available(const int cc) {
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
@@ -1186,6 +1182,8 @@ struct ggml_cuda_graph {
    std::vector<cudaGraphNode_t> nodes;
    bool disable_due_to_gpu_arch = false;
    bool warmup_complete = false;
+    uint64_t uid = 0;
+    int64_t last_used_time = 0;
    struct node_properties {
        ggml_tensor node;
        void *   node_src_data_ptrs[GGML_MAX_SRC];
@@ -1367,12 +1365,28 @@ struct ggml_backend_cuda_context {
    // when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
    std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;

+    int64_t last_graph_eviction_sweep = 0;
+
    ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
+        const int64_t time_now = ggml_time_us();
+
+        // sweep every 5s, evicting cuda graphs unused for >=10s
+        if (time_now - last_graph_eviction_sweep >= 5'000'000) {
+            last_graph_eviction_sweep = time_now;
+            for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
+                if (time_now - it->second->last_used_time >= 10'000'000) {
+                    it = cuda_graphs.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+
        auto it = cuda_graphs.find(first_node_ptr);
        if (it == cuda_graphs.end()) {
-            cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
-            return cuda_graphs[first_node_ptr].get();
+            it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
        }
+        it->second->last_used_time = time_now;
        return it->second.get();
    }

--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -1,96 +1,79 @@
 #include "concat.cuh"

 // contiguous kernels
-static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
+template <int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
+                                                                                 const float * y,
+                                                                                 float *       dst,
+                                                                                 int64_t       ne00,
+                                                                                 int64_t       ne01,
+                                                                                 int64_t       ne02,
+                                                                                 int64_t       ne0,
+                                                                                 int64_t       ne1,
+                                                                                 int64_t       ne2) {
+    static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");

-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
+    const int64_t n = ne0 * ne1 * ne2;

-    if (nidx < ne00) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * gridDim.y;
-        dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            (nidx - ne00) +
-            blockIdx.y * (ne0 - ne00) +
-            blockIdx.z * (ne0 - ne00) * gridDim.y;
-        dst[offset_dst] = y[offset_src];
+    for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
+        if constexpr (dim == 0) {
+            const int64_t row = i / ne0;
+            const int64_t i0  = i - row * ne0;
+
+            if (i0 < ne00) {
+                dst[i] = x[row * ne00 + i0];
+            } else {
+                dst[i] = y[row * (ne0 - ne00) + (i0 - ne00)];
+            }
+        } else if constexpr (dim == 1) {
+            const int64_t dst_plane  = ne0 * ne1;
+            const int64_t src0_plane = ne0 * ne01;
+            const int64_t src1_plane = dst_plane - src0_plane;
+            const int64_t i2         = i / dst_plane;
+            const int64_t i01        = i - i2 * dst_plane;
+
+            if (i01 < src0_plane) {
+                dst[i] = x[i2 * src0_plane + i01];
+            } else {
+                dst[i] = y[i2 * src1_plane + (i01 - src0_plane)];
+            }
+        } else {
+            const int64_t src0_size = ne0 * ne1 * ne02;
+
+            if (i < src0_size) {
+                dst[i] = x[i];
+            } else {
+                dst[i] = y[i - src0_size];
+            }
+        }
    }
 }

-static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
+static void concat_f32_cuda(const float * x,
+                            const float * y,
+                            float *       dst,
+                            int64_t       ne00,
+                            int64_t       ne01,
+                            int64_t       ne02,
+                            int64_t       ne0,
+                            int64_t       ne1,
+                            int64_t       ne2,
+                            int           dim,
+                            cudaStream_t  stream) {
+    const int64_t n          = ne0 * ne1 * ne2;
+    const int     num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;

-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-
-    if (blockIdx.y < (unsigned)ne01) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            blockIdx.z * ne0 * ne01;
-        dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx +
-            (blockIdx.y - ne01) * ne0 +
-            blockIdx.z * ne0 * (gridDim.y - ne01);
-        dst[offset_dst] = y[offset_src];
-    }
-}
-
-static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-
-    if (blockIdx.z < (unsigned)ne02) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            blockIdx.z * ne0 * gridDim.y;
-        dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            (blockIdx.z - ne02) * ne0 *  gridDim.y;
-        dst[offset_dst] = y[offset_src];
-    }
-}
-
-static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2);
    if (dim == 0) {
-        concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
+        concat_f32_cont<0>
+            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
    if (dim == 1) {
-        concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
+        concat_f32_cont<1>
+            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
-    concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
+    concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 }

 // non-contiguous kernel (slow)
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -305,12 +305,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
        const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV, const int i_sup) {
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    // K/V data is loaded with decreasing granularity for D for better memory bandwidth.
-    // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
+    // The minimum granularity is 16 bytes.
+    constexpr int h2_per_chunk = 16/sizeof(half2);
+    const int chunks_per_row = D2 / h2_per_chunk;
    if constexpr (use_cp_async) {
+        static_assert(warp_size == 32, "bad warp_size");
        static_assert(!oob_check, "OOB check not compatible with cp_async");
        constexpr int preload = 64;
-        constexpr int h2_per_chunk = 16/sizeof(half2);
-        const int chunks_per_row = D2 / h2_per_chunk;

        const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);

@@ -348,11 +349,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
        // 6: max  1*16= 16 bytes,   8 half
        ggml_cuda_unroll<6>{}(load);
    } else {
-        // TODO use ggml_cuda_memcpy_1
+        const half2 zero[4] = {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}};
        auto load = [&] __device__ (const int n) {
-            const int stride_k = warp_size >> n;
-            const int k0_start = stride_k == warp_size ? 0 : D2 - D2 % (2*stride_k);
-            const int k0_stop  =                             D2 - D2 % (1*stride_k);
+            const int stride_k = 32 >> n;
+            const int k0_start = stride_k == 32 ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
+            const int k0_stop  =                      chunks_per_row - chunks_per_row % (1*stride_k);
            const int stride_i = warp_size / stride_k;

            if (k0_start == k0_stop) {
@@ -371,15 +372,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
                    const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);

-                    tile_KV[i*stride_tile + k] = !oob_check || i < i_sup ? KV[i*stride_KV + k] : make_half2(0.0f, 0.0f);
+                    ggml_cuda_memcpy_1<16>(tile_KV + i*stride_tile + k*4,
+                        !oob_check || i < i_sup ? KV + i*stride_KV + k*h2_per_chunk : zero);
                }
            }
        };
-        // 1: max 32* 4=128 bytes,  64 half
-        // 2: max 16* 4= 64 bytes,  32 half
-        // 3: max  8* 4= 32 bytes,  16 half
-        // 4: max  4* 4= 16 bytes,   8 half
-        ggml_cuda_unroll<4>{}(load);
+        // 1: max 32*16=512 bytes, 256 half
+        // 2: max 16*16=256 bytes, 128 half
+        // 3: max  8*16=128 bytes,  64 half
+        // 4: max  4*16= 64 bytes,  32 half
+        // 5: max  2*16= 32 bytes,  16 half
+        // 6: max  1*16= 16 bytes,   8 half
+        ggml_cuda_unroll<6>{}(load);
    }
 }

@@ -862,11 +866,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    }


-#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
-    T_A_VKQ A_identity;
-    make_identity_mat(A_identity);
-#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
-
    // Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
 #pragma unroll
    for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) {
@@ -897,29 +896,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;

                T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
-#if defined(LDMATRIX_TRANS_AVAILABLE)
                load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
-#elif defined(AMD_MFMA_AVAILABLE)
-                // MFMA A register layout: A_mat[i=lane%16][k=4*(lane/16)+reg].
-                // Normal load gives A_mat[seq][dv] but we need A_mat[dv][seq] = V^T.
-                // Load with transposed addressing: 4 strided half loads.
-                {
-                    const half2 * xs0 = tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2;
-                    const half * xs0_h = (const half *) xs0;
-                    const int stride_h = stride_tile_V * 2; // stride in half units
-                    half * A_h = (half *) A.x;
-#pragma unroll
-                    for (int l = 0; l < 4; ++l) {
-                        A_h[l] = xs0_h[(4*(threadIdx.x / 16) + l) * stride_h + threadIdx.x % 16];
-                    }
-                }
-#else
-                // TODO: Try to transpose tile_V when loading gmem to smem.
-                // Use mma to transpose T_A_VKQ for RDNA.
-                T_A_VKQ A_trans;
-                load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
-                mma(A, A_trans, A_identity);
-#endif // defined(LDMATRIX_TRANS_AVAILABLE)
                if constexpr (T_B_KQ::I == 8) {
                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
                } else {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -368,15 +368,21 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
    }

    ~ggml_cuda_pool_leg() {
+        clear_pool();
+        GGML_ASSERT(pool_size == 0);
+    }
+
+    void clear_pool() {
        ggml_cuda_set_device(device);
        for (int i = 0; i < MAX_BUFFERS; ++i) {
            ggml_cuda_buffer & b = buffer_pool[i];
            if (b.ptr != nullptr) {
                CUDA_CHECK(cudaFree(b.ptr));
                pool_size -= b.size;
+                b.ptr  = nullptr;
+                b.size = 0;
            }
        }
-        GGML_ASSERT(pool_size == 0);
    }

    void * alloc(size_t size, size_t * actual_size) override {
@@ -421,7 +427,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
        size_t look_ahead_size = (size_t) (1.05 * size);
        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
        ggml_cuda_set_device(device);
-        CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
+        cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+        if (err == cudaErrorMemoryAllocation) {
+            (void)cudaGetLastError();
+            const size_t cached_bytes = pool_size;
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
+                           device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
+            CUDA_CHECK(cudaDeviceSynchronize());
+            clear_pool();
+            err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+            if (err == cudaSuccess) {
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
+            }
+        }
+        CUDA_CHECK(err);
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
@@ -1203,6 +1222,13 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
    // For small tensors, simply reduce them as FP32.
    // The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
    if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
+        for (size_t i = 0; i < n_backends; ++i) {
+            if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
+                ggml_cuda_set_device(cuda_ctx->device);
+                CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
+            }
+        }
        NCCL_CHECK(ncclGroupStart());
        for (size_t i = 0; i < n_backends; ++i) {
            ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
@@ -1224,7 +1250,11 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
        tmp[i].alloc(ne);

        ggml_cuda_set_device(cuda_ctx->device);
-        to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
+        if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
+            to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
+        } else {
+            CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
+        }
        CUDA_CHECK(cudaGetLastError());
    }

@@ -3108,6 +3138,15 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
    const void * graph_key = ggml_cuda_graph_get_key(cgraph);
    ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);

+    if (cgraph->uid != 0 &&
+        cgraph->uid == graph->uid) {
+        GGML_LOG_DEBUG("CUDA Graph id %zu reused\n", cgraph->uid);
+        GGML_ASSERT((int)graph->node_props.size() == cgraph->n_nodes);
+        return false;
+    }
+
+    graph->uid = cgraph->uid;
+
    // Check if the graph size has changed
    if ((int)graph->node_props.size() != cgraph->n_nodes) {
        res = true;
@@ -3553,6 +3592,30 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
        return true;
    }

+    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
+     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
+        const ggml_tensor * unary = cgraph->nodes[node_idx];
+        const ggml_tensor * sqr   = cgraph->nodes[node_idx+1];
+
+        if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
+            return false;
+        }
+
+        if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
+            return false;
+        }
+
+        if (unary->type != sqr->type) {
+            return false;
+        }
+
+        if (!ggml_is_contiguous(unary->src[0])) {
+            return false;
+        }
+
+        return true;
+    }
+
    if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
        const ggml_tensor *scale  = cgraph->nodes[node_idx];
@@ -4061,6 +4124,12 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                        continue;
                    }

+                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
+                        ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
+                        i++;
+                        continue;
+                    }
+
                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
                        i += 2;
                        ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -86,17 +86,12 @@ namespace ggml_cuda_mma {
    //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
    //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR

-    static constexpr bool is_i_major(const data_layout dl) {
-        return dl == DATA_LAYOUT_I_MAJOR ||
-               dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
-    }
-
    static constexpr __device__ data_layout get_input_data_layout() {
-#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#if defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
        return DATA_LAYOUT_I_MAJOR_MIRRORED;
 #else
        return DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#endif // defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
    }

    template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
@@ -113,7 +108,6 @@ namespace ggml_cuda_mma {
        T x[ne] = {0};

        static constexpr __device__ bool supported() {
-            if (I == 64 && J ==  2) return true;
            if (I == 16 && J ==  8) return true;
            if (I == 32 && J ==  4) return true;
            if (I == 16 && J == 16) return true;
@@ -122,7 +116,7 @@ namespace ggml_cuda_mma {
        }

        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
+            if constexpr (I == 16 && J == 4) {
                return threadIdx.x % 16;
            } else if constexpr (I == 16 && J == 8) {
                return threadIdx.x % 16;
@@ -139,8 +133,8 @@ namespace ggml_cuda_mma {
        }

        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
-                return (2 * ((threadIdx.x / 16) % 2) + l);
+            if constexpr (I == 16 && J == 4) {
+                return threadIdx.x / 16;
            } else if constexpr (I == 16 && J == 8) {
                return 2 * (threadIdx.x / 16) + l;
            } else if constexpr (I == 32 && J == 4) {
@@ -154,7 +148,7 @@ namespace ggml_cuda_mma {
                return -1;
            }
        }
-#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#elif defined(VOLTA_MMA_AVAILABLE)
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};

@@ -283,7 +277,7 @@ namespace ggml_cuda_mma {
        static constexpr int         J  = J_;
        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;

-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#if defined(VOLTA_MMA_AVAILABLE)
        static constexpr int ne = I * J / WARP_SIZE;
        half2 x[ne] = {{0.0f, 0.0f}};

@@ -407,7 +401,7 @@ namespace ggml_cuda_mma {
                return -1;
            }
        }
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#endif // defined(VOLTA_MMA_AVAILABLE)
    };

    template <int I_, int J_>
@@ -701,57 +695,12 @@ namespace ggml_cuda_mma {
    }
 #endif // defined(TURING_MMA_AVAILABLE)

-    static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
-#if defined(RDNA4)
-        const int row = t.get_i(0);
-        const int left_right = t.get_j(0) / 4;
-        const int up_down = row / 8;
-        const int idx = row % 8;
-        reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
-#else
-        GGML_UNUSED_VARS(t);
-        NO_DEVICE_CODE;
-#endif // defined(RDNA4)
-    }
-
    template <int I, int J, typename T, data_layout dl>
    static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
-#if defined(AMD_MFMA_AVAILABLE)
-        if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
-#pragma unroll
-            for (int l = 0; l < t.ne; ++l) {
-                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
-            }
-        } else {
-            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-        }
-#elif defined(AMD_WMMA_AVAILABLE)
-        // All wmma layout has contiguous data when i-major.
-        if constexpr (is_i_major(dl)) {
-            // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
-            constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
-            if constexpr (sizeof(t.x) > aligned_copy_bytes) {
-                static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
-                constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
-#pragma unroll
-                for (int i = 0; i < aligned_copy_count; ++i) {
-                    ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
-                }
-            } else {
-                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-            }
-        } else {
-#pragma unroll
-            for (int l = 0; l < t.ne; ++l) {
-                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
-            }
-        }
-#else
 #pragma unroll
        for (int l = 0; l < t.ne; ++l) {
            t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
        }
-#endif // defined(AMD_MFMA_AVAILABLE)
    }

    template <typename T>
@@ -764,26 +713,37 @@ namespace ggml_cuda_mma {
            : "=r"(xi[0]), "=r"(xi[1])
            : "l"(xs));
 #else
-        load_generic(t, xs0, stride);
+        GGML_UNUSED_VARS(t, xs0, stride);
+        NO_DEVICE_CODE;
 #endif // TURING_MMA_AVAILABLE
    }

-    template <typename T>
+    template <typename T, data_layout dl>
    static __device__ __forceinline__ void load_ldmatrix(
-            tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
+            tile<16, 4, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #ifdef TURING_MMA_AVAILABLE
        int * xi = (int *) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
        asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
            : "=r"(xi[0]), "=r"(xi[1])
            : "l"(xs));
+#elif defined(AMD_WMMA_AVAILABLE)
+#ifdef RDNA3
+        static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
+        static_assert(sizeof(t.x) == 16, "bad ne");
+        ggml_cuda_memcpy_1<8>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
+        ggml_cuda_memcpy_1<8>(t.x + 2, xs0 + t.get_i(0)*stride + 2);
+#else
+        static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
+        static_assert(sizeof(t.x) == 8, "bad ne");
+        ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
+#endif // RDNA3
+#elif defined(AMD_MFMA_AVAILABLE)
+        static_assert(sizeof(t.x) == 4, "bad ne");
+        ggml_cuda_memcpy_1<4>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
 #else
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
        GGML_UNUSED_VARS(t, xs0, stride);
        NO_DEVICE_CODE;
-#else
-        load_generic(t, xs0, stride);
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
 #endif // TURING_MMA_AVAILABLE
    }

@@ -796,19 +756,26 @@ namespace ggml_cuda_mma {
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
            : "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
            : "l"(xs));
-#else
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-#if 1
-        // TODO: more generic handling
-        static_assert(sizeof(T) == 4, "bad type size");
+#elif defined(VOLTA_MMA_AVAILABLE)
        ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
        ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
+#elif defined(AMD_WMMA_AVAILABLE)
+#ifdef RDNA3
+        static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
+        static_assert(sizeof(t.x) == 32, "bad ne");
+        ggml_cuda_memcpy_1<16>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
+        ggml_cuda_memcpy_1<16>(t.x + 4, xs0 + t.get_i(0)*stride + 4);
 #else
-        load_generic(t, xs0, stride);
-#endif // 1
+        static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
+        static_assert(sizeof(t.x) == 16, "bad ne");
+        ggml_cuda_memcpy_1<16>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
+#endif // RDNA3
+#elif defined(AMD_MFMA_AVAILABLE)
+        static_assert(sizeof(t.x) == 8, "bad ne");
+        ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
 #else
-        load_generic(t, xs0, stride);
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        GGML_UNUSED_VARS(t, xs0, stride);
+        NO_DEVICE_CODE;
 #endif // TURING_MMA_AVAILABLE
    }

@@ -827,23 +794,30 @@ namespace ggml_cuda_mma {

    static __device__ __forceinline__ void load_ldmatrix(
            tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#if defined(VOLTA_MMA_AVAILABLE)
        ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
 #else
        GGML_UNUSED_VARS(t, xs0, stride);
        NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#endif // defined(VOLTA_MMA_AVAILABLE)
    }

    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix_trans(
            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
 #ifdef TURING_MMA_AVAILABLE
-        int * xi = (int * ) t.x;
+        int * xi = (int *) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
            : "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
            : "l"(xs));
+#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+        half * xh = (half *) t.x;
+#pragma unroll
+        for (int l = 0; l < t.ne; ++l) {
+            xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
+            xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
+        }
 #else
        GGML_UNUSED_VARS(t, xs0, stride);
        NO_DEVICE_CODE;
@@ -1218,73 +1192,27 @@ namespace ggml_cuda_mma {
        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
        int32x4_t * acc = (int32x4_t *) D.x;
 #if defined(CDNA4) || defined(CDNA3)
-        acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
-                                                       ((int64_t *) B.x)[0],
-                                                       acc[0],
-                                                       0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
 #elif defined(CDNA2) || defined(CDNA1)
-        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
-                                                      B.x[0],
-                                                      acc[0],
-                                                      0, 0, 0);
-        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
-                                                      B.x[1],
-                                                      acc[0],
-                                                      0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
 #endif // defined(CDNA4) || defined(CDNA3)
-
 #elif defined(AMD_WMMA_AVAILABLE)
-
        using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
        int32x8_t * acc = (int32x8_t *) D.x;
-
 #if defined(RDNA4)
        using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
        int32x2_t * a_vec = (int32x2_t *) A.x;
        int32x2_t * b_vec = (int32x2_t *) B.x;
-
-        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
-            true,
-            a_vec[0],
-            true,
-            b_vec[0],
-            acc[0],
-            true
-        );
-
-        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
-            true,
-            a_vec[1],
-            true,
-            b_vec[1],
-            acc[0],
-            true
-        );
-
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], true);
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[1], true, b_vec[1], acc[0], true);
 #elif defined(RDNA3)
        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
        int32x4_t * a_vec = (int32x4_t *) A.x;
        int32x4_t * b_vec = (int32x4_t *) B.x;
-
-        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
-            true,
-            a_vec[0],
-            true,
-            b_vec[0],
-            acc[0],
-            true
-        );
-
-        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
-            true,
-            a_vec[1],
-            true,
-            b_vec[1],
-            acc[0],
-            true
-        );
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], true);
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[1], true, b_vec[1], acc[0], true);
 #endif // RDNA4
-
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
@@ -1297,19 +1225,10 @@ namespace ggml_cuda_mma {
        using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
        int32x16_t * acc = (int32x16_t *) D.x;
 #if defined(CDNA4) || defined(CDNA3)
-        acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
-                                                       ((int64_t *) B.x)[0],
-                                                       acc[0],
-                                                       0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
 #elif defined(CDNA2) || defined(CDNA1)
-        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
-                                                     B.x[0],
-                                                     acc[0],
-                                                     0, 0, 0);
-        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
-                                                     B.x[1],
-                                                     acc[0],
-                                                     0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
 #endif // defined(CDNA4) || defined(CDNA3)

 #else
@@ -1329,7 +1248,7 @@ namespace ggml_cuda_mma {

    static __device__ __forceinline__ void mma(
            tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#if defined(VOLTA_MMA_AVAILABLE)
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        int       * Dxi = (int       *) D.x;
@@ -1344,12 +1263,12 @@ namespace ggml_cuda_mma {
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#endif // defined(VOLTA_MMA_AVAILABLE)
    }

    static __device__ __forceinline__ void mma(
            tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#if defined(VOLTA_MMA_AVAILABLE)
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        int       * Dxi = (int       *) D.x;
@@ -1364,41 +1283,35 @@ namespace ggml_cuda_mma {
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#endif // defined(VOLTA_MMA_AVAILABLE)
    }

    template <data_layout dl_d, data_layout dl_ab>
    static __device__ __forceinline__ void mma(
            tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
-#if defined(AMD_WMMA_AVAILABLE)
+#if defined(AMD_MFMA_AVAILABLE)
+        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+        int32x4_t * acc = (int32x4_t *) D.x;
+#if defined(CDNA4) || defined(CDNA3)
+        const int64_t xA = uint32_t(A.x[0]);
+        const int64_t xB = uint32_t(B.x[0]);
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(xA, xB, acc[0], 0, 0, 0);
+#elif defined(CDNA2) || defined(CDNA1)
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
+#endif // defined(CDNA4) || defined(CDNA3)
+#elif defined(AMD_WMMA_AVAILABLE)
        using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
        int32x8_t * acc = (int32x8_t *) D.x;
 #if defined(RDNA4)
        using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
        int32x2_t * a_vec = (int32x2_t *) A.x;
        int32x2_t * b_vec = (int32x2_t *) B.x;
-
-        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
-            true,
-            a_vec[0],
-            true,
-            b_vec[0],
-            acc[0],
-            false
-        );
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], false);
 #elif defined(RDNA3)
        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
        int32x4_t * a_vec = (int32x4_t *) A.x;
        int32x4_t * b_vec = (int32x4_t *) B.x;
-
-        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
-            true,
-            a_vec[0],
-            true,
-            b_vec[0],
-            acc[0],
-            false
-        );
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], false);
 #endif // RDNA4
 #else
        GGML_UNUSED(D);
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -104,7 +104,7 @@ struct tile_x_sizes {
 };

 static int get_mmq_x_max_host(const int cc) {
-    return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
+    return (turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
        GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
 #ifdef GGML_CUDA_FORCE_MMQ
            128                     : 64;
@@ -114,9 +114,9 @@ static int get_mmq_x_max_host(const int cc) {
 }

 static constexpr __device__ int get_mmq_x_max_device() {
-#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
    return 128;
-#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
+#else // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)

 #if defined(GGML_USE_HIP)
    return 64;
@@ -1054,13 +1054,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
        tile_A A[ntx];
 #pragma unroll
        for (int n = 0; n < ntx; ++n) {
-            load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
            tile_B B;
-            load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

            float dB;
            const int j = j0 + tile_C::get_j(0);
@@ -1295,13 +1295,13 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
        tile_A A[ntx];
 #pragma unroll
        for (int n = 0; n < ntx; ++n) {
-            load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
            tile_B B;
-            load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

            const int j = j0 + tile_C::get_j(0);
            const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
@@ -1435,57 +1435,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
-#if defined(AMD_MFMA_AVAILABLE)
-    constexpr data_layout input_layout = get_input_data_layout();
-    typedef tile<16,  8, int, input_layout>        tile_A;
-    typedef tile<16,  8, int, input_layout>        tile_B;
-    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
-    typedef tile<64,  2, int, input_layout>        tile_load;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = granularity;
-    constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
-    const int   * y_qs = (const int   *) y + 4;
-    const float * y_df = (const float *) y;
-
-    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
-
-    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
-        const int k0 = k00 + k01;
-
-        tile_A A[ntx];
-#pragma unroll
-        for (int n = 0; n < ntx; ++n) {
-            load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
-        }
-
-#pragma unroll
-        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
-            tile_B B[1];
-            load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
-
-            const int j = j0 + tile_C::get_j(0);
-            const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                tile_C C;
-                mma(C, A[n], B[0]);
-
-#pragma unroll
-                for (int l = 0; l < tile_C::ne; ++l) {
-                    const int i = i0 + n*tile_C::I + tile_C::get_i(l);
-                    sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
-                }
-            }
-        }
-    }
-#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
+#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
    constexpr data_layout input_layout = get_input_data_layout();
    typedef tile<16,  4, int, input_layout>        tile_A;
    typedef tile<16,  4, int, input_layout>        tile_B;
@@ -1510,13 +1460,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
        tile_A A[ntx];
 #pragma unroll
        for (int n = 0; n < ntx; ++n) {
-            load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
            tile_B B;
-            load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

            const int j = j0 + tile_C::get_j(0);
            const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
@@ -1742,74 +1692,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
-#if defined(AMD_MFMA_AVAILABLE)
-    constexpr data_layout input_layout = get_input_data_layout();
-    typedef tile<16,  8, int, input_layout>        tile_A;
-    typedef tile<16,  8, int, input_layout>        tile_B;
-    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
-    typedef tile<64,  2, int, input_layout>        tile_load;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = granularity;
-    constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
-
-    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
-
-    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
-        const int k0 = k00 + k01;
-
-        tile_A A[ntx];
-#pragma unroll
-        for (int n = 0; n < ntx; ++n) {
-            load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
-        }
-
-#pragma unroll
-        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
-            tile_B B[1];
-            load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
-
-            const int j = j0 + tile_C::get_j(0);
-            const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x/2 : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y/2;
-            const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
-                                              : (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
-                                                             : __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
-
-            tile_C Cm;
-            if (k01 >= MMQ_TILE_NE_K * 3/4) {
-                tile_A A1;
-                A1.x[0] = 0x01010101;
-                A1.x[1] = 0x01010101;
-                mma(Cm, A1, B[0]);
-            }
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                tile_C Cd;
-                mma(Cd, A[n], B[0]);
-
-#pragma unroll
-                for (int l = 0; l < tile_C::ne; ++l) {
-                    const int i = i0 + n*tile_C::I + tile_C::get_i(l);
-                    const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
-                    float tmp = Cd.x[l]*dm.x;
-                    if (k01 >= MMQ_TILE_NE_K * 3/4) {
-                        tmp -= Cm.x[l]*dm.y;
-                    }
-                    sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
-                    sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
-                }
-            }
-        }
-    }
-#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
+#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
    constexpr data_layout input_layout = get_input_data_layout();
    typedef tile<16,  4, int, input_layout>        tile_A;
    typedef tile<16,  4, int, input_layout>        tile_B;
@@ -1834,13 +1717,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
        tile_A A[ntx];
 #pragma unroll
        for (int n = 0; n < ntx; ++n) {
-            load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
            tile_B B;
-            load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

            const int j = j0 + tile_C::get_j(0);
            const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
@@ -2573,59 +2456,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
-#if defined(AMD_MFMA_AVAILABLE)
-    constexpr data_layout input_layout = get_input_data_layout();
-    typedef tile<16,  8, int, input_layout>        tile_A;
-    typedef tile<16,  8, int, input_layout>        tile_B;
-    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
-    typedef tile<64,  2, int, input_layout>        tile_load;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = granularity;
-    constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
-    const int   * x_sc = (const int   *) x_df + MMQ_TILE_NE_K/QI6_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const float * y_df = (const float *) y;
-
-    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
-
-    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
-        const int k0 = k00 + k01;
-
-        tile_A A[ntx];
-#pragma unroll
-        for (int n = 0; n < ntx; ++n) {
-            load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
-        }
-
-#pragma unroll
-        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
-            tile_B B[1];
-            load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
-
-            const int j = j0 + tile_C::get_j(0);
-            const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                tile_C C;
-                mma(C, A[n], B[0]);
-
-#pragma unroll
-                for (int l = 0; l < tile_C::ne; ++l) {
-                    const int i = i0 + n*tile_C::I + tile_C::get_i(l);
-                    const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
-                    sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
-                }
-            }
-        }
-    }
-#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
+#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
    constexpr data_layout input_layout = get_input_data_layout();
    typedef tile<16,  4, int, input_layout>        tile_A;
    typedef tile<16,  4, int, input_layout>        tile_B;
@@ -2651,13 +2482,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
        tile_A A[ntx];
 #pragma unroll
        for (int n = 0; n < ntx; ++n) {
-            load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
            tile_B B;
-            load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

            const int j = j0 + tile_C::get_j(0);
            const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
@@ -3647,10 +3478,10 @@ template <ggml_type type, int mmq_x, bool need_check>
 static __global__ void mul_mat_q(
        const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
        const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
-        const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
-        const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
-        const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
-        const int ncols_max) {
+        const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
+        const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
+        const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
+        const uint3 ntx) {

    // Skip unused template specializations for faster compilation:
    if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
@@ -3664,8 +3495,7 @@ static __global__ void mul_mat_q(
    constexpr int qk    = ggml_cuda_type_traits<type>::qk;
    constexpr int mmq_y = get_mmq_y_device();

-    const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x
-    const int nty = (nrows_x   + mmq_y - 1) / mmq_y; // Number of tiles y
+    const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y

    // Initialize the ids for writing back data with just the index.
    // For regular matrix multiplications this is never changed.
@@ -3686,8 +3516,9 @@ static __global__ void mul_mat_q(
    // On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
 #if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
    {
-        const int wt = blockIdx.z / nchannels_y;
-        const int zt = blockIdx.z - wt*nchannels_y;
+        const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
+        const int wt = tmp2.x;
+        const int zt = tmp2.y;
        const int jt = blockIdx.y;
        const int it = blockIdx.x;

@@ -3730,40 +3561,40 @@ static __global__ void mul_mat_q(
        const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
        const int tile_y_max_j = col_diff - jt*mmq_x - 1;

-        const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
+        const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;

        constexpr bool fixup = false;
        mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
-             tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
+             tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
        return;
    }
 #endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA

-    constexpr int ITER_K = get_iter_k(type);
-
-    const     int64_t blocks_per_ne00 = ncols_x / qk;
-    constexpr int     blocks_per_iter = ITER_K / qk;
+    constexpr int ITER_K          = get_iter_k(type);
+    constexpr int blocks_per_iter = ITER_K / qk;

    // kbc == k block continuous, current index in continuous ijk space.
-    int64_t kbc      = (int64_t) blockIdx.x     *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
-    int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
+    int kbc      = int64_t(blockIdx.x)    *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
+    int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;

-    kbc      -= (kbc      % blocks_per_ne00) % blocks_per_iter;
-    kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
+    kbc      -= fastmodulo(kbc,      blocks_per_ne00) % blocks_per_iter;
+    kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;

    // kb0 == k index when doing the matrix multiplication for an output tile.
-    int kb0_start = kbc % blocks_per_ne00;
-    int kb0_stop  = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
-    while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
-        int tmp = kbc;
-        const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-        tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-        const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
-        tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
-        const int zt = tmp / (ntx*blocks_per_ne00);
-        tmp -= zt * (ntx*blocks_per_ne00);
-        const int jt = tmp / blocks_per_ne00;
+    int kb0_start = fastmodulo(kbc, blocks_per_ne00);
+    int kb0_stop  = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
+    while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
+        int tmp = fastdiv(kbc, blocks_per_ne00);
+        uint2 tmp2 = fast_div_modulo(tmp, ntx);
+        const int jt = tmp2.y;
+        tmp = tmp2.x;
+        tmp2 = fast_div_modulo(tmp, nchannels_y);
+        const int zt = tmp2.y;
+        tmp = tmp2.x;
+        tmp2 = fast_div_modulo(tmp, nsamples_y);
+        const int wt = tmp2.y;
+        const int it = tmp2.x;

        // Defaults for regular matrix multiplication:
        int col_low    = 0;
@@ -3781,11 +3612,11 @@ static __global__ void mul_mat_q(
            offset_dst = 0;

            if (jt*mmq_x >= col_diff) {
-                kbc += blocks_per_ne00;
-                kbc -= kbc % blocks_per_ne00;
+                kbc += blocks_per_ne00.z;
+                kbc -= fastmodulo(kbc, blocks_per_ne00);

                kb0_start = 0;
-                kb0_stop  = min(blocks_per_ne00, kbc_stop - kbc);
+                kb0_stop  = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));

                continue;
            }
@@ -3810,32 +3641,34 @@ static __global__ void mul_mat_q(
        const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
        const int tile_y_max_j = col_diff - jt*mmq_x - 1;

-        const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
+        const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;

        constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
        mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
             tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);

-        kbc += blocks_per_ne00;
-        kbc -= kbc % blocks_per_ne00;
+        kbc += blocks_per_ne00.z;
+        kbc -= fastmodulo(kbc, blocks_per_ne00);

        kb0_start = 0;
-        kb0_stop  = min(blocks_per_ne00, kbc_stop - kbc);
+        kb0_stop  = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
    }

    if (kbc >= kbc_stop) {
        return;
    }

-    int tmp = kbc;
-    const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
-    tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
-    const int zt = tmp / (ntx*blocks_per_ne00);
-    tmp -= zt * (ntx*blocks_per_ne00);
-    const int jt = tmp / blocks_per_ne00;
+    int tmp = fastdiv(kbc, blocks_per_ne00);
+    uint2 tmp2 = fast_div_modulo(tmp, ntx);
+    const int jt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nchannels_y);
+    const int zt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nsamples_y);
+    const int wt = tmp2.y;
+    const int it = tmp2.x;

    // Defaults for regular matrix multiplication:
    int col_low    = 0;
@@ -3877,7 +3710,7 @@ static __global__ void mul_mat_q(
    const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
    const int tile_y_max_j = col_diff - jt*mmq_x - 1;

-    const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
+    const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;

    constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
    mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
@@ -3886,46 +3719,37 @@ static __global__ void mul_mat_q(
 }

 template <ggml_type type, int mmq_x, bool need_check>
-static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
-                                                const int32_t * expert_bounds,
-                                                float * __restrict__ dst,
-                                                const float * __restrict__ tmp_last_tile,
-                                                const int    ncols_x,
-                                                const int    nrows_x,
-                                                const int    ncols_dst,
-                                                const size_t stride_col_dst,
-                                                const int    nchannels_y,
-                                                const size_t stride_channel_dst,
-                                                const int    nsamples_y,
-                                                const size_t stride_sample_dst,
-                                                const int    ncols_max) {
-    constexpr int     mmq_y           = get_mmq_y_device();
-    constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
-    constexpr int     ITER_K          = get_iter_k(type);
+__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
+static __global__ void mul_mat_q_stream_k_fixup(
+        const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
+        float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
+        const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
+        const int stride_sample_dst, const uint3 ntx) {
+    constexpr int mmq_y           = get_mmq_y_device();
+    constexpr int qk              = ggml_cuda_type_traits<type>::qk;
+    constexpr int ITER_K          = get_iter_k(type);
+    constexpr int blocks_per_iter = ITER_K / qk;

-    constexpr int     blocks_per_iter = ITER_K / qk;
-    const     int64_t blocks_per_ne00 = ncols_x / qk;
-
-    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int nwarps = mmq_get_nwarps_device()/2;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();

-    float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
+    float sum[mmq_x / nwarps] = {0.0f};
+    const int i = blockIdx.y*warp_size + threadIdx.x;

-    const int ntx  = (ncols_max + mmq_x - 1) / mmq_x;
-    const int nty  = (nrows_x   + mmq_y - 1) / mmq_y;
+    const int nty = (nrows_x + mmq_y - 1) / mmq_y;

    const int bidx0 = blockIdx.x;

    // kbc == k block continuous, current index in continuous ijk space.
-    int64_t kbc0      = (int64_t) bidx0     *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
-    int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
+    int kbc0      = int64_t(blockIdx.x)    *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
+    int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;

-    kbc0      -= (kbc0      % blocks_per_ne00) % blocks_per_iter;
-    kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
+    kbc0      -= fastmodulo(kbc0,      blocks_per_ne00) % blocks_per_iter;
+    kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;

    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
-    const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
-    const bool did_not_write_last      = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
+    const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
+    const bool did_not_write_last      = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
    if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
        return;
    }
@@ -3934,11 +3758,11 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

    // Iterate over previous blocks and sum up partial sums written to fixup buffer.
    // All CUDA blocks that get here must have a previous block that needs a fixup.
-    int64_t bidx = bidx0 - 1;
-    int64_t kbc_stop = kbc0;
+    int bidx = bidx0 - 1;
+    int kbc_stop = kbc0;
    while(true) {
-        int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
-        kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
+        int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
+        kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;

        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
@@ -3948,20 +3772,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

        any_fixup = true;

+
 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
            const int j = j0 + threadIdx.y;

-#pragma unroll
-            for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
-                const int i = i0 + threadIdx.x;
-
-                sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
-            }
+            sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
        }

        // If this block started in a previous tile we are done and don't need to combine additional partial results.
-        if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
+        if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
            break;
        }
        bidx--;
@@ -3972,14 +3792,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
        return;
    }

-    int tmp = kbc0;
-    const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
-    tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
-    const int zt = tmp / (ntx*blocks_per_ne00);
-    tmp -= zt * (ntx*blocks_per_ne00);
-    const int jt = tmp / blocks_per_ne00;
+    int tmp = fastdiv(kbc0, blocks_per_ne00);
+    uint2 tmp2 = fast_div_modulo(tmp, ntx);
+    const int jt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nchannels_y);
+    const int zt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nsamples_y);
+    const int wt = tmp2.y;
+    const int it = tmp2.x;

    if (!ids_dst) {
        const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
@@ -3987,6 +3809,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

        const int i_max = nrows_x   - it*mmq_y - 1;
        const int j_max = ncols_dst - jt*mmq_x - 1;
+        if (need_check && i > i_max) {
+            return;
+        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3996,16 +3821,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
                return;
            }

-#pragma unroll
-            for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
-                const int i = i0 + threadIdx.x;
-
-                if (need_check && i > i_max) {
-                    continue;
-                }
-
-                dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
-            }
+            dst[j*stride_col_dst + i] += sum[j0/nwarps];
        }
        return;
    }
@@ -4025,6 +3841,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

    const int i_max = nrows_x  - it*mmq_y - 1;
    const int j_max = col_diff - jt*mmq_x - 1;
+    if (need_check && i > i_max) {
+        return;
+    }

 #pragma unroll
    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -4034,16 +3853,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
            return;
        }

-#pragma unroll
-        for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
-            const int i = i0 + threadIdx.x;
-
-            if (need_check && i > i_max) {
-                continue;
-            }
-
-            dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
-        }
+        dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
    }
 }

@@ -4091,29 +3901,44 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int channel_ratio = args.nchannels_y / args.nchannels_x;
    const int sample_ratio  = args.nsamples_y  / args.nsamples_x;

+    const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
+    const uint3 ntx_fd             = init_fastdiv_values(ntx);
+    const uint3 nchannels_y_fd     = init_fastdiv_values(args.nchannels_y);
+    const uint3 nsamples_y_fd      = init_fastdiv_values(args.nsamples_y);
+    const uint3 channel_ratio_fd   = init_fastdiv_values(channel_ratio);
+    const uint3 sample_ratio_fd    = init_fastdiv_values(sample_ratio);
+
    if (!args.use_stream_k) {
        if (args.nrows_x % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
-                 args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-                 channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-                 sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-                 args.ncols_max);
+                 blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+                 channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+                 sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+                 ntx_fd);
        } else {
            constexpr bool need_check = true;
            mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
-                 args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-                 channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-                 sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-                 args.ncols_max);
+                 blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+                 channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+                 sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+                 ntx_fd);
        }
        return;
    }

-    const dim3 block_nums_stream_k(nsm, 1, 1);
-    const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
+    // For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
+    // This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
+    const int ntiles_dst = ntx * nty * ntzw;
+    const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
+    const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
+    const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
+
+    GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
+
+    const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;

    ggml_cuda_pool & pool = ctx.pool(id);
    ggml_cuda_pool_alloc<float> tmp_fixup(pool);
@@ -4121,40 +3946,45 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
        tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
    }

+    const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
+    const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
+
    if (args.nrows_x % mmq_y == 0) {
        constexpr bool need_check = false;
        mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
            (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
-             args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-             channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-             sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-             args.ncols_max);
+             blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+             channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+             sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+             ntx_fd);

        if (!fixup_needed) {
            return;
        }

-        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
-            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
-             args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
-             args.ncols_max);
+        CUDA_CHECK(cudaGetLastError());
+        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
+            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
+             args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
+             ntx_fd);
    } else {
        constexpr bool need_check = true;
        mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
            (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
-             args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-             channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-             sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-             args.ncols_max);
+             blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+             channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+             sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+             ntx_fd);

        if (!fixup_needed) {
            return;
        }

-        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
-            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
-             args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
-             args.ncols_max);
+        CUDA_CHECK(cudaGetLastError());
+        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
+            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
+             args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
+             ntx_fd);
    }
 }

--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -65,6 +65,11 @@ static __device__ __forceinline__ float op_sqr(float x) {
    return x * x;
 }

+static __device__ __forceinline__ float op_relu_sqr(float x) {
+    const float r = fmaxf(x, 0.0f);
+    return r * r;
+}
+
 static __device__ __forceinline__ float op_sqrt(float x) {
    return sqrtf(x);
 }
@@ -615,3 +620,21 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
            GGML_ABORT("Unsupported unary op for fused unary+mul");
    }
 }
+
+/* fused relu + sqr */
+
+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
+    const ggml_tensor * src = relu_node->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+    GGML_ASSERT(src->type == sqr_node->type);
+
+    const int k = ggml_nelements(src);
+    if (src->type == GGML_TYPE_F16) {
+        unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
+    } else {
+        unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
+    }
+}
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -91,6 +91,8 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);

+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
+
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
    return x / (1.0f + expf(-x));
 }
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -33,7 +33,6 @@
 #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
 #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
-#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
 #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
 #define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
@@ -59,6 +58,7 @@
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
+#define cudaErrorMemoryAllocation hipErrorOutOfMemory
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -42,6 +42,7 @@
 #define cudaDeviceProp musaDeviceProp
 #define cudaDeviceSynchronize musaDeviceSynchronize
 #define cudaError_t musaError_t
+#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -12,9 +12,12 @@
 #include <cstddef>
 #include <stdexcept>
 #include <string>
+#include <sstream>
+#include <iomanip>
 #include <unordered_set>
 #include <unordered_map>
 #include <regex>
+#include <queue>

 #ifdef _WIN32
 #    include <sal.h>
@@ -41,18 +44,26 @@
 #include "htp_iface.h"
 #include "htp-drv.h"

+using intvec  = std::vector<int>;
+using uintvec = std::vector<unsigned int>;
+using u32vec  = std::vector<uint32_t>;
+
 static size_t opt_ndev         = 1;
 static size_t opt_nhvx         = 0; // use all
 static int    opt_arch         = 0; // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
-static int    opt_profile      = 0;
+static int    opt_profile      = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
 static int    opt_hostbuf      = 1; // hostbuf ON by default
 static int    opt_use_hmx      = 1; // when set, enable HMX; when 0, use HVX only

+// Default PMU events, if profiling with PMU (mode=2) is enabled
+// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
+//     https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
+static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
+
 // Enable all stages by default
-static int opt_opmask   = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
-static int opt_opsync   = 0;  // synchronous ops
+static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
 }

 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
-                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
+                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
    if (!opt_profile) return;

    op_desc desc(op);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
-                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
+
+    char pmu_str[256] = "";
+    if (opt_profile > 1) {
+        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
+        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
+                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
+    }
+
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
+            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
 }

 // ** backend sessions

 struct ggml_hexagon_opbatch;
-struct ggml_hexagon_opshm;
+struct ggml_hexagon_opqueue;

 struct ggml_hexagon_session {
    std::string      name;
@@ -132,8 +150,8 @@ struct ggml_hexagon_session {
    bool             valid_iface;

    std::atomic<int>      op_pending;
-    ggml_hexagon_opbatch *op_batch;
-    ggml_hexagon_opshm   *op_shm;
+    ggml_hexagon_opbatch* op_batch;
+    ggml_hexagon_opqueue* op_queue;

    ggml_backend_buffer_type buffer_type        = {};
    ggml_backend_buffer_type repack_buffer_type = {};
@@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf

 // Backend session implementation

-struct ggml_hexagon_opshm {
-    ggml_hexagon_shared_buffer *sbuf;
-
-    std::vector<bool> block_mask;
-    size_t            block_size;
-
-    uint8_t * base()     const { return this->sbuf->base; }
-    int       fd()       const { return this->sbuf->fd;   }
-    size_t    n_blocks() const { return this->block_mask.size(); }
-
-    ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
-        size_t n_bufs    = HTP_OP_MAX_BUFS;
-        size_t n_ops     = max_batch;
-        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
-
-        block_mask.resize(max_pending, true);
-
-        block_size = sizeof(htp_buf_desc) * n_bufs    +
-                     sizeof(htp_tensor)   * n_tensors +
-                     sizeof(htp_op_desc)  * n_ops;
-
-        sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
-
-        if (opt_verbose) {
-            GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
-                    sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
-        }
-    }
-
-    ~ggml_hexagon_opshm() {
-        delete sbuf;
-    }
-
-    uint8_t * allocate() {
-        auto it = std::find(block_mask.begin(), block_mask.end(), true);
-        if (it == block_mask.end())
-            return nullptr;
-
-        unsigned int i = std::distance(block_mask.begin(), it);
-        uint8_t*  addr = sbuf->base + (i * block_size);
-        block_mask[i]  = false;
-
-        HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-        return addr;
-    }
-
-    void release(uint8_t * addr) {
-        int i = (addr - sbuf->base) / block_size;
-        block_mask[i] = true;
-        HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-    }
-};
-
 struct ggml_hexagon_opbatch {
-    const char* name;
+    ggml_hexagon_session*            sess;

-    std::vector<htp_buf_desc> buffers;
-    std::vector<htp_tensor>   tensors;
-    std::vector<htp_op_desc>  ops;
+    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
+
+    std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
+    std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
+    std::vector<htp_op_desc>         h_ops;     // htp op descriptors

    std::unordered_map<int, int>                b_map; // buffer fd   to index
    std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr  to index
@@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
        d_map.clear();
    }

-    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
-        name = sess->c_name();
+    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
+        this->sess = sess;

        n_bufs_max = HTP_OP_MAX_BUFS;
-        n_ops_max  = max_batch;
+        n_ops_max  = batch_size;
        n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;

        b_vmem_max = HTP_OP_MAX_VMEM;

-        buffers.resize(n_bufs_max);
-        tensors.resize(n_tens_max);
        ops.resize(n_ops_max);

+        h_bufs.resize(n_bufs_max);
+        h_tens.resize(n_tens_max);
+        h_ops.resize(n_ops_max);
+
        b_map.reserve(n_bufs_max);
        t_map.reserve(n_tens_max);
        d_map.reserve(n_tens_max);
@@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {

        b_map.insert({sbuf->fd, bi});

-        htp_buf_desc &b = buffers[bi];
+        htp_buf_desc &b = h_bufs[bi];
        b.base = (uint64_t) sbuf->base;
        b.fd   = sbuf->fd;
        b.size = sbuf->size;
@@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
        // First lookup by tensor data
        auto range = d_map.equal_range(t->data);
        for (auto it = range.first; it != range.second; ++it) {
-            htp_tensor * h = &tensors[it->second];
+            htp_tensor * h = &h_tens[it->second];
            if (same_shape(h, t)) { return it->second; }
        }

@@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
        uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
        size_t   t_size   = ggml_nbytes(t);

-        htp_tensor &h = tensors[ti];
+        htp_tensor &h = h_tens[ti];
        h.bi    = add_buffer(sbuf);
        h.data  = t_offset;
        h.size  = t_size;
@@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
    // assumes that fit_op() was called first and returned true
    void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
        // Add new op
-        htp_op_desc &o = ops[n_ops++];
+
+        unsigned int n = n_ops++;
        GGML_ASSERT(n_ops <= n_ops_max);

+        ops[n] = t;
+
+        htp_op_desc &o = h_ops[n];
        memcpy(&o.params, &t->op_params, sizeof(t->op_params));
        o.opcode = opcode;
        o.flags  = 0;

-        if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
            o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
        }

-        ggml_hexagon_dump_op_exec(name, t, o.flags);
+        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);

        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
            o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
        }
        o.dst = add_tensor(t);
    }
+};

-    size_t flush(uint8_t * mem_addr, size_t mem_size) {
-        static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
-        static_assert(sizeof(htp_tensor)   % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
-        static_assert(sizeof(htp_op_desc)  % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+struct ggml_hexagon_opqueue {
+    // Shared buffer for storing batches
+    ggml_hexagon_shared_buffer *shm_buf;
+    size_t                      shm_blk_size;

-        const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
-        const size_t t_size = sizeof(htp_tensor)   * n_tens;
-        const size_t o_size = sizeof(htp_op_desc)  * n_ops;
+    using opvec = std::vector<const ggml_tensor*>;

-        const size_t m_size = b_size + t_size + o_size;
-        GGML_ASSERT(m_size <= mem_size);
+    std::queue<unsigned int>    done;       // completed batch ids
+    std::vector<opvec>          op_cache;   // per batch op cache
+    std::vector<uint64_t>       start_usec; // per batch start time

-        uint8_t * b_ptr = (uint8_t *) mem_addr;
-        uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
-        uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
+    ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
+        size_t n_bufs    = HTP_OP_MAX_BUFS;
+        size_t n_ops     = batch_size;
+        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;

-        memcpy(b_ptr, (void *) buffers.data(), b_size);
-        memcpy(t_ptr, (void *) tensors.data(), t_size);
-        memcpy(o_ptr, (void *) ops.data(),     o_size);
+        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
+                       sizeof(htp_tensor)    * n_tensors +
+                       sizeof(htp_op_desc)   * n_ops     +
+                       sizeof(htp_prof_desc) * n_ops;

-        HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
-                name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
+        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
+
+        op_cache.resize(depth);
+        start_usec.resize(depth, 0);
+
+        // init done queue
+        for (unsigned int i = 0; i < depth; i++) { done.push(i); }
+
+        if (opt_verbose) {
+            GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
+                    sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
+        }
+    }
+
+    ~ggml_hexagon_opqueue() {
+        delete shm_buf;
+    }
+
+    // push new batch
+    bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
+        static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
+        static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
+        static_assert(sizeof(htp_buf_desc)    % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
+        static_assert(sizeof(htp_tensor)      % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
+        static_assert(sizeof(htp_op_desc)     % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+        static_assert(sizeof(htp_prof_desc)   % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
+
+        if (done.empty()) { return false; }
+
+        req.id        = done.front(); done.pop(); // batch id
+        req.n_bufs    = op_batch->n_bufs;
+        req.n_tensors = op_batch->n_tens;
+        req.n_ops     = op_batch->n_ops;
+
+        op_cache[req.id]   = op_batch->ops;
+        start_usec[req.id] = ggml_time_us();
+
+        const size_t b_size = sizeof(htp_buf_desc)  * req.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * req.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
+
+        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
+        dbuf.fd       = shm_buf->fd;
+        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
+        dbuf.size     = b_size + t_size + o_size + p_size;
+
+        GGML_ASSERT(dbuf.size <= shm_blk_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * b_ptr = m_ptr; m_ptr += b_size;
+        uint8_t * t_ptr = m_ptr; m_ptr += t_size;
+        uint8_t * o_ptr = m_ptr;
+
+        memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
+        memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
+        memcpy(o_ptr, (void *) op_batch->h_ops.data(),  o_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
+                shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
+                b_size, t_size, o_size, (size_t) dbuf.size);
+
+        op_batch->reset();

        if (opt_verbose > 1) {
            htp_buf_desc *b = (htp_buf_desc*) b_ptr;
-            for (unsigned int i=0; i < n_bufs; i++) {
-                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
+            for (unsigned int i=0; i < req.n_bufs; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
                            b[i].fd, (void *) b[i].base, (size_t) b[i].size);
            }
            htp_tensor *t = (htp_tensor*) t_ptr;
-            for (unsigned int i=0; i < n_tens; i++) {
+            for (unsigned int i=0; i < req.n_tensors; i++) {
                GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
-                            name, i, t[i].bi, t[i].data, t[i].size,
+                            shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
                            (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
            }
        }

-        reset();
+        return true;
+    }

-        return m_size;
+    void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
+        GGML_ASSERT(rsp.id < op_cache.size());
+
+        done.push(rsp.id);
+
+        const size_t b_size = sizeof(htp_buf_desc)  * rsp.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * rsp.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
+
+        const size_t m_size = b_size + t_size + o_size + p_size;
+        GGML_ASSERT(m_size <= shm_blk_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
+                shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
+                (size_t) dbuf.size, b_size, t_size, o_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
+
+        if (opt_profile && rsp.n_ops > 0) {
+            auto & ops = op_cache[rsp.id];
+
+            uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
+            uint32_t htp_usec   = 0;
+
+            GGML_ASSERT(rsp.n_ops <= ops.size());
+
+            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+            }
+
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+        }
    }
 };

@@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
            GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
        }

-        op_shm->release((uint8_t*) dbuf.ptr);
-
        if (rsp.status != HTP_STATUS_OK) {
            GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
            // TODO: handle errors
        }

-        // FIXME: profile will be per opreq
-        // this->prof_usecs  = rsp.prof_usecs;
-        // this->prof_cycles = rsp.prof_cycles;
-        // this->prof_pkts   = rsp.prof_pkts;
+        op_queue->pop(rsp, dbuf);

        this->op_pending--;  // atomic dec

@@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
 void ggml_hexagon_session::flush_batch() {
    if (op_batch->empty()) { return; }

-    htp_opbatch_req req;
-    req.n_bufs    = op_batch->n_bufs;
-    req.n_tensors = op_batch->n_tens;
-    req.n_ops     = op_batch->n_ops;
+    htp_opbatch_req req {};
+    dspqueue_buffer dbuf{};

-    dspqueue_buffer dbuf;
-    dbuf.fd     = op_shm->fd();
-    dbuf.flags  = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-    dbuf.ptr    = op_shm->allocate();
-    if (!dbuf.ptr) {
+    if (!op_queue->push(req, dbuf, op_batch)) {
        flush_pending(false);
-        dbuf.ptr = op_shm->allocate();
+        op_queue->push(req, dbuf, op_batch);
    }

-    dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
-    dbuf.size   = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
-
    // Bump pending flag (cleared in the session::flush once we get the response)
    this->op_pending++;  // atomic inc

-    HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
-
    int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
    if (err != 0) {
        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    }

    if (opt_etm) {
-        err = htp_iface_enable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 1);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

-    // Start the DSP-side service. We need to pass the queue ID to the
-    // DSP in a FastRPC call; the DSP side will import the queue and start
-    // listening for packets in a callback.
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
+
+        err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
+    // Allocate buffers and state for op batching
+    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
+    this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
+
+    // Start processing op batch requests
    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
    if (err != 0) {
        GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
        throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
    }
    this->valid_iface = true;
-
-    // Allocate buffers and state for op batching
-    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
-    this->op_shm   = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
 }

 void ggml_hexagon_session::release() noexcept(true) {
@@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
    int err;

    delete this->op_batch;
-    delete this->op_shm;
+    delete this->op_queue;

    // Stop the DSP-side service and close the queue
    if (this->valid_iface) {
@@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
    }

    if (opt_etm) {
-        err = htp_iface_disable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 0);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        err = htp_iface_profiler(this->handle, 0, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
    if (this->valid_queue) {
        err = dspqueue_close(queue);
        if (err != 0) {
@@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
    repack_buffer_type.device = dev;

    op_batch = nullptr;
-    op_shm   = nullptr;
+    op_queue = nullptr;

    try {
        allocate(dev_id);
@@ -2596,6 +2670,62 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se
    return true;
 }

+static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
+
+    // diag only supports F32 currently
+    if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    // Input must have ne[1] == 1 (vector input)
+    if (src0->ne[1] != 1) {
+        return false;
+    }
+
+    // Output must be square in first two dimensions
+    if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
+static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0]; // A
+    const struct ggml_tensor * src1 = op->src[1]; // B
+    const struct ggml_tensor * dst  = op;         // X
+
+    if (!src0 || !src1) {
+        return false;
+    }
+
+    if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    if (src0->ne[0] != src0->ne[1]) {
+        return false;
+    }
+
+    if (src0->ne[1] != src1->ne[1]) {
+        return false;
+    }
+
+    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
+        return false;
+    }
+
+    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
    return sess->c_name();
@@ -2632,7 +2762,9 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
        case GGML_OP_ROPE:           return HTP_OP_ROPE;
        case GGML_OP_REPEAT:         return HTP_OP_REPEAT;
        case GGML_OP_CUMSUM:         return HTP_OP_CUMSUM;
-
+        case GGML_OP_FILL:           return HTP_OP_FILL;
+        case GGML_OP_DIAG:           return HTP_OP_DIAG;
+        case GGML_OP_SOLVE_TRI:      return HTP_OP_SOLVE_TRI;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(t)) {
                case GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -2673,7 +2805,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg

    for (int i = 0; i < graph->n_nodes; ++i) {
        ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n)) {
+        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
            sess->enqueue_op(op_remap_to_htp(n), n);
        }
    }
@@ -3029,6 +3161,17 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
    return true;
 }

+static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * dst = op;
+
+    if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
    auto sess = static_cast<ggml_hexagon_session *>(dev->context);

@@ -3159,6 +3302,18 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_cumsum(sess, op);
            break;

+        case GGML_OP_FILL:
+            supp = ggml_hexagon_supported_fill(sess, op);
+            break;
+
+        case GGML_OP_DIAG:
+            supp = ggml_hexagon_supported_diag(sess, op);
+            break;
+
+        case GGML_OP_SOLVE_TRI:
+            supp = ggml_hexagon_supported_solve_tri(sess, op);
+            break;
+
        default:
            break;
    }
@@ -3294,6 +3449,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
    return NULL;
 }

+template<typename T> std::vector<T> str_to_vec(const char* str) {
+    std::stringstream ss(str);
+    std::vector<T> v;
+    std::string    t;
+
+    while (std::getline(ss, t, ',')) {
+        v.push_back(std::stoul(t, nullptr, 0));
+    }
+
+    return v;
+}
+
+template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
+    std::stringstream ss;
+    ss << std::setbase(BASE) << std::showbase;
+    for (auto i : v) { ss << i << ','; }
+    auto str = ss.str(); str.pop_back(); // drop last comma
+    return str;
+}
+
 static void ggml_hexagon_init(ggml_backend_reg * reg) {
    // Basic sanity checks to make sure definitions match
    static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3307,8 +3482,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
-    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
+    const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
    const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
    const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
    const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3321,19 +3495,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    auto RE_ICASE = std::regex_constants::icase;

-    opt_opfilter     = str_opfilter     ? new std::regex(str_opfilter, RE_ICASE) : NULL;
-    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
-    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0)  : opt_opmask;
-    opt_opsync       = str_opsync  ? atoi(str_opsync)              : opt_opsync;
-    opt_opbatch      = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
-    opt_opqueue      = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
-    opt_profile      = str_profile ? atoi(str_profile) : 0;
-    opt_etm          = str_etm     ? atoi(str_etm)     : 0;
-    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
-    opt_use_hmx      = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
-    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
+    opt_opfilter     = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
+    opt_verbose      = str_verbose  ? atoi(str_verbose)             : 0;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+    opt_opstage      = str_opstage  ? strtoul(str_opstage, NULL, 0) : opt_opstage;
+    opt_opbatch      = str_opbatch  ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
+    opt_opqueue      = str_opqueue  ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
+    opt_etm          = str_etm      ? atoi(str_etm)                 : 0;
+    opt_nhvx         = str_nhvx     ? strtoul(str_nhvx, NULL, 0)    : opt_nhvx;
+    opt_use_hmx      = str_use_hmx  ? atoi(str_use_hmx)             : opt_use_hmx;
+    opt_ndev         = str_ndev     ? strtoul(str_ndev, NULL, 0)    : opt_ndev;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+
+    if (str_profile) {
+        opt_pmu_evt = [&]() -> std::vector<uint32_t> {
+            auto v  = str_to_vec<uint32_t>(str_profile);
+            switch (v.size()) {
+                case 1:  opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
+                case 8:  opt_profile = 2;    return v;           // mode with custom  pmu events
+                default: opt_profile = 0;    return {};          // garbage input
+            }}();
+        if (opt_profile == 1) opt_pmu_evt = {};
+        GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
+                vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
+    }

    if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
        opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -34,6 +34,9 @@ add_library(${HTP_LIB} SHARED
    argsort-ops.c
    ssm-conv.c
    cumsum-ops.c
+    fill-ops.c
+    diag-ops.c
+    solve-tri-ops.c
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
--- a/Show More
+++ b/Show More