sync : ggml

ggml : bump version to 0.10.1 (ggml/1469)
webui: fix slow mic stop and WAV encode (#22480 )
2026-05-08 18:14:07 +00:00 · 2026-04-29 16:43:47 +03:00 · 2026-04-29 16:43:47 +03:00 · 2026-04-29 12:58:35 +02:00 · 2026-04-29 13:32:40 +03:00 · 2026-04-29 16:19:33 +08:00
348 changed files with 30517 additions and 13694 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04

 ## Build Image

--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -6,7 +6,7 @@

 <!-- You can provide more details and link related discussions here. Delete this section if not applicable -->

-# Requirements
+## Requirements

 <!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->

--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -0,0 +1,116 @@
+name: CI (snapdragon)
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+    paths:
+      - '.github/workflows/build-and-test-snapdragon.yml'
+      - 'ggml/include/ggml-hexagon.h'
+      - 'ggml/src/ggml-hexagon/**'
+      - 'docs/backend/snapdragon/**'
+      - 'scripts/snapdragon/**'
+      - 'CMakePresets.json'
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - '.github/workflows/build-and-test-snapdragon.yml'
+      - 'ggml/include/ggml-hexagon.h'
+      - 'ggml/src/ggml-hexagon/**'
+      - 'docs/backend/snapdragon/**'
+      - 'scripts/snapdragon/**'
+      - 'CMakePresets.json'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  android-ndk-snapdragon:
+    runs-on: ubuntu-latest
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false
+
+      - name: Build Llama.CPP for Snapdragon Android
+        id: build_llama_cpp_snapdragon_android
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+          cmake --preset arm64-android-snapdragon-release -B build
+          cmake --build build
+          cmake --install build --prefix pkg-snapdragon/llama.cpp
+
+      - name: Upload Llama.CPP Snapdragon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-android-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
+  test-snapdragon-qdc:
+    name: Test on QDC Android Device (${{ matrix.device }})
+    needs: [android-ndk-snapdragon]
+    runs-on: ubuntu-slim
+    strategy:
+      fail-fast: false
+      matrix:
+        device: [SM8750, SM8650, SM8850]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Download build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: llama-cpp-android-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+          cache: pip
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y curl unzip
+
+      - name: Install QDC SDK wheel
+        run: |
+          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
+          unzip qdc_sdk.zip -d qdc_sdk
+          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
+
+      - name: Check QDC API key
+        id: check_secret
+        env:
+          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
+        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
+
+      - name: Run QDC tests (${{ matrix.device }})
+        if: steps.check_secret.outputs.has-qdc-key == 'true'
+        run: |
+          python scripts/snapdragon/qdc/run_qdc_jobs.py \
+              --test       all \
+              --pkg-dir    pkg-snapdragon/llama.cpp \
+              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
+              --device     ${{ matrix.device }}
+        env:
+          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
+
+      - name: Cleanup
+        if: always()
+        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,26 +1,24 @@
 name: CI (android)

 on:
-  workflow_dispatch: # allows manual triggering
+  workflow_dispatch:
  push:
    branches:
      - master
-    paths: [
-      '.github/workflows/build-android.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
+    paths:
+      - '.github/workflows/build-android.yml'
+      - '**/CMakeLists.txt'
+      - '**/.cmake'
+      - '**/*.h'
+      - '**/*.hpp'
+      - '**/*.c'
+      - '**/*.cpp'

  pull_request:
    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-android.yml',
-      'examples/llama.android/**'
-    ]
+    paths:
+      - '.github/workflows/build-android.yml'
+      - 'examples/llama.android/**'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -67,35 +65,24 @@ jobs:
    defaults:
      run:
        shell: bash
-    strategy:
-      matrix:
-        include:
-          - build: 'arm64-cpu'
-            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
-          - build: 'arm64-snapdragon'
-            defines: '--preset arm64-android-snapdragon-release'

    steps:
      - name: Clone
-        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          lfs: false

-      - name: Build Llama.CPP for Hexagon Android
-        id: build_llama_cpp_hexagon_android
+      - name: Build
+        id: ndk_build
        run: |
-          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
-            cp docs/backend/snapdragon/CMakeUserPresets.json .
-          fi
-          cmake ${{ matrix.defines }} -B build
+          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Upload Llama.CPP Hexagon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
+      - name: Upload Android Build Artifact
+        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
        uses: actions/upload-artifact@v6
        with:
-          name: llama-cpp-android-${{ matrix.build }}
+          name: llama-cpp-android-arm64-cpu
          path: pkg-adb/llama.cpp
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -0,0 +1,142 @@
+name: CI (sycl)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-sycl.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-sycl.yml',
+      'ggml/src/ggml-sycl/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-sycl-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -555,106 +555,6 @@ jobs:
            -DGGML_MUSA=ON
          time cmake --build build --config Release -j $(nproc)

-  ubuntu-22-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-sycl
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          time cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-sycl-fp16
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          time cmake --build build --config Release -j $(nproc)

  windows-latest:
    runs-on: windows-2025
@@ -863,39 +763,6 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat

  windows-latest-hip:
    runs-on: windows-2022
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -598,15 +598,29 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -614,10 +628,6 @@ jobs:
          variant: ccache
          evict-old-files: 1d

-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
      - name: Build
        id: cmake_build
        shell: cmd
@@ -670,6 +680,82 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-sycl-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+
  ubuntu-22-rocm:
    runs-on: ubuntu-22.04

@@ -1045,6 +1131,7 @@ jobs:
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
+      - ubuntu-24-sycl
      - android-arm64
      - macOS-cpu
      - ios-xcode-build
@@ -1133,6 +1220,8 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
--- a/.gitignore
+++ b/.gitignore
@@ -34,7 +34,6 @@
 /.vscode/
 /nppBackup

-
 # Coverage

 /gcovr-report/
@@ -74,6 +73,7 @@
 !/models/templates

 # Zig
+
 /zig-out/
 /zig-cache/

@@ -93,6 +93,7 @@
 !/examples/sycl/*.sh

 # Server Web UI temporary files
+
 /tools/server/webui/node_modules
 /tools/server/webui/dist
 # we no longer use gz for index.html
@@ -106,9 +107,11 @@ __pycache__/
 poetry.toml

 # Nix
+
 /result

 # Test binaries
+
 /tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
@@ -124,6 +127,7 @@ poetry.toml
 /tests/test-tokenizer-1-spm

 # Scripts
+
 !/scripts/install-oneapi.bat

 # Generated by scripts
@@ -132,16 +136,24 @@ poetry.toml
 /wikitext-2-raw/

 # Test models for lora adapters
+
 /lora-tests

 # Local scripts
+
 /run-vim.sh
 /run-chat.sh
 /run-spec.sh
 /.ccache/

 # IDE
+
 /*.code-workspace
 /.windsurf/
 # emscripten
 a.out.*
+
+# AGENTS
+
+AGENTS.local.md
+.pi/SYSTEM.md
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -0,0 +1,33 @@
+You are a coding agent. Here are some very important rules that you must follow:
+
+General:
+- By very precise and concise when writing code, comments, explanations, etc.
+- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
+- Don't try to build or run the code unless you are explicitly asked to do so
+
+Coding:
+- When in doubt, always refer to the CONTRIBUTING.md file of the project
+- When referencing issues or PRs in comments, use the format:
+  - C/C++ code: `// ref: <url>`
+  - Other (CMake, etc.): `# ref: <url>`
+
+Pull requests (PRs):
+- New branch names are prefixed with "gg/"
+- Before opening a pull request, ask the user to confirm the description
+- When creating a pull request, look for the repository's PR template and follow it
+- For the AI usage disclosure section, write "YES. llama.cpp + pi"
+- Always create the pull requests in draft mode
+
+Commits:
+- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
+- Do not explicitly set the git author in commits - rely on the default git config
+
+Resources (read on demand):
+- [CONTRIBUTING.md](CONTRIBUTING.md)
+- [Build documentation](docs/build.md)
+- [Server usage documentation](tools/server/README.md)
+- [Server development documentation](tools/server/README-dev.md)
+- [PEG parser](docs/development/parsing.md)
+- [Auto parser](docs/autoparser.md)
+- [Jinja engine](common/jinja/README.md)
+- [PR template](.github/pull_request_template.md)
--- a/11
+++ b/11
@@ -53,28 +53,29 @@
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
 /ggml/include/                          @ggerganov
+/ggml/src/ggml-backend-meta.cpp         @JohannesGaessler
 /ggml/src/ggml-cann/                    @ggml-org/ggml-cann
 /ggml/src/ggml-common.h                 @ggerganov
 /ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
 /ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
-/ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
-/ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
+/ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
+/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
+/ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggml-org/ggml-metal
 /ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
+/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
 /ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
 /ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-virtgpu/                 @kpouget
+/ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -25,7 +25,8 @@ struct common_arg {
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
-    bool is_sparam = false; // is current arg a sampling param?
+    bool is_sampling = false; // is current arg a sampling param?
+    bool is_spec = false; // is current arg a speculative decoding param?
    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
@@ -74,7 +75,8 @@ struct common_arg {
    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
-    common_arg & set_sparam();
+    common_arg & set_sampling();
+    common_arg & set_spec();
    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                start = trim_leading_whitespace(result.tags["pre"]);
-                end   = trim_trailing_whitespace(result.tags["post"]);
+                start = result.tags["pre"];
+                end   = result.tags["post"];
            } else if (!result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
            }
        }
    }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -397,6 +397,25 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
    return render_message_to_json(msgs, c);
 }

+json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
+    if (tools.empty()) {
+        return json();
+    }
+
+    auto result = json::array();
+    for (const auto & tool : tools) {
+        result.push_back({
+            { "type",     "function" },
+            { "function", {
+                { "name", tool.name },
+                { "description", tool.description },
+                { "parameters", json::parse(tool.parameters) },
+            }},
+        });
+    }
+    return result;
+}
+
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
    std::vector<common_chat_tool> result;

@@ -432,56 +451,6 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    return result;
 }

-json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
-    if (tools.empty()) {
-        return json();
-    }
-
-    auto result = json::array();
-    for (const auto & tool : tools) {
-        result.push_back({
-            { "type",     "function" },
-            { "function",
-             {
-                  { "name", tool.name },
-                  { "description", tool.description },
-                  { "parameters", json::parse(tool.parameters) },
-              }                      },
-        });
-    }
-    return result;
-}
-
-json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
-    json delta = json::object();
-    if (!diff.reasoning_content_delta.empty()) {
-        delta["reasoning_content"] = diff.reasoning_content_delta;
-    }
-    if (!diff.content_delta.empty()) {
-        delta["content"] = diff.content_delta;
-    }
-    if (diff.tool_call_index != std::string::npos) {
-        json tool_call;
-        tool_call["index"] = diff.tool_call_index;
-        if (!diff.tool_call_delta.id.empty()) {
-            tool_call["id"]   = diff.tool_call_delta.id;
-            tool_call["type"] = "function";
-        }
-        if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
-            json function = json::object();
-            if (!diff.tool_call_delta.name.empty()) {
-                function["name"] = diff.tool_call_delta.name;
-            }
-            if (!diff.tool_call_delta.arguments.empty()) {
-                function["arguments"] = diff.tool_call_delta.arguments;
-            }
-            tool_call["function"] = function;
-        }
-        delta["tool_calls"] = json::array({ tool_call });
-    }
-    return delta;
-}
-
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
    if (use_jinja) {
        try {
@@ -575,6 +544,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }

+// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+static bool is_lfm2_template(const std::string & src) {
+    return src.find("<|tool_list_start|>") != std::string::npos &&
+           src.find("<|tool_list_end|>")   != std::string::npos;
+}
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
+    common_chat_prompt_preset asr_preset;
+    asr_preset.system = "";
+    asr_preset.user   = "Transcribe audio to text";
+
+    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
+        asr_preset.system = "Perform ASR.";
+        asr_preset.user   = "";
+    }
+
+    return asr_preset;
+}
+
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
    if (!variant.empty()) {
        if (variant == "tool_use") {
@@ -2084,10 +2073,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-    if (src.find("<|tool_list_start|>") != std::string::npos &&
-        src.find("<|tool_list_end|>") != std::string::npos) {
+    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }
@@ -2396,4 +2382,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
    GGML_ASSERT(chat_templates->template_default != nullptr);
    return chat_templates->template_default->caps.to_map();
 }
-
--- a/common/chat.h
+++ b/common/chat.h
@@ -256,14 +256,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
+
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
 nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

-nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
-
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

@@ -275,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);
+
+// specialized per-task preset
+struct common_chat_prompt_preset {
+    std::string system;
+    std::string user;
+};
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -70,7 +70,7 @@ common_time_meas::~common_time_meas() {
 // CPU utils
 //

-int32_t cpu_get_num_physical_cores() {
+int32_t common_cpu_get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
@@ -185,11 +185,11 @@ static int cpu_count_math_cpus(int n_cpu) {
 /**
 * Returns number of CPUs on system that are useful for math.
 */
-int32_t cpu_get_num_math() {
+int32_t common_cpu_get_num_math() {
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
    if (n_cpu < 1) {
-        return cpu_get_num_physical_cores();
+        return common_cpu_get_num_physical_cores();
    }
    if (is_hybrid_cpu()) {
        cpu_set_t affinity;
@@ -202,7 +202,7 @@ int32_t cpu_get_num_math() {
        }
    }
 #endif
-    return cpu_get_num_physical_cores();
+    return common_cpu_get_num_physical_cores();
 }

 // Helper for setting process priority
@@ -263,7 +263,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 //


-void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
    int32_t n_set = 0;

    if (cpuparams.n_threads < 0) {
@@ -271,7 +271,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
        if (role_model != nullptr) {
            cpuparams = *role_model;
        } else {
-            cpuparams.n_threads = cpu_get_num_math();
+            cpuparams.n_threads = common_cpu_get_num_math();
        }
    }

@@ -1521,7 +1521,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    return cparams;
 }

-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
    struct ggml_threadpool_params tpp;

    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
--- a/common/common.h
+++ b/common/common.h
@@ -54,7 +54,7 @@ struct common_control_vector_load_info;
 // CPU utils
 //

-struct cpu_params {
+struct common_cpu_params {
    int      n_threads                   = -1;
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
    bool     mask_valid                  = false;   // Default: any CPU
@@ -63,8 +63,8 @@ struct cpu_params {
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };

-int32_t cpu_get_num_physical_cores();
-int32_t cpu_get_num_math();
+int32_t common_cpu_get_num_physical_cores();
+int32_t common_cpu_get_num_math();

 //
 // Common params
@@ -274,6 +274,7 @@ struct common_params_sampling {
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
+    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@@ -296,34 +297,19 @@ struct common_params_model {

 struct common_ngram_mod;

-struct common_params_speculative {
-    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
+// draft-model-based speculative decoding parameters
+struct common_params_speculative_draft {
+    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min = 0;  // minimum number of draft tokens to use for speculative decoding

-    // general-purpose speculative decoding parameters
+    float p_split = 0.1f;  // speculative decoding split probability
+    float p_min   = 0.75f; // minimum speculative decoding probability (greedy)

-    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min   = 0;  // minimum number of draft tokens to use for speculative decoding
-    float   p_split = 0.1f; // speculative decoding split probability
-    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+    common_params_model mparams;

-    // ngram-based speculative decoding
+    llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts

-    uint16_t ngram_size_n   = 12; // ngram size for lookup
-    uint16_t ngram_size_m   = 48; // mgram size for speculative tokens
-    uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
-
-    std::shared_ptr<common_ngram_mod> ngram_mod;
-
-    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
-
-    // draft-model speculative decoding
-
-    struct common_params_model mparams_dft;
-
-    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
-
-    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+    llama_context_params cparams; // these are the parameters for the draft llama_context

    int32_t n_ctx        = 0;  // draft context size
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
@@ -331,25 +317,60 @@ struct common_params_speculative {
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

-    struct cpu_params cpuparams;
-    struct cpu_params cpuparams_batch;
+    common_cpu_params cpuparams;
+    common_cpu_params cpuparams_batch;

    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+};
+
+struct common_params_speculative_ngram_mod {
+    int32_t n_match = 24;
+
+    int32_t n_max = 64;
+    int32_t n_min = 48;
+
+    // shared instance of the ngram container for all speculative decoding contexts
+    std::shared_ptr<common_ngram_mod> obj;
+};
+
+struct common_params_speculative_ngram_map {
+    uint16_t size_n   = 12; // ngram size for lookup
+    uint16_t size_m   = 48; // mgram size for speculative tokens
+    uint16_t min_hits = 1;  // minimum hits at ngram/mgram lookup for mgram to be proposed
+};
+
+struct common_params_speculative_ngram_cache {
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
+};
+
+struct common_params_speculative {
+    // TODO: become a vector in order to support "chains of speculators"
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
+
+    common_params_speculative_draft draft;
+
+    common_params_speculative_ngram_mod ngram_mod;
+    common_params_speculative_ngram_map ngram_simple;
+    common_params_speculative_ngram_map ngram_map_k;
+    common_params_speculative_ngram_map ngram_map_k4v;
+
+    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
+        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
    }
 };

 struct common_params_vocoder {
    struct common_params_model model;

-    std::string speaker_file = ""; // speaker file path                                      // NOLINT
+    std::string speaker_file; // speaker file path

-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
 };

 struct common_params_diffusion {
@@ -432,8 +453,8 @@ struct common_params {

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

-    struct cpu_params cpuparams;
-    struct cpu_params cpuparams_batch;
+    common_cpu_params cpuparams;
+    common_cpu_params cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
@@ -581,8 +602,6 @@ struct common_params {
    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
-    int reasoning_budget = -1;
-    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@@ -679,7 +698,7 @@ std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
+void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);

 //
@@ -747,6 +766,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, char prefix) {
+    return !str.empty() && str.front() == prefix;
+}
+
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
@@ -842,7 +866,7 @@ common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -1,9 +1,38 @@
 #include "debug.h"

+#include "common.h"
 #include "log.h"

 #include <cmath>
+#include <regex>
 #include <string>
+#include <vector>
+
+struct common_debug_cb_user_data::impl {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;
+    bool                    abort_on_nan{false};
+};
+
+common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
+common_debug_cb_user_data::~common_debug_cb_user_data() = default;
+
+common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
+    : pimpl(std::make_unique<impl>())
+{
+    for (const auto & pattern : filter_patterns) {
+        try {
+            std::string anchored_pattern = "^" + pattern;
+            pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+        } catch (const std::regex_error & e) {
+            throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+        }
+    }
+    pimpl->abort_on_nan = abort_on_nan;
+
+    params.cb_eval           = common_debug_cb_eval;
+    params.cb_eval_user_data = this;
+}

 static std::string common_ggml_ne_string(const ggml_tensor * t) {
    std::string str;
@@ -47,8 +76,7 @@ static float common_ggml_get_float_value(const uint8_t * data,

 #define INDENT "    "

-template <bool abort>
-void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -94,7 +122,7 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
        LOG(INDENT "sum = %f\n", sum);
    }

-    if constexpr (abort) {
+    if (abort_on_nan) {
        if (std::isnan(sum)) {
            LOG("encountered NaN - aborting\n");
            exit(0);
@@ -112,8 +140,9 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
 * @param user_data user data to pass at each call back
 * @return true to receive data or continue the graph, false otherwise
 */
-template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (base_callback_data *) user_data;
+bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (common_debug_cb_user_data *) user_data;
+    auto * pimpl = cb_data->pimpl.get();

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
@@ -122,10 +151,10 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
        return true;  // Always retrieve data
    }

-    bool matches_filter = cb_data->tensor_filters.empty();
+    bool matches_filter = pimpl->tensor_filters.empty();

    if (!matches_filter) {
-        for (const auto & filter : cb_data->tensor_filters) {
+        for (const auto & filter : pimpl->tensor_filters) {
            if (std::regex_search(t->name, filter)) {
                matches_filter = true;
                break;
@@ -148,20 +177,14 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b

    if (!is_host) {
        auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+        pimpl->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
    }

    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
+        uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
+        common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
    }

    return true;
 }
-
-// Explicit template instantiations
-template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
-template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
-template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
-template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@@ -1,43 +1,31 @@
 #pragma once
-#include "common.h"
+
+#include <memory>
 #include <string>
 #include <vector>
-#include <regex>

 // common debug functions and structs

-// Print a tensor's detailed data
-// data - the tensor's data in byte format
-// type - the tensor's quantization type
-// ne   - the tensor dimensions array
-// nb   - the tensor strides array
-// n    - the number of rows/columns to fully print
-template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
+struct common_params;

 // Intended to use as callback for ggml_backend_sched_eval_callback
 // prints tensors that are processed in the computation graph
-// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
-// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determines whether an error should be thrown whenever a NaN is encountered
+// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
+// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
+// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
-template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
-struct base_callback_data {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
+bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);

-    base_callback_data() = default;
+struct common_debug_cb_user_data {
+    struct impl;
+    std::unique_ptr<impl> pimpl;

-    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
-        for (const auto & pattern : filter_patterns) {
-            try {
-                std::string anchored_pattern = "^" + pattern;
-                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-            } catch (const std::regex_error & e) {
-                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-            }
-        }
-        params.cb_eval           = common_debug_cb_eval<false>;
-        params.cb_eval_user_data = this;
-    }
+    common_debug_cb_user_data();
+    ~common_debug_cb_user_data();
+
+    common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
+    common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
+
+    common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
 };
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -627,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
    if (!tag.empty()) {
        tags.push_back(tag);
    } else {
-        tags = {"Q4_K_M", "Q4_0"};
+        tags = {"Q4_K_M", "Q8_0"};
    }

    for (const auto & t : tags) {
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -856,7 +856,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
        ggml_backend_dev_memory(dev, &free, &total);

        const size_t self = mb.model + mb.context + mb.compute;
-        const size_t unaccounted = total - self - free;
+        const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);

        table_data.push_back({
            template_gpu,
@@ -867,7 +867,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
            std::to_string(mb.model / MiB),
            std::to_string(mb.context / MiB),
            std::to_string(mb.compute / MiB),
-            std::to_string(unaccounted / MiB)});
+            std::to_string(unaccounted / static_cast<int64_t>(MiB))});
    }

    // print memory breakdown for host:
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -1,4 +1,3 @@
-#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -106,10 +106,16 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
+
    // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
+    virtual value execute_impl(context &) { throw_exec_error(); }
    // execute is the public method to execute a statement with error handling
    value execute(context &);
+
+private:
+    [[noreturn]] void throw_exec_error() const {
+        throw std::runtime_error("cannot exec " + type());
+    }
 };

 // Type Checking Utilities
@@ -143,7 +149,7 @@ struct program : public statement {
    program() = default;
    explicit program(statements && body) : body(std::move(body)) {}
    std::string type() const override { return "Program"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
    }
 };
@@ -195,7 +201,7 @@ struct break_statement : public statement {
        }
    };

-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw break_statement::signal();
    }
 };
@@ -209,7 +215,7 @@ struct continue_statement : public statement {
        }
    };

-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw continue_statement::signal();
    }
 };
@@ -509,7 +515,7 @@ struct slice_expression : public expression {
        chk_type<expression>(this->step_expr);
    }
    std::string type() const override { return "SliceExpression"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
 };
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
    return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }

+[[noreturn]] static value string_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("String join builtin not implemented");
+}
+
 const func_builtins & value_string_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
@@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
            res->val_str.mark_input_based_on(val_input->as_string());
            return res;
        }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("String join builtin not implemented");
-        }},
+        {"join", string_join_not_implemented},
    };
    return builtins;
 }
@@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
    return builtins;
 }

+[[noreturn]] static value array_unique_not_implemented(const func_args &) {
+    throw not_implemented_exception("Array unique builtin not implemented");
+}

 const func_builtins & value_array_t::get_builtins() const {
    static const func_builtins builtins = {
@@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"unique", [](const func_args &) -> value {
-            throw not_implemented_exception("Array unique builtin not implemented");
-        }},
+        {"unique", array_unique_not_implemented},
    };
    return builtins;
 }

+[[noreturn]] static value object_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("object join not implemented");
+}

 const func_builtins & value_object_t::get_builtins() const {
    if (!has_builtins) {
@@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
            });
            return result;
        }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("object join not implemented");
-        }},
+        {"join", object_join_not_implemented},
    };
    return builtins;
 }
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -129,27 +129,25 @@ struct value_t {
    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

-    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
-    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
-    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
-    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
+    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
+    virtual double as_float() const { throw_type_error("is not a float value"); }
+    virtual string as_string() const { throw_type_error("is not a string value"); }
+    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
+    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const {
-        throw std::runtime_error("No builtins available for type " + type());
-    }
+    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }

-    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }

    virtual bool is_numeric() const { return false; }
    virtual bool is_hashable() const { return false; }
@@ -163,6 +161,11 @@ struct value_t {
    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }

+private:
+    [[noreturn]] void throw_type_error(const char* expected) const {
+        throw std::runtime_error(type() + " " + expected);
+    }
+
 protected:
    virtual bool equivalent(const value_t &) const = 0;
    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -49,7 +49,7 @@ enum common_log_col : int {
 };

 // disable colors by default
-static std::vector<const char *> g_col = {
+static const char* g_col[] = {
    "",
    "",
    "",
@@ -247,7 +247,6 @@ public:

            entries = std::move(new_entries);
        }
-
        cv.notify_one();
    }

@@ -265,7 +264,6 @@ public:
                {
                    std::unique_lock<std::mutex> lock(mtx);
                    cv.wait(lock, [this]() { return head != tail; });
-
                    cur = entries[head];

                    head = (head + 1) % entries.size();
@@ -301,7 +299,6 @@ public:

                tail = (tail + 1) % entries.size();
            }
-
            cv.notify_one();
        }

@@ -338,7 +335,7 @@ public:
            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
        } else {
-            for (size_t i = 0; i < g_col.size(); i++) {
+            for (size_t i = 0; i < std::size(g_col); i++) {
                g_col[i] = "";
            }
        }
@@ -368,14 +365,20 @@ struct common_log * common_log_init() {
 }

 struct common_log * common_log_main() {
-    static struct common_log log;
+    // We intentionally leak (i.e. do not delete) the logger singleton because
+    // common_log destructor called at DLL teardown phase will cause hanging on Windows.
+    // OS will release resources anyway so it should not be a significant issue,
+    // though this design may cause logs to be lost if not flushed before the program exits.
+    // Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
+    static struct common_log * log;
    static std::once_flag    init_flag;
    std::call_once(init_flag, [&]() {
+        log = new common_log;
        // Set default to auto-detect colors
-        log.set_colors(tty_can_use_colors());
+        log->set_colors(tty_can_use_colors());
    });

-    return &log;
+    return log;
 }

 void common_log_pause(struct common_log * log) {
--- a/common/log.h
+++ b/common/log.h
@@ -49,7 +49,11 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
 struct common_log;

 struct common_log * common_log_init();
-struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+
+// Singleton, intentionally leaked to avoid Windows teardown hangs.
+// Call common_log_flush() before exit if you want to ensure all logs are flushed.
+struct common_log * common_log_main();
+
 void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
 void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
 void                common_log_free  (struct common_log * log);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -43,7 +43,7 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
    for (const auto & it : key_to_opt) {
        const std::string & key = it.first;
        const common_arg & opt = it.second;
-        if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
            allowed_keys.insert(key);
            // also add variant keys (args without leading dashes and env vars)
            for (const auto & arg : opt.get_args()) {
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -122,6 +122,20 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            }
            break;
        case REASONING_BUDGET_DONE:
+            // Re-arm on a new start tag: some models emit multiple <think> blocks
+            // per response, and each should get a fresh budget window.
+            if (ctx->start_matcher.advance(token)) {
+                ctx->state = REASONING_BUDGET_COUNTING;
+                ctx->remaining = ctx->budget;
+                ctx->end_matcher.reset();
+                LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
+
+                if (ctx->remaining <= 0) {
+                    ctx->state = REASONING_BUDGET_FORCING;
+                    ctx->force_pos = 0;
+                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
+                }
+            }
            break;
    }
 }
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -61,18 +61,26 @@ static bool common_speculative_are_compatible(
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
-        LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
+                "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
        return false;
    }

-    if (
-        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
-        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
-        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
-    ) {
-        LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
+    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        (llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
+        LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
+                __func__,
+                llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
+                llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
+        return false;
+    }
+
+    if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        (llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
+        LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
+                __func__,
+                llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
+                llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
        return false;
    }

@@ -143,6 +151,9 @@ struct common_speculative_state {
            llama_tokens & result) = 0;

    virtual void accept(uint16_t n_accepted) = 0;
+
+    virtual int32_t n_max(const common_params_speculative & params) const = 0;
+    virtual int32_t n_min(const common_params_speculative & params) const = 0;
 };

 struct common_speculative_checkpoint {
@@ -288,6 +299,8 @@ struct common_speculative_state_draft : public common_speculative_state {
            const llama_tokens & prompt_tgt,
            llama_token id_last,
            llama_tokens & result) override {
+        const auto & sparams = params.draft;
+
        auto * spec = this;

        auto & batch      = spec->batch;
@@ -301,7 +314,7 @@ struct common_speculative_state_draft : public common_speculative_state {
        int reuse_i = 0; // index of part to be reused in prompt_dft
        int reuse_n = 0; // length of part to be reused in prompt_dft

-        const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
+        const int n_ctx = llama_n_ctx(ctx_dft) - sparams.n_max;

        llama_tokens prompt_cnv;
        if (!spec->vocab_cmpt) {
@@ -359,7 +372,7 @@ struct common_speculative_state_draft : public common_speculative_state {
        }

        result.clear();
-        result.reserve(params.n_max);
+        result.reserve(sparams.n_max);

        bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
        if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
@@ -372,7 +385,7 @@ struct common_speculative_state_draft : public common_speculative_state {
                for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
                    result.push_back(prompt_dft[i]);

-                    if (params.n_max <= (int) result.size()) {
+                    if (sparams.n_max <= (int) result.size()) {
                        break;
                    }
                }
@@ -465,7 +478,7 @@ struct common_speculative_state_draft : public common_speculative_state {
        common_sampler_reset(smpl);

        // sample n_draft tokens from the draft model
-        for (int i = 0; i < params.n_max; ++i) {
+        for (int i = 0; i < sparams.n_max; ++i) {
            common_batch_clear(batch);

            common_sampler_sample(smpl, ctx_dft, 0, true);
@@ -484,12 +497,12 @@ struct common_speculative_state_draft : public common_speculative_state {

            result.push_back(id);

-            if (params.n_max <= (int) result.size()) {
+            if (sparams.n_max <= (int) result.size()) {
                break;
            }

            // only collect very high-confidence draft tokens
-            if (cur_p->data[0].p < params.p_min) {
+            if (cur_p->data[0].p < sparams.p_min) {
                break;
            }

@@ -510,10 +523,14 @@ struct common_speculative_state_draft : public common_speculative_state {
            detokenized = replace_to_tgt(detokenized);
            LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
            result = common_tokenize(ctx_tgt, detokenized, false, true);
-            if (result.size() > (size_t)params.n_max) {
-                result.resize(params.n_max);
+            if (result.size() > (size_t) sparams.n_max) {
+                result.resize(sparams.n_max);
            }
        }
+
+        if (result.size() < (size_t) sparams.n_min) {
+            result.clear();
+        }
    }

    void accept(uint16_t n_accepted) override {
@@ -521,6 +538,14 @@ struct common_speculative_state_draft : public common_speculative_state {
        GGML_UNUSED(n_accepted);
    }

+    int32_t n_max(const common_params_speculative & params) const override {
+        return params.draft.n_max;
+    }
+
+    int32_t n_min(const common_params_speculative & params) const override {
+        return params.draft.n_min;
+    }
+
    std::string replace_to_dft(const std::string & input) const {
        std::string result = input;

@@ -573,6 +598,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
        // noop
        GGML_UNUSED(n_accepted);
    }
+
+    int32_t n_max(const common_params_speculative & params) const override {
+        return params.draft.n_max;
+    }
+
+    int32_t n_min(const common_params_speculative & params) const override {
+        return params.draft.n_min;
+    }
 };

 // state of self-speculation (simple implementation, not ngram-map)
@@ -602,19 +635,27 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
        // noop
        GGML_UNUSED(n_accepted);
    }
+
+    int32_t n_max(const common_params_speculative & /*params*/) const override {
+        return config.size_mgram;
+    }
+
+    int32_t n_min(const common_params_speculative & /*params*/) const override {
+        return config.size_mgram;
+    }
 };

 struct common_speculative_state_ngram_map_k : public common_speculative_state {
    // draft ngram map for speculative decoding without draft model
-    common_ngram_map map;
+    common_ngram_map config;

    common_speculative_state_ngram_map_k(
            enum common_speculative_type type,
-            common_ngram_map map)
-        : common_speculative_state(type), map(std::move(map)) {}
+            common_ngram_map config)
+        : common_speculative_state(type), config(std::move(config)) {}

    void begin(const llama_tokens & prompt) override {
-        common_ngram_map_begin(map, prompt);
+        common_ngram_map_begin(config, prompt);
    }

    void draft(
@@ -622,12 +663,20 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
            const llama_tokens & prompt_tgt,
            llama_token id_last,
            llama_tokens & result) override {
-        common_ngram_map_draft(map, prompt_tgt, id_last, result);
+        common_ngram_map_draft(config, prompt_tgt, id_last, result);
        GGML_UNUSED(params);
    }

    void accept(uint16_t n_accepted) override {
-        common_ngram_map_accept(map, n_accepted);
+        common_ngram_map_accept(config, n_accepted);
+    }
+
+    int32_t n_max(const common_params_speculative & /*params*/) const override {
+        return config.size_value;
+    }
+
+    int32_t n_min(const common_params_speculative & /*params*/) const override {
+        return config.size_value;
    }
 };

@@ -684,7 +733,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
            const llama_tokens & prompt_tgt,
            llama_token id_last,
            llama_tokens & result) override {
-        GGML_UNUSED(params);
+        const auto & sparams = params.ngram_mod;

        n_draft_last = 0;

@@ -704,16 +753,16 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
            i_last = cur_len - n;
        }

-        result.resize(n + params.n_max);
+        result.resize(n + sparams.n_max);
        for (size_t i = 0; i < n - 1; ++i) {
            result[i] = prompt_tgt[cur_len - n + 1 + i];
        }
        result[n - 1] = id_last;

-        for (int i = 0; i < params.n_max; ++i) {
+        for (int i = 0; i < sparams.n_max; ++i) {
            const llama_token token = mod.get(result.data() + i);
            if (token == common_ngram_mod::EMPTY) {
-                if (i < params.n_min) {
+                if (i < sparams.n_min) {
                    result.clear();
                    return;
                }
@@ -749,12 +798,21 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {

                    mod.reset();
                    n_low = 0;
+                    i_last = 0;
                }
            } else {
                n_low = 0;
            }
        }
    }
+
+    int32_t n_max(const common_params_speculative & params) const override {
+        return params.ngram_mod.n_max;
+    }
+
+    int32_t n_min(const common_params_speculative & params) const override {
+        return params.ngram_mod.n_min;
+    }
 };

 struct common_speculative_state_ngram_cache : public common_speculative_state {
@@ -848,6 +906,14 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
        // TODO: noop
        GGML_UNUSED(n_accepted);
    }
+
+    int32_t n_max(const common_params_speculative & /*params*/) const override {
+        return n_draft;
+    }
+
+    int32_t n_min(const common_params_speculative & /*params*/) const override {
+        return 0;
+    }
 };

 struct common_speculative {
@@ -856,11 +922,13 @@ struct common_speculative {
    common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
 };

-static common_ngram_map get_common_ngram_map(const common_speculative_config & config) {
-    uint16_t size_key   = config.params.ngram_size_n;
-    uint16_t size_value = config.params.ngram_size_m;
-    bool     key_only   = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
-    uint16_t min_hits   = config.params.ngram_min_hits;
+static common_ngram_map get_common_ngram_map(
+        common_speculative_type type,
+        const common_params_speculative_ngram_map & config) {
+    uint16_t size_key   = config.size_n;
+    uint16_t size_value = config.size_m;
+    bool     key_only   = type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
+    uint16_t min_hits   = config.min_hits;

    return common_ngram_map(size_key, size_value, key_only, min_hits);
 }
@@ -918,8 +986,8 @@ common_speculative * common_speculative_init(
        common_params_speculative & params,
        llama_context             * ctx_tgt) {
    llama_context * ctx_dft = nullptr;
-    if (params.model_dft) {
-        ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
+    if (params.draft.model) {
+        ctx_dft = llama_init_from_model(params.draft.model, params.draft.cparams);
        if (ctx_dft == nullptr) {
            LOG_ERR("%s", "failed to create draft context\n");
            return nullptr;
@@ -929,7 +997,7 @@ common_speculative * common_speculative_init(
    // Compute the implementations to use based on the config and their order of preference
    std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
    {
-        bool has_draft = !params.mparams_dft.path.empty();
+        bool has_draft = !params.draft.mparams.path.empty();
        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3

        bool has_ngram_cache   = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
@@ -952,16 +1020,17 @@ common_speculative * common_speculative_init(
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
        }
        if (has_ngram_mod) {
-            // shared instance for all speculative decoding contexts
-            if (!params.ngram_mod) {
-                params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
+            auto & sparams = params.ngram_mod;

-                LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
-                        params.ngram_size_n, params.ngram_mod->size(),
-                        (float)(params.ngram_mod->size_bytes())/1024/1024);
+            if (!sparams.obj) {
+                sparams.obj = std::make_shared<common_ngram_mod>(sparams.n_match, 4*1024*1024);

-                if (params.ngram_size_n < 16) {
-                    LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
+                LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__,
+                        sparams.n_match, sparams.obj->size(), (float)(sparams.obj->size_bytes())/1024/1024);
+
+                if (sparams.n_match < 16) {
+                    LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
+                            "see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, sparams.n_match);
                }
            }

@@ -991,7 +1060,7 @@ common_speculative * common_speculative_init(
                impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
                    /* .ctx_tgt      = */ ctx_tgt,
                    /* .ctx_dft      = */ ctx_dft,
-                    /* .replacements = */ params.replacements,
+                    /* .replacements = */ params.draft.replacements,
                    /* .use_ckpt     = */ use_ckpt
                ));
                break;
@@ -1001,18 +1070,18 @@ common_speculative * common_speculative_init(
                break;
            }
            case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
-                common_ngram_map ngram_map = get_common_ngram_map(config);
+                common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple);

                uint16_t ngram_size_key   = ngram_map.size_key;
                uint16_t mgram_size_value = ngram_map.size_value;

                auto config_simple = common_ngram_simple_config {
-                    /* .size_ngram      = */ ngram_size_key,
-                    /* .size_mgram      = */ mgram_size_value
+                    /* .size_ngram = */ ngram_size_key,
+                    /* .size_mgram = */ mgram_size_value
                };
                auto state = std::make_unique<common_speculative_state_ngram_simple>(
-                    /* .type            = */ config.type,
-                    /* .state           = */ config_simple
+                    /* .type  = */ config.type,
+                    /* .state = */ config_simple
                );
                impls.push_back(std::move(state));
                break;
@@ -1021,18 +1090,17 @@ common_speculative * common_speculative_init(
            case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
                impls.push_back(std::make_unique<common_speculative_state_ngram_map_k>(
                    (config.type),
-                    get_common_ngram_map(config)
+                    get_common_ngram_map(config.type, config.params.ngram_map_k)
                ));
                break;
            }
            case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
-                GGML_ASSERT(config.params.ngram_mod);
-                impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
+                GGML_ASSERT(config.params.ngram_mod.obj);
+                impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod.obj));
                break;
            }
            case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
-                auto state = create_state_ngram_cache(
-                        params.lookup_cache_static, params.lookup_cache_dynamic, config);
+                auto state = create_state_ngram_cache(params.ngram_cache.lookup_cache_static, params.ngram_cache.lookup_cache_dynamic, config);
                impls.push_back(std::make_unique<common_speculative_state_ngram_cache>(state));
                break;
            }
@@ -1090,6 +1158,15 @@ llama_tokens common_speculative_draft(
            impl->n_call_draft++;
        }

+        {
+            const int n_min = impl->n_min(params);
+
+            if (!result.empty() && (int) result.size() < n_min) {
+                LOG_DBG("%s: ignoring small draft: %d < %d\n", __func__, (int) result.size(), n_min);
+                result.clear();
+            }
+        }
+
        if (!result.empty()) {
            LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
                    common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
@@ -1099,7 +1176,7 @@ llama_tokens common_speculative_draft(
            impl->n_gen_drafts++;
            impl->n_gen_tokens += result.size();

-            break; // We have a draft, so break out of the loop and return it.
+            break; // we have a draft, so break out of the loop and return it.
        }
    }

@@ -1127,6 +1204,32 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
    }
 }

+int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params) {
+    if (spec == nullptr) {
+        return 0;
+    }
+
+    int32_t n_max = 0;
+    for (const auto & impl : spec->impls) {
+        n_max = std::max(n_max, impl->n_max(params));
+    }
+
+    return n_max;
+}
+
+int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params) {
+    if (spec == nullptr) {
+        return 0;
+    }
+
+    int32_t n_min = 0;
+    for (const auto & impl : spec->impls) {
+        n_min = std::max(n_min, impl->n_min(params));
+    }
+
+    return n_min;
+}
+
 void common_speculative_print_stats(const common_speculative * spec) {
    if (spec == nullptr) {
        return;
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -33,6 +33,9 @@ llama_tokens common_speculative_draft(
 // informs the speculative decoder that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);

+int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
+int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
+
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -272,6 +272,22 @@ class ModelBase:

        return tensors

+    @staticmethod
+    def _scale_is_trivial(scale: Tensor) -> bool:
+        return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
+
+    def _write_scale_tensor(self, scale_name: str, scale: Tensor):
+        if not self._scale_is_trivial(scale):
+            scale_f32 = scale.float().numpy().flatten()
+            logger.info(f"  + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
+            self.gguf_writer.add_tensor(scale_name, scale_f32)
+
+    def _write_scales_tensor(self, scale_name: str, scales: list[float]):
+        if not np.allclose(scales, 1.0, atol=1e-6):
+            scale_vals = np.array(scales, dtype=np.float32)
+            logger.info(f"  + {scale_name} (per-expert scale, shape [{len(scales)}])")
+            self.gguf_writer.add_tensor(scale_name, scale_vals)
+
    def dequant_model(self):
        # If all quantized tensors were already handled (e.g. pure NVFP4), skip
        if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -494,7 +510,7 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
                        tensors_to_remove.append(name)
-                    if name.endswith((".k_scale", ".v_scale")):
+                    if name.endswith((".input_scale", ".k_scale", ".v_scale")):
                        tensors_to_remove.append(name)
            elif quant_method is not None:
                raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -602,10 +618,6 @@ class ModelBase:
        raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
        return raw, [out_features, n_super * 64]

-    @staticmethod
-    def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
-        return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
-
    def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
        if "language_model." in name:
            name = name.replace("language_model.", "")
@@ -616,19 +628,8 @@ class ModelBase:
        logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)

-        # Emit per-tensor scale2 as a separate F32 tensor when non-trivial
-        if not self._nvfp4_scale2_is_trivial(scale2):
-            scale2_f32 = scale2.float().numpy().flatten()
-            scale_name = new_name.replace(".weight", ".scale")
-            logger.info(f"  + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
-            self.gguf_writer.add_tensor(scale_name, scale2_f32)
-
-        # Emit per-tensor input_scale as a separate F32 tensor when non-trivial
-        if not self._nvfp4_scale2_is_trivial(input_scale):
-            input_scale_f32 = input_scale.float().numpy().flatten()
-            input_scale_name = new_name.replace(".weight", ".input_scale")
-            logger.info(f"  + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
-            self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
+        self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
+        self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)

    def _generate_nvfp4_tensors(self):
        # Per-layer expert merging to avoid holding all experts in memory
@@ -719,24 +720,17 @@ class ModelBase:
        logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)

-        # Emit per-expert scale2 tensor if any expert has non-trivial scale2
        scales.sort(key=lambda x: x[0])
-        scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
-        if not np.allclose(scale_vals, 1.0, atol=1e-6):
-            scale_name = new_name.replace(".weight", ".scale")
-            logger.info(f"  + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
-            self.gguf_writer.add_tensor(scale_name, scale_vals)
+        self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])

-        # Emit per-expert input_scale tensor if any expert has non-trivial input_scale
        input_scales.sort(key=lambda x: x[0])
-        input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
-        if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
-            input_scale_name = new_name.replace(".weight", ".input_scale")
-            logger.info(f"  + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
-            self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
+        self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])

        del experts, merged

+    def _needs_nvfp4_processing(self) -> bool:
+        return True
+
    def prepare_tensors(self):
        # detect NVFP4 quantization (ModelOpt format)
        quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
@@ -746,7 +740,12 @@ class ModelBase:

        if (not quant_algo or not quant_layers) and quant_config_file.is_file():
            with open(quant_config_file, "r", encoding="utf-8") as f:
-                quant_config = json.load(f).get("quantization") or {}
+                hf_quant_config = json.load(f)
+                quant_config = hf_quant_config.get("quantization") or {}
+                producer = hf_quant_config.get("producer") or {}
+                producer_name = (producer.get("name") or "").lower()
+                if quant_method is None:
+                    self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
                quant_algo = quant_config.get("quant_algo", quant_algo)
                quant_layers = quant_config.get("quantized_layers", quant_layers) or {}

@@ -762,7 +761,7 @@ class ModelBase:
        # NVFP4 weights are repacked and written directly to gguf_writer.
        # This must run before dequant_model so NVFP4 tensors are removed
        # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
-        if self._is_nvfp4:
+        if self._is_nvfp4 and self._needs_nvfp4_processing():
            self._generate_nvfp4_tensors()

        self.dequant_model()
@@ -2194,6 +2193,10 @@ class MmprojModel(ModelBase):
                # merge configs
                self.preprocessor_config = {**self.preprocessor_config, **cfg}

+    def _needs_nvfp4_processing(self) -> bool:
+        # nvfp4 quantization applies to the text model only.
+        return False
+
    def get_vision_config(self) -> dict[str, Any] | None:
        config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
        return self.global_config.get(config_name)
@@ -4454,6 +4457,12 @@ class NemotronNanoV2VLModel(MmprojModel):
        }
        return vision_config

+    def dequant_model(self):
+        if self._is_nvfp4:
+            # Skip nvfp4 quantization for vision/audio model.
+            return
+        super().dequant_model()
+
    def set_gguf_parameters(self):
        if "image_mean" not in self.preprocessor_config:
            self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
@@ -4477,6 +4486,10 @@ class NemotronNanoV2VLModel(MmprojModel):
        if "input_conditioner" in name:
            return

+        # mtmd does not support video yet so skip tensors related to video.
+        if "radio_model.model.patch_generator.video_embedder" in name:
+            return
+
        # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
        if "patch_generator.pos_embed" in name:
            if not name.endswith(".weight"):
@@ -10824,7 +10837,11 @@ class NemotronHModel(GraniteHybridModel):
        # uses self.model_arch to build the tensor name map, and all MoE-specific
        # mappings would be missed if it were called with the default non-MoE arch.
        hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
-        if "num_experts_per_tok" in hparams:
+        has_moe_params = (
+            "num_experts_per_tok" in hparams
+            or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"])
+        )
+        if has_moe_params:
            self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
            self.is_moe = True

@@ -10971,6 +10988,11 @@ class NemotronHModel(GraniteHybridModel):
        if name.startswith(("vision_model.", "mlp1.")):
            return

+        if name.startswith(("sound_encoder.")):
+            return
+        if name.startswith(("sound_projection.")):
+            return
+
        # Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
        if name.startswith("language_model."):
            name = name[len("language_model."):]
@@ -11855,7 +11877,7 @@ class LLaDAMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
+@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE

@@ -11994,28 +12016,58 @@ class HunYuanModel(TextModel):


@ModelBase.register("HunYuanVLForConditionalGeneration")
-class HunyuanOCRVisionModel(MmprojModel):
+class HunyuanVLVisionModel(MmprojModel):
+    # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
+    # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
+    # Each variant maps to a different projector type in clip.cpp so image
+    # preprocessing follows the correct code path.
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self.hparams_vision is not None
-        # HunyuanOCR uses max_image_size instead of image_size
+        # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
        if "image_size" not in self.hparams_vision:
            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)

+    @staticmethod
+    def is_ocr_variant(hparams: dict) -> bool:
+        """Return True for HunyuanOCR, False for HunyuanVL.
+
+        The projector's output dim must equal the text model's hidden_size by
+        construction (that's what "projector" means). HunyuanOCR pairs a 1B text
+        backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
+        ViT -> LLM projection dim is a hard architectural signature, not a
+        magic number.
+        """
+        vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
+        return vision_out == 1024
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        assert self.hparams_vision is not None
-        hparams = self.hparams_vision
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
-        self.gguf_writer.add_vision_use_gelu(True)
-        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
-        self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
-        self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
-        self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+        vcfg = self.hparams_vision
+
+        if self.is_ocr_variant(self.global_config):
+            # --- HunyuanOCR ---
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
+            self.gguf_writer.add_vision_use_gelu(True)
+            self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
+            self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
+            self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+            self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+            return
+
+        # --- HunyuanVL ---
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
+        self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
+        self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
+        self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
+        self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
+        self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith("vit."):
-            return  # skip text tensors
+            return
        # strip CLS token (row 0) from position embeddings so resize_position_embeddings works
        if "position_embedding" in name:
            data_torch = data_torch[1:]  # [n_patches+1, n_embd] -> [n_patches, n_embd]
@@ -12023,11 +12075,66 @@ class HunyuanOCRVisionModel(MmprojModel):

    def tensor_force_quant(self, name, new_name, bid, n_dims):
        # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
+        # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
        if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
        return super().tensor_force_quant(name, new_name, bid, n_dims)


+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanVLTextModel(HunYuanModel):
+    # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
+    # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
+    # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
+    # the config and pick the matching GGUF architecture.
+    model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
+
+    @staticmethod
+    def _is_ocr_config(hparams: dict) -> bool:
+        # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
+        # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
+        # HunyuanVLVisionModel.is_ocr_variant.
+        return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
+
+    def __init__(self, dir_model: Path, *args, **kwargs):
+        raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
+        if self._is_ocr_config(raw_hparams):
+            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
+        else:
+            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
+        super().__init__(dir_model, *args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
+        # the HunYuan-Dense arch which already handles standard rope in super().
+        if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
+            return
+
+        if self.rope_parameters.get("rope_type") != "xdrope":
+            return
+
+        # defaults for HunyuanVL. The C++ side later computes:
+        #   freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
+        self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
+        self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
+
+        ctx_len = int(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
+        self.gguf_writer.add_context_length(ctx_len)
+
+        self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip vision tensors — they are written by HunyuanVLVisionModel
+        if name.startswith("vit."):
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.SMOLLM3
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -31,6 +31,8 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.

 ## Recommended Release

+### Windows
+
 The following releases are verified and recommended:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
@@ -39,9 +41,22 @@ The following releases are verified and recommended:
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

+### Ubuntu 24.04
+
+The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
+
+It is recommended to use them with Intel Docker.
+
+The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.

 ## News

+- 2026.04
+
+  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
+  - Fused MoE.
+  - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
+
 - 2026.03
  - Support Flash-Attention: less memory usage, performance impact depends on LLM.

@@ -229,6 +244,7 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 |Verified release|
 |-|
+|2025.3.3 |
 |2025.2.1|
 |2025.1|
 |2024.1|
@@ -339,6 +355,12 @@ Choose one of following methods to run.
 ./examples/sycl/test.sh
 ```

+- Run llama-server:
+
+```sh
+./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
+```
+
 2. Command line
 Launch inference

@@ -627,10 +649,18 @@ Choose one of following methods to run.

 1. Script

+- Run test:
+
 ```
 examples\sycl\win-test.bat
 ```

+- Run llama-server:
+
+```
+examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
+```
+
 2. Command line

 Launch inference
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
  ```

 - `GGML_HEXAGON_PROFILE=1`
-  Generates a host-side profile for the ggml-hexagon Ops.
+  Enables Op profiling:

- `GGML_HEXAGON_OPMASK=0x0`
-  Allows enabling specific stages of the processing pipeline:
+  - `1` Basic profile with per-op `usecs` and `cycles` counters
+  - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
+  - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
+
+  The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
+  Examples:
+
+      `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
+
+- `GGML_HEXAGON_OPSTAGE=0x0`
+  Allows enabling specific stages of the Op processing pipeline:

  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
  - `0x2` Enable Op Compute (MUL_MAT, etc.)

  Examples:

-      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
+      `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)

 - `GGML_HEXAGON_OPFILTER=regex`
  Allows filtering (disabling) Ops that match the regex pattern:
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -26,7 +26,7 @@ Legend:
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -60,7 +60,7 @@ Legend:
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
@@ -105,7 +105,7 @@ Legend:
 |                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -202,10 +202,14 @@ static bool run(llama_context * ctx, const common_params & params) {
    print_tokenized_prompt(ctx, tokens, params.prompt);

    if (params.save_logits) {
-        output_data output {ctx, model, params};
-        std::filesystem::path model_path{params.model.path};
-        std::string model_name{model_path.stem().string()};
-        save_output_data(output, model_name, params.logits_output_dir);
+        try {
+            output_data output {ctx, model, params};
+            std::filesystem::path model_path{params.model.path};
+            std::string model_name{model_path.stem().string()};
+            save_output_data(output, model_name, params.logits_output_dir);
+        } catch (const std::exception & e) {
+            LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
+        }
    }

    return true;
@@ -223,7 +227,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    std::optional<base_callback_data> cb_data;
+    std::optional<common_debug_cb_user_data> cb_data;
    if (!params.save_logits) {
        cb_data.emplace(params, params.tensor_filter);
    }
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -3,7 +3,6 @@
 #include "debug.h"
 #include "log.h"
 #include "llama.h"
-#include "llama-cpp.h"

 #include <clocale>
 #include <string>
@@ -38,7 +37,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");

-    base_callback_data cb_data;
+    common_debug_cb_user_data cb_data;

    common_params params;

@@ -53,7 +52,7 @@ int main(int argc, char ** argv) {

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    params.cb_eval = common_debug_cb_eval<false>;
+    params.cb_eval = common_debug_cb_eval;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -73,12 +73,12 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
    auto ctx_arg = common_params_parser_init(params, md.ex);

    std::vector<common_arg *> common_options;
-    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> sampling_options;
    std::vector<common_arg *> specific_options;
    for (auto & opt : ctx_arg.options) {
        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
-        if (opt.is_sparam) {
-            sparam_options.push_back(&opt);
+        if (opt.is_sampling) {
+            sampling_options.push_back(&opt);
        } else if (opt.in_example(ctx_arg.ex)) {
            specific_options.push_back(&opt);
        } else {
@@ -93,7 +93,7 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
    ss << "### Common params\n\n";
    write_table(ss, common_options);
    ss << "\n\n### Sampling params\n\n";
-    write_table(ss, sparam_options);
+    write_table(ss, sampling_options);
    ss << "\n\n### " << md.specific_section_header << "\n\n";
    write_table(ss, specific_options);

--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -37,9 +37,9 @@ int main(int argc, char ** argv){

    common_ngram_cache ngram_cache;
    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
-    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.lookup_cache_static.c_str());
+    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.ngram_cache.lookup_cache_static.c_str());

-    common_ngram_cache_save(ngram_cache, params.speculative.lookup_cache_static);
+    common_ngram_cache_save(ngram_cache, params.speculative.ngram_cache.lookup_cache_static);

    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv){
        return 1;
    }

-    const int n_draft = params.speculative.n_max;
+    const int n_draft = params.speculative.draft.n_max;

    // init llama.cpp
    llama_backend_init();
@@ -49,18 +49,18 @@ int main(int argc, char ** argv){
    {
        const int64_t t_start_draft_us = ggml_time_us();

-        if (!params.speculative.lookup_cache_static.empty()) {
+        if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
            try {
-                ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
                exit(1);
            }
        }

-        if (!params.speculative.lookup_cache_dynamic.empty()) {
+        if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -25,7 +25,7 @@ int main(int argc, char ** argv){
    }

    // max. number of additional tokens to draft if match is found
-    const int n_draft = params.speculative.n_max;
+    const int n_draft = params.speculative.draft.n_max;

    // init llama.cpp
    llama_backend_init();
@@ -54,18 +54,18 @@ int main(int argc, char ** argv){
        const int64_t t_start_draft_us = ggml_time_us();
        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);

-        if (!params.speculative.lookup_cache_static.empty()) {
+        if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
            try {
-                ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
                exit(1);
            }
        }

-        if (!params.speculative.lookup_cache_dynamic.empty()) {
+        if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

@@ -213,7 +213,7 @@ int main(int argc, char ** argv){

    // Update dynamic ngram cache with context ngram cache and save it to disk:
    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    common_ngram_cache_save(ngram_cache_dynamic, params.speculative.lookup_cache_dynamic);
+    common_ngram_cache_save(ngram_cache_dynamic, params.speculative.ngram_cache.lookup_cache_dynamic);

    LOG("\n\n");

--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -25,7 +25,11 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+if [[ -n "$MMPROJ" ]]; then
+    CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
+else
+    CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+fi

 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
@@ -38,6 +42,7 @@ if [[ -n "$DEBUG" ]]; then
 else
    CMD_ARGS=("python")
 fi
+
 CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -50,7 +55,3 @@ CMD_ARGS+=("--outtype" "${TYPE}")
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
-if [[ -n "$MMPROJ" ]]; then
-    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
-    echo "The mmproj model was created in $(realpath "$mmproj_file")"
-fi
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -8,8 +8,24 @@
 #include <clocale>
 #include <cstdio>
 #include <cstring>
+#include <cinttypes>
 #include <string>
 #include <vector>
+#include <utility>
+
+struct spec_checkpoint {
+    int64_t n_tokens = 0;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+
+    bool empty() const {
+        return data.empty();
+    }
+};

 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
@@ -27,7 +43,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.speculative.mparams_dft.path.empty()) {
+    if (params.speculative.draft.mparams.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -46,6 +62,14 @@ int main(int argc, char ** argv) {
    model_tgt = llama_init_tgt->model();
    ctx_tgt   = llama_init_tgt->context();

+    // check if the context supports partial sequence removal
+    const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
+    const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
+
+    if (use_ckpt) {
+        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
+    }
+
    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);

    // load the draft model
@@ -53,7 +77,7 @@ int main(int argc, char ** argv) {

    // TODO: simplify this logic
    {
-        const auto & params_spec = params.speculative;
+        const auto & params_spec = params.speculative.draft;

        auto params_dft = params;

@@ -61,15 +85,15 @@ int main(int argc, char ** argv) {
        params_dft.n_ctx        = params_spec.n_ctx;
        params_dft.n_batch      = llama_n_ctx_seq(ctx_tgt);
        params_dft.devices      = params_spec.devices;
-        params_dft.model        = params_spec.mparams_dft;
+        params_dft.model        = params_spec.mparams;
        params_dft.n_gpu_layers = params_spec.n_gpu_layers;

        if (params_spec.cpuparams.n_threads > 0) {
-            params_dft.cpuparams.n_threads       = params.speculative.cpuparams.n_threads;
-            params_dft.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+            params_dft.cpuparams.n_threads       = params.speculative.draft.cpuparams.n_threads;
+            params_dft.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
        }

-        params_dft.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
+        params_dft.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides;

        auto mparams_dft = common_model_params_to_llama(params_dft);

@@ -79,8 +103,8 @@ int main(int argc, char ** argv) {
            return 1;
        }

-        params.speculative.model_dft = model_dft.get();
-        params.speculative.cparams_dft = common_context_params_to_llama(params_dft);
+        params.speculative.draft.model = model_dft.get();
+        params.speculative.draft.cparams = common_context_params_to_llama(params_dft);
    }

    // Tokenize the prompt
@@ -119,7 +143,7 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+    common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));

    // eval the prompt
    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
@@ -142,21 +166,49 @@ int main(int argc, char ** argv) {

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

+    size_t n_draft = 0;
+
+    llama_tokens draft;
+    spec_checkpoint spec_ckpt;
+
    const auto t_enc_end = ggml_time_us();

    const auto t_dec_start = ggml_time_us();

    while (true) {
-        // optionally, generate draft tokens that can be appended to the target batch
+        // generate or reuse draft tokens
        //
        // this is the most important part of the speculation. the more probable tokens that are provided here
        // the better the performance will be. in theory, this computation can be performed asynchronously and even
        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
        // from a cache or lookup tables.
        //
-        llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
+        if (draft.empty()) {
+            // generate a new draft
+            draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);

-        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
+            // save the original draft size
+            n_draft = draft.size();
+
+            // save a checkpoint of the target context before evaluating the draft
+            // this allows us to restore the state if partial draft acceptance occurs
+            if (!draft.empty() && use_ckpt) {
+                const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                spec_ckpt.data.resize(ckpt_size);
+
+                const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                GGML_ASSERT(n == ckpt_size);
+
+                spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
+                LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
+                        spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
+            }
+        } else {
+            // we have a previous (partial) draft to reuse from checkpoint restoration
+            if (use_ckpt) {
+                GGML_ASSERT(!spec_ckpt.empty());
+            }
+        }

        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
@@ -164,11 +216,6 @@ int main(int argc, char ** argv) {

        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
-            // do not waste time on small drafts
-            if (draft.size() < (size_t) params_spec.n_min) {
-                draft.clear();
-            }
-
            for (size_t i = 0; i < draft.size(); ++i) {
                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
            }
@@ -178,6 +225,12 @@ int main(int argc, char ** argv) {
            llama_decode(ctx_tgt, batch_tgt);
        }

+        // only save the sampler sampler state if we use checkpoints
+        common_sampler_ptr smpl_save;
+        if (use_ckpt) {
+            smpl_save.reset(common_sampler_clone(smpl.get()));
+        }
+
        // sample from the full target batch and return the accepted tokens based on the target sampler
        //
        // for each token to be accepted, the sampler would have to sample that same token
@@ -185,14 +238,38 @@ int main(int argc, char ** argv) {
        // available logits from the batch and sample the next token until we run out of logits or the sampler
        // disagrees with the draft
        //
-        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
+        auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);

        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());

        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token

+        // check for partial draft acceptance:
+        // if the context doesn't support partial sequence removal, restore the checkpoint
+        // and make the accepted tokens the new partial draft for the next iteration
+        if (use_ckpt && ids.size() - 1 < draft.size()) {
+            LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
+
+            draft = std::move(ids);
+
+            const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+            GGML_ASSERT(n == spec_ckpt.size());
+
+            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
+
+            prompt_tgt.resize(spec_ckpt.n_tokens);
+            smpl = std::move(smpl_save);
+
+            n_past = (int) prompt_tgt.size();
+
+            continue;
+        }
+
+        common_speculative_accept(spec, ids.size() - 1);
+
+        // full acceptance: consume the draft and commit accepted tokens
        n_past    += ids.size() - 1;
-        n_drafted += draft.size(); // note: we ignore the discarded small drafts
+        n_drafted += n_draft; // note: we ignore the discarded small drafts
        n_accept  += ids.size() - 1;
        n_predict += ids.size();

@@ -222,6 +299,9 @@ int main(int argc, char ** argv) {

        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);

+        // clear the draft since it has been consumed
+        draft.clear();
+
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);

@@ -243,7 +323,7 @@ int main(int argc, char ** argv) {
    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

    LOG_INF("\n");
-    LOG_INF("n_draft   = %d\n", params_spec.n_max);
+    LOG_INF("n_draft   = %d\n", params_spec.draft.n_max);
    LOG_INF("n_predict = %d\n", n_predict);
    LOG_INF("n_drafted = %d\n", n_drafted);
    LOG_INF("n_accept  = %d\n", n_accept);
@@ -254,11 +334,10 @@ int main(int argc, char ** argv) {

    LOG_INF("\n");
    LOG_INF("target:\n\n");
-    common_perf_print(ctx_tgt, smpl);
+    common_perf_print(ctx_tgt, smpl.get());

    llama_batch_free(batch_tgt);

-    common_sampler_free(smpl);
    common_speculative_free(spec);

    llama_backend_free();
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -49,7 +49,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.speculative.mparams_dft.path.empty()) {
+    if (params.speculative.draft.mparams.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -58,7 +58,7 @@ int main(int argc, char ** argv) {
    const int n_seq_dft = params.n_parallel;

    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
-    const float p_draft_split = params.speculative.p_split;
+    const float p_draft_split = params.speculative.draft.p_split;

    std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
    std::uniform_real_distribution<> u_dist;
@@ -80,15 +80,15 @@ int main(int argc, char ** argv) {
    ctx_tgt   = llama_init_tgt->context();

    // load the draft model
-    params.devices = params.speculative.devices;
-    params.model = params.speculative.mparams_dft;
-    params.n_gpu_layers = params.speculative.n_gpu_layers;
-    if (params.speculative.cpuparams.n_threads > 0) {
-        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
+    params.devices = params.speculative.draft.devices;
+    params.model = params.speculative.draft.mparams;
+    params.n_gpu_layers = params.speculative.draft.n_gpu_layers;
+    if (params.speculative.draft.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads;
    }

-    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
+    params.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
+    params.tensor_buft_overrides     = params.speculative.draft.tensor_buft_overrides;

    auto llama_init_dft = common_init_from_params(params);

@@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
    //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));

    // how many tokens to draft each time
-    int n_draft = params.speculative.n_max;
+    int n_draft = params.speculative.draft.n_max;

    int n_predict = 0;
    int n_drafted = 0;
--- a/examples/sycl/start-svr.sh
+++ b/examples/sycl/start-svr.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+Help() {
+  cat << EOF
+Usage: $(basename "$0") [OPTIONS]
+
+This script processes files with specified options.
+
+Options:
+  -h, --help    Display this help message and exit.
+  -c, --context <value>    Set context length. Bigger need more memory.
+  -p, --promote <value>    Prompt to start generation with.
+  -m, --model   <value>    Full model file path.
+  -mg,--main-gpu <value>   Set main GPU ID (0 - n) for single GPU mode.
+  -sm,--split-mode <value> How to split the model across multiple GPUs, one of:
+                            - none: use one GPU only
+                            - layer (default): split layers and KV across GPUs
+                            - row: split rows across GPUs
+  -ngl,--n-gpu-layers <value>  Max. number of layers to store in VRAM (default: -1)
+  -lv,--log-verbosity <value>  Set the verbosity threshold. Messages with a higher verbosity will be
+                               ignored. Values:
+                                - 0: generic output
+                                - 1: error
+                                - 2: warning
+                                - 3: info
+                                - 4: debug
+
+
+EOF
+}
+
+BIN_FILE=./build/bin/llama-server
+SEED=0
+GPUS_SETTING=""
+
+MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
+NGL=99
+CONTEXT=4096
+GGML_SYCL_DEVICE=-1
+SPLIT_MODE=layer
+LOG_VERBOSE=3
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -c|--context)
+            CONTEXT=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -m|--model)
+            MODEL_FILE="$2"
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -mg|--main-gpu)
+            GGML_SYCL_DEVICE=$2
+            SPLIT_MODE=none
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -sm|--split-mode)
+            SPLIT_MODE=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -ngl|--n-gpu-layers)
+            NGL=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -lv|--log-verbosity)
+            LOG_VERBOSE=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -h|--help)
+            Help
+            exit 0
+            ;;
+        *)
+            # Handle unknown options or stop processing options
+            echo "Invalid option: $1"
+            # Optional: exit script or shift to treat remaining as positional args
+            exit 1
+            ;;
+    esac
+done
+
+
+
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
+
+if [ $GGML_SYCL_DEVICE -ne -1 ]; then
+    echo "Use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
+else
+    echo "Use all Intel GPUs, including iGPU & dGPU"
+    GPUS_SETTING="-sm ${SPLIT_MODE}"
+ fi
+
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
+
+
--- a/examples/sycl/test.sh
+++ b/examples/sycl/test.sh
@@ -38,7 +38,7 @@ SEED=0
 GPUS_SETTING=""

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
 NGL=99
 CONTEXT=4096
 GGML_SYCL_DEVICE=-1
@@ -122,9 +122,10 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
-   echo "Use all Intel GPUs, including iGPU & dGPU"
+    echo "Use all Intel GPUs, including iGPU & dGPU"
+    GPUS_SETTING="-sm ${SPLIT_MODE}"
 fi

-echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
-ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap

--- a/examples/sycl/win-start-svr.bat
+++ b/examples/sycl/win-start-svr.bat
@@ -0,0 +1,179 @@
+::  MIT license
+::  Copyright (C) 2024 Intel Corporation
+::  SPDX-License-Identifier: MIT
+
+@echo off
+setlocal EnableExtensions EnableDelayedExpansion
+
+set "BIN_FILE=.\build\bin\llama-server.exe"
+set "SEED=0"
+set "GPUS_SETTING="
+
+set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
+set "NGL=99"
+set "CONTEXT=4096"
+set "GGML_SYCL_DEVICE=-1"
+set "SPLIT_MODE=layer"
+set "LOG_VERBOSE=3"
+
+if "%~1"=="" goto after_args
+
+:parse_args
+if "%~1"=="" goto after_args
+
+if /I "%~1"=="-c" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--context" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-m" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--model" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-mg" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--main-gpu" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-sm" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--split-mode" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-ngl" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--n-gpu-layers" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-lv" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--log-verbosity" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-h" goto help
+if /I "%~1"=="--help" goto help
+
+echo Invalid option: %~1
+exit /b 1
+
+:missing_value
+echo Missing value for option: %~1
+exit /b 1
+
+:help
+echo Usage: %~n0 [OPTIONS]
+echo.
+echo This script processes files with specified options.
+echo.
+echo Options:
+echo   -h, --help    Display this help message and exit.
+echo   -c, --context ^<value^>    Set context length. Bigger need more memory.
+echo   -m, --model   ^<value^>    Full model file path.
+echo   -mg,--main-gpu ^<value^>   Set main GPU ID (0 - n) for single GPU mode.
+echo   -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
+echo                             - none: use one GPU only
+echo                             - layer (default): split layers and KV across GPUs
+echo                             - row: split rows across GPUs
+echo   -ngl,--n-gpu-layers ^<value^>  Max. number of layers to store in VRAM (default: -1)
+echo   -lv,--log-verbosity ^<value^>  Set the verbosity threshold. Messages with a higher verbosity will be
+echo                                ignored. Values:
+echo                                 - 0: generic output
+echo                                 - 1: error
+echo                                 - 2: warning
+echo                                 - 3: info
+echo                                 - 4: debug
+exit /b 0
+
+:after_args
+
+REM In Windows CMD, source is not available; call oneAPI setvars if present.
+if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
+  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
+) else (
+  echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
+)
+
+REM Support malloc device memory more than 4GB.
+set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
+echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
+
+if not "%GGML_SYCL_DEVICE%"=="-1" (
+  echo Use %GGML_SYCL_DEVICE% as main GPU
+  REM Use single GPU only.
+  set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
+  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
+  echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
+) else (
+  echo Use all Intel GPUs, including iGPU ^& dGPU
+  set "GPUS_SETTING=-sm %SPLIT_MODE%"
+)
+
+echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
+set "ZES_ENABLE_SYSMAN=1"
+%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
+
+endlocal
+
--- a/examples/sycl/win-test.bat
+++ b/examples/sycl/win-test.bat
@@ -2,10 +2,200 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

-set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
-@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-:: support malloc device memory more than 4GB.
-set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-set LOAD_MODE="--mmap"
-.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
+@echo off
+setlocal EnableExtensions EnableDelayedExpansion
+
+REM MIT license
+REM Copyright (C) 2024 Intel Corporation
+REM SPDX-License-Identifier: MIT
+
+set "BIN_FILE=.\build\bin\llama-completion.exe"
+set "SEED=0"
+set "GPUS_SETTING="
+
+set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
+set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
+set "NGL=99"
+set "CONTEXT=4096"
+set "GGML_SYCL_DEVICE=-1"
+set "SPLIT_MODE=layer"
+set "LOG_VERBOSE=3"
+
+if "%~1"=="" goto after_args
+
+:parse_args
+if "%~1"=="" goto after_args
+
+if /I "%~1"=="-c" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--context" (
+  if "%~2"=="" goto missing_value
+  set "CONTEXT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-p" (
+  if "%~2"=="" goto missing_value
+  set "INPUT_PROMPT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--promote" (
+  if "%~2"=="" goto missing_value
+  set "INPUT_PROMPT=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-m" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--model" (
+  if "%~2"=="" goto missing_value
+  set "MODEL_FILE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-mg" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--main-gpu" (
+  if "%~2"=="" goto missing_value
+  set "GGML_SYCL_DEVICE=%~2"
+  set "SPLIT_MODE=none"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-sm" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--split-mode" (
+  if "%~2"=="" goto missing_value
+  set "SPLIT_MODE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-ngl" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--n-gpu-layers" (
+  if "%~2"=="" goto missing_value
+  set "NGL=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-lv" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+if /I "%~1"=="--log-verbosity" (
+  if "%~2"=="" goto missing_value
+  set "LOG_VERBOSE=%~2"
+  shift
+  shift
+  goto parse_args
+)
+
+if /I "%~1"=="-h" goto help
+if /I "%~1"=="--help" goto help
+
+echo Invalid option: %~1
+exit /b 1
+
+:missing_value
+echo Missing value for option: %~1
+exit /b 1
+
+:help
+echo Usage: %~n0 [OPTIONS]
+echo.
+echo This script processes files with specified options.
+echo.
+echo Options:
+echo   -h, --help    Display this help message and exit.
+echo   -c, --context ^<value^>    Set context length. Bigger need more memory.
+echo   -p, --promote ^<value^>    Prompt to start generation with.
+echo   -m, --model   ^<value^>    Full model file path.
+echo   -mg,--main-gpu ^<value^>   Set main GPU ID (0 - n) for single GPU mode.
+echo   -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
+echo                             - none: use one GPU only
+echo                             - layer (default): split layers and KV across GPUs
+echo                             - row: split rows across GPUs
+echo   -ngl,--n-gpu-layers ^<value^>  Max. number of layers to store in VRAM (default: -1)
+echo   -lv,--log-verbosity ^<value^>  Set the verbosity threshold. Messages with a higher verbosity will be
+echo                                ignored. Values:
+echo                                 - 0: generic output
+echo                                 - 1: error
+echo                                 - 2: warning
+echo                                 - 3: info
+echo                                 - 4: debug
+exit /b 0
+
+:after_args
+
+REM In Windows CMD, source is not available; call oneAPI setvars if present.
+if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
+  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
+) else (
+  echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
+)
+
+REM Support malloc device memory more than 4GB.
+set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
+echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
+
+if not "%GGML_SYCL_DEVICE%"=="-1" (
+  echo Use %GGML_SYCL_DEVICE% as main GPU
+  REM Use single GPU only.
+  set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
+  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
+  echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
+) else (
+  echo Use all Intel GPUs, including iGPU ^& dGPU
+  set "GPUS_SETTING=-sm %SPLIT_MODE%"
+)
+
+echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
+set "ZES_ENABLE_SYSMAN=1"
+%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
+
+endlocal
+
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 10)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -213,7 +213,7 @@ set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
 set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")

 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph"                              ON)
 option(GGML_HIP_RCCL                        "ggml: use ROCm Collective Comm. Library"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -470,11 +470,10 @@ endforeach()

 target_link_libraries(ggml-base PRIVATE Threads::Threads)

-find_library(MATH_LIBRARY m)
-if (MATH_LIBRARY)
-    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
-        target_link_libraries(ggml-base PRIVATE m)
-    endif()
+if (DEFINED MATH_LIBRARY)
+    target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
+elseif (NOT WIN32 AND NOT DEFINED ENV{ONEAPI_ROOT})
+    target_link_libraries(ggml-base PRIVATE m)
 endif()

 if (CMAKE_SYSTEM_NAME MATCHES "Android")
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -1205,40 +1205,57 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg

    if (split_state.n_segments != 1) {
        GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
-        GGML_ASSERT(offset == 0);
-        GGML_ASSERT(size == ggml_nbytes(tensor));
        GGML_ASSERT(tensor->ne[3] == 1);
+
        size_t offset_data = 0;
        std::vector<size_t> simple_offsets(n_bufs, 0);
        if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
            GGML_ASSERT(tensor->ne[2] == 1);
+
+            const size_t row_stride = tensor->nb[1];
+            GGML_ASSERT(offset % row_stride == 0);
+            GGML_ASSERT(size   % row_stride == 0);
+            const int64_t r_start = offset / row_stride;
+            const int64_t r_count = size   / row_stride;
+            GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
+
            const int64_t blck_size = ggml_blck_size(tensor->type);
            for (size_t s = 0; s < split_state.n_segments; s++) {
                for (size_t j = 0; j < n_bufs; j++) {
                    ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                    GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
                    const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
-                    ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
-                        tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
+                    ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
+                        simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
+                        r_count, simple_tensor->nb[1], tensor->nb[1]);
                    offset_data       += nbytes;
                    simple_offsets[j] += nbytes;
                }
            }
-            GGML_ASSERT(offset_data*tensor->ne[1] == size);
+            GGML_ASSERT(offset_data*r_count == size);
            return;
        }
        GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
+
+        const size_t row_stride = tensor->nb[2];
+        GGML_ASSERT(offset % row_stride == 0);
+        GGML_ASSERT(size   % row_stride == 0);
+        const int64_t r_start = offset / row_stride;
+        const int64_t r_count = size   / row_stride;
+        GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
+
        for (size_t s = 0; s < split_state.n_segments; s++) {
            for (size_t j = 0; j < n_bufs; j++) {
                ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
-                ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
-                    tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
+                ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
+                    simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
+                    r_count, simple_tensor->nb[2], tensor->nb[2]);
                offset_data       += nbytes;
                simple_offsets[j] += nbytes;
            }
        }
-        GGML_ASSERT(offset_data*tensor->ne[2] == size);
+        GGML_ASSERT(offset_data*r_count == size);
        return;
    }

@@ -1295,40 +1312,57 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co

    if (split_state.n_segments != 1) {
        GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
-        GGML_ASSERT(offset == 0);
-        GGML_ASSERT(size == ggml_nbytes(tensor));
        GGML_ASSERT(tensor->ne[3] == 1);
+
        size_t offset_data = 0;
        std::vector<size_t> simple_offsets(n_bufs, 0);
        if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
            GGML_ASSERT(tensor->ne[2] == 1);
+
+            const size_t row_stride = tensor->nb[1];
+            GGML_ASSERT(offset % row_stride == 0);
+            GGML_ASSERT(size   % row_stride == 0);
+            const int64_t r_start = offset / row_stride;
+            const int64_t r_count = size   / row_stride;
+            GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
+
            const int64_t blck_size = ggml_blck_size(tensor->type);
            for (size_t s = 0; s < split_state.n_segments; s++) {
                for (size_t j = 0; j < n_bufs; j++) {
                    const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                    GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
                    const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
-                    ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
-                        tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
+                    ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
+                        simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
+                        r_count, simple_tensor->nb[1], tensor->nb[1]);
                    offset_data       += nbytes;
                    simple_offsets[j] += nbytes;
                }
            }
-            GGML_ASSERT(offset_data*tensor->ne[1] == size);
+            GGML_ASSERT(offset_data*r_count == size);
            return;
        }
        GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
+
+        const size_t row_stride = tensor->nb[2];
+        GGML_ASSERT(offset % row_stride == 0);
+        GGML_ASSERT(size   % row_stride == 0);
+        const int64_t r_start = offset / row_stride;
+        const int64_t r_count = size   / row_stride;
+        GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
+
        for (size_t s = 0; s < split_state.n_segments; s++) {
            for (size_t j = 0; j < n_bufs; j++) {
                const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
-                ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
-                    tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
+                ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
+                    simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
+                    r_count, simple_tensor->nb[2], tensor->nb[2]);
                offset_data       += nbytes;
                simple_offsets[j] += nbytes;
            }
        }
-        GGML_ASSERT(offset_data*tensor->ne[2] == size);
+        GGML_ASSERT(offset_data*r_count == size);
        return;
    }

@@ -1792,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                    continue;
                }

-                i = get_i_delayed(i);
+                const int i_delayed = get_i_delayed(i);
+
+                // If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
+                // A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
+                //     its compute flag disabled and thus gets its data zeroed out.
+                // If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
+                if (i_delayed > i) {
+                    for (size_t j = 0; j < n_backends; j++) {
+                        auto & bcj = backend_ctx->backend_configs[j];
+                        if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                            for (int ii = i + 1; ii <= i_delayed; ii++) {
+                                bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
+                            }
+                        }
+                    }
+                }
+
+                i = i_delayed;

                for (size_t j = 0; j < n_backends; j++) {
                    auto & bcj = backend_ctx->backend_configs[j];
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -181,6 +181,12 @@ struct ggml_backend_registry {
            return;
        }

+        for (auto & entry : backends) {
+            if (entry.reg == reg) {
+                return;
+            }
+        }
+
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
@@ -192,6 +198,12 @@ struct ggml_backend_registry {
    }

    void register_device(ggml_backend_dev_t device) {
+        for (auto & dev : devices) {
+            if (dev == device) {
+                return;
+            }
+        }
+
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
 #endif
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -25,6 +25,7 @@
 #include "ggml-impl.h"
 #include "ggml.h"

+
 #include <aclnnop/aclnn_add.h>
 #include <aclnnop/aclnn_add_rms_norm.h>
 #include <aclnnop/aclnn_addcdiv.h>
@@ -45,7 +46,9 @@
 #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
 #include <aclnnop/aclnn_ger.h>
 #include <aclnnop/aclnn_group_norm.h>
+#include <aclnnop/aclnn_gather_v2.h>
 #include <aclnnop/aclnn_grouped_matmul_v3.h>
+#include <aclnnop/aclnn_scatter.h>
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_im2col.h>
 #include <aclnnop/aclnn_index_copy.h>
@@ -62,6 +65,7 @@
 #include <aclnnop/aclnn_permute.h>
 #include <aclnnop/aclnn_pow.h>
 #include <aclnnop/aclnn_pow_tensor_tensor.h>
+#include <aclnnop/aclnn_recurrent_gated_delta_rule.h>
 #include <aclnnop/aclnn_reduce_sum.h>
 #include <aclnnop/aclnn_reflection_pad1d.h>
 #include <aclnnop/aclnn_repeat.h>
@@ -69,11 +73,15 @@
 #include <aclnnop/aclnn_rms_norm.h>
 #include <aclnnop/aclnn_roll.h>
 #include <aclnnop/aclnn_softmax.h>
+#include <aclnnop/aclnn_softmax_cross_entropy_with_logits.h>
 #include <aclnnop/aclnn_sub.h>
 #include <aclnnop/aclnn_sum.h>
 #include <aclnnop/aclnn_threshold.h>
 #include <aclnnop/aclnn_tril.h>
+#include <aclnnop/aclnn_triangular_solve.h>
 #include <aclnnop/aclnn_triu.h>
+#include <aclnnop/aclnn_logical_not.h>
+#include <aclnnop/aclnn_masked_fill_scalar.h>
 #include <aclnnop/aclnn_upsample_nearest_2d.h>
 #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
 #include <aclnnop/aclnn_zero.h>
@@ -151,6 +159,107 @@ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, ac
    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
 }

+// Fused SwiGLU using aclnnSwiGlu: splits input along innermost dim, applies
+// SiLU to left half, multiplies by right half.
+//
+// Falls back to the generic two-kernel path when src[1] != nullptr (two
+// independent halves) or swapped != 0 (reversed activation order), as
+// aclnnSwiGlu only handles the single interleaved tensor in standard order.
+//
+// CANN tiling for SwiGlu requires (storageShapeDim + viewDims) to be even.
+// aclCreateTensor always uses storageShapeDim=1, so viewDims must be odd.
+// We use a 3D view (1+3=4, even) to satisfy this constraint while preserving
+// correct split semantics along the innermost (ne[0]) dimension.
+void ggml_cann_swiglu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    auto silu_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+        GGML_CANN_CALL_ACLNN_OP(ctx, Silu, acl_src, acl_dst);
+    };
+
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+    if (dst->src[1] != nullptr || swapped != 0) {
+        ggml_cann_op_unary_gated(silu_fn, ctx, dst);
+        return;
+    }
+
+    // aclnnSwiGlu requires the split dim (src->ne[0]) to be even; fall back otherwise.
+    if (dst->src[0]->ne[0] % 2 != 0) {
+        ggml_cann_op_unary_gated(silu_fn, ctx, dst);
+        return;
+    }
+
+    ggml_tensor * src0 = dst->src[0];
+    size_t elem_size = ggml_element_size(src0);
+
+    // src0 GGML: [2*ne0, ne1, ne2, ne3] → 3D view [2*ne0, ne1, ne2*ne3]
+    // CANN reversed: [ne2*ne3, ne1, 2*ne0], split along CANN dim 2 (last).
+    int64_t ne0_x2   = src0->ne[0];
+    int64_t ne1      = src0->ne[1];
+    int64_t ne23     = src0->ne[2] * src0->ne[3];
+    int64_t src3d_ne[] = { ne0_x2, ne1, ne23 };
+    size_t  src3d_nb[] = { (size_t)src0->nb[0], (size_t)src0->nb[1], (size_t)src0->nb[2] };
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type),
+                                                     elem_size, src3d_ne, src3d_nb, 3);
+
+    // dst GGML: [ne0, ne1, ne2, ne3] → 3D view [ne0, ne1, ne2*ne3]
+    int64_t ne0      = dst->ne[0];
+    int64_t dst3d_ne[] = { ne0, ne1, ne23 };
+    size_t  dst3d_nb[] = { (size_t)dst->nb[0], (size_t)dst->nb[1], (size_t)dst->nb[2] };
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
+                                                     elem_size, dst3d_ne, dst3d_nb, 3);
+
+    // CANN tensor [ne23, ne1, 2*ne0]: split along CANN dim 2 (last) = 2*ne0.
+    GGML_CANN_CALL_ACLNN_OP(ctx, SwiGlu, acl_src.get(), (int64_t)2, acl_dst.get());
+}
+
+// Fused GeGLU using aclnnGeGluV3: splits input along ne[0] (CANN last dim),
+// activates the LEFT half with GELU, multiplies by right half.
+// approximate: 0=tanh, 1=none(erf). activateLeft=true matches GGML convention.
+// outGelu is a required-but-discard output buffer.
+//
+// Falls back to the generic two-kernel path when src[1] != nullptr (two
+// independent halves) or swapped != 0 (reversed activation order), as
+// aclnnGeGluV3 only handles the single interleaved tensor in standard order.
+void ggml_cann_geglu(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t approximate) {
+    auto gelu_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+        GGML_CANN_CALL_ACLNN_OP(ctx, Gelu, acl_src, acl_dst);
+    };
+
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+    if (dst->src[1] != nullptr || swapped != 0) {
+        ggml_cann_op_unary_gated(gelu_fn, ctx, dst);
+        return;
+    }
+
+    // aclnnGeGluV3 requires the split dim (src->ne[0]) to be even; fall back otherwise.
+    if (dst->src[0]->ne[0] % 2 != 0) {
+        ggml_cann_op_unary_gated(gelu_fn, ctx, dst);
+        return;
+    }
+
+    ggml_tensor * src0 = dst->src[0];
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+    // Allocate a temporary buffer for the required outGelu output (same shape as dst).
+    // Build contiguous strides since the pool allocation is a fresh buffer.
+    size_t  elem_size    = ggml_element_size(dst);
+    int64_t ne[GGML_MAX_DIMS] = { dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3] };
+    size_t  nb[GGML_MAX_DIMS];
+    nb[0] = elem_size;
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        nb[i] = nb[i - 1] * ne[i - 1];
+    }
+    size_t gelu_out_size = nb[GGML_MAX_DIMS - 1] * ne[GGML_MAX_DIMS - 1];
+    ggml_cann_pool_alloc gelu_out_alloc(ctx.pool(), gelu_out_size);
+
+    acl_tensor_ptr acl_gelu_out = ggml_cann_create_tensor(
+        gelu_out_alloc.get(), ggml_cann_type_mapping(dst->type), elem_size, ne, nb, GGML_MAX_DIMS);
+    // V3 adds activateLeft param; true → Gelu(left)*right, matching GGML convention.
+    // GGML dim 0 → CANN last dim (index GGML_MAX_DIMS-1 = 3 for 4D tensor).
+    GGML_CANN_CALL_ACLNN_OP(ctx, GeGluV3, acl_src.get(), (int64_t)(GGML_MAX_DIMS - 1), approximate, true,
+                             acl_dst.get(), acl_gelu_out.get());
+}
+
 /**
 * @brief Repeats elements of a tensor along each dimension according to the
 * specified repeat array.
@@ -445,28 +554,33 @@ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
    void *               buffer = temp_buffer_allocator.get();

-    int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
-    size_t  div_nb[GGML_MAX_DIMS];
-    div_nb[0] = sizeof(float);
+    int64_t norm_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
+    size_t  norm_nb[GGML_MAX_DIMS];
+    norm_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
-        div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
+        norm_nb[i] = norm_nb[i - 1] * norm_ne[i - 1];
    }
-    acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
+    acl_tensor_ptr acl_norm = ggml_cann_create_tensor(buffer, ACL_FLOAT, sizeof(float), norm_ne, norm_nb, GGML_MAX_DIMS);

    std::vector<int64_t> norm_dims  = { 3 };
    acl_int_array_ptr    dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());

    float          p_value  = 2.0f;
    acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
-    GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
+    GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_norm.get());

-    // Clamp norm to at least eps: scale = 1/fmaxf(norm, eps)
-    acl_scalar_ptr acl_min = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
-    float          flt_max = FLT_MAX;
-    acl_scalar_ptr acl_max = ggml_cann_create_scalar(&flt_max, aclDataType::ACL_FLOAT);
-    GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_div.get(), acl_min.get(), acl_max.get(), acl_div.get());
+    ggml_cann_pool_alloc clamp_buffer_allocator(ctx.pool());
+    acl_tensor_ptr       acl_clamped;

-    GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
+    if (eps > 0.0f) {
+        void *         clamp_buf  = clamp_buffer_allocator.alloc(n_bytes);
+        acl_clamped               = ggml_cann_create_tensor(clamp_buf, ACL_FLOAT, sizeof(float), norm_ne, norm_nb, GGML_MAX_DIMS);
+        acl_scalar_ptr eps_scalar = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
+        GGML_CANN_CALL_ACLNN_OP(ctx, ClampMin, acl_norm.get(), eps_scalar.get(), acl_clamped.get());
+    }
+
+    aclTensor * acl_div_input = acl_clamped ? acl_clamped.get() : acl_norm.get();
+    GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div_input, acl_dst.get());
 }

 void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -482,56 +596,30 @@ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor *
    logits_nb[1]              = logits_nb[0] * logits_ne[0];
    acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);

-    size_t               log_softmax_type_size = sizeof(float);
-    int64_t              log_softmax_n_bytes   = nr * nc * log_softmax_type_size;
-    ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
-    void *               log_softmax_buffer = log_softmax_allocator.get();
-
-    int64_t log_softmax_ne[] = { nc, nr };
-    size_t  log_softmax_nb[2];
-    log_softmax_nb[0]              = log_softmax_type_size;
-    log_softmax_nb[1]              = log_softmax_nb[0] * log_softmax_ne[0];
-    acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
-                                                             log_softmax_ne, log_softmax_nb, 2);
-
-    GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
-
    int64_t labels_ne[] = { nc, nr };
    size_t  labels_nb[2];
    labels_nb[0]              = ggml_type_size(src1->type);
    labels_nb[1]              = labels_nb[0] * labels_ne[0];
    acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);

-    size_t               mul_type_size = sizeof(float);
-    int64_t              mul_n_bytes   = nr * nc * mul_type_size;
-    ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
-    void *               mul_buffer = mul_allocator.get();
+    size_t               loss_per_sample_type_size = sizeof(float);
+    int64_t              loss_per_sample_n_bytes   = nr * loss_per_sample_type_size;
+    ggml_cann_pool_alloc loss_per_sample_allocator(ctx.pool(), loss_per_sample_n_bytes);
+    void *               loss_per_sample_buffer = loss_per_sample_allocator.get();

-    int64_t mul_ne[] = { nc, nr };
-    size_t  mul_nb[2];
-    mul_nb[0]                     = mul_type_size;
-    mul_nb[1]                     = mul_nb[0] * mul_ne[0];
-    acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
+    int64_t loss_per_sample_ne[] = { nr };
+    size_t  loss_per_sample_nb[1];
+    loss_per_sample_nb[0] = loss_per_sample_type_size;
+    acl_tensor_ptr acl_loss_per_sample = ggml_cann_create_tensor(
+        loss_per_sample_buffer, ACL_FLOAT, loss_per_sample_type_size, loss_per_sample_ne, loss_per_sample_nb, 1);

-    GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
+    size_t               backprop_n_bytes = nr * nc * sizeof(float);
+    ggml_cann_pool_alloc backprop_allocator(ctx.pool(), backprop_n_bytes);
+    void *               backprop_buffer = backprop_allocator.get();
+    acl_tensor_ptr acl_backprop = ggml_cann_create_tensor(backprop_buffer, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);

-    size_t               sum_per_sample_type_size = sizeof(float);
-    int64_t              sum_per_sample_n_bytes   = nr * sum_per_sample_type_size;
-    ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
-    void *               sum_per_sample_buffer = sum_per_sample_allocator.get();
-
-    int64_t sum_per_sample_ne[] = { nr };
-    size_t  sum_per_sample_nb[1];
-    sum_per_sample_nb[0]              = sum_per_sample_type_size;
-    acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
-        sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
-
-    std::vector<int64_t> sum_dims   = { 1 };
-    acl_int_array_ptr    dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
-    bool                 keep_dims  = false;
-
-    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
-                            acl_sum_per_sample.get());
+    GGML_CANN_CALL_ACLNN_OP(ctx, SoftmaxCrossEntropyWithLogits, acl_logits.get(), acl_labels.get(),
+                            acl_loss_per_sample.get(), acl_backprop.get());

    size_t               total_sum_type_size = sizeof(float);
    int64_t              total_sum_n_bytes   = 1 * total_sum_type_size;
@@ -547,11 +635,12 @@ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor *

    std::vector<int64_t> total_sum_dims    = { 0 };
    acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
+    bool              keep_dims            = false;

-    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
+    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_loss_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
                            acl_total_sum.get());

-    float          value        = -1.0f / static_cast<float>(nr);
+    float          value        = 1.0f / static_cast<float>(nr);
    acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
    acl_tensor_ptr acl_dst =
        ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
@@ -589,6 +678,33 @@ void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
                            acl_mean_out.get(), acl_rstd_out.get());
 }

+void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src0 = dst->src[0];
+    ggml_tensor * src1 = dst->src[1];
+
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
+
+    // Create a view of dst at the target offset with src1's dimensions
+    acl_tensor_ptr acl_dst  = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+    acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
+
+    if (!inplace) {
+        // First copy src0 to dst entirely
+        size_t cpy_size = ggml_nbytes(dst);
+        ACL_CHECK(
+            aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+    }
+
+    // Copy src1 into the target region of dst
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst.get(), acl_src1.get());
+}
+
 void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    ggml_tensor * src0 = dst->src[0];
    ggml_tensor * src1 = dst->src[1];
@@ -652,6 +768,113 @@ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
 }

+void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src = dst->src[0];
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+    // GGML cumsum operates along dim 0 (innermost / ne[0]).
+    // ggml_cann_create_tensor reverses dimensions to [ne3,ne2,ne1,ne0],
+    // so GGML dim 0 maps to CANN dim 3 (the last dim of the 4-D tensor).
+    GGML_CANN_CALL_ACLNN_OP(ctx, Cumsum, acl_src.get(), (int64_t)3,
+                            ggml_cann_type_mapping(dst->type), acl_dst.get());
+}
+
+void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src0 = dst->src[0];  // A: [N, N, B2, B3] lower triangular
+    ggml_tensor * src1 = dst->src[1];  // B: [K, N, B2, B3]
+
+    acl_tensor_ptr acl_a = ggml_cann_create_tensor(src0);
+    acl_tensor_ptr acl_b = ggml_cann_create_tensor(src1);
+    acl_tensor_ptr acl_x = ggml_cann_create_tensor(dst);
+
+    // mOut: triangular copy of A (required output), same shape as A.
+    const size_t a_bytes = ggml_nbytes(src0);
+    ggml_cann_pool_alloc m_alloc(ctx.pool(), a_bytes);
+    acl_tensor_ptr acl_m = ggml_cann_create_tensor(
+        m_alloc.get(), ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
+
+    // Solve AX = B: upper=false (lower tri), transpose=false, unitriangular=false.
+    GGML_CANN_CALL_ACLNN_OP(ctx, TriangularSolve,
+        acl_b.get(), acl_a.get(), false, false, false,
+        acl_x.get(), acl_m.get());
+}
+
+void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src = dst->src[0];
+
+    GGML_ASSERT(src->ne[1] == 1);
+
+    const int64_t N       = src->ne[0];
+    const int64_t n_batch = src->ne[2] * src->ne[3];
+    const size_t  nb_f32  = sizeof(float);
+
+    // Fill dst with zeros.
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+    {
+        float          zero = 0.0f;
+        acl_scalar_ptr acl_zero = ggml_cann_create_scalar(&zero, ACL_FLOAT);
+        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_zero.get());
+    }
+
+    // Copy src vector onto the diagonal of dst via strided views.
+    // src viewed as [N, n_batch], contiguous strides.
+    int64_t ne_vec[2]      = { N, n_batch };
+    size_t  nb_src_vec[2]  = { nb_f32, N * nb_f32 };
+    // dst diagonal view: stride (N+1)*4 steps along the diagonal.
+    size_t  nb_dst_diag[2] = { (N + 1) * nb_f32, N * N * nb_f32 };
+
+    acl_tensor_ptr acl_src_vec  = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne_vec, nb_src_vec, 2);
+    acl_tensor_ptr acl_dst_diag = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne_vec, nb_dst_diag, 2);
+
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst_diag.get(), acl_src_vec.get());
+}
+
+void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    float c = ggml_get_op_params_f32(dst, 0);
+
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+    acl_scalar_ptr acl_c   = ggml_cann_create_scalar(&c, ACL_FLOAT);
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_c.get());
+}
+
+void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src = dst->src[0];
+
+    const int64_t S       = src->ne[0];
+    const int64_t n_batch = src->ne[2] * src->ne[3];
+    const size_t  nb_f32  = sizeof(float);
+
+    int64_t ne3d[3] = { S, S, n_batch };
+    size_t  nb3d[3] = { nb_f32, S * nb_f32, S * S * nb_f32 };
+
+    const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
+
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3);
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3);
+
+    switch (ttype) {
+        case GGML_TRI_TYPE_LOWER:
+            // Tril(-1): preserve row > col (strict lower), zero upper + diagonal.
+            GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)-1, acl_dst.get());
+            break;
+        case GGML_TRI_TYPE_UPPER_DIAG:
+            // Triu(0): preserve row <= col (upper + diagonal), zero strict lower.
+            GGML_CANN_CALL_ACLNN_OP(ctx, Triu, acl_src.get(), (int64_t)0, acl_dst.get());
+            break;
+        case GGML_TRI_TYPE_UPPER:
+            // Triu(1): preserve row < col (strict upper), zero lower + diagonal.
+            GGML_CANN_CALL_ACLNN_OP(ctx, Triu, acl_src.get(), (int64_t)1, acl_dst.get());
+            break;
+        case GGML_TRI_TYPE_LOWER_DIAG:
+            // Tril(0): preserve row >= col (lower + diagonal), zero strict upper.
+            GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)0, acl_dst.get());
+            break;
+        default:
+            GGML_ABORT("unsupported tri type");
+    }
+}
+
 void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    ggml_tensor *  src     = dst->src[0];
    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
@@ -1695,152 +1918,90 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
 }

-/**
- * @brief Performs index select operation on a 4D tensor using the CANN backend.
- *
- * This function applies the `IndexSelect` operation along a specific dimension
- * of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
- * It iterates over the last two dimensions of the source tensor, creates the corresponding
- * CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
- * operation for each slice.
- *
- * @param ctx The context for CANN backend operations.
- * @param src_buffer The source buffer containing the 4D input tensor data.
- * @param src_ne The dimensions of the source tensor.
- * @param src_nb The strides (byte offsets) of the source tensor.
- * @param dst_buffer The destination buffer where the output tensor data will be written.
- * @param dst_ne The dimensions of the destination tensor.
- * @param dst_nb The strides (byte offsets) of the destination tensor.
- * @param index The index tensor specifying the indices to select from the source tensor.
- * @param type The data type of the source and destination tensors.
- */
-static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
-                                  void *                      src_buffer,
-                                  int64_t *                   src_ne,
-                                  size_t *                    src_nb,
-                                  void *                      dst_buffer,
-                                  int64_t *                   dst_ne,
-                                  size_t *                    dst_nb,
-                                  ggml_tensor *               index,
-                                  ggml_type                   type) {
-    for (int64_t i = 0; i < src_ne[3]; i++) {
-        for (int64_t j = 0; j < src_ne[2]; j++) {
-            // src
-            acl_tensor_ptr acl_src_tensor =
-                ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
-                                        ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
-
-            // index
-            acl_tensor_ptr acl_index = ggml_cann_create_tensor(
-                (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
-                ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
-
-            // out
-            acl_tensor_ptr acl_out =
-                ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
-                                        ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
-            GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
-        }
-    }
-}
-
-/**
- * @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
- *
- * This function applies the `IndexCopy` operation along a specific dimension of the
- * destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
- * to positions specified by the index tensor (`index`).
- * It iterates over the last two dimensions of the tensors, creates the corresponding
- * CANN tensors for source, index, and destination slices, and performs the index copy
- * operation for each slice.
- *
- * @param ctx The context for CANN backend operations.
- * @param src_buffer The source buffer containing the 4D input tensor data to be copied.
- * @param src_ne The dimensions of the source tensor.
- * @param src_nb The strides (byte offsets) of the source tensor.
- * @param dst_buffer The destination buffer where values will be copied to.
- * @param dst_ne The dimensions of the destination tensor.
- * @param dst_nb The strides (byte offsets) of the destination tensor.
- * @param index The index tensor specifying target positions in the destination tensor.
- * @param type The data type of the source and destination tensors.
- */
-static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
-                                void *                      src_buffer,
-                                int64_t *                   src_ne,
-                                size_t *                    src_nb,
-                                void *                      dst_buffer,
-                                int64_t *                   dst_ne,
-                                size_t *                    dst_nb,
-                                ggml_tensor *               index,
-                                ggml_type                   type) {
-    for (int64_t i = 0; i < src_ne[3]; i++) {
-        for (int64_t j = 0; j < src_ne[2]; j++) {
-            // src
-            acl_tensor_ptr acl_src_tensor =
-                ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
-                                        ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
-
-            // index
-            acl_tensor_ptr acl_index = ggml_cann_create_tensor(
-                (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
-                ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
-
-            // out
-            acl_tensor_ptr acl_out =
-                ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
-                                        ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
-            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
-        }
-    }
-}

 void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];  // src
+    ggml_tensor * src0 = dst->src[0];  // weight
    ggml_tensor * src1 = dst->src[1];  // index

    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16
                || dst->type == GGML_TYPE_BF16);

+    // n_idx: number of row indices per (i2, i3) batch slice.
+    // ggml guarantees: src0->ne[2] == src1->ne[1], src0->ne[3] == src1->ne[2], src1->ne[3] == 1.
+    const int64_t n_idx = src1->ne[0];
+
+    // Gather all (i2, i3) batch slices from src into dst.
+    // ggml_cann_create_tensor reverses dims, so ACL sees [ne1, ne0].
+    // GatherV2 with dim=0 gathers along ACL dim-0 == ggml ne[1] (the vocabulary / row axis).
+    // nb: the 4 strides of the source buffer (nb[0..1] for the 2D slice shape,
+    //     nb[2..3] for computing per-batch-slice base pointer offsets).
+    auto gather_batched = [&](void * src_base, aclDataType acl_type, size_t type_size,
+                              const size_t * nb) {
+        int64_t src_ne[2]  = { src0->ne[0], src0->ne[1] };
+        size_t  src_nb_2d[2] = { nb[0], nb[1] };
+        int64_t dst_ne[2]  = { src0->ne[0], n_idx };
+        size_t  dst_nb_2d[2] = { dst->nb[0], dst->nb[1] };
+        int64_t idx_ne[1]  = { n_idx };
+        size_t  idx_nb[1]  = { (size_t)ggml_element_size(src1) };
+
+        for (int64_t i3 = 0; i3 < src0->ne[3]; i3++) {
+            for (int64_t i2 = 0; i2 < src0->ne[2]; i2++) {
+                acl_tensor_ptr acl_src = ggml_cann_create_tensor(
+                    (char *)src_base + i3 * nb[3] + i2 * nb[2],
+                    acl_type, type_size, src_ne, src_nb_2d, 2);
+                acl_tensor_ptr acl_idx = ggml_cann_create_tensor(
+                    (char *)src1->data + i3 * src1->nb[2] + i2 * src1->nb[1],
+                    ggml_cann_type_mapping(src1->type), (size_t)ggml_element_size(src1),
+                    idx_ne, idx_nb, 1);
+                acl_tensor_ptr acl_dst = ggml_cann_create_tensor(
+                    (char *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2],
+                    acl_type, type_size, dst_ne, dst_nb_2d, 2);
+                GGML_CANN_CALL_ACLNN_OP(ctx, GatherV2, acl_src.get(), 0, acl_idx.get(), acl_dst.get());
+            }
+        }
+    };
+
    switch (src0->type) {
        case GGML_TYPE_BF16:
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            if (src0->type == dst->type) {
-                aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
-                                      dst->type);
+                gather_batched(src0->data,
+                               ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+                               src0->nb);
            } else {
-                acl_tensor_ptr       acl_src0 = ggml_cann_create_tensor(src0);
-                ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
-                void *               src_trans_buffer = src_buffer_allocator.get();
-                size_t               src_trans_nb[GGML_MAX_DIMS];
-                src_trans_nb[0] = dst->nb[0];
+                // Cast src0 to dst type, then gather.
+                ggml_cann_pool_alloc src_cast_allocator(ctx.pool(),
+                                                        ggml_nelements(src0) * ggml_element_size(dst));
+                size_t src_cast_nb[GGML_MAX_DIMS];
+                src_cast_nb[0] = ggml_type_size(dst->type);
                for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                    src_cast_nb[i] = src_cast_nb[i - 1] * src0->ne[i - 1];
                }
-                acl_tensor_ptr src_trans_tensor =
-                    ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
-                                            ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
-                aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
-                aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
-                                      dst->type);
+                acl_tensor_ptr acl_src0     = ggml_cann_create_tensor(src0);
+                acl_tensor_ptr acl_src_cast = ggml_cann_create_tensor(
+                    src_cast_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                    src0->ne, src_cast_nb, GGML_MAX_DIMS);
+                aclnn_cast(ctx, acl_src0.get(), acl_src_cast.get(), ggml_cann_type_mapping(dst->type));
+
+                gather_batched(src_cast_allocator.get(),
+                               ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                               src_cast_nb);
            }
            break;
        case GGML_TYPE_Q8_0:
            {
-                // add 1 dim for bcast mul.
+                // Dequantize Q8_0 to dst type, then gather.
                size_t  weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
                int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
-                int64_t scale_offset = 0;
-                // [3,4,5,64] -> [3,4,5,2,32]
-                weight_ne[0]         = QK8_0;
-                weight_ne[1]         = src0->ne[0] / QK8_0;
-                weight_nb[0]         = sizeof(int8_t);
-                weight_nb[1]         = weight_nb[0] * weight_ne[0];
+                weight_ne[0] = QK8_0;
+                weight_ne[1] = src0->ne[0] / QK8_0;
+                weight_nb[0] = sizeof(int8_t);
+                weight_nb[1] = weight_nb[0] * weight_ne[0];
                for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
                    weight_ne[i] = src0->ne[i - 1];
                    weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
                }
-                // [3,4,5,64] -> [3,4,5,2,1]
                scale_ne[0] = 1;
                scale_ne[1] = src0->ne[0] / QK8_0;
                scale_nb[0] = sizeof(uint16_t);
@@ -1849,31 +2010,33 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
                    scale_ne[i] = src0->ne[i - 1];
                    scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
                }
-                // [3,4,5,64] -> [3,4,5,2,32]
                dequant_ne    = weight_ne;
                dequant_nb[0] = ggml_type_size(dst->type);
                for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
                    dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
                }
-                scale_offset = ggml_nelements(src0) * sizeof(int8_t);
-                ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
-                                                              ggml_nelements(src0) * ggml_type_size(dst->type));
-                acl_tensor_ptr       acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
-                                                                                 weight_ne, weight_nb, GGML_MAX_DIMS + 1);
-                acl_tensor_ptr       acl_scale_tensor =
-                    ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
-                                            GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
-                acl_tensor_ptr dequant_tensor =
-                    ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
-                                            ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
-                aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
-                dequant_nb[0] = ggml_type_size(dst->type);
+                const int64_t scale_offset = ggml_nelements(src0) * sizeof(int8_t);
+                ggml_cann_pool_alloc dequant_allocator(ctx.pool(),
+                                                       ggml_nelements(src0) * ggml_type_size(dst->type));
+                acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
+                                                                     weight_ne, weight_nb, GGML_MAX_DIMS + 1);
+                acl_tensor_ptr acl_scale  = ggml_cann_create_tensor(
+                    src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
+                    GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
+                acl_tensor_ptr acl_dequant = ggml_cann_create_tensor(
+                    dequant_allocator.get(), ggml_cann_type_mapping(dst->type),
+                    ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
+                aclnn_mul(ctx, acl_weight.get(), acl_scale.get(), acl_dequant.get());
+
+                // Reinterpret dequant buffer as 4D [src0->ne] with contiguous strides.
                dequant_ne    = src0->ne;
+                dequant_nb[0] = ggml_type_size(dst->type);
                for (int i = 1; i < GGML_MAX_DIMS; i++) {
                    dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
                }
-                aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
-                                      dst->nb, src1, dst->type);
+                gather_batched(dequant_allocator.get(),
+                               ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                               dequant_nb);
                break;
            }
        default:
@@ -1883,31 +2046,70 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 }

 void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];  // src
-    ggml_tensor * src1 = dst->src[1];  // index
+    ggml_tensor * src0 = dst->src[0];  // source values
+    ggml_tensor * src1 = dst->src[1];  // row indices
+
+    // n_idx: number of source rows to scatter per batch slice.
+    // ggml guarantees: src0->ne[1] == src1->ne[0].
+    const int64_t n_idx = src1->ne[0];
+
+    // Copy n_idx rows of src [ne0, n_idx] into dst [ne0, ne1] at positions given by a 1D index.
+    // ggml_cann_create_tensor reverses dims, so ACL sees [ne1, ne0] for dst.
+    // InplaceIndexCopy with dim=0 copies along ACL dim-0 == ggml ne[1] (the row axis).
+    // src_nb: the 4 strides of the source buffer (nb[0..1] for the 2D slice shape,
+    //         nb[2..3] for computing per-batch-slice base pointer offsets).
+    auto scatter_batched = [&](void * src_base, aclDataType acl_type, size_t type_size,
+                               const size_t * src_nb) {
+        int64_t d_ne[2]    = { dst->ne[0], dst->ne[1] };
+        size_t  d_nb[2]    = { dst->nb[0], dst->nb[1] };
+        int64_t s_ne[2]    = { dst->ne[0], n_idx };
+        size_t  s_nb_2d[2] = { src_nb[0], src_nb[1] };
+        int64_t i_ne[1]    = { n_idx };
+        size_t  i_nb[1]    = { (size_t)ggml_element_size(src1) };
+
+        for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
+            for (int64_t i2 = 0; i2 < dst->ne[2]; i2++) {
+                acl_tensor_ptr acl_dst = ggml_cann_create_tensor(
+                    (char *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2],
+                    acl_type, type_size, d_ne, d_nb, 2);
+                acl_tensor_ptr acl_idx = ggml_cann_create_tensor(
+                    (char *)src1->data + (i3 % src1->ne[2]) * src1->nb[2] + (i2 % src1->ne[1]) * src1->nb[1],
+                    ggml_cann_type_mapping(src1->type), (size_t)ggml_element_size(src1),
+                    i_ne, i_nb, 1);
+                acl_tensor_ptr acl_src = ggml_cann_create_tensor(
+                    (char *)src_base + i3 * src_nb[3] + i2 * src_nb[2],
+                    acl_type, type_size, s_ne, s_nb_2d, 2);
+                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_dst.get(), 0, acl_idx.get(), acl_src.get());
+            }
+        }
+    };

    switch (dst->type) {
        case GGML_TYPE_F32:
-            {
-                aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
-                break;
-            }
+            scatter_batched(src0->data,
+                            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                            src0->nb);
+            break;
        case GGML_TYPE_F16:
        case GGML_TYPE_BF16:
            {
-                acl_tensor_ptr       acl_src0 = ggml_cann_create_tensor(src0);
-                ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
-                void *               src_trans_buffer = src_buffer_allocator.get();
-                size_t               src_trans_nb[GGML_MAX_DIMS];
-                src_trans_nb[0] = sizeof(uint16_t);
+                // Cast src0 (F32) to dst type first.
+                ggml_cann_pool_alloc src_cast_allocator(ctx.pool(),
+                                                        ggml_nelements(src0) * ggml_type_size(dst->type));
+                size_t src_cast_nb[GGML_MAX_DIMS];
+                src_cast_nb[0] = ggml_type_size(dst->type);
                for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                    src_cast_nb[i] = src_cast_nb[i - 1] * src0->ne[i - 1];
                }
-                acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
-                    src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
-                aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
-                aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
-                                    dst->type);
+                acl_tensor_ptr acl_src0     = ggml_cann_create_tensor(src0);
+                acl_tensor_ptr acl_src_cast = ggml_cann_create_tensor(
+                    src_cast_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                    src0->ne, src_cast_nb, GGML_MAX_DIMS);
+                aclnn_cast(ctx, acl_src0.get(), acl_src_cast.get(), ggml_cann_type_mapping(dst->type));
+
+                scatter_batched(src_cast_allocator.get(),
+                                ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                                src_cast_nb);
                break;
            }
        default:
@@ -3268,29 +3470,50 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst
    int64_t           paddingsArray[2] = { opts[0], opts[1] };
    acl_int_array_ptr paddings         = ggml_cann_create_int_array(paddingsArray, 2);

-    for (int64_t i = 0; i < src0->ne[3]; i++) {
-        acl_tensor_ptr acl_src =
-            ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
-                                    ggml_element_size(src0), src0->ne, src0->nb, 3);
+    // Collapsing ne[2]*ne[3] into a single batch dimension requires that dim3
+    // is contiguous with respect to dim2 in both src and dst.
+    GGML_ASSERT(src0->nb[3] == src0->nb[2] * src0->ne[2]);
+    GGML_ASSERT(dst->nb[3]  == dst->nb[2]  * dst->ne[2]);

-        acl_tensor_ptr acl_dst =
-            ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
-                                    ggml_element_size(dst), dst->ne, dst->nb, 3);
+    int64_t src_ne_3d[3] = { src0->ne[0], src0->ne[1], src0->ne[2] * src0->ne[3] };
+    int64_t dst_ne_3d[3] = { dst->ne[0],  dst->ne[1],  dst->ne[2]  * dst->ne[3]  };

-        GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
-    }
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type),
+                                                     ggml_element_size(src0), src_ne_3d, src0->nb, 3);
+
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
+                                                     ggml_element_size(dst), dst_ne_3d, dst->nb, 3);
+
+    GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
 }

 void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    ggml_tensor * src0 = dst->src[0];
    ggml_tensor * src1 = dst->src[1];

+    // Write element-wise equality (0 or 1) into a temporary buffer to avoid
+    // modifying src0 in-place.  Use the same type as src0 so ReduceSum can
+    // consume it directly without a type cast.
+    ggml_cann_pool_alloc eq_alloc(ctx.pool(), ggml_nelements(src0) * ggml_element_size(src0));
+    size_t eq_nb[GGML_MAX_DIMS];
+    eq_nb[0] = ggml_element_size(src0);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        eq_nb[i] = eq_nb[i - 1] * src0->ne[i - 1];
+    }
+    acl_tensor_ptr acl_eq = ggml_cann_create_tensor(
+        eq_alloc.get(), ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
+        src0->ne, eq_nb, GGML_MAX_DIMS);
+
    acl_tensor_ptr acl_self  = ggml_cann_create_tensor(src0);
    acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
+    GGML_CANN_CALL_ACLNN_OP(ctx, EqTensor, acl_self.get(), acl_other.get(), acl_eq.get());

-    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
-
-    ggml_cann_sum(ctx, dst);
+    // Sum the 0/1 values into dst.
+    acl_tensor_ptr    acl_dst    = ggml_cann_create_tensor(dst);
+    int64_t           dims[4]    = { 0, 1, 2, 3 };
+    acl_int_array_ptr dims_arr   = ggml_cann_create_int_array(dims, 4);
+    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_eq.get(), dims_arr.get(), true,
+                            ggml_cann_type_mapping(dst->type), acl_dst.get());
 }

 void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -3306,6 +3529,27 @@ void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
 }

+void ggml_cann_softplus(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src0 = dst->src[0];
+
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+    float          beta_val      = 1.0f;
+    float          threshold_val = 20.0f;
+    acl_scalar_ptr beta          = ggml_cann_create_scalar(&beta_val,      ACL_FLOAT);
+    acl_scalar_ptr threshold     = ggml_cann_create_scalar(&threshold_val, ACL_FLOAT);
+
+    GGML_CANN_CALL_ACLNN_OP(ctx, Softplus, acl_src.get(), beta.get(), threshold.get(), acl_dst.get());
+}
+
+void ggml_cann_geglu_quick(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    auto gelu_quick_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+    };
+    ggml_cann_op_unary_gated(gelu_quick_fn, ctx, dst);
+}
+
 /**
 * @brief Performs expert-specific matrix multiplication (MoE) with
 * floating-point precision using the CANN backend.
@@ -3892,46 +4136,65 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
 }

 static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];  // weight
-    ggml_tensor * src1 = dst->src[1];  // input
+    ggml_tensor * src0 = dst->src[0];  // weight  [ne00=m, ne01=K, ne02, ne03]
+    ggml_tensor * src1 = dst->src[1];  // input   [ne10=n, ne11=K, ne12, ne13]
    GGML_TENSOR_BINARY_OP_LOCALS

-    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
-    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
+    // dst[i,j] = sum_k src0[i,k] * src1[j,k]  i.e. dst = src0 @ src1^T.
+    //
+    // ggml_cann_create_tensor reverses dimension order, so ACL sees:
+    //   acl_src0 slice:   ggml[m,K]  ->  ACL[K,m]
+    //   acl_src1 slice:   ggml[n,K]  ->  ACL[K,n]
+    //   acl_dst  slice:   ggml[m,n]  ->  ACL[n,m]
+    //
+    // Build a transposed view of src1 by swapping ne[0]/ne[1]:
+    //   src1_t:  ggml[K,n] (swapped strides)  ->  ACL[n,K]
+    //
+    // Matmul(src1_t [n,K], src0 [K,m]) = [n,m] = acl_dst  ✓
+    //
+    // The outer batch loop is kept because src0 may have fewer batch slices than
+    // dst (ne02 <= ne2, ne03 <= ne3): this is a strided-broadcast not supported
+    // by standard CANN Matmul broadcasting.
+
+    const aclDataType src0_acl_type = ggml_cann_type_mapping(src0->type);
+    const aclDataType src1_acl_type = ggml_cann_type_mapping(src1->type);
+    const aclDataType dst_acl_type  = ggml_cann_type_mapping(dst->type);
+    const size_t      src0_type_sz  = ggml_type_size(src0->type);
+    const size_t      src1_type_sz  = ggml_type_size(src1->type);
+    const size_t      dst_type_sz   = ggml_type_size(dst->type);

    const int64_t dps2 = ne2 / ne02;
    const int64_t dps3 = ne3 / ne03;
+
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t i02 = i2 / dps2;
            const int64_t i03 = i3 / dps3;

-            const int64_t  i12 = i2;
-            const int64_t  i13 = i3;
-            acl_tensor_ptr accumulator =
-                ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
-                                        ggml_type_size(dst->type), dst->ne, dst->nb, 2);
+            // src0 2D slice at [i02, i03]: ggml [m, K] -> ACL [K, m]
+            int64_t src0_ne[2] = { ne00, ne01 };
+            size_t  src0_nb[2] = { nb00, nb01 };
+            acl_tensor_ptr acl_src0_s = ggml_cann_create_tensor(
+                (char *) src0->data + i02 * nb02 + i03 * nb03,
+                src0_acl_type, src0_type_sz, src0_ne, src0_nb, 2);

-            // The outer product needs to be accumulated in this dimension.
-            for (int64_t i1 = 0; i1 < ne11; i1++) {
-                acl_tensor_ptr acl_input = ggml_cann_create_tensor(
-                    (char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
-                    ggml_type_size(src0->type), src1->ne, src1->nb, 1);
+            // src1 transposed 2D slice at [i2, i3]: swap ne/nb -> ggml[K,n] -> ACL[n,K]
+            int64_t src1_t_ne[2] = { ne11, ne10 };
+            size_t  src1_t_nb[2] = { nb11, nb10 };
+            acl_tensor_ptr acl_src1_t = ggml_cann_create_tensor(
+                (char *) src1->data + i2 * nb12 + i3 * nb13,
+                src1_acl_type, src1_type_sz, src1_t_ne, src1_t_nb, 2);

-                acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
-                    (char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
-                    ggml_type_size(src0->type), src0->ne, src0->nb, 1);
+            // dst 2D slice at [i2, i3]: ggml [m, n] -> ACL [n, m]
+            int64_t dst_ne[2] = { ne0, ne1 };
+            size_t  dst_nb[2] = { nb0, nb1 };
+            acl_tensor_ptr acl_dst_s = ggml_cann_create_tensor(
+                (char *) dst->data + i2 * nb2 + i3 * nb3,
+                dst_acl_type, dst_type_sz, dst_ne, dst_nb, 2);

-                ggml_cann_pool_alloc output_allocator(ctx.pool());
-                void *               output_buffer = output_allocator.alloc(ggml_nbytes(dst));
-                acl_tensor_ptr       acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
-                                                                       ggml_type_size(dst->type), dst->ne, dst->nb, 2);
-
-                GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
-                float       alpha_value = 1.0f;
-                aclScalar * alpha       = aclCreateScalar(&alpha_value, ACL_FLOAT);
-                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
-            }
+            // Matmul(src1_t [n,K], src0 [K,m]) = [n,m] = acl_dst_s  ✓
+            GGML_CANN_CALL_ACLNN_OP(ctx, Matmul,
+                acl_src1_t.get(), acl_src0_s.get(), acl_dst_s.get(), (int8_t) 1);
        }
    }
 }
@@ -4170,3 +4433,4 @@ void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor *
        }
    }
 }
+
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -32,6 +32,9 @@
 #include <aclnnop/aclnn_cat.h>
 #include <aclnnop/aclnn_clamp.h>
 #include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_cumsum.h>
+#include <aclnnop/aclnn_tril.h>
+#include <aclnnop/aclnn_triu.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_gelu.h>
 #include <aclnnop/aclnn_gelu_v2.h>
@@ -47,6 +50,9 @@
 #include <aclnnop/aclnn_sign.h>
 #include <aclnnop/aclnn_silu.h>
 #include <aclnnop/aclnn_sin.h>
+#include <aclnnop/aclnn_softplus.h>
+#include <aclnnop/aclnn_swi_glu.h>
+#include <aclnnop/aclnn_geglu.h>
 #include <aclnnop/aclnn_slice.h>
 #include <aclnnop/aclnn_sqrt.h>
 #include <aclnnop/aclnn_tanh.h>
@@ -69,6 +75,9 @@
 */
 void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

+void ggml_cann_swiglu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+void ggml_cann_geglu(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t approximate);
+
 /**
 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
 *          backend.
@@ -325,6 +334,48 @@ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);

+/**
+ * @brief   Computes the cumulative sum of a ggml tensor along dim 0 using the
+ *          CANN backend.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor. dst->op is `GGML_OP_CUMSUM`.
+ */
+void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief   Computes a triangular mask (tril/triu) of a square ggml tensor
+ *          using the CANN backend.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor. dst->op is `GGML_OP_TRI`.
+ */
+void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief   Solves a triangular linear system AX=B using the CANN backend.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor. dst->op is `GGML_OP_SOLVE_TRI`.
+ */
+void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief   Creates a diagonal matrix from a vector using the CANN backend.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor. dst->op is `GGML_OP_DIAG`.
+ */
+void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief   Fills a tensor with a constant scalar value using the CANN backend.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor. dst->op is `GGML_OP_FILL`.
+ */
+void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
 /**
 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
 *          the CANN backend.
@@ -461,6 +512,9 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor *
 // @see ggml_cann_dup.
 void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);

+// @see ggml_cann_acc, but copies src1 into dst instead of adding.
+void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
 /**
 * @brief   Computes the softmax activation with optional masking.
 *
@@ -813,6 +867,8 @@ void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 *            dst->op is expected to be `GGML_OP_STEP`.
 */
 void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+void ggml_cann_softplus(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+void ggml_cann_geglu_quick(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Performs the Flash Attention extended operator using the CANN backend.
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1428,6 +1428,22 @@ static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
    return false;
 }

+/**
+ * @brief Set a region of a tensor's device memory to a specified value.
+ *
+ * @param buffer The CANN buffer containing the tensor.
+ * @param tensor Pointer to the tensor whose memory will be set.
+ * @param value The value to which each byte in the region will be set.
+ * @param offset Byte offset within the tensor's data to start setting.
+ * @param size Number of bytes to set.
+ */
+static void ggml_backend_cann_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+    ACL_CHECK(aclrtMemset((char *) tensor->data + offset, size, value, size));
+}
+
 /**
 * @brief Clear a CANN buffer by setting all its memory to a specified value.
 *
@@ -1454,7 +1470,7 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
    /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cann_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cann_buffer_init_tensor,
-    /* .memset_tensor   = */ NULL,
+    /* .memset_tensor   = */ ggml_backend_cann_buffer_memset_tensor,
    /* .set_tensor      = */ ggml_backend_cann_buffer_set_tensor,
    /* .get_tensor      = */ ggml_backend_cann_buffer_get_tensor,
    /* .set_tensor_2d   = */ NULL,
@@ -1835,6 +1851,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
                case GGML_UNARY_OP_STEP:
                    ggml_cann_step(ctx, dst);
                    break;
+                case GGML_UNARY_OP_SOFTPLUS:
+                    ggml_cann_softplus(ctx, dst);
+                    break;
                default:
                    return false;
            }
@@ -1845,20 +1864,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
                    GGML_CANN_CALL_OP_UNARY_GATED(Relu);
                    break;
                case GGML_GLU_OP_GEGLU:
+                    ggml_cann_geglu(ctx, dst, 0);  // approximate=0 → tanh
+                    break;
                case GGML_GLU_OP_GEGLU_ERF:
-                    // aclnnGelu internally uses the erf-based approximation.
-                    GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
+                    ggml_cann_geglu(ctx, dst, 1);  // approximate=1 → erf
                    break;
                case GGML_GLU_OP_SWIGLU:
-                    GGML_CANN_CALL_OP_UNARY_GATED(Silu);
+                    ggml_cann_swiglu(ctx, dst);
                    break;
                case GGML_GLU_OP_GEGLU_QUICK:
-                    {
-                        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
-                            GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
-                        };
-                        ggml_cann_op_unary_gated(lambda, ctx, dst);
-                    }
+                    ggml_cann_geglu_quick(ctx, dst);
                    break;
                default:
                    return false;
@@ -1920,6 +1935,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
        case GGML_OP_CPY:
            ggml_cann_cpy(ctx, dst);
            break;
+        case GGML_OP_SET:
+            ggml_cann_set(ctx, dst);
+            break;
        case GGML_OP_CONT:
            ggml_cann_dup(ctx, dst);
            break;
@@ -1989,6 +2007,21 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
        case GGML_OP_SSM_CONV:
            ggml_cann_ssm_conv(ctx, dst);
            break;
+        case GGML_OP_CUMSUM:
+            ggml_cann_cumsum(ctx, dst);
+            break;
+        case GGML_OP_TRI:
+            ggml_cann_tri(ctx, dst);
+            break;
+        case GGML_OP_FILL:
+            ggml_cann_fill(ctx, dst);
+            break;
+        case GGML_OP_DIAG:
+            ggml_cann_diag(ctx, dst);
+            break;
+        case GGML_OP_SOLVE_TRI:
+            ggml_cann_solve_tri(ctx, dst);
+            break;
        default:
            return false;
    }
@@ -2324,6 +2357,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
    if (use_cann_graph) {
        // If no matching graph is found, the graph needs to be recaptured.
        graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
+
        if (graph_capture_required) {
            // If no matching graph is found, add a new ACL graph.
            ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
@@ -2382,6 +2416,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_STEP:
                case GGML_UNARY_OP_GELU_ERF:
+                case GGML_UNARY_OP_SOFTPLUS:
                    return true;
                default:
                    return false;
@@ -2572,6 +2607,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
+        case GGML_OP_SET:
        case GGML_OP_GROUP_NORM:
            return true;
        case GGML_OP_PAD:
@@ -2649,6 +2685,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
            }
        case GGML_OP_SSM_CONV:
            return true;
+        case GGML_OP_CUMSUM:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_TRI:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_FILL:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_DIAG:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_SOLVE_TRI:
+            return op->src[0]->type == GGML_TYPE_F32;
        default:
            return false;
    }
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -485,6 +485,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_RV_ZIHINTPAUSE)
                string(APPEND MARCH_STR "_zihintpause")
            endif()
+            if (GGML_CPU_RISCV64_SPACEMIT)
+                # `xsmtvdotii' is only required for GCC >= 15.
+                if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+                    CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
+                    string(APPEND MARCH_STR "_xsmtvdotii")
+                endif()
+            endif()

            list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
        else()
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
    const int lda = KB * sizeof(TA);
    //const int ldb = KB * sizeof(TB);

-    static thread_local packed_B_t Tile0[TILE_N * TILE_K];
-    static thread_local packed_B_t Tile1[TILE_N * TILE_K];
-    static thread_local int8_t Tile23[TILE_M * TILE_K];
+    alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
+    alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];

-    static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
-    static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
+    alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
+    alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];

    // double buffering C to interleave avx512 and amx
    int32_t * C_cur = TileC0;
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
    const int m1 = std::max(M - TILE_M, 0);
    //const int lda = KB * sizeof(TA);

-    static thread_local int8_t Tile0[TILE_N * TILE_K];
-    static thread_local int8_t Tile1[TILE_N * TILE_K];
-    static thread_local int8_t Tile23[TILE_M * TILE_K];
+    alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];

    // mat mul result for each group
-    static thread_local int32_t Tile4[TILE_M * TILE_N];
-    static thread_local int32_t Tile5[TILE_M * TILE_N];
-    static thread_local int32_t Tile6[TILE_M * TILE_N];
-    static thread_local int32_t Tile7[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];

    // sum of each QK_K block, contains 8 groups, int32
-    static thread_local int32_t Sumi4[TILE_M * TILE_N];
-    static thread_local int32_t Sumi5[TILE_M * TILE_N];
-    static thread_local int32_t Sumi6[TILE_M * TILE_N];
-    static thread_local int32_t Sumi7[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];

    const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
    for (int i = 0; i < KB; ++i) {
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -5023,6 +5023,71 @@ void ggml_gemm_q8_0_4x8_q8_0(int                        n,
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);

+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntb() * 8 == 256) {
+        const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
+
+        static const uint32_t idx_arr[8] = {0, 1, 4, 5, 2, 3, 6, 7};
+        svuint32_t idx = svld1(svptrue_b32(), idx_arr);
+        static const uint32_t idx_arr1[8] = {0, 1, 2, 3, 1, 2, 3, 0};
+        svuint32_t idx_sc1 = svld1(svptrue_b32(), idx_arr1);
+        static const uint32_t idx_arr2[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+        svuint32_t idx_sc2 = svld1(svptrue_b32(), idx_arr2);
+
+        for (int y = 0; y < nr; y += 4) {
+            const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
+
+            for (int x = 0; x < nc; x += ncols_interleaved) {
+                const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
+                const block_q8_0x4 * a_ptr = a_ptr_base;
+
+                svfloat32_t acc_f32_01 = svdup_f32(0);
+                svfloat32_t acc_f32_23 = svdup_f32(0);
+
+                for (int b = 0; b < nb; b++) {
+
+                    svint32_t acc_01 = svdup_s32(0);
+                    svint32_t acc_23 = svdup_s32(0);
+
+                    // Process 4 chunks of 8 positions each
+                    for (int chunk = 0; chunk < 4; chunk++) {
+                        svint8_t s_a01 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32);
+                        svint8_t s_a23 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32 + 16);
+                        svint8_t s_b0123 = svld1_s8(svptrue_b8(), b_ptr->qs + chunk * 32);
+
+                        acc_01 = svmmla_s32(acc_01, s_a01, s_b0123);
+                        acc_23 = svmmla_s32(acc_23, s_a23, s_b0123);
+                    }
+
+                    // Reorder outputs from 2×2 tiles to row-major
+                    // acc[01] = [r0c0, r0c1, r1c0, r1c1, r0c2, r0c3, r1c2, r1c3]
+                    // acc[23] = [r2c0, r2c1, r3c0, r3c1, r2c2, r2c3, r3c2, r3c3]
+
+                    svint32_t row01 = svtbl_s32(acc_01, idx);
+                    svint32_t row23 = svtbl_s32(acc_23, idx);
+
+                    svfloat16_t temp1 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) a_ptr->d);
+                    svfloat16_t temp2 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) b_ptr->d);
+                    svfloat32_t sv_a_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp1, temp1)), idx_sc1);
+                    svfloat32_t sv_b_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp2, temp2)), idx_sc2);
+
+                    acc_f32_01 = svmla_f32_x(svptrue_b32(), acc_f32_01, svcvt_f32_s32_x(svptrue_b32(), row01), svmul_lane_f32(sv_b_d, sv_a_d, 0));
+                    acc_f32_23 = svmla_f32_x(svptrue_b32(), acc_f32_23, svcvt_f32_s32_x(svptrue_b32(), row23), svmul_lane_f32(sv_b_d, sv_a_d, 2));
+                    a_ptr++;
+                    b_ptr++;
+                }
+
+                svbool_t pg4 = svptrue_pat_b32(SV_VL4);
+                svst1_f32(pg4, s + (y+0) * bs + x, acc_f32_01);
+                svst1_f32(pg4, s + (y+1) * bs + x, svext_f32(acc_f32_01, acc_f32_01, 4));
+                svst1_f32(pg4, s + (y+2) * bs + x, acc_f32_23);
+                svst1_f32(pg4, s + (y+3) * bs + x, svext_f32(acc_f32_23, acc_f32_23, 4));
+            }
+        }
+        return;
+    }
+#endif  // SVE compile-time end
+
 #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
    const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;

--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -2300,9 +2300,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 #if defined __AVX2__

-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i m15 = _mm256_set1_epi8(15);

    __m256 acc = _mm256_setzero_ps();

@@ -2314,53 +2313,45 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const uint8_t * GGML_RESTRICT qh = x[i].qh;
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;

+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m256i scales_16 = _mm256_cvtepi8_epi16(scales);
+        const __m256i q8sclsub = _mm256_slli_epi32(_mm256_madd_epi16(q8sums, scales_16), 5);

        __m256i sumi = _mm256_setzero_si256();

        int is = 0;

        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;

-            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
-            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m3), 4);
+            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(12)), 2);
+            const __m256i q4h_2 = _mm256_and_si256(q4bitsH, _mm256_set1_epi8(48));
+            const __m256i q4h_3 = _mm256_srli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(-64)), 2);

-            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m15), q4h_0);
+            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m15), q4h_1);
+            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m15), q4h_2);
+            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m15), q4h_3);

            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;

-            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);

-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;

            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
@@ -2372,6 +2363,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

        }

+        sumi = _mm256_sub_epi32(sumi, q8sclsub);
        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
    }

--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -2321,6 +2321,9 @@ class tinyBLAS_Q0_PPC {
    }

    void matmul(int64_t m, int64_t n) {
+    #if defined(_AIX) || defined(__BIG_ENDIAN__)
+        mnpack(0, m, 0, n);
+    #else
        const int64_t mc = 64;
        const int64_t kc = 64;
        int64_t nc = 64;
@@ -2334,7 +2337,6 @@ class tinyBLAS_Q0_PPC {
        } else {
            n_aligned = (n / 64) * 64;
        }
-
        if (n_aligned > 0) {
            if (n_aligned % 64 == 0)      nc = 64;
            else if (n_aligned == n)      nc = n;
@@ -2352,6 +2354,7 @@ class tinyBLAS_Q0_PPC {
        } else {
            mnpack(0, m, 0, n);
        }
+    #endif
    }

  private:
@@ -3191,12 +3194,16 @@ class tinyBLAS_PPC {
    }

    void matmul(int64_t m, int64_t n) {
+    #if defined(_AIX) || defined(__BIG_ENDIAN__)
+        mnpack(0, m, 0, n);
+    #else
        int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;
        if (m % mc == 0 && n % nc == 0 && k % kc == 0) {
            matmul_tiled(m, n, mc, nc, kc);
        } else {
            mnpack(0, m, 0, n);
        }
+    #endif
    }

  private:
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -1036,12 +1036,12 @@ inline static float ggml_gelu_quick_f32(float x) {
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }

-//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-//    }
-//}
+inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_table_gelu_quick_f16[i16[i]];
+    }
+}

 #ifdef GGML_GELU_QUICK_FP16
 inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
@@ -1060,13 +1060,6 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
 }
 #endif

-inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_CPU_FP16_TO_FP32(x[i]);
-        y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
-    }
-}
-
 // Sigmoid Linear Unit (SiLU) function
 inline static float ggml_silu_f32(float x) {
    return x/(1.0f + expf(-x));
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -830,6 +830,18 @@ static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
 #endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
 }

+static __device__ __forceinline__ uint8_t ggml_cuda_fp32_to_ue4m3(float x) {
+#if defined(BLACKWELL_MMA_AVAILABLE) // This is used for NVFP4 subblock scale quantizations only
+    if (!(x > 0.0f)) {
+        return 0;
+    }
+    const __nv_fp8_e4m3 xf(x);
+    return xf.__x;
+#else
+     NO_DEVICE_CODE; // Used only for NVFP4 Scales for Activations, only for Blackwell
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
+}
+
 __device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
    const uint8_t sign_bit = (x < 0.0f) << 3;
    float         ax       = fabsf(x) * e;
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -1,96 +1,79 @@
 #include "concat.cuh"

 // contiguous kernels
-static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
+template <int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
+                                                                                 const float * y,
+                                                                                 float *       dst,
+                                                                                 int64_t       ne00,
+                                                                                 int64_t       ne01,
+                                                                                 int64_t       ne02,
+                                                                                 int64_t       ne0,
+                                                                                 int64_t       ne1,
+                                                                                 int64_t       ne2) {
+    static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");

-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
+    const int64_t n = ne0 * ne1 * ne2;

-    if (nidx < ne00) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * gridDim.y;
-        dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            (nidx - ne00) +
-            blockIdx.y * (ne0 - ne00) +
-            blockIdx.z * (ne0 - ne00) * gridDim.y;
-        dst[offset_dst] = y[offset_src];
+    for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
+        if constexpr (dim == 0) {
+            const int64_t row = i / ne0;
+            const int64_t i0  = i - row * ne0;
+
+            if (i0 < ne00) {
+                dst[i] = x[row * ne00 + i0];
+            } else {
+                dst[i] = y[row * (ne0 - ne00) + (i0 - ne00)];
+            }
+        } else if constexpr (dim == 1) {
+            const int64_t dst_plane  = ne0 * ne1;
+            const int64_t src0_plane = ne0 * ne01;
+            const int64_t src1_plane = dst_plane - src0_plane;
+            const int64_t i2         = i / dst_plane;
+            const int64_t i01        = i - i2 * dst_plane;
+
+            if (i01 < src0_plane) {
+                dst[i] = x[i2 * src0_plane + i01];
+            } else {
+                dst[i] = y[i2 * src1_plane + (i01 - src0_plane)];
+            }
+        } else {
+            const int64_t src0_size = ne0 * ne1 * ne02;
+
+            if (i < src0_size) {
+                dst[i] = x[i];
+            } else {
+                dst[i] = y[i - src0_size];
+            }
+        }
    }
 }

-static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
+static void concat_f32_cuda(const float * x,
+                            const float * y,
+                            float *       dst,
+                            int64_t       ne00,
+                            int64_t       ne01,
+                            int64_t       ne02,
+                            int64_t       ne0,
+                            int64_t       ne1,
+                            int64_t       ne2,
+                            int           dim,
+                            cudaStream_t  stream) {
+    const int64_t n          = ne0 * ne1 * ne2;
+    const int     num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;

-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-
-    if (blockIdx.y < (unsigned)ne01) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            blockIdx.z * ne0 * ne01;
-        dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx +
-            (blockIdx.y - ne01) * ne0 +
-            blockIdx.z * ne0 * (gridDim.y - ne01);
-        dst[offset_dst] = y[offset_src];
-    }
-}
-
-static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-
-    if (blockIdx.z < (unsigned)ne02) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            blockIdx.z * ne0 * gridDim.y;
-        dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            (blockIdx.z - ne02) * ne0 *  gridDim.y;
-        dst[offset_dst] = y[offset_src];
-    }
-}
-
-static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2);
    if (dim == 0) {
-        concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
+        concat_f32_cont<0>
+            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
    if (dim == 1) {
-        concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
+        concat_f32_cont<1>
+            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
-    concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
+    concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 }

 // non-contiguous kernel (slow)
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -66,6 +66,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  32, 128, 128, 128, 2, true);

+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  32, 128, 128, 128, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1,  32, 128, 128, 128, 1, false);
+
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512,  8,  64, 4,  32, 256, 256, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32, 256, 256, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
@@ -85,6 +88,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);

+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  32, 128, 128, 128, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1,  32, 128, 128, 128, 1, false);
+
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512,  8,  64, 4,  32,  96,  64, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
@@ -118,6 +124,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);

+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  64, 160, 128,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2,  64, 160, 128,  64, 2, false);
+
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32, 128, 128, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1,  32, 128, 128, 128, 1, false);
@@ -1217,7 +1226,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        float KQ_max_scale[cols_per_thread];
 #pragma unroll
        for (int col = 0; col < cols_per_thread; ++col) {
-            const int jc = cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col);
+            const int jc = (threadIdx.y/np)*cols_per_warp + (cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col));
            const float sink = sinks_f[jc % ncols2];

            const float KQ_max_new = fmaxf(KQ_max[col], sink);
@@ -1825,6 +1834,10 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);

+// Mistral Small 4 (DKQ=320, DV=256), GQA=32-only build:
+extern DECL_FATTN_MMA_F16_CASE(320, 256,  1, 32);
+extern DECL_FATTN_MMA_F16_CASE(320, 256,  2, 32);
+
 // For GLM 4.7 Flash
 extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
 extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
--- a/ggml/src/ggml-cuda/fattn-tile.cu
+++ b/ggml/src/ggml-cuda/fattn-tile.cu
@@ -38,6 +38,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
        } break;
+        case 320: {
+            GGML_ASSERT(V->ne[0] == 256);
+            ggml_cuda_flash_attn_ext_tile_case<320, 256>(ctx, dst);
+        } break;
        case 512: {
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<512, 512>(ctx, dst);
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2,  64,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  64,  64)
@@ -128,6 +130,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2,  32,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  32,  64)
@@ -195,6 +199,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  64,  64)
@@ -264,6 +270,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4,  64,  64)
@@ -1144,14 +1152,16 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
        }
    }

-    if (Q->ne[1] > 8/ncols2) {
-        constexpr int cols_per_block = 16;
-        const int nwarps    = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
-        const int nbatch_fa = ggml_cuda_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
-        fattn_kernel_t fattn_kernel = flash_attn_tile<DKQ, DV, cols_per_block/ncols2, ncols2, use_logit_softcap>;
-        launch_fattn<DV, cols_per_block/ncols2, ncols2>
-            (ctx, dst, fattn_kernel, nwarps, nbytes_shared, nbatch_fa, true, true, false, warp_size);
-        return;
+    if constexpr (ncols2 <= 16) {
+        if (Q->ne[1] > 8/ncols2) {
+            constexpr int cols_per_block = 16;
+            const int nwarps    = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
+            const int nbatch_fa = ggml_cuda_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
+            fattn_kernel_t fattn_kernel = flash_attn_tile<DKQ, DV, cols_per_block/ncols2, ncols2, use_logit_softcap>;
+            launch_fattn<DV, cols_per_block/ncols2, ncols2>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, nbatch_fa, true, true, false, warp_size);
+            return;
+        }
    }

    if constexpr (ncols2 <= 8) {
@@ -1210,6 +1220,14 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
    const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
    const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;

+    if constexpr (DKQ == 320) { // Mistral Small 4
+        if (use_gqa_opt && gqa_ratio % 32 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV, 32, use_logit_softcap>(ctx, dst);
+            return;
+        }
+        GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32");
+    }
+
    if constexpr (DKQ == 576) {
        if (use_gqa_opt && gqa_ratio % 16 == 0) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
@@ -1221,7 +1239,7 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
        }
    }

-    if constexpr (DKQ <= 512) {
+    if constexpr (DKQ <= 512 && DKQ != 320) {
        if (use_gqa_opt && gqa_ratio % 8 == 0) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
            return;
@@ -1275,5 +1293,6 @@ extern DECL_FATTN_TILE_CASE( 96,  96);
 extern DECL_FATTN_TILE_CASE(112, 112);
 extern DECL_FATTN_TILE_CASE(128, 128);
 extern DECL_FATTN_TILE_CASE(256, 256);
+extern DECL_FATTN_TILE_CASE(320, 256);
 extern DECL_FATTN_TILE_CASE(512, 512);
 extern DECL_FATTN_TILE_CASE(576, 512);
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -143,6 +143,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
            GGML_ASSERT(V->ne[0] == 256);
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
            break;
+        case 320:
+            // For Mistral Small 4, go straight to the ncols1 switch (ncols2=32-only build).
+            GGML_ASSERT(V->ne[0] == 256);
+            {
+                float max_bias = 0.0f;
+                memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
+
+                const bool use_gqa_opt = mask && max_bias == 0.0f;
+                GGML_ASSERT(use_gqa_opt);
+                GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
+                const int gqa_ratio = Q->ne[2] / K->ne[2];
+                GGML_ASSERT(gqa_ratio % 32 == 0);
+
+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<320, 256, 32>(ctx, dst);
+            }
+            break;
        case 512:
            GGML_ASSERT(V->ne[0] == 512);
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<512, 512>(ctx, dst);
@@ -352,6 +368,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                return BEST_FATTN_KERNEL_NONE;
            }
            break;
+        case 320:
+            if (V->ne[0] != 256 || !gqa_opt_applies) {
+                return BEST_FATTN_KERNEL_NONE;
+            }
+            if (gqa_ratio % 32 != 0) {
+                return BEST_FATTN_KERNEL_NONE;
+            }
+            break;
        case 512:
            if (V->ne[0] != K->ne[0]) {
                return BEST_FATTN_KERNEL_NONE;
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3592,6 +3592,30 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
        return true;
    }

+    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
+     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
+        const ggml_tensor * unary = cgraph->nodes[node_idx];
+        const ggml_tensor * sqr   = cgraph->nodes[node_idx+1];
+
+        if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
+            return false;
+        }
+
+        if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
+            return false;
+        }
+
+        if (unary->type != sqr->type) {
+            return false;
+        }
+
+        if (!ggml_is_contiguous(unary->src[0])) {
+            return false;
+        }
+
+        return true;
+    }
+
    if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
        const ggml_tensor *scale  = cgraph->nodes[node_idx];
@@ -3616,6 +3640,357 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
    return false;
 }

+// try and fuse nodes and return the number of nodes to skip
+static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) {
+
+    static bool disable_fusion = getenv("GGML_CUDA_DISABLE_FUSION") != nullptr && std::atoi(getenv("GGML_CUDA_DISABLE_FUSION"));
+    if (disable_fusion) {
+        return 0;
+    }
+
+    ggml_tensor * node = cgraph->nodes[i];
+
+    //topk-moe
+    if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
+            cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
+        ggml_cuda_topk_moe_args args;
+        const bool              can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
+        std::vector<ggml_op>    ops;
+
+        if (can_fuse) {
+            const ggml_tensor * logits  = node->src[0];
+            ggml_tensor *       weights = nullptr;
+            ggml_tensor *       ids     = nullptr;
+            const ggml_tensor * bias    = nullptr;
+            const ggml_tensor * clamp   = nullptr;
+            const ggml_tensor * scale   = nullptr;
+
+            if (!args.delayed_softmax) {
+                ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
+                int     out_nodes[2];  // nodes which can't be elided
+
+                if (args.prob_bias) {
+                    bias = cgraph->nodes[i + 2]->src[1];
+                    ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT, GGML_OP_VIEW,
+                                            GGML_OP_GET_ROWS });
+                    out_nodes[0] = i + 4;
+                    ids          = cgraph->nodes[i + 4];
+                } else {
+                    ops.insert(ops.end(),
+                               { gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS });
+                    out_nodes[0] = i + 3;
+                    ids          = cgraph->nodes[i + 3];
+                }
+
+                if (args.norm) {
+                    ops.insert(ops.end(),
+                               { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, GGML_OP_RESHAPE });
+                    clamp = cgraph->nodes[i + ops.size() - 3];
+                }
+                if (args.scale) {
+                    ops.insert(ops.end(), { GGML_OP_SCALE });
+                    scale = cgraph->nodes[i + ops.size() - 1];
+                }
+
+                weights      = cgraph->nodes[i + ops.size() - 1];
+                out_nodes[1] = i + ops.size() - 1;
+
+                if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
+                        ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
+                        ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/true)) {
+                    ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
+                    return ops.size() - 1;
+                }
+            } else if (!args.norm && !args.prob_bias) {
+                //special case gpt-oss, no norm, no bias.
+                ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
+                                        GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
+                weights                     = cgraph->nodes[i + 5];
+                ids                         = cgraph->nodes[i + 1];
+                const ggml_tensor * softmax = cgraph->nodes[i + 4];
+
+                int out_nodes[2] = { i + 1, i + 5 };
+                if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
+                        ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
+                        ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/true)) {
+                    ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
+                    return ops.size() - 1;
+                }
+            }
+        }
+    }
+
+    //RoPE + view + set-rows
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
+        ggml_tensor * rope     = cgraph->nodes[i];
+        ggml_tensor * set_rows = cgraph->nodes[i + 2];
+
+        ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
+        return 2;
+    }
+
+    // multi-(add or mul)
+    if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
+        int     n_fuse = 0;
+        ggml_op ops[8];
+        std::fill(ops, ops + 8, node->op);
+
+        for (; n_fuse <= 6; ++n_fuse) {
+            if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
+                break;
+            }
+            if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
+                break;
+            }
+            if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
+                break;
+            }
+        }
+
+        n_fuse++;
+
+        if (n_fuse > 1) {
+            ggml_tensor fused_node;
+            memcpy(&fused_node, node, sizeof(ggml_tensor));
+            for (int j = 0; j < n_fuse - 1; ++j) {
+                fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
+            }
+            fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
+            if (node->op == GGML_OP_ADD) {
+                ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
+            } else {
+                ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
+            }
+            return n_fuse - 1;
+        }
+    }
+
+    bool fused_mul_mat_vec = false;
+    int  fused_node_count  = 0;
+
+    // gate + glu + up
+    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
+        const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
+
+        if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
+            ggml_tensor * glu         = cgraph->nodes[i + 4];
+            ggml_tensor * gate_bias_n = glu->src[0];
+            ggml_tensor * up_bias_n   = glu->src[1];
+
+            //we don't assume the order for {gate, up}. Instead infer it from the bias tensor
+            ggml_tensor * gate_n = nullptr;
+            ggml_tensor * up_n   = nullptr;
+
+            if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
+                gate_n = cgraph->nodes[i];
+                up_n   = cgraph->nodes[i + 2];
+            } else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
+                gate_n = cgraph->nodes[i + 2];
+                up_n   = cgraph->nodes[i];
+            } else {
+                continue;
+            }
+
+            auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
+                if (op_bias == GGML_OP_ADD) {
+                    if (bias_node->src[0] == mul_node) {
+                        return bias_node->src[1];
+                    }
+                    if (bias_node->src[1] == mul_node) {
+                        return bias_node->src[0];
+                    }
+                    return (ggml_tensor *) nullptr;
+                }
+                GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
+                GGML_ASSERT(bias_node->src[0] == mul_node);
+                return bias_node->src[1];
+            };
+
+            ggml_tensor * up_bias_tensor   = get_bias_tensor(up_bias_n, up_n, bias_op);
+            ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
+
+            if (!up_bias_tensor || !gate_bias_tensor) {
+                continue;
+            }
+
+            // we don't support repeating adds
+            if (bias_op == GGML_OP_ADD && (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
+                                           !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
+                continue;
+            }
+
+            const ggml_tensor * src0 = up_n->src[0];
+            const ggml_tensor * src1 = up_n->src[1];
+            const ggml_tensor * ids  = up_n->src[2];
+
+            if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
+                ggml_cuda_mm_fusion_args_host fusion_data{};
+                fusion_data.gate      = gate_n->src[0];
+                fusion_data.x_bias    = up_bias_tensor;
+                fusion_data.gate_bias = gate_bias_tensor;
+                fusion_data.glu_op    = ggml_get_glu_op(glu);
+
+                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
+                fused_mul_mat_vec = true;
+                fused_node_count  = 5;
+                break;
+            }
+
+            if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
+                ggml_cuda_mm_fusion_args_host fusion_data{};
+                fusion_data.gate      = gate_n->src[0];
+                fusion_data.x_bias    = up_bias_tensor;
+                fusion_data.gate_bias = gate_bias_tensor;
+                fusion_data.glu_op    = ggml_get_glu_op(glu);
+
+                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
+                fused_mul_mat_vec = true;
+                fused_node_count  = 5;
+                break;
+            }
+        } else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
+            ggml_tensor * glu  = cgraph->nodes[i + 2];
+            ggml_tensor * gate = glu->src[0];
+            ggml_tensor * up   = glu->src[1];
+
+            bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1]) ||
+                      (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
+
+            if (!ok) {
+                continue;
+            }
+
+            const ggml_tensor * src0 = up->src[0];
+            const ggml_tensor * src1 = up->src[1];
+            const ggml_tensor * ids  = up->src[2];
+
+            if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
+                ggml_cuda_mm_fusion_args_host fusion_data{};
+                fusion_data.gate   = gate->src[0];
+                fusion_data.glu_op = ggml_get_glu_op(glu);
+
+                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
+                fused_mul_mat_vec = true;
+                fused_node_count  = 3;
+                break;
+            }
+
+            if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
+                ggml_cuda_mm_fusion_args_host fusion_data{};
+                fusion_data.gate   = gate->src[0];
+                fusion_data.glu_op = ggml_get_glu_op(glu);
+
+                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
+                fused_mul_mat_vec = true;
+                fused_node_count  = 3;
+                break;
+            }
+        }
+    }
+
+    if (fused_mul_mat_vec) {
+        return fused_node_count - 1;
+    }
+
+    fused_mul_mat_vec = false;
+    fused_node_count  = 0;
+
+    // gate + add + glu + up + add
+    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
+        const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
+
+        if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
+            continue;
+        }
+
+        ggml_tensor * mm_node   = cgraph->nodes[i];
+        ggml_tensor * bias_node = cgraph->nodes[i + 1];
+
+        ggml_tensor * bias_tensor = nullptr;
+        if (bias_op == GGML_OP_ADD) {
+            if (bias_node->src[0] == mm_node) {
+                bias_tensor = bias_node->src[1];
+            } else if (bias_node->src[1] == mm_node) {
+                bias_tensor = bias_node->src[0];
+            } else {
+                continue;
+            }
+        } else {
+            if (bias_node->src[0] != mm_node) {
+                continue;
+            }
+            bias_tensor = bias_node->src[1];
+        }
+
+        const ggml_tensor * src0 = mm_node->src[0];
+        const ggml_tensor * src1 = mm_node->src[1];
+        const ggml_tensor * ids  = mm_node->src[2];
+
+        if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
+            continue;
+        }
+
+        if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
+            continue;
+        }
+
+        ggml_cuda_mm_fusion_args_host fusion_data{};
+        fusion_data.x_bias = bias_tensor;
+
+        if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
+            ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
+            fused_mul_mat_vec = true;
+            fused_node_count  = 2;
+            break;
+        }
+
+        if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
+            ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
+            fused_mul_mat_vec = true;
+            fused_node_count  = 2;
+            break;
+        }
+    }
+
+    if (fused_mul_mat_vec) {
+        return fused_node_count - 1;
+    }
+
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD }, {})) {
+        ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
+        return 2;
+    }
+
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
+        ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i + 1]);
+        return 1;
+    }
+
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
+        ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i + 1]);
+        return 1;
+    }
+
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SILU }) ||
+        ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SIGMOID }) ||
+        ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SOFTPLUS })) {
+        ggml_cuda_op_unary_mul(*cuda_ctx, node, cgraph->nodes[i + 1]);
+        return 1;
+    }
+
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
+        ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i + 1]);
+        return 1;
+    }
+
+    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
+        ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i + 2], node);
+        return 2;
+    }
+
+    return 0;
+}
+
 static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required, const void * graph_key) {
    bool graph_evaluated_or_captured = false;

@@ -3762,349 +4137,11 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                    continue;
                }

-                // start of fusion operations
-                static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
-                if (!disable_fusion) {
-                    ggml_cuda_topk_moe_args args;
+                int nodes_to_skip = ggml_cuda_try_fuse(cuda_ctx, cgraph, i);

-                    if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
-                        cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
-                        const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
-
-                        std::vector<ggml_op> ops;
-
-                        if (can_fuse) {
-                            const ggml_tensor * logits  = node->src[0];
-                            ggml_tensor *       weights = nullptr;
-                            ggml_tensor *       ids     = nullptr;
-                            const ggml_tensor * bias    = nullptr;
-                            const ggml_tensor * clamp   = nullptr;
-                            const ggml_tensor * scale   = nullptr;
-
-                            if (!args.delayed_softmax) {
-                                ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
-                                int     out_nodes[2];  // nodes which can't be elided
-
-                                if (args.prob_bias) {
-                                    bias = cgraph->nodes[i + 2]->src[1];
-                                    ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT,
-                                                            GGML_OP_VIEW, GGML_OP_GET_ROWS });
-                                    out_nodes[0] = i + 4;
-                                    ids          = cgraph->nodes[i + 4];
-                                } else {
-                                    ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW,
-                                                            GGML_OP_GET_ROWS });
-                                    out_nodes[0] = i + 3;
-                                    ids          = cgraph->nodes[i + 3];
-                                }
-
-                                if (args.norm) {
-                                    ops.insert(ops.end(), { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
-                                                            GGML_OP_DIV, GGML_OP_RESHAPE });
-                                    clamp = cgraph->nodes[i + ops.size() - 3];
-                                }
-                                if (args.scale) {
-                                    ops.insert(ops.end(), { GGML_OP_SCALE });
-                                    scale = cgraph->nodes[i + ops.size() - 1];
-                                }
-
-                                weights      = cgraph->nodes[i + ops.size() - 1];
-                                out_nodes[1] = i + ops.size() - 1;
-
-                                if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
-                                    ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
-                                    ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
-                                    ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
-                                    i += ops.size() - 1;
-                                    continue;
-                                }
-                            } else if (!args.norm && !args.prob_bias) {
-                                //special case gpt-oss, no norm, no bias.
-                                ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
-                                                        GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
-                                weights                     = cgraph->nodes[i + 5];
-                                ids                         = cgraph->nodes[i + 1];
-                                const ggml_tensor * softmax = cgraph->nodes[i + 4];
-
-                                int out_nodes[2] = { i + 1, i + 5 };
-                                if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
-                                    ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
-                                    ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
-                                    ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
-                                    i += ops.size() - 1;
-                                    continue;
-                                }
-                            }
-                        }
-                    }
-
-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
-                        ggml_tensor * rope = cgraph->nodes[i];
-                        ggml_tensor * set_rows = cgraph->nodes[i + 2];
-
-                        ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
-                        i += 2;
-                        continue;
-                    }
-
-                    if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
-                        int n_fuse = 0;
-                        ggml_op ops[8];
-                        std::fill(ops, ops + 8, node->op);
-
-                        for (; n_fuse <= 6; ++n_fuse){
-                            if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
-                                break;
-                            }
-                            if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
-                                break;
-                            }
-                            if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
-                                break;
-                            }
-                        }
-
-                        n_fuse++;
-
-                        if (n_fuse > 1) {
-                            ggml_tensor fused_node;
-                            memcpy(&fused_node, node, sizeof(ggml_tensor));
-                            for (int j = 0; j < n_fuse - 1; ++j) {
-                                fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
-                            }
-                            fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
-                            if (node->op == GGML_OP_ADD) {
-                                ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
-                            } else {
-                                ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
-                            }
-                            i += n_fuse - 1;
-
-                            continue;
-                        }
-                    }
-
-                    bool fused_mul_mat_vec = false;
-                    int fused_node_count = 0;
-
-                    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
-                        const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
-
-                        if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
-                            ggml_tensor * glu         = cgraph->nodes[i + 4];
-                            ggml_tensor * gate_bias_n = glu->src[0];
-                            ggml_tensor * up_bias_n   = glu->src[1];
-
-                            //we don't assume the order for {gate, up}. Instead infer it from the bias tensor
-                            ggml_tensor * gate_n      = nullptr;
-                            ggml_tensor * up_n        = nullptr;
-
-                            if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
-                                gate_n = cgraph->nodes[i];
-                                up_n   = cgraph->nodes[i + 2];
-                            } else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
-                                gate_n = cgraph->nodes[i + 2];
-                                up_n   = cgraph->nodes[i];
-                            } else {
-                                continue;
-                            }
-
-                            auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
-                                if (op_bias == GGML_OP_ADD) {
-                                    if (bias_node->src[0] == mul_node) {
-                                        return bias_node->src[1];
-                                    }
-                                    if (bias_node->src[1] == mul_node) {
-                                        return bias_node->src[0];
-                                    }
-                                    return (ggml_tensor *) nullptr;
-                                }
-                                GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
-                                GGML_ASSERT(bias_node->src[0] == mul_node);
-                                return bias_node->src[1];
-                            };
-
-                            ggml_tensor * up_bias_tensor   = get_bias_tensor(up_bias_n, up_n, bias_op);
-                            ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
-
-                            if (!up_bias_tensor || !gate_bias_tensor) {
-                                continue;
-                            }
-
-                            // we don't support repeating adds
-                            if (bias_op == GGML_OP_ADD &&
-                                (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
-                                 !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
-                                continue;
-                            }
-
-                            const ggml_tensor * src0 = up_n->src[0];
-                            const ggml_tensor * src1 = up_n->src[1];
-                            const ggml_tensor * ids  = up_n->src[2];
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate      = gate_n->src[0];
-                                fusion_data.x_bias    = up_bias_tensor;
-                                fusion_data.gate_bias = gate_bias_tensor;
-                                fusion_data.glu_op    = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 5;
-                                break;
-                            }
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate      = gate_n->src[0];
-                                fusion_data.x_bias    = up_bias_tensor;
-                                fusion_data.gate_bias = gate_bias_tensor;
-                                fusion_data.glu_op    = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 5;
-                                break;
-                            }
-                        } else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
-                            ggml_tensor * glu  = cgraph->nodes[i + 2];
-                            ggml_tensor * gate = glu->src[0];
-                            ggml_tensor * up   = glu->src[1];
-
-                            bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
-                                || (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
-
-                            if (!ok) continue;
-
-                            const ggml_tensor * src0 = up->src[0];
-                            const ggml_tensor * src1 = up->src[1];
-                            const ggml_tensor * ids  = up->src[2];
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate   = gate->src[0];
-                                fusion_data.glu_op = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 3;
-                                break;
-                            }
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate   = gate->src[0];
-                                fusion_data.glu_op = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 3;
-                                break;
-                            }
-                        }
-                    }
-
-                    if (fused_mul_mat_vec) {
-                        i += fused_node_count - 1;
-                        continue;
-                    }
-
-                    fused_mul_mat_vec = false;
-                    fused_node_count = 0;
-
-                    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
-                        const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
-
-                        if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
-                            continue;
-                        }
-
-                        ggml_tensor * mm_node   = cgraph->nodes[i];
-                        ggml_tensor * bias_node = cgraph->nodes[i + 1];
-
-                        ggml_tensor * bias_tensor = nullptr;
-                        if (bias_op == GGML_OP_ADD) {
-                            if (bias_node->src[0] == mm_node) {
-                                bias_tensor = bias_node->src[1];
-                            } else if (bias_node->src[1] == mm_node) {
-                                bias_tensor = bias_node->src[0];
-                            } else {
-                                continue;
-                            }
-                        } else {
-                            if (bias_node->src[0] != mm_node) {
-                                continue;
-                            }
-                            bias_tensor = bias_node->src[1];
-                        }
-
-                        const ggml_tensor * src0 = mm_node->src[0];
-                        const ggml_tensor * src1 = mm_node->src[1];
-                        const ggml_tensor * ids  = mm_node->src[2];
-
-                        if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
-                            continue;
-                        }
-
-                        if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
-                            continue;
-                        }
-
-                        ggml_cuda_mm_fusion_args_host fusion_data{};
-                        fusion_data.x_bias = bias_tensor;
-
-                        if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
-                            ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
-                            fused_mul_mat_vec = true;
-                            fused_node_count = 2;
-                            break;
-                        }
-
-                        if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
-                            ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
-                            fused_mul_mat_vec = true;
-                            fused_node_count = 2;
-                            break;
-                        }
-                    }
-
-                    if (fused_mul_mat_vec) {
-                        i += fused_node_count - 1;
-                        continue;
-                    }
-
-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
-                        ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
-                        i += 2;
-                        continue;
-                    }
-
-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
-                        ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
-                        i++;
-                        continue;
-                    }
-
-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
-                        ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i+1]);
-                        i++;
-                        continue;
-                    }
-
-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SILU }) ||
-                        ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SIGMOID }) ||
-                        ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SOFTPLUS })) {
-                        ggml_cuda_op_unary_mul(*cuda_ctx, node, cgraph->nodes[i+1]);
-                        i++;
-                        continue;
-                    }
-
-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
-                        i += 2;
-                        ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
-                        continue;
-                    }
+                if (nodes_to_skip != 0) {
+                    i += nodes_to_skip;
+                    continue;
                }
 #ifndef NDEBUG
                assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -1015,25 +1015,35 @@ namespace ggml_cuda_mma {
 #endif // AMD_MFMA_AVAILABLE
    }

-    static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> &     D,
-                                                            const tile<16, 8, int> & A,
-                                                            const tile<8, 8, int> &  B,
-                                                            uint32_t                 a_scale,
-                                                            uint32_t                 b_scale) {
+    template <ggml_type type>
+    static __device__ __forceinline__ void mma_block_scaled_fp4(tile<16, 8, float> &     D,
+                                                                const tile<16, 8, int> & A,
+                                                                const tile<8, 8, int> &  B,
+                                                                uint32_t                 a_scale,
+                                                                uint32_t                 b_scale) {
 #ifdef BLACKWELL_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
        float *     Dxi = (float *) D.x;

-        asm volatile(
-            "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
-            "%10, {0, 0}, %11, {0, 0};"
-            : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
-            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
+        if constexpr (type == GGML_TYPE_MXFP4) {
+            asm volatile(
+                "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
+                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
+                "%10, {0, 0}, %11, {0, 0};"
+                : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
+                : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
+        } else {
+            asm volatile(
+                "mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue4m3 "
+                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
+                "%10, {0, 0}, %11, {0, 0};"
+                : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
+                : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
+        }
 #else
        GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
-#endif  // BLACKWELL_MMA_AVAILABLE
+#endif // BLACKWELL_MMA_AVAILABLE
    }

    static __device__ __forceinline__ void mma(
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -122,7 +122,7 @@ void ggml_cuda_mul_mat_q(
                            || GGML_CUDA_CC_IS_CDNA(cc);

    // TODO: tighter pool buffer size vs q8 path
-    const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
+    const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4);

    if (!ids) {
        const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -133,9 +133,9 @@ void ggml_cuda_mul_mat_q(
            const int64_t s11 = src1->nb[1] / ts_src1;
            const int64_t s12 = src1->nb[2] / ts_src1;
            const int64_t s13 = src1->nb[3] / ts_src1;
-            if (use_native_mxfp4) {
+            if (use_native_fp4) {
                static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
-                quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
+                quantize_mmq_fp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
                                        ne11, ne12, ne13, stream);

            } else {
@@ -146,10 +146,8 @@ void ggml_cuda_mul_mat_q(
        }

        // Stride depends on quantization format
-        const int64_t s12 = use_native_mxfp4 ?
-                                ne11 * ne10_padded * sizeof(block_fp4_mmq) /
-                                    (8 * QK_MXFP4 * sizeof(int))  // block_fp4_mmq holds 256 values (8 blocks of 32)
-                                :
+        const int64_t s12 = use_native_fp4 ?
+                                ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) :  // block_fp4_mmq holds 256 values
                                ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
        const int64_t s13 = ne12*s12;

@@ -198,8 +196,8 @@ void ggml_cuda_mul_mat_q(
        const int64_t s12 = src1->nb[2] / ts_src1;
        const int64_t s13 = src1->nb[3] / ts_src1;

-        if (use_native_mxfp4) {
-            quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
+        if (use_native_fp4) {
+            quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
                                    ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
        } else {
            quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@@ -208,8 +206,9 @@ void ggml_cuda_mul_mat_q(
        CUDA_CHECK(cudaGetLastError());
    }

-    const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
-                                           ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
+    static_assert(QK_K == 8 * QK_MXFP4, "QK_K needs to be 8 * QK_MXFP4");
+    const int64_t s12 = use_native_fp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) :
+                                         ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
    const int64_t s13 = ne12*s12;

    // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -10,9 +10,9 @@
 using namespace ggml_cuda_mma;

 #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
-#define MMQ_ITER_K 256
-#define MMQ_ITER_K_MXFP4_FP4    512
-#define MMQ_NWARPS 8
+#define MMQ_ITER_K             256
+#define MMQ_ITER_K_FP4         512
+#define MMQ_NWARPS               8

 typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
 typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
@@ -46,9 +46,12 @@ struct block_q8_1_mmq {
    int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
 };

+// this struct is used for fp4 data types (currently only used for Blackwell)
+// mxfp4 has block size 32, each int32 of d4 contains 2 e8m0 scales in the lower 16 bits
+// nvfp4 has block size 16, each int32 of d4 contains 4 ue4m3 scales
 struct block_fp4_mmq {
-    uint32_t d4[4];       // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc.
-    int8_t   qs[4 * 32];  // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values
+    uint32_t d4[4];
+    int8_t   qs[4 * 32];  // 256 FP4 values packed as 4-bit pairs (2 per byte)
 };

 static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
@@ -143,10 +146,11 @@ static int get_mmq_y_host(const int cc) {

 static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
 #if defined(BLACKWELL_MMA_AVAILABLE)
-    return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K;
-#else
-    return MMQ_ITER_K;
+if (type == GGML_TYPE_NVFP4 || type == GGML_TYPE_MXFP4) {
+    return MMQ_ITER_K_FP4;
+}
 #endif // defined(BLACKWELL_MMA_AVAILABLE)
+    return MMQ_ITER_K;
 }

 static constexpr __device__ int get_mmq_y_device() {
@@ -213,8 +217,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
 }

 #define MMQ_MMA_TILE_X_K_Q8_0  (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0                   + 4)
-#define MMQ_MMA_TILE_X_K_FP4   (2*MMQ_TILE_NE_K + 8                                       + 4) // MXFP4
-#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2                         + 4) // NVFP4
+#define MMQ_MMA_TILE_X_K_FP4   (2*MMQ_TILE_NE_K + 8                                       + 4) // MXFP4 and NVFP4 Blackwell
+#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2                         + 4) // NVFP4 Generic
 #define MMQ_MMA_TILE_X_K_Q8_1  (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0                   + 4)
 #define MMQ_MMA_TILE_X_K_Q2_K  (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K                           + 4)
 #define MMQ_MMA_TILE_X_K_Q3_K  (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2                         + 4)
@@ -240,7 +244,11 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
        case GGML_TYPE_Q8_0:    return MMQ_MMA_TILE_X_K_Q8_0;
        // tile sizes are the same for Q8_1 and FP4 for blackwell
        case GGML_TYPE_MXFP4:   return MMQ_MMA_TILE_X_K_Q8_1;
+#if defined(BLACKWELL_MMA_AVAILABLE)
+        case GGML_TYPE_NVFP4:   return MMQ_MMA_TILE_X_K_FP4;
+#else
        case GGML_TYPE_NVFP4:   return MMQ_MMA_TILE_X_K_NVFP4;
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
        case GGML_TYPE_Q2_K:    return MMQ_MMA_TILE_X_K_Q2_K;
        case GGML_TYPE_Q3_K:    return MMQ_MMA_TILE_X_K_Q3_K;
        case GGML_TYPE_Q4_K:    return MMQ_MMA_TILE_X_K_Q8_1;
@@ -934,6 +942,128 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr
    }
 }

+#ifdef BLACKWELL_MMA_AVAILABLE
+template <int mmq_y, bool need_check>
+static __device__ __forceinline__ void load_tiles_nvfp4_nvfp4(const char * __restrict__ x,
+                                                            int * __restrict__ x_tile,
+                                                            const int kbx0,
+                                                            const int i_max,
+                                                            const int stride) {
+    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+    constexpr int iter_k = get_iter_k(GGML_TYPE_NVFP4);
+    constexpr int threads_per_row = iter_k / QK_NVFP4; // each thread processes 1 block
+    constexpr int rows_per_warp = warp_size / threads_per_row;
+
+    uint32_t * x_u32 = (uint32_t *) x_tile;
+
+    const int txi = threadIdx.x;
+    const int kbx = txi % threads_per_row;
+    const int row_in_warp = txi / threads_per_row;
+
+    const block_nvfp4 * bxi_base = (const block_nvfp4 *) x + kbx0 + kbx;
+    uint32_t * x_u32_scale = x_u32 + 64 + kbx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
+        int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
+
+        if constexpr (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_nvfp4 * bxi = bxi_base + i * stride;
+        const int row_base = i * MMQ_MMA_TILE_X_K_FP4;
+        const int q_base = row_base + 8 * kbx;
+
+        const uint32_t * src_qs = reinterpret_cast<const uint32_t *>(bxi->qs);
+
+#pragma unroll
+        for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) {
+            x_u32[q_base + 2 * sub + 0] = src_qs[2 * sub + 0];
+            x_u32[q_base + 2 * sub + 1] = src_qs[2 * sub + 1];
+        }
+
+        x_u32_scale[row_base] = get_int_b4(bxi->d, 0);
+    }
+}
+
+// Shared MMA kernel for MXFP4 and NVFP4 on Blackwell.
+// Both quantizations encode values as e2m1 (FP4) and produce one uint32 scale per
+// m16n8k64 MMA call; only the PTX kind (scale_vec::2X ue8m0 vs scale_vec::4X ue4m3)
+// and the per-type stride constant differ.
+template <int mmq_x, int mmq_y, ggml_type type>
+static __device__ __forceinline__ void vec_dot_fp4_fp4_mma(const int * __restrict__ x,
+                                                           const int * __restrict__ y,
+                                                           float * __restrict__ sum,
+                                                           const int k00) {
+    static_assert(type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4,
+                  "vec_dot_fp4_fp4_mma: type must be MXFP4 or NVFP4");
+
+    typedef tile<16, 8, int>   tile_A;
+    typedef tile<8, 8, int>    tile_B;
+    typedef tile<16, 8, float> tile_C;
+
+    constexpr int stride        = MMQ_MMA_TILE_X_K_FP4;
+    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx           = rows_per_warp / tile_C::I;
+    constexpr int nfrags        = MMQ_TILE_NE_K / tile_A::J;
+
+    y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_K);
+
+    const int *      x_qs = (const int *) x;
+    const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
+    const int *      y_qs = (const int *) y + 4;
+    const uint32_t * y_sc = (const uint32_t *) y;
+
+    // 2 threads per quad supply the packed scale register to the block_scale MMA,
+    // see https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
+    const int tidx_A = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
+    const int tidx_B = threadIdx.x / 4;
+    const int i0     = (threadIdx.y / ntx) * rows_per_warp;
+
+    tile_A   A[ntx][nfrags];
+    uint32_t scaleA[ntx][nfrags];
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int frag = 0; frag < nfrags; ++frag) {
+            const int k0 = k00 + frag * tile_A::J;
+            load_ldmatrix(A[n][frag], x_qs + (i0 + n * tile_A::I) * stride + k0, stride);
+            scaleA[n][frag] = x_sc[(i0 + n * tile_A::I + tidx_A) * stride + k0 / tile_A::J];
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
+        tile_B   B[nfrags];
+        uint32_t scaleB[nfrags];
+
+#pragma unroll
+        for (int frag = 0; frag < nfrags; ++frag) {
+            const int k0 = frag * tile_B::J;
+            load_generic(B[frag], y_qs + j0 * MMQ_TILE_Y_K + k0, MMQ_TILE_Y_K);
+            scaleB[frag] = y_sc[(j0 + tidx_B) * MMQ_TILE_Y_K + frag];
+        }
+
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+            for (int frag = 0; frag < nfrags; ++frag) {
+                tile_C C = {};
+                mma_block_scaled_fp4<type>(C, A[n][frag], B[frag], scaleA[n][frag], scaleB[frag]);
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
+                }
+            }
+        }
+    }
+}
+#endif // BLACKWELL_MMA_AVAILABLE
+

 template <int mmq_y, bool need_check>
 static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x,
@@ -1163,77 +1293,6 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
 }

-template <int mmq_x, int mmq_y>
-static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x,
-                                                               const int * __restrict__ y,
-                                                               float * __restrict__ sum,
-                                                               const int k00) {
-    typedef tile<16, 8, int>   tile_A;
-    typedef tile<8, 8, int>    tile_B;
-    typedef tile<16, 8, float> tile_C;  // Output is float for native scaled MMA
-
-    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx           = rows_per_warp / tile_C::I;  // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K);
-
-    // Match layout from load_tiles_mxfp4_fp4
-    const int *      x_qs = (const int *) x;
-    const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
-    const int *      y_qs = (const int *) y + 4;
-    const uint32_t * y_sc = (const uint32_t *) y;
-
-    // tile_A has a length of 64 logical values vs. 32 values in block_mxfp4
-    tile_A   A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
-    uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
-
-    // Block scale
-    // Each thread has to point to a 4 byte scale value
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
-
-    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
-            const int k0 = k00 + k01;
-
-            load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0,
-                          MMQ_MMA_TILE_X_K_FP4);
-
-            // based on block-scaling document, 2 threads in each quad need to supply to the scale value
-            const int tidx         = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
-            scaleA[n][k01 / (2 * QI_MXFP4)] =
-                *(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4));
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
-#pragma unroll
-        for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
-            tile_B   B;
-            uint32_t scaleB;  // 2xN scales
-
-            load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K);
-
-            scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)];
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                tile_C C;
-
-                mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB);
-#pragma unroll
-                for (int l = 0; l < tile_C::ne; ++l) {
-                    sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
-                }
-            }
-        }
-    }
-}

 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
@@ -3259,7 +3318,7 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
    static constexpr int              vdr          = VDR_MXFP4_Q8_1_MMQ;
 #ifdef BLACKWELL_MMA_AVAILABLE
    static constexpr load_tiles_mmq_t load_tiles  = load_tiles_mxfp4_fp4<mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_MXFP4>;
 #else
    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_mxfp4<mmq_y, need_check>;
    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
@@ -3270,8 +3329,13 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
 template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_NVFP4> {
    static constexpr int              vdr          = VDR_NVFP4_Q8_1_MMQ;
+#ifdef BLACKWELL_MMA_AVAILABLE
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_nvfp4_nvfp4<mmq_y, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_NVFP4>;
+#else
    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_nvfp4<mmq_y, need_check>;
    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
+#endif // BLACKWELL_MMA_AVAILABLE
    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
 };

@@ -3406,7 +3470,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(

 #if defined(BLACKWELL_MMA_AVAILABLE)
    // FP4 tile stores 8 blocks
-    constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1;
+    constexpr int ne_block = (type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4) ? QK_K : 4 * QK8_1;
 #else
    constexpr int ne_block = 4 * QK8_1;
 #endif  // defined(BLACKWELL_MMA_AVAILABLE)
@@ -3478,10 +3542,10 @@ template <ggml_type type, int mmq_x, bool need_check>
 static __global__ void mul_mat_q(
        const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
        const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
-        const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
-        const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
-        const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
-        const int ncols_max) {
+        const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
+        const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
+        const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
+        const uint3 ntx) {

    // Skip unused template specializations for faster compilation:
    if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
@@ -3495,8 +3559,7 @@ static __global__ void mul_mat_q(
    constexpr int qk    = ggml_cuda_type_traits<type>::qk;
    constexpr int mmq_y = get_mmq_y_device();

-    const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x
-    const int nty = (nrows_x   + mmq_y - 1) / mmq_y; // Number of tiles y
+    const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y

    // Initialize the ids for writing back data with just the index.
    // For regular matrix multiplications this is never changed.
@@ -3517,8 +3580,9 @@ static __global__ void mul_mat_q(
    // On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
 #if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
    {
-        const int wt = blockIdx.z / nchannels_y;
-        const int zt = blockIdx.z - wt*nchannels_y;
+        const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
+        const int wt = tmp2.x;
+        const int zt = tmp2.y;
        const int jt = blockIdx.y;
        const int it = blockIdx.x;

@@ -3561,40 +3625,40 @@ static __global__ void mul_mat_q(
        const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
        const int tile_y_max_j = col_diff - jt*mmq_x - 1;

-        const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
+        const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;

        constexpr bool fixup = false;
        mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
-             tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
+             tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
        return;
    }
 #endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA

-    constexpr int ITER_K = get_iter_k(type);
-
-    const     int64_t blocks_per_ne00 = ncols_x / qk;
-    constexpr int     blocks_per_iter = ITER_K / qk;
+    constexpr int ITER_K          = get_iter_k(type);
+    constexpr int blocks_per_iter = ITER_K / qk;

    // kbc == k block continuous, current index in continuous ijk space.
-    int64_t kbc      = (int64_t) blockIdx.x     *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
-    int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
+    int kbc      = int64_t(blockIdx.x)    *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
+    int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;

-    kbc      -= (kbc      % blocks_per_ne00) % blocks_per_iter;
-    kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
+    kbc      -= fastmodulo(kbc,      blocks_per_ne00) % blocks_per_iter;
+    kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;

    // kb0 == k index when doing the matrix multiplication for an output tile.
-    int kb0_start = kbc % blocks_per_ne00;
-    int kb0_stop  = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
-    while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
-        int tmp = kbc;
-        const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-        tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-        const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
-        tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
-        const int zt = tmp / (ntx*blocks_per_ne00);
-        tmp -= zt * (ntx*blocks_per_ne00);
-        const int jt = tmp / blocks_per_ne00;
+    int kb0_start = fastmodulo(kbc, blocks_per_ne00);
+    int kb0_stop  = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
+    while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
+        int tmp = fastdiv(kbc, blocks_per_ne00);
+        uint2 tmp2 = fast_div_modulo(tmp, ntx);
+        const int jt = tmp2.y;
+        tmp = tmp2.x;
+        tmp2 = fast_div_modulo(tmp, nchannels_y);
+        const int zt = tmp2.y;
+        tmp = tmp2.x;
+        tmp2 = fast_div_modulo(tmp, nsamples_y);
+        const int wt = tmp2.y;
+        const int it = tmp2.x;

        // Defaults for regular matrix multiplication:
        int col_low    = 0;
@@ -3612,11 +3676,11 @@ static __global__ void mul_mat_q(
            offset_dst = 0;

            if (jt*mmq_x >= col_diff) {
-                kbc += blocks_per_ne00;
-                kbc -= kbc % blocks_per_ne00;
+                kbc += blocks_per_ne00.z;
+                kbc -= fastmodulo(kbc, blocks_per_ne00);

                kb0_start = 0;
-                kb0_stop  = min(blocks_per_ne00, kbc_stop - kbc);
+                kb0_stop  = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));

                continue;
            }
@@ -3641,32 +3705,34 @@ static __global__ void mul_mat_q(
        const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
        const int tile_y_max_j = col_diff - jt*mmq_x - 1;

-        const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
+        const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;

        constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
        mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
             tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);

-        kbc += blocks_per_ne00;
-        kbc -= kbc % blocks_per_ne00;
+        kbc += blocks_per_ne00.z;
+        kbc -= fastmodulo(kbc, blocks_per_ne00);

        kb0_start = 0;
-        kb0_stop  = min(blocks_per_ne00, kbc_stop - kbc);
+        kb0_stop  = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
    }

    if (kbc >= kbc_stop) {
        return;
    }

-    int tmp = kbc;
-    const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
-    tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
-    const int zt = tmp / (ntx*blocks_per_ne00);
-    tmp -= zt * (ntx*blocks_per_ne00);
-    const int jt = tmp / blocks_per_ne00;
+    int tmp = fastdiv(kbc, blocks_per_ne00);
+    uint2 tmp2 = fast_div_modulo(tmp, ntx);
+    const int jt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nchannels_y);
+    const int zt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nsamples_y);
+    const int wt = tmp2.y;
+    const int it = tmp2.x;

    // Defaults for regular matrix multiplication:
    int col_low    = 0;
@@ -3708,7 +3774,7 @@ static __global__ void mul_mat_q(
    const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
    const int tile_y_max_j = col_diff - jt*mmq_x - 1;

-    const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
+    const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;

    constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
    mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
@@ -3717,46 +3783,37 @@ static __global__ void mul_mat_q(
 }

 template <ggml_type type, int mmq_x, bool need_check>
-static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
-                                                const int32_t * expert_bounds,
-                                                float * __restrict__ dst,
-                                                const float * __restrict__ tmp_last_tile,
-                                                const int    ncols_x,
-                                                const int    nrows_x,
-                                                const int    ncols_dst,
-                                                const size_t stride_col_dst,
-                                                const int    nchannels_y,
-                                                const size_t stride_channel_dst,
-                                                const int    nsamples_y,
-                                                const size_t stride_sample_dst,
-                                                const int    ncols_max) {
-    constexpr int     mmq_y           = get_mmq_y_device();
-    constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
-    constexpr int     ITER_K          = get_iter_k(type);
+__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
+static __global__ void mul_mat_q_stream_k_fixup(
+        const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
+        float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
+        const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
+        const int stride_sample_dst, const uint3 ntx) {
+    constexpr int mmq_y           = get_mmq_y_device();
+    constexpr int qk              = ggml_cuda_type_traits<type>::qk;
+    constexpr int ITER_K          = get_iter_k(type);
+    constexpr int blocks_per_iter = ITER_K / qk;

-    constexpr int     blocks_per_iter = ITER_K / qk;
-    const     int64_t blocks_per_ne00 = ncols_x / qk;
-
-    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int nwarps = mmq_get_nwarps_device()/2;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();

-    float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
+    float sum[mmq_x / nwarps] = {0.0f};
+    const int i = blockIdx.y*warp_size + threadIdx.x;

-    const int ntx  = (ncols_max + mmq_x - 1) / mmq_x;
-    const int nty  = (nrows_x   + mmq_y - 1) / mmq_y;
+    const int nty = (nrows_x + mmq_y - 1) / mmq_y;

    const int bidx0 = blockIdx.x;

    // kbc == k block continuous, current index in continuous ijk space.
-    int64_t kbc0      = (int64_t) bidx0     *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
-    int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
+    int kbc0      = int64_t(blockIdx.x)    *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
+    int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;

-    kbc0      -= (kbc0      % blocks_per_ne00) % blocks_per_iter;
-    kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
+    kbc0      -= fastmodulo(kbc0,      blocks_per_ne00) % blocks_per_iter;
+    kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;

    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
-    const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
-    const bool did_not_write_last      = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
+    const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
+    const bool did_not_write_last      = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
    if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
        return;
    }
@@ -3765,11 +3822,11 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

    // Iterate over previous blocks and sum up partial sums written to fixup buffer.
    // All CUDA blocks that get here must have a previous block that needs a fixup.
-    int64_t bidx = bidx0 - 1;
-    int64_t kbc_stop = kbc0;
+    int bidx = bidx0 - 1;
+    int kbc_stop = kbc0;
    while(true) {
-        int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
-        kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
+        int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
+        kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;

        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
@@ -3779,20 +3836,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

        any_fixup = true;

+
 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
            const int j = j0 + threadIdx.y;

-#pragma unroll
-            for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
-                const int i = i0 + threadIdx.x;
-
-                sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
-            }
+            sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
        }

        // If this block started in a previous tile we are done and don't need to combine additional partial results.
-        if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
+        if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
            break;
        }
        bidx--;
@@ -3803,14 +3856,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
        return;
    }

-    int tmp = kbc0;
-    const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
-    const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
-    tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
-    const int zt = tmp / (ntx*blocks_per_ne00);
-    tmp -= zt * (ntx*blocks_per_ne00);
-    const int jt = tmp / blocks_per_ne00;
+    int tmp = fastdiv(kbc0, blocks_per_ne00);
+    uint2 tmp2 = fast_div_modulo(tmp, ntx);
+    const int jt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nchannels_y);
+    const int zt = tmp2.y;
+    tmp = tmp2.x;
+    tmp2 = fast_div_modulo(tmp, nsamples_y);
+    const int wt = tmp2.y;
+    const int it = tmp2.x;

    if (!ids_dst) {
        const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
@@ -3818,6 +3873,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

        const int i_max = nrows_x   - it*mmq_y - 1;
        const int j_max = ncols_dst - jt*mmq_x - 1;
+        if (need_check && i > i_max) {
+            return;
+        }

 #pragma unroll
        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3827,16 +3885,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
                return;
            }

-#pragma unroll
-            for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
-                const int i = i0 + threadIdx.x;
-
-                if (need_check && i > i_max) {
-                    continue;
-                }
-
-                dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
-            }
+            dst[j*stride_col_dst + i] += sum[j0/nwarps];
        }
        return;
    }
@@ -3856,6 +3905,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,

    const int i_max = nrows_x  - it*mmq_y - 1;
    const int j_max = col_diff - jt*mmq_x - 1;
+    if (need_check && i > i_max) {
+        return;
+    }

 #pragma unroll
    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3865,16 +3917,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
            return;
        }

-#pragma unroll
-        for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
-            const int i = i0 + threadIdx.x;
-
-            if (need_check && i > i_max) {
-                continue;
-            }
-
-            dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
-        }
+        dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
    }
 }

@@ -3922,29 +3965,44 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int channel_ratio = args.nchannels_y / args.nchannels_x;
    const int sample_ratio  = args.nsamples_y  / args.nsamples_x;

+    const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
+    const uint3 ntx_fd             = init_fastdiv_values(ntx);
+    const uint3 nchannels_y_fd     = init_fastdiv_values(args.nchannels_y);
+    const uint3 nsamples_y_fd      = init_fastdiv_values(args.nsamples_y);
+    const uint3 channel_ratio_fd   = init_fastdiv_values(channel_ratio);
+    const uint3 sample_ratio_fd    = init_fastdiv_values(sample_ratio);
+
    if (!args.use_stream_k) {
        if (args.nrows_x % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
-                 args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-                 channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-                 sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-                 args.ncols_max);
+                 blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+                 channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+                 sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+                 ntx_fd);
        } else {
            constexpr bool need_check = true;
            mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
-                 args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-                 channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-                 sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-                 args.ncols_max);
+                 blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+                 channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+                 sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+                 ntx_fd);
        }
        return;
    }

-    const dim3 block_nums_stream_k(nsm, 1, 1);
-    const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
+    // For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
+    // This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
+    const int ntiles_dst = ntx * nty * ntzw;
+    const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
+    const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
+    const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
+
+    GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
+
+    const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;

    ggml_cuda_pool & pool = ctx.pool(id);
    ggml_cuda_pool_alloc<float> tmp_fixup(pool);
@@ -3952,40 +4010,45 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
        tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
    }

+    const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
+    const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
+
    if (args.nrows_x % mmq_y == 0) {
        constexpr bool need_check = false;
        mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
            (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
-             args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-             channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-             sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-             args.ncols_max);
+             blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+             channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+             sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+             ntx_fd);

        if (!fixup_needed) {
            return;
        }

-        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
-            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
-             args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
-             args.ncols_max);
+        CUDA_CHECK(cudaGetLastError());
+        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
+            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
+             args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
+             ntx_fd);
    } else {
        constexpr bool need_check = true;
        mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
            (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
-             args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
-             channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
-             sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-             args.ncols_max);
+             blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
+             channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
+             sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
+             ntx_fd);

        if (!fixup_needed) {
            return;
        }

-        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
-            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
-             args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
-             args.ncols_max);
+        CUDA_CHECK(cudaGetLastError());
+        mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
+            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
+             args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
+             ntx_fd);
    }
 }

--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -115,6 +115,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(gg
        case GGML_TYPE_IQ4_NL:  return 6;
        case GGML_TYPE_IQ4_XS:  return 5;
        case GGML_TYPE_MXFP4:   return 4;
+        case GGML_TYPE_NVFP4:   return 4;
        case GGML_TYPE_Q2_K:    return 4;
        case GGML_TYPE_Q3_K:    return 4;
        case GGML_TYPE_Q4_0:    return 6;
@@ -135,6 +136,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggm
        case GGML_TYPE_IQ3_S:   return 6;
        case GGML_TYPE_IQ3_XXS: return 7;
        case GGML_TYPE_MXFP4:   return 7;
+        case GGML_TYPE_NVFP4:   return 8;
        case GGML_TYPE_Q2_K:    return 7;
        case GGML_TYPE_Q3_K:    return 5;
        default:                return MMVQ_MAX_BATCH_SIZE;
@@ -221,6 +223,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type
        case GGML_TYPE_IQ4_NL:  return 7;
        case GGML_TYPE_IQ4_XS:  return 5;
        case GGML_TYPE_MXFP4:   return 5;
+        case GGML_TYPE_NVFP4:   return 5;
        case GGML_TYPE_Q3_K:    return 4;
        case GGML_TYPE_Q4_0:    return 7;
        case GGML_TYPE_Q4_1:    return 7;
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -70,6 +70,102 @@ __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
    return static_cast<uint8_t>(biased);
 }

+
+static __global__ void quantize_mmq_nvfp4(
+        const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
+        const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t ne0, const int64_t ne1, const int64_t ne2) {
+#if defined(BLACKWELL_MMA_AVAILABLE)
+
+    const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
+    if (i0_base >= ne0) {
+        return;
+    }
+
+    const int64_t i1 = blockIdx.x;
+    const int64_t i2 = blockIdx.z % ne2;
+    const int64_t i3 = blockIdx.z / ne2;
+    const int64_t i01 = ids ? ids[i1] : i1;
+    const int64_t k_block = i0_base / QK_K;
+    const int64_t blocks_per_col = (ne0 + QK_K - 1) / QK_K;
+    if (k_block >= blocks_per_col) {
+        return;
+    }
+
+    const int64_t ib = blockIdx.z * ((int64_t) blocks_per_col * ne1) + k_block * ne1 + blockIdx.x;
+    block_fp4_mmq * y = (block_fp4_mmq *) vy;
+    block_fp4_mmq * yb = y + ib;
+
+    const int sub = (i0_base % QK_K) / QK_NVFP4_SUB;
+
+    float vals_raw[QK_NVFP4_SUB];
+    float amax_raw = 0.0f;
+    const int64_t base_idx = i3 * s03 + i2 * s02 + i01 * s01;
+#pragma unroll
+    for (int k = 0; k < QK_NVFP4_SUB; k++) {
+        const int64_t i00 = i0_base + k;
+        if (i00 < ne00) {
+            const float v = x[base_idx + i00];
+            vals_raw[k] = v;
+            amax_raw = fmaxf(amax_raw, fabsf(v));
+        } else {
+            vals_raw[k] = 0.0f;
+        }
+    }
+
+    static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2};
+    const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
+
+    float best_err = FLT_MAX;
+    uint8_t fp8_code = 0;
+    float subblock_scale = 0.0f;
+
+#pragma unroll // Check +/- 2 to find best code to reduce NVFP4 activation loss. Negligible overhead on Blackwell.
+    for (int i = 0; i < 5; i++) {
+        const int test_code = first_fp8_code + test_offsets[i];
+        if (test_code < 0 || test_code > 0x7e) {
+            continue;
+        }
+        const uint8_t code = (uint8_t) test_code;
+        const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
+        const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
+        float cur_err = 0.0f;
+#pragma unroll
+        for (int k = 0; k < QK_NVFP4_SUB; ++k) {
+            const float v = vals_raw[k];
+            const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
+            const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
+            cur_err = fmaf(err_diff, err_diff, cur_err);
+        }
+
+        if (cur_err < best_err) {
+            best_err = cur_err;
+            fp8_code = test_code;
+            subblock_scale = test_scale;
+        }
+    }
+
+    const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
+    uint32_t q0 = 0;
+    uint32_t q1 = 0;
+#pragma unroll // this is faster than the previous __nv_fp4x4_e2m1
+    for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
+        q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k +  0], inv_scale) << (8 * k);
+        q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k +  8], inv_scale) << (8 * k + 4);
+        q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k +  4], inv_scale) << (8 * k);
+        q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
+    }
+
+    uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
+    yqs[2 * sub + 0] = q0;
+    yqs[2 * sub + 1] = q1;
+    reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
+#else
+    NO_DEVICE_CODE; // This is for Blackwell NVFP4 activations only.
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
+
+}
+
 // quantize values in the format mxfp4 is stored which is interleaved nibbles
 // i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31
 static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
@@ -316,28 +412,32 @@ void quantize_mmq_q8_1_cuda(
    }
 }

-void quantize_mmq_mxfp4_cuda(const float *                    x,
-                             const int32_t *                  ids,
-                             void *                           vy,
-                             [[maybe_unused]] const ggml_type type_src0,
-                             const int64_t                    ne00,
-                             const int64_t                    s01,
-                             const int64_t                    s02,
-                             const int64_t                    s03,
-                             const int64_t                    ne0,
-                             const int64_t                    ne1,
-                             const int64_t                    ne2,
-                             const int64_t                    ne3,
-                             cudaStream_t                     stream) {
-    GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
+void quantize_mmq_fp4_cuda(
+        const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
+        const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
+    GGML_ASSERT(type_src0 == GGML_TYPE_MXFP4 || type_src0 == GGML_TYPE_NVFP4);
+    GGML_ASSERT(ne0 > 0);

-    constexpr int nwarps = 8;
-    constexpr int vals_per_warp  = 2 * QK_MXFP4;
-    constexpr int vals_per_block = nwarps * vals_per_warp;
+    if (type_src0 == GGML_TYPE_NVFP4) {
+        GGML_ASSERT(ne00 % QK_NVFP4 == 0);
+        constexpr int nvfp4_block_size = 128;
+        const int64_t block_num_y = (ne0 + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size);
+        const dim3 block_size(nvfp4_block_size, 1, 1);
+        const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
+        quantize_mmq_nvfp4<<<num_blocks, block_size, 0, stream>>>(
+            x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
+    } else {
+        GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);

-    const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
-    const dim3    num_blocks(ne1, block_num_y, ne2 * ne3);
-    const dim3    block_size(WARP_SIZE, nwarps, 1);
+        constexpr int nwarps = 8;
+        constexpr int vals_per_warp  = 2 * QK_MXFP4;
+        constexpr int vals_per_block = nwarps * vals_per_warp;

-    quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
+        const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
+        const dim3    num_blocks(ne1, block_num_y, ne2 * ne3);
+        const dim3    block_size(WARP_SIZE, nwarps, 1);
+
+        quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
+    }
 }
--- a/ggml/src/ggml-cuda/quantize.cuh
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -26,7 +26,7 @@ void quantize_mmq_q8_1_cuda(
        ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);

-void quantize_mmq_mxfp4_cuda(const float *   x,
+void quantize_mmq_fp4_cuda(const float *   x,
                             const int32_t * ids,
                             void *          vy,
                             ggml_type       type_src0,
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(320, 256, 1, 32);
 DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(320, 256, 2, 32);
 DECL_FATTN_MMA_F16_CASE(576, 512, 2, 32);
--- a/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-tile.cuh"
+
+DECL_FATTN_TILE_CASE(320, 256);
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -3,7 +3,7 @@
 from glob import glob
 import os

-HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 512, 576]
+HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]

 TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]

@@ -62,7 +62,7 @@ for filename in glob("*.cu"):
    os.remove(filename)

 for head_size_kq in HEAD_SIZES_KQ:
-    head_size_v = head_size_kq if head_size_kq != 576 else 512
+    head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
    with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
        f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))

@@ -84,13 +84,16 @@ for ncols in [8, 16, 32, 64]:
                    continue
                if head_size_kq == 72:
                    continue
-                if head_size_kq == 512 and ncols2 not in (4, 8):
+                # Skip compilation of unused ncols2 values for niche head sizes:
+                if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
                    continue
-                if head_size_kq != 576 and ncols2 in (16, 32):
+                if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
                    continue
-                if head_size_kq == 576 and ncols2 not in (4, 16, 32):
+                if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
                    continue
-                head_size_v = head_size_kq if head_size_kq != 576 else 512
+                if head_size_kq not in (320, 576) and ncols2 in (16, 32):
+                    continue
+                head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))

 for type in TYPES_MMQ:
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -65,6 +65,11 @@ static __device__ __forceinline__ float op_sqr(float x) {
    return x * x;
 }

+static __device__ __forceinline__ float op_relu_sqr(float x) {
+    const float r = fmaxf(x, 0.0f);
+    return r * r;
+}
+
 static __device__ __forceinline__ float op_sqrt(float x) {
    return sqrtf(x);
 }
@@ -615,3 +620,21 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
            GGML_ABORT("Unsupported unary op for fused unary+mul");
    }
 }
+
+/* fused relu + sqr */
+
+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
+    const ggml_tensor * src = relu_node->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+    GGML_ASSERT(src->type == sqr_node->type);
+
+    const int k = ggml_nelements(src);
+    if (src->type == GGML_TYPE_F16) {
+        unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
+    } else {
+        unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
+    }
+}
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -91,6 +91,8 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);

+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
+
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
    return x / (1.0f + expf(-x));
 }
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -12,9 +12,12 @@
 #include <cstddef>
 #include <stdexcept>
 #include <string>
+#include <sstream>
+#include <iomanip>
 #include <unordered_set>
 #include <unordered_map>
 #include <regex>
+#include <queue>

 #ifdef _WIN32
 #    include <sal.h>
@@ -41,18 +44,26 @@
 #include "htp_iface.h"
 #include "htp-drv.h"

+using intvec  = std::vector<int>;
+using uintvec = std::vector<unsigned int>;
+using u32vec  = std::vector<uint32_t>;
+
 static size_t opt_ndev         = 1;
 static size_t opt_nhvx         = 0; // use all
 static int    opt_arch         = 0; // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
-static int    opt_profile      = 0;
+static int    opt_profile      = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
 static int    opt_hostbuf      = 1; // hostbuf ON by default
 static int    opt_use_hmx      = 1; // when set, enable HMX; when 0, use HVX only

+// Default PMU events, if profiling with PMU (mode=2) is enabled
+// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
+//     https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
+static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
+
 // Enable all stages by default
-static int opt_opmask   = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
-static int opt_opsync   = 0;  // synchronous ops
+static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
 }

 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
-                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
+                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
    if (!opt_profile) return;

    op_desc desc(op);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
-                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
+
+    char pmu_str[256] = "";
+    if (opt_profile > 1) {
+        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
+        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
+                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
+    }
+
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
+            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
 }

 // ** backend sessions

 struct ggml_hexagon_opbatch;
-struct ggml_hexagon_opshm;
+struct ggml_hexagon_opqueue;

 struct ggml_hexagon_session {
    std::string      name;
@@ -132,8 +150,8 @@ struct ggml_hexagon_session {
    bool             valid_iface;

    std::atomic<int>      op_pending;
-    ggml_hexagon_opbatch *op_batch;
-    ggml_hexagon_opshm   *op_shm;
+    ggml_hexagon_opbatch* op_batch;
+    ggml_hexagon_opqueue* op_queue;

    ggml_backend_buffer_type buffer_type        = {};
    ggml_backend_buffer_type repack_buffer_type = {};
@@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf

 // Backend session implementation

-struct ggml_hexagon_opshm {
-    ggml_hexagon_shared_buffer *sbuf;
-
-    std::vector<bool> block_mask;
-    size_t            block_size;
-
-    uint8_t * base()     const { return this->sbuf->base; }
-    int       fd()       const { return this->sbuf->fd;   }
-    size_t    n_blocks() const { return this->block_mask.size(); }
-
-    ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
-        size_t n_bufs    = HTP_OP_MAX_BUFS;
-        size_t n_ops     = max_batch;
-        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
-
-        block_mask.resize(max_pending, true);
-
-        block_size = sizeof(htp_buf_desc) * n_bufs    +
-                     sizeof(htp_tensor)   * n_tensors +
-                     sizeof(htp_op_desc)  * n_ops;
-
-        sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
-
-        if (opt_verbose) {
-            GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
-                    sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
-        }
-    }
-
-    ~ggml_hexagon_opshm() {
-        delete sbuf;
-    }
-
-    uint8_t * allocate() {
-        auto it = std::find(block_mask.begin(), block_mask.end(), true);
-        if (it == block_mask.end())
-            return nullptr;
-
-        unsigned int i = std::distance(block_mask.begin(), it);
-        uint8_t*  addr = sbuf->base + (i * block_size);
-        block_mask[i]  = false;
-
-        HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-        return addr;
-    }
-
-    void release(uint8_t * addr) {
-        int i = (addr - sbuf->base) / block_size;
-        block_mask[i] = true;
-        HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-    }
-};
-
 struct ggml_hexagon_opbatch {
-    const char* name;
+    ggml_hexagon_session*            sess;

-    std::vector<htp_buf_desc> buffers;
-    std::vector<htp_tensor>   tensors;
-    std::vector<htp_op_desc>  ops;
+    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
+
+    std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
+    std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
+    std::vector<htp_op_desc>         h_ops;     // htp op descriptors

    std::unordered_map<int, int>                b_map; // buffer fd   to index
    std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr  to index
@@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
        d_map.clear();
    }

-    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
-        name = sess->c_name();
+    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
+        this->sess = sess;

        n_bufs_max = HTP_OP_MAX_BUFS;
-        n_ops_max  = max_batch;
+        n_ops_max  = batch_size;
        n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;

        b_vmem_max = HTP_OP_MAX_VMEM;

-        buffers.resize(n_bufs_max);
-        tensors.resize(n_tens_max);
        ops.resize(n_ops_max);

+        h_bufs.resize(n_bufs_max);
+        h_tens.resize(n_tens_max);
+        h_ops.resize(n_ops_max);
+
        b_map.reserve(n_bufs_max);
        t_map.reserve(n_tens_max);
        d_map.reserve(n_tens_max);
@@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {

        b_map.insert({sbuf->fd, bi});

-        htp_buf_desc &b = buffers[bi];
+        htp_buf_desc &b = h_bufs[bi];
        b.base = (uint64_t) sbuf->base;
        b.fd   = sbuf->fd;
        b.size = sbuf->size;
@@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
        // First lookup by tensor data
        auto range = d_map.equal_range(t->data);
        for (auto it = range.first; it != range.second; ++it) {
-            htp_tensor * h = &tensors[it->second];
+            htp_tensor * h = &h_tens[it->second];
            if (same_shape(h, t)) { return it->second; }
        }

@@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
        uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
        size_t   t_size   = ggml_nbytes(t);

-        htp_tensor &h = tensors[ti];
+        htp_tensor &h = h_tens[ti];
        h.bi    = add_buffer(sbuf);
        h.data  = t_offset;
        h.size  = t_size;
@@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
    // assumes that fit_op() was called first and returned true
    void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
        // Add new op
-        htp_op_desc &o = ops[n_ops++];
+
+        unsigned int n = n_ops++;
        GGML_ASSERT(n_ops <= n_ops_max);

+        ops[n] = t;
+
+        htp_op_desc &o = h_ops[n];
        memcpy(&o.params, &t->op_params, sizeof(t->op_params));
        o.opcode = opcode;
        o.flags  = 0;

-        if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
            o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
        }

-        ggml_hexagon_dump_op_exec(name, t, o.flags);
+        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);

        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
            o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
        }
        o.dst = add_tensor(t);
    }
+};

-    size_t flush(uint8_t * mem_addr, size_t mem_size) {
-        static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
-        static_assert(sizeof(htp_tensor)   % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
-        static_assert(sizeof(htp_op_desc)  % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+struct ggml_hexagon_opqueue {
+    // Shared buffer for storing batches
+    ggml_hexagon_shared_buffer *shm_buf;
+    size_t                      shm_blk_size;

-        const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
-        const size_t t_size = sizeof(htp_tensor)   * n_tens;
-        const size_t o_size = sizeof(htp_op_desc)  * n_ops;
+    using opvec = std::vector<const ggml_tensor*>;

-        const size_t m_size = b_size + t_size + o_size;
-        GGML_ASSERT(m_size <= mem_size);
+    std::queue<unsigned int>    done;       // completed batch ids
+    std::vector<opvec>          op_cache;   // per batch op cache
+    std::vector<uint64_t>       start_usec; // per batch start time

-        uint8_t * b_ptr = (uint8_t *) mem_addr;
-        uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
-        uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
+    ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
+        size_t n_bufs    = HTP_OP_MAX_BUFS;
+        size_t n_ops     = batch_size;
+        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;

-        memcpy(b_ptr, (void *) buffers.data(), b_size);
-        memcpy(t_ptr, (void *) tensors.data(), t_size);
-        memcpy(o_ptr, (void *) ops.data(),     o_size);
+        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
+                       sizeof(htp_tensor)    * n_tensors +
+                       sizeof(htp_op_desc)   * n_ops     +
+                       sizeof(htp_prof_desc) * n_ops;

-        HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
-                name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
+        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
+
+        op_cache.resize(depth);
+        start_usec.resize(depth, 0);
+
+        // init done queue
+        for (unsigned int i = 0; i < depth; i++) { done.push(i); }
+
+        if (opt_verbose) {
+            GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
+                    sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
+        }
+    }
+
+    ~ggml_hexagon_opqueue() {
+        delete shm_buf;
+    }
+
+    // push new batch
+    bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
+        static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
+        static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
+        static_assert(sizeof(htp_buf_desc)    % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
+        static_assert(sizeof(htp_tensor)      % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
+        static_assert(sizeof(htp_op_desc)     % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+        static_assert(sizeof(htp_prof_desc)   % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
+
+        if (done.empty()) { return false; }
+
+        req.id        = done.front(); done.pop(); // batch id
+        req.n_bufs    = op_batch->n_bufs;
+        req.n_tensors = op_batch->n_tens;
+        req.n_ops     = op_batch->n_ops;
+
+        op_cache[req.id]   = op_batch->ops;
+        start_usec[req.id] = ggml_time_us();
+
+        const size_t b_size = sizeof(htp_buf_desc)  * req.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * req.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
+
+        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
+        dbuf.fd       = shm_buf->fd;
+        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
+        dbuf.size     = b_size + t_size + o_size + p_size;
+
+        GGML_ASSERT(dbuf.size <= shm_blk_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * b_ptr = m_ptr; m_ptr += b_size;
+        uint8_t * t_ptr = m_ptr; m_ptr += t_size;
+        uint8_t * o_ptr = m_ptr;
+
+        memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
+        memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
+        memcpy(o_ptr, (void *) op_batch->h_ops.data(),  o_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
+                shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
+                b_size, t_size, o_size, (size_t) dbuf.size);
+
+        op_batch->reset();

        if (opt_verbose > 1) {
            htp_buf_desc *b = (htp_buf_desc*) b_ptr;
-            for (unsigned int i=0; i < n_bufs; i++) {
-                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
+            for (unsigned int i=0; i < req.n_bufs; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
                            b[i].fd, (void *) b[i].base, (size_t) b[i].size);
            }
            htp_tensor *t = (htp_tensor*) t_ptr;
-            for (unsigned int i=0; i < n_tens; i++) {
+            for (unsigned int i=0; i < req.n_tensors; i++) {
                GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
-                            name, i, t[i].bi, t[i].data, t[i].size,
+                            shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
                            (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
            }
        }

-        reset();
+        return true;
+    }

-        return m_size;
+    void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
+        GGML_ASSERT(rsp.id < op_cache.size());
+
+        done.push(rsp.id);
+
+        const size_t b_size = sizeof(htp_buf_desc)  * rsp.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * rsp.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
+
+        const size_t m_size = b_size + t_size + o_size + p_size;
+        GGML_ASSERT(m_size <= shm_blk_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
+                shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
+                (size_t) dbuf.size, b_size, t_size, o_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
+
+        if (opt_profile && rsp.n_ops > 0) {
+            auto & ops = op_cache[rsp.id];
+
+            uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
+            uint32_t htp_usec   = 0;
+
+            GGML_ASSERT(rsp.n_ops <= ops.size());
+
+            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+            }
+
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+        }
    }
 };

@@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
            GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
        }

-        op_shm->release((uint8_t*) dbuf.ptr);
-
        if (rsp.status != HTP_STATUS_OK) {
            GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
            // TODO: handle errors
        }

-        // FIXME: profile will be per opreq
-        // this->prof_usecs  = rsp.prof_usecs;
-        // this->prof_cycles = rsp.prof_cycles;
-        // this->prof_pkts   = rsp.prof_pkts;
+        op_queue->pop(rsp, dbuf);

        this->op_pending--;  // atomic dec

@@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
 void ggml_hexagon_session::flush_batch() {
    if (op_batch->empty()) { return; }

-    htp_opbatch_req req;
-    req.n_bufs    = op_batch->n_bufs;
-    req.n_tensors = op_batch->n_tens;
-    req.n_ops     = op_batch->n_ops;
+    htp_opbatch_req req {};
+    dspqueue_buffer dbuf{};

-    dspqueue_buffer dbuf;
-    dbuf.fd     = op_shm->fd();
-    dbuf.flags  = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-    dbuf.ptr    = op_shm->allocate();
-    if (!dbuf.ptr) {
+    if (!op_queue->push(req, dbuf, op_batch)) {
        flush_pending(false);
-        dbuf.ptr = op_shm->allocate();
+        op_queue->push(req, dbuf, op_batch);
    }

-    dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
-    dbuf.size   = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
-
    // Bump pending flag (cleared in the session::flush once we get the response)
    this->op_pending++;  // atomic inc

-    HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
-
    int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
    if (err != 0) {
        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    }

    if (opt_etm) {
-        err = htp_iface_enable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 1);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

-    // Start the DSP-side service. We need to pass the queue ID to the
-    // DSP in a FastRPC call; the DSP side will import the queue and start
-    // listening for packets in a callback.
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
+
+        err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
+    // Allocate buffers and state for op batching
+    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
+    this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
+
+    // Start processing op batch requests
    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
    if (err != 0) {
        GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
        throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
    }
    this->valid_iface = true;
-
-    // Allocate buffers and state for op batching
-    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
-    this->op_shm   = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
 }

 void ggml_hexagon_session::release() noexcept(true) {
@@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
    int err;

    delete this->op_batch;
-    delete this->op_shm;
+    delete this->op_queue;

    // Stop the DSP-side service and close the queue
    if (this->valid_iface) {
@@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
    }

    if (opt_etm) {
-        err = htp_iface_disable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 0);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        err = htp_iface_profiler(this->handle, 0, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
    if (this->valid_queue) {
        err = dspqueue_close(queue);
        if (err != 0) {
@@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
    repack_buffer_type.device = dev;

    op_batch = nullptr;
-    op_shm   = nullptr;
+    op_queue = nullptr;

    try {
        allocate(dev_id);
@@ -2596,6 +2670,62 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se
    return true;
 }

+static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
+
+    // diag only supports F32 currently
+    if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    // Input must have ne[1] == 1 (vector input)
+    if (src0->ne[1] != 1) {
+        return false;
+    }
+
+    // Output must be square in first two dimensions
+    if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
+static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0]; // A
+    const struct ggml_tensor * src1 = op->src[1]; // B
+    const struct ggml_tensor * dst  = op;         // X
+
+    if (!src0 || !src1) {
+        return false;
+    }
+
+    if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    if (src0->ne[0] != src0->ne[1]) {
+        return false;
+    }
+
+    if (src0->ne[1] != src1->ne[1]) {
+        return false;
+    }
+
+    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
+        return false;
+    }
+
+    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
    return sess->c_name();
@@ -2632,7 +2762,9 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
        case GGML_OP_ROPE:           return HTP_OP_ROPE;
        case GGML_OP_REPEAT:         return HTP_OP_REPEAT;
        case GGML_OP_CUMSUM:         return HTP_OP_CUMSUM;
-
+        case GGML_OP_FILL:           return HTP_OP_FILL;
+        case GGML_OP_DIAG:           return HTP_OP_DIAG;
+        case GGML_OP_SOLVE_TRI:      return HTP_OP_SOLVE_TRI;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(t)) {
                case GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -2673,7 +2805,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg

    for (int i = 0; i < graph->n_nodes; ++i) {
        ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n)) {
+        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
            sess->enqueue_op(op_remap_to_htp(n), n);
        }
    }
@@ -3029,6 +3161,17 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
    return true;
 }

+static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * dst = op;
+
+    if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
    auto sess = static_cast<ggml_hexagon_session *>(dev->context);

@@ -3159,6 +3302,18 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_cumsum(sess, op);
            break;

+        case GGML_OP_FILL:
+            supp = ggml_hexagon_supported_fill(sess, op);
+            break;
+
+        case GGML_OP_DIAG:
+            supp = ggml_hexagon_supported_diag(sess, op);
+            break;
+
+        case GGML_OP_SOLVE_TRI:
+            supp = ggml_hexagon_supported_solve_tri(sess, op);
+            break;
+
        default:
            break;
    }
@@ -3294,6 +3449,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
    return NULL;
 }

+template<typename T> std::vector<T> str_to_vec(const char* str) {
+    std::stringstream ss(str);
+    std::vector<T> v;
+    std::string    t;
+
+    while (std::getline(ss, t, ',')) {
+        v.push_back(std::stoul(t, nullptr, 0));
+    }
+
+    return v;
+}
+
+template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
+    std::stringstream ss;
+    ss << std::setbase(BASE) << std::showbase;
+    for (auto i : v) { ss << i << ','; }
+    auto str = ss.str(); str.pop_back(); // drop last comma
+    return str;
+}
+
 static void ggml_hexagon_init(ggml_backend_reg * reg) {
    // Basic sanity checks to make sure definitions match
    static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3307,8 +3482,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
-    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
+    const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
    const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
    const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
    const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3321,19 +3495,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    auto RE_ICASE = std::regex_constants::icase;

-    opt_opfilter     = str_opfilter     ? new std::regex(str_opfilter, RE_ICASE) : NULL;
-    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
-    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0)  : opt_opmask;
-    opt_opsync       = str_opsync  ? atoi(str_opsync)              : opt_opsync;
-    opt_opbatch      = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
-    opt_opqueue      = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
-    opt_profile      = str_profile ? atoi(str_profile) : 0;
-    opt_etm          = str_etm     ? atoi(str_etm)     : 0;
-    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
-    opt_use_hmx      = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
-    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
+    opt_opfilter     = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
+    opt_verbose      = str_verbose  ? atoi(str_verbose)             : 0;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+    opt_opstage      = str_opstage  ? strtoul(str_opstage, NULL, 0) : opt_opstage;
+    opt_opbatch      = str_opbatch  ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
+    opt_opqueue      = str_opqueue  ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
+    opt_etm          = str_etm      ? atoi(str_etm)                 : 0;
+    opt_nhvx         = str_nhvx     ? strtoul(str_nhvx, NULL, 0)    : opt_nhvx;
+    opt_use_hmx      = str_use_hmx  ? atoi(str_use_hmx)             : opt_use_hmx;
+    opt_ndev         = str_ndev     ? strtoul(str_ndev, NULL, 0)    : opt_ndev;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+
+    if (str_profile) {
+        opt_pmu_evt = [&]() -> std::vector<uint32_t> {
+            auto v  = str_to_vec<uint32_t>(str_profile);
+            switch (v.size()) {
+                case 1:  opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
+                case 8:  opt_profile = 2;    return v;           // mode with custom  pmu events
+                default: opt_profile = 0;    return {};          // garbage input
+            }}();
+        if (opt_profile == 1) opt_pmu_evt = {};
+        GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
+                vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
+    }

    if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
        opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -34,6 +34,9 @@ add_library(${HTP_LIB} SHARED
    argsort-ops.c
    ssm-conv.c
    cumsum-ops.c
+    fill-ops.c
+    diag-ops.c
+    solve-tri-ops.c
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
--- a/ggml/src/ggml-hexagon/htp/diag-ops.c
+++ b/ggml/src/ggml-hexagon/htp/diag-ops.c
@@ -0,0 +1,216 @@
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-ops.h"
+#include "hvx-types.h"
+#include "hex-utils.h"
+#include "hvx-copy.h"
+#include "hex-dma.h"
+
+#define htp_diag_tensors_preamble                           \
+    const struct htp_tensor * restrict src0 = octx->src[0]; \
+    const struct htp_tensor * restrict dst  = octx->dst;    \
+                                                     \
+    const uint32_t ne02 = src0->ne[2];               \
+                                                     \
+    const uint32_t ne0 = dst->ne[0];                 \
+    const uint32_t ne1 = dst->ne[1];                 \
+                                                     \
+    const uint32_t nb02 = src0->nb[2];               \
+    const uint32_t nb03 = src0->nb[3];               \
+                                                     \
+    const uint32_t nb1 = dst->nb[1];                 \
+    const uint32_t nb2 = dst->nb[2];                 \
+    const uint32_t nb3 = dst->nb[3];
+
+struct htp_diag_context {
+    struct htp_ops_context * octx;
+    size_t          src_batch_size;
+    size_t          dst_row_size;
+    size_t          src_batch_size_aligned;
+    size_t          dst_row_size_aligned;
+    uint32_t        batches_per_thread;
+    uint32_t        total_batches;
+};
+
+#define htp_diag_preamble                                              \
+    struct htp_diag_context * dctx = (struct htp_diag_context *) data; \
+    struct htp_ops_context *  octx = dctx->octx;                       \
+    htp_diag_tensors_preamble;
+
+static inline void hvx_diag_row_f32(const float * restrict src, float * restrict dst,
+                                    uint32_t row_idx, uint32_t n) {
+    hvx_splat_f32_a((uint8_t *) dst, 0.0f, n);
+    dst[row_idx] = src[row_idx];
+}
+
+// ---------------------------------------------------------------------------
+// Per thread worker: DMA src fetch, compute in VTCM, DMA dst writeback
+// ---------------------------------------------------------------------------
+
+static void diag_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
+    htp_diag_preamble;
+    dma_queue * dma_queue = octx->ctx->dma[ith];
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const uint32_t ib0 = dctx->batches_per_thread * ith;
+    const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
+
+    if (ib0 >= ib1) {
+        return;
+    }
+
+    const size_t src_batch_size         = dctx->src_batch_size;
+    const size_t dst_row_size           = dctx->dst_row_size;
+    const size_t src_batch_size_aligned = dctx->src_batch_size_aligned;
+    const size_t dst_row_size_aligned   = dctx->dst_row_size_aligned;
+
+    const uint8_t * src_data = (const uint8_t *) src0->data;
+    uint8_t *       dst_data = (uint8_t *) dst->data;
+
+    // 1 src buffer + 1 dst row buffer per thread in VTCM
+    uint8_t * src_spad = octx->src0_spad.data + (ith * src_batch_size_aligned);
+    uint8_t * dst_spad = octx->dst_spad.data  + (ith * dst_row_size_aligned);
+
+    for (uint32_t ib = ib0; ib < ib1; ib++) {
+        const uint32_t i3 = ib / ne02;
+        const uint32_t i2 = ib % ne02;
+
+        const uint8_t * src_batch = src_data + i3 * nb03 + i2 * nb02;
+
+        // Fetch source vector into VTCM
+        dma_queue_push_ddr_to_vtcm(dma_queue,
+                                   dma_make_ptr(src_spad, src_batch),
+                                   src_batch_size_aligned, src_batch_size, 1);
+        dma_queue_flush(dma_queue);
+
+        const float * src_spad_f32 = (const float *) src_spad;
+        float       * dst_spad_f32 = (float *) dst_spad;
+
+        for (uint32_t i1 = 0; i1 < ne1; i1++) {
+            // Compute row in VTCM
+            hvx_diag_row_f32(src_spad_f32, dst_spad_f32, i1, ne0);
+
+            // Write completed row back to DDR
+            uint8_t * dst_row = dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1;
+            dma_queue_push_vtcm_to_ddr(dma_queue,
+                                       dma_make_ptr(dst_row, dst_spad),
+                                       dst_row_size, dst_row_size_aligned, 1);
+            dma_queue_flush(dma_queue);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "diag-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// ---------------------------------------------------------------------------
+// Per thread worker: Direct HVX (no DMA)
+// ---------------------------------------------------------------------------
+
+static void diag_thread_f32(unsigned int nth, unsigned int ith, void * data) {
+    htp_diag_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const uint8_t * src_data = (const uint8_t *) src0->data;
+    uint8_t *       dst_data = (uint8_t *) dst->data;
+
+    const uint32_t ib0 = dctx->batches_per_thread * ith;
+    const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
+
+    for (uint32_t ib = ib0; ib < ib1; ib++) {
+        const uint32_t i3 = ib / ne02;
+        const uint32_t i2 = ib % ne02;
+
+        const float * restrict src_batch = (const float *)(src_data + i3 * nb03 + i2 * nb02);
+
+        for (uint32_t i1 = 0; i1 < ne1; i1++) {
+            float * restrict dst_row = (float *)(dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+            hvx_diag_row_f32(src_batch, dst_row, i1, ne0);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "diag-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+int op_diag_f32(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * dst  = octx->dst;
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    const uint32_t total_batches = src0->ne[2] * src0->ne[3];
+    const uint32_t n_threads     = MIN(octx->n_threads, total_batches);
+
+    const size_t src_batch_size         = src0->ne[0] * sizeof(float);
+    const size_t dst_row_size           = dst->ne[0] * sizeof(float);
+    const size_t src_batch_size_aligned = hex_round_up(src_batch_size, VLEN);
+    const size_t dst_row_size_aligned   = hex_round_up(dst_row_size, VLEN);
+
+    // 1 src buffer + 1 dst row buffer per thread
+    const size_t spad_per_thread = src_batch_size_aligned + dst_row_size_aligned;
+
+    octx->src0_spad.size_per_thread = src_batch_size_aligned;
+    octx->dst_spad.size_per_thread  = dst_row_size_aligned;
+
+    octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+    octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;                        octx->src0_spad.src = NULL;
+    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src  = NULL;
+
+    struct htp_diag_context dctx = {
+        .octx                   = octx,
+        .src_batch_size         = src_batch_size,
+        .dst_row_size           = dst_row_size,
+        .src_batch_size_aligned = src_batch_size_aligned,
+        .dst_row_size_aligned   = dst_row_size_aligned,
+        .batches_per_thread     = (total_batches + n_threads - 1) / n_threads,
+        .total_batches          = total_batches,
+    };
+
+    if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
+        worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32, &dctx, n_threads);
+    } else {
+        worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32_dma, &dctx, n_threads);
+    }
+
+    return HTP_STATUS_OK;
+}
+
+int op_diag(struct htp_ops_context * octx) {
+    const struct htp_tensor * dst = octx->dst;
+
+    int err = HTP_STATUS_OK;
+
+    switch (dst->type) {
+        case HTP_TYPE_F32:
+            err = op_diag_f32(octx);
+            break;
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
--- a/ggml/src/ggml-hexagon/htp/fill-ops.c
+++ b/ggml/src/ggml-hexagon/htp/fill-ops.c
@@ -0,0 +1,123 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+
+#include <string.h>
+
+#include "hvx-copy.h"
+#include "hvx-utils.h"
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-ops.h"
+
+// ggml op_params layout for FILL:
+//   op_params[0] (as float) - the scalar fill value
+
+#define fill_preamble \
+    const struct htp_tensor * dst = octx->dst; \
+    \
+    const uint32_t ne0 = dst->ne[0]; \
+    const uint32_t ne1 = dst->ne[1]; \
+    const uint32_t ne2 = dst->ne[2]; \
+    const uint32_t ne3 = dst->ne[3]; \
+    \
+    const uint32_t nb1 = dst->nb[1]; \
+    const uint32_t nb2 = dst->nb[2]; \
+    const uint32_t nb3 = dst->nb[3]; \
+    \
+    const uint32_t nr = ne1 * ne2 * ne3;
+
+struct htp_fill_context {
+    struct htp_ops_context * octx;
+    uint32_t nrows_per_thread;
+    uint32_t total_rows;  // ne1 * ne2 * ne3
+    bool     opt_path;
+    HVX_Vector splat_vec;
+    uint32_t   elem_size;
+};
+
+static void fill_thread(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_fill_context * fctx = (const struct htp_fill_context *) data;
+    struct htp_ops_context        * octx = fctx->octx;
+    fill_preamble;
+
+    // Parallelise over the flat row index spanning ne1*ne2*ne3
+    const uint32_t ir0 = fctx->nrows_per_thread * ith;
+    const uint32_t ir1 = MIN(ir0 + fctx->nrows_per_thread, fctx->total_rows);
+
+    uint64_t t1 = HAP_perf_get_qtimer_count();
+
+    if (fctx->opt_path) {
+        // Opt path: tensor is fully contiguous, treat as flat array
+        const uint32_t elem_start = ir0 * ne0;
+        const uint32_t elem_end = ir1 * ne0;
+        uint8_t * dst_ptr = (uint8_t *) dst->data + elem_start * fctx->elem_size;
+        hvx_splat_u(dst_ptr, fctx->splat_vec, elem_end - elem_start, fctx->elem_size);
+    } else {
+        // Non-contiguous path: must respect strides
+        for (uint32_t ir = ir0; ir < ir1; ++ir) {
+            const uint32_t i1 = ir % ne1;
+            const uint32_t i2 = (ir / ne1) % ne2;
+            const uint32_t i3 = ir / (ne1 * ne2);
+            uint8_t * dst_ptr = (uint8_t *) dst->data + i1*nb1 + i2*nb2 + i3*nb3;
+            hvx_splat_u(dst_ptr, fctx->splat_vec, ne0, fctx->elem_size);
+        }
+    }
+
+    uint64_t t2 = HAP_perf_get_qtimer_count();
+    FARF(HIGH, "fill %u/%u: rows %u:%u usec %u\n",
+         ith, nth, ir0, ir1, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+int op_fill(struct htp_ops_context * octx) {
+    fill_preamble;
+
+    if (dst->type != HTP_TYPE_F32 && dst->type != HTP_TYPE_F16) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    // nr = ne1*ne2*ne3 (flat row count across all outer dims); parallelise over it.
+    const uint32_t n_threads = MIN(nr, octx->n_threads);
+
+    // Optimize if fully contiguous: skip stride arithmetic, treat as flat array
+    const bool opt_path = (nb2 == nb1 * ne1) && (nb3 == nb2 * ne2);
+
+    FARF(HIGH, "fill: (%ux%ux%ux%u) type=%u opt=%d\n",
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->type, (int) opt_path);
+
+    float val_f32 = 0.f;
+    memcpy(&val_f32, &octx->op_params[0], sizeof(float));
+
+    struct htp_fill_context fctx = {
+        .octx             = octx,
+        .nrows_per_thread = (nr + n_threads - 1) / n_threads,
+        .total_rows       = nr,
+        .opt_path         = opt_path,
+    };
+
+    switch (dst->type) {
+    case HTP_TYPE_F32:
+        fctx.splat_vec = hvx_vec_splat_f32(val_f32);
+        fctx.elem_size = sizeof(float);
+        break;
+    case HTP_TYPE_F16:
+        fctx.splat_vec = hvx_vec_splat_f16((_Float16) val_f32);
+        fctx.elem_size = sizeof(_Float16);
+        break;
+    default:
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    worker_pool_run_func(octx->ctx->worker_pool, fill_thread, &fctx, n_threads);
+
+    return HTP_STATUS_OK;
+}
--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@@ -4,6 +4,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <qurt_memory.h>
+#include <qurt.h>

 #include "hexagon_types.h"
 #include "hexagon_protos.h"
@@ -100,4 +101,31 @@ static inline void hex_pause() {
    asm volatile(" pause(#255)\n");
 }

+#ifndef HEX_NUM_PMU_COUNTERS
+#define HEX_NUM_PMU_COUNTERS 8
+#endif
+
+static inline void hex_get_pmu(uint32_t counters[]) {
+#if __HVX_ARCH__ >= 79
+    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
+    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
+    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
+    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
+    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
+    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
+    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
+    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
+#else
+    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
+    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
+    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
+    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
+    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
+    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
+    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
+    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
+    // qurt_pmu_get_pmucnt(counters);
+#endif
+}
+
 #endif /* HEX_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -1683,7 +1683,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
    __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
    assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);

-    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
+    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type,
         M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);

    // initialize eye tile (32x32 identity matrix)
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -10,6 +10,7 @@
 #include <dspqueue.h>
 #include <stdatomic.h>
 #include <stdint.h>
+#include <stdbool.h>

 #define HTP_MAX_NTHREADS 10
 #define HTP_MAX_MMAPS    16
@@ -66,7 +67,9 @@ struct htp_context {
    int                    thread_id;
    int                    thread_prio;

-    int                    hmx_enabled;
+    bool                   hmx_enabled;
+    bool                   etm;
+    uint32_t               profiler;

    uint8_t *              vtcm_base;
    size_t                 vtcm_size;
@@ -98,5 +101,8 @@ int op_repeat(struct htp_ops_context * octx);
 int op_argsort(struct htp_ops_context * octx);
 int op_ssm_conv(struct htp_ops_context * octx);
 int op_cumsum(struct htp_ops_context * octx);
+int op_fill(struct htp_ops_context * octx);
+int op_diag(struct htp_ops_context * octx);
+int op_solve_tri(struct htp_ops_context * octx);

 #endif /* HTP_CTX_H */
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -42,9 +42,9 @@ enum htp_data_type {

 // Mask to enable various stages of the Ops.
 // Used for debugging and profiling.
-enum htp_op_mask {
-    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
-    HTP_OPMASK_COMPUTE  = (1 << 1),  // Enable Compute
+enum htp_op_stage {
+    HTP_OPSTAGE_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into NPU)
+    HTP_OPSTAGE_COMPUTE  = (1 << 1),  // Enable Compute
 };

 // Do not reorder first 4 (used as an index)
@@ -80,7 +80,9 @@ enum htp_op_code {
    HTP_OP_SSM_CONV,
    HTP_OP_REPEAT,
    HTP_OP_CUMSUM,
-
+    HTP_OP_FILL,
+    HTP_OP_DIAG,
+    HTP_OP_SOLVE_TRI,
    HTP_OP_INVALID
 };

@@ -135,27 +137,45 @@ struct htp_op_desc {
    int32_t  params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
    uint16_t src[HTP_OP_MAX_INPUTS];    // Input tensors indices
    uint16_t dst;                       // Output tensor index
+};

-    // the rest is filled in-place by the NPU
-    uint32_t prof_usecs;                // Number of usec per request
-    uint32_t prof_cycles;               // Number of cycles per request
-    uint32_t prof_pkts;                 // Number of instruction packets per request
-    uint32_t unused;
+enum htp_profiler_mode {
+    HTP_PROF_DISABLED = 0,
+    HTP_PROF_BASIC    = 1,
+    HTP_PROF_PMU      = 2,
+};
+
+#define HTP_PROF_PMU_NCNT 8
+
+// Profile descriptor
+struct htp_prof_desc {
+    uint32_t opcode;                 // GGML/HTP Op
+    uint32_t usecs;                  // Number of usec
+    uint32_t cycles;                 // Number of cycles
+    uint32_t pad;                    // Unused
+    uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
 };

 struct htp_opbatch_req {
+    uint32_t id;          // Batch id
    uint32_t n_bufs;      // Number of buffers
    uint32_t n_tensors;   // Number of tensors
    uint32_t n_ops;       // Number of ops
    uint32_t flags;       // unused
+    uint32_t pad;         // unused
    // struct htp_buf_desc  bufs[];    -- dspqueue buf 0
    // struct htp_tensor    tensors[]; -- dspqueue buf 0
    // struct htp_op_desc   ops[];     -- dspqueue buf 0
 };

 struct htp_opbatch_rsp {
+    uint32_t id;         // Batch id
    uint32_t status;     // HTP_STATUS_...
-    // struct htp_op_req ops[];     -- dspqueue buf 0
+    uint32_t n_bufs;     // Number of buffers
+    uint32_t n_tensors;  // Number of tensors
+    uint32_t n_ops;      // Number of op profile descriptors
+    uint32_t pad;        // unused
+    // struct htp_prof_desc profs[];  -- dspqueue buf 0
 };

 #endif /* HTP_OPS_H */
--- a/ggml/src/ggml-hexagon/htp/htp_iface.idl
+++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl
@@ -6,13 +6,17 @@
 #include "AEEStdDef.idl"
 #include "remote.idl"

+struct htp_iface_pmu_conf {
+    uint32 events[8];
+};
+
 interface htp_iface : remote_handle64 {
    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
    AEEResult stop();
    AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
    AEEResult munmap(in uint32 fd);
-    AEEResult enable_etm();
-    AEEResult disable_etm();
+    AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
+    AEEResult etm(in uint32 enable);
 };

 #endif /* HTP_IDL */
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -256,6 +256,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
    return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
 }

+static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
+}
+
+static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
+}
+
+static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
+}
+
 #else

 static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
@@ -273,6 +285,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
    return Q6_Vhf_vmpy_VhfVhf(a, b);
 }

+static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_vadd_VsfVsf(a, b);
+}
+
+static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_vsub_VsfVsf(a, b);
+}
+
+static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_vmpy_VsfVsf(a, b);
+}
+
 #endif // __HVX_ARCH__ < 79

 #endif /* HVX_BASE_H */
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -27,6 +27,7 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
+#include "htp_iface.h"
 #include "worker-pool.h"

 AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -100,6 +101,74 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
        }
    }

+#if __HVX_ARCH__ >= 75
+    {
+        // Set HMX clock
+        HAP_power_request_t request;
+        memset(&request, 0, sizeof(HAP_power_request_t));
+        request.type = HAP_power_set_HMX_v2;
+        request.hmx_v2.set_clock = TRUE;
+        request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
+        FARF(ALWAYS, "Setting HMX clock\n");
+        err = HAP_power_set((void *) &ctx, &request);
+        if (err != AEE_SUCCESS) {
+            FARF(ERROR, "Error setting HMX clock.");
+            return err;
+        }
+    }
+#endif
+
+    return AEE_SUCCESS;
+}
+
+AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
+    int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
+    struct htp_context * ctx = (struct htp_context *) handle;
+    if (!ctx) {
+        return AEE_EBADPARM;
+    }
+
+    if (mode == HTP_PROF_PMU) {
+        const uint32_t* events = pmu_conf->events;
+
+        // Pack 4 event IDs (low 8 bits) into each 32-bit config register
+        uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
+        for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
+            evtcfg  |= ((events[i + 0] & 0xFF) << (i * 8));
+            evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
+        }
+
+        // For events >255 pack high 2 bits of all 8 event IDs into cfg register
+        // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
+        for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
+            cfg |= (((events[i] >> 8) & 3) << (i * 2));
+        }
+
+        FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
+
+        // Configure PMU registers
+        qurt_pmu_set(QURT_PMUCFG,     cfg);
+        qurt_pmu_set(QURT_PMUEVTCFG,  evtcfg);
+        qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
+        qurt_pmu_enable(1);
+    }
+
+    ctx->profiler = mode;
+
    return AEE_SUCCESS;
 }

@@ -129,35 +198,19 @@ AEEResult htp_iface_close(remote_handle64 handle) {
        }
    }

+    if (ctx->profiler) {
+        qurt_pmu_enable(1);
+    }
+
+    if (ctx->etm) {
+        HAP_user_etm_disable();
+    }
+
    free(ctx);
    return AEE_SUCCESS;
 }

-AEEResult htp_iface_enable_etm(remote_handle64 handle) {
-    int err = HAP_user_etm_enable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_disable_etm(remote_handle64 handle) {
-    int err = HAP_user_etm_disable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
+AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
@@ -204,7 +257,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
    return AEE_ENOMEMORY;
 }

-AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
+AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
@@ -434,19 +487,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
 struct profile_data {
    uint64_t usecs;
    uint64_t cycles;
-    uint64_t pkts;
+    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
 };

-static inline void profile_start(struct profile_data * d) {
-    d->usecs  = HAP_perf_get_qtimer_count();
-    d->cycles = hex_get_cycles();
-    d->pkts   = hex_get_pktcnt();
+static inline void profile_start(uint32_t mode, struct profile_data * d) {
+    switch (mode) {
+        case HTP_PROF_PMU:
+            hex_get_pmu(d->pmu_counters);
+            // fallthrough
+        case HTP_PROF_BASIC:
+            d->usecs  = HAP_perf_get_qtimer_count();
+            d->cycles = hex_get_cycles();
+            break;
+        default:
+            break;
+    }
 }

-static inline void profile_stop(struct profile_data * d) {
-    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-    d->cycles = hex_get_cycles() - d->cycles;
-    d->pkts   = hex_get_pktcnt() - d->pkts;
+static inline void profile_stop(uint32_t mode, struct profile_data * d) {
+    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
+    switch (mode) {
+        case HTP_PROF_PMU:
+            hex_get_pmu(pmu_counters);
+            for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
+                d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
+            }
+            // fallthrough
+        case HTP_PROF_BASIC:
+            d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
+            d->cycles = hex_get_cycles() - d->cycles;
+            break;
+        default:
+            break;
+    }
 }

 static int execute_op(struct htp_ops_context * octx) {
@@ -514,6 +587,15 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_CUMSUM:
            return op_cumsum(octx);

+        case HTP_OP_FILL:
+            return op_fill(octx);
+
+        case HTP_OP_DIAG:
+            return op_diag(octx);
+
+        case HTP_OP_SOLVE_TRI:
+            return op_solve_tri(octx);
+
        case HTP_OP_INVALID:
            break;

@@ -720,30 +802,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            continue;
        }

+        // Reset poll count for valid requests
+        poll_count = DSPQUEUE_POLL_COUNT;
+
        const uint32_t n_bufs = req.n_bufs;
        const uint32_t n_tens = req.n_tensors;
        const uint32_t n_ops  = req.n_ops;

-        const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
-        const uint32_t t_size = sizeof(struct htp_tensor)   * n_tens;
-        const uint32_t o_size = sizeof(struct htp_op_desc)  * n_ops;
+        const uint32_t b_size = sizeof(struct htp_buf_desc)  * n_bufs;
+        const uint32_t t_size = sizeof(struct htp_tensor)    * n_tens;
+        const uint32_t o_size = sizeof(struct htp_op_desc)   * n_ops;
+        const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;

-        if (dbuf.size < b_size + t_size + o_size) {
+        if (dbuf.size < b_size + t_size + o_size + p_size) {
            FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
            break;
        }

-        // Reset poll count for valid requests
-        poll_count = DSPQUEUE_POLL_COUNT;
-
-        uint8_t * m_ptr = dbuf.ptr;
-        struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
-        struct htp_tensor*   tens = (struct htp_tensor*)   m_ptr; m_ptr += t_size;
-        struct htp_op_desc*   ops = (struct htp_op_desc*)  m_ptr;
-
-        FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
+        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);

+        // Setup descriptor pointers
+        uint8_t * m_ptr = dbuf.ptr;
+        struct htp_buf_desc* bufs = (struct htp_buf_desc*)  m_ptr; m_ptr += b_size;
+        struct htp_tensor*   tens = (struct htp_tensor*)    m_ptr; m_ptr += t_size;
+        struct htp_op_desc*   ops = (struct htp_op_desc*)   m_ptr; m_ptr += o_size;
+        struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
+
        prep_op_bufs(ctx, bufs, n_bufs);
        prep_tensors(ctx, bufs, tens, n_tens);

@@ -754,22 +839,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {

        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;
-            profile_start(&prof);
+
+            profile_start(ctx->profiler, &prof);

            proc_op_req(octx, tens, i, &ops[i]);

-            profile_stop(&prof);
-            ops[i].prof_usecs  = prof.usecs;
-            ops[i].prof_cycles = prof.cycles;
-            ops[i].prof_pkts   = prof.pkts;
+            profile_stop(ctx->profiler, &prof);
+
+            if (ctx->profiler) {
+                pds[i].opcode = ops[i].opcode;
+                pds[i].usecs  = prof.usecs;
+                pds[i].cycles = prof.cycles;
+                for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
+                    pds[i].pmu[j] = prof.pmu_counters[j];
+                }
+            }
        }

        // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);

        struct htp_opbatch_rsp rsp;
-        rsp.status = HTP_STATUS_OK; // FIXME
+        rsp.id        = req.id;
+        rsp.status    = HTP_STATUS_OK;
+        rsp.n_bufs    = n_bufs;
+        rsp.n_tensors = n_tens;
+        rsp.n_ops     = n_ops;

        dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+
        err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
        if (err != 0) {
            FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) {
    const int act_stride = (int)(src1->nb[1] / sizeof(float));
    const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));

+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
    if (src0->type == HTP_TYPE_F16) {
        if (is_batched) {
            hmx_matmul_w16a32_batched_params_t batch_params = {
--- a/ggml/src/ggml-hexagon/htp/solve-tri-ops.c
+++ b/ggml/src/ggml-hexagon/htp/solve-tri-ops.c
@@ -0,0 +1,267 @@
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-ops.h"
+#include "hvx-types.h"
+#include "hvx-utils.h"
+
+struct htp_solve_tri_context {
+    struct htp_ops_context * octx;
+    uint32_t                 jobs_per_thread;
+    uint32_t                 total_jobs;
+    uint32_t                 k_chunks;
+    uint32_t                 col_block;
+};
+
+static inline void solve_tri_row_scalar(const float * A_row,
+                                        const float * B_row,
+                                        float *       X,
+                                        uint32_t      row,
+                                        uint32_t      k,
+                                        uint32_t      col0,
+                                        uint32_t      coln,
+                                        float         inv_diag) {
+    for (uint32_t col = col0; col < col0 + coln; ++col) {
+        float sum = 0.0f;
+        for (uint32_t t = 0; t < row; ++t) {
+            sum += A_row[t] * X[t * k + col];
+        }
+        X[row * k + col] = (B_row[col] - sum) * inv_diag;
+    }
+}
+
+static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
+    HVX_Vector v = *((const HVX_UVector *) src);
+    HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
+    return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
+}
+
+static inline void solve_tri_row_hvx(const float * A_row,
+                                     const float * B_row,
+                                     float *       X,
+                                     uint32_t      row,
+                                     uint32_t      k,
+                                     uint32_t      col0,
+                                     uint32_t      coln,
+                                     float         inv_diag) {
+    const bool full = (coln == VLEN_FP32);
+
+    HVX_Vector sum_v = Q6_V_vzero();
+    for (uint32_t t = 0; t < row; ++t) {
+        const float   a         = A_row[t];
+        const float * x_row_col = X + t * k + col0;
+
+        HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
+        HVX_Vector a_v = hvx_vec_splat_f32(a);
+        sum_v          = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
+    }
+
+    const float * b_row_col = B_row + col0;
+    float *       x_out_col = X + row * k + col0;
+
+    HVX_Vector b_v        = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
+    HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
+
+    HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
+    hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
+}
+
+// Batch-level thread: each job is one full batch.
+static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
+    struct htp_ops_context *       octx = sctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];  // A
+    const struct htp_tensor * src1 = octx->src[1];  // B
+    const struct htp_tensor * dst  = octx->dst;     // X
+
+    const uint32_t n = src0->ne[0];
+    const uint32_t k = src1->ne[0];
+
+    const uint32_t ne02 = src0->ne[2];
+
+    const uint32_t col_block = VLEN_FP32;
+    const uint32_t k_full    = (k / col_block) * col_block;
+
+    const uint32_t start_batch = sctx->jobs_per_thread * ith;
+    const uint32_t end_batch   = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
+        const uint32_t i03 = batch / ne02;
+        const uint32_t i02 = batch - i03 * ne02;
+
+        const float * A_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
+        const float * B_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
+        float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
+
+        for (uint32_t row = 0; row < n; ++row) {
+            const float   diag     = A_batch[row * n + row];
+            const float   inv_diag = 1.0f / diag;
+            const float * A_row    = A_batch + row * n;
+            const float * B_row    = B_batch + row * k;
+
+            uint32_t col0 = 0;
+            for (; col0 < k_full; col0 += col_block) {
+                solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
+            }
+
+            if (col0 < k) {
+                const uint32_t coln = k - col0;
+                if (coln >= 8) {
+                    solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+                } else {
+                    solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+                }
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
+         ith, nth, n, n, k, n, start_batch, end_batch,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// Chunk-level thread: each job is one (batch, col_chunk) pair.
+static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
+    struct htp_ops_context *       octx = sctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];  // A
+    const struct htp_tensor * src1 = octx->src[1];  // B
+    const struct htp_tensor * dst  = octx->dst;     // X
+
+    const uint32_t n = src0->ne[0];
+    const uint32_t k = src1->ne[0];
+
+    const uint32_t ne02 = src0->ne[2];
+
+    const uint32_t start_job = sctx->jobs_per_thread * ith;
+    const uint32_t end_job   = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    for (uint32_t job = start_job; job < end_job; ++job) {
+        const uint32_t batch = job / sctx->k_chunks;
+        const uint32_t chunk = job - batch * sctx->k_chunks;
+
+        const uint32_t i03 = batch / ne02;
+        const uint32_t i02 = batch - i03 * ne02;
+
+        const uint32_t col0 = chunk * sctx->col_block;
+        const uint32_t coln = MIN(sctx->col_block, k - col0);
+
+        const float * A_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
+        const float * B_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
+        float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
+
+        const bool use_hvx = (coln >= 8);
+
+        for (uint32_t row = 0; row < n; ++row) {
+            const float diag     = A_batch[row * n + row];
+            const float inv_diag = 1.0f / diag;
+
+            const float * A_row = A_batch + row * n;
+            const float * B_row = B_batch + row * k;
+
+            if (use_hvx) {
+                solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+            } else {
+                solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
+         ith, nth, n, n, k, n, start_job, end_job,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+int op_solve_tri(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = octx->src[0];  // A
+    const struct htp_tensor * src1 = octx->src[1];  // B
+    const struct htp_tensor * dst  = octx->dst;     // X
+
+    if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    // left=true, lower=true, uni=false only
+    if (src0->ne[0] != src0->ne[1]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+    if (src0->ne[1] != src1->ne[1]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
+        dst->ne[3] != src1->ne[3]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    const uint32_t k = src1->ne[0];
+
+    const uint32_t col_block     = VLEN_FP32;
+    const uint32_t k_chunks      = (k + col_block - 1) / col_block;
+    const uint32_t total_batches = src0->ne[2] * src0->ne[3];
+    const bool     batched       = total_batches >= (uint32_t) octx->n_threads;
+
+    FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+         src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
+
+    if (batched) {
+        // Batch-level parallelism
+        const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
+
+        struct htp_solve_tri_context sctx = {
+            .octx            = octx,
+            .jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
+            .total_jobs      = total_batches,
+            .k_chunks        = k_chunks,
+            .col_block       = col_block,
+        };
+
+        worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
+    } else {
+        // Chunk-level parallelism
+        const uint32_t total_jobs = total_batches * k_chunks;
+        const uint32_t n_threads  = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
+
+        struct htp_solve_tri_context sctx = {
+            .octx            = octx,
+            .jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
+            .total_jobs      = total_jobs,
+            .k_chunks        = k_chunks,
+            .col_block       = col_block,
+        };
+
+        worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
+    }
+
+    return HTP_STATUS_OK;
+}
--- a/ggml/src/ggml-hexagon/libggml-htp.inf
+++ b/ggml/src/ggml-hexagon/libggml-htp.inf
@@ -8,7 +8,7 @@ CatalogFile = libggml-htp.cat
 PnpLockDown = 1

 [DestinationDirs]
-Drivers_Dir = 6
+Drivers_Dir = 13

 [SourceDisksNames]
 1 = %DiskId%
@@ -18,6 +18,7 @@ libggml-htp-v68.so = 1
 libggml-htp-v69.so = 1
 libggml-htp-v73.so = 1
 libggml-htp-v75.so = 1
+libggml-htp-v79.so = 1
 libggml-htp-v81.so = 1

 [ControlFlags]
@@ -31,6 +32,7 @@ libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE

 [Strings]
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -677,7 +677,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
    const ggml_type tsrc1 = op->src[1]->type;

    const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
-    const bool bc_out = op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0;
+
+    constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y;
+    constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
+
+    const bool has_tensor = ggml_metal_device_get_props(ggml_metal_library_get_device(lib))->has_tensor;
+
+    const bool bc_out = has_tensor
+        ? (op->ne[0] % NRA != 0 || op->ne[1] % NRB != 0)
+        : (op->ne[0] % 64  != 0 || op->ne[1] % 32  != 0);

    snprintf(base, 256, "kernel_mul_mm_%s_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
    snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
@@ -694,8 +702,20 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
        ggml_metal_cv_free(cv);
    }

-    // when the output size is not multiple of 64x32, we need extra smem to prevent out-of-bounds writes
-    res.smem = bc_out ? 8192 : 4096 + 2048;
+    if (has_tensor) {
+        res.nr0 = NRA;
+        res.nr1 = NRB;
+
+        const size_t smem_a = NRA * N_MM_NK_TOTAL * sizeof(ggml_fp16_t);
+        res.smem = smem_a;
+    } else {
+        res.nr0 = 64;
+        res.nr1 = 32;
+
+        res.smem = bc_out ? 8192 : (4096 + 2048);
+    }
+
+    res.nsg = N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y;

    return res;
 }
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -102,6 +102,8 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev

 void ggml_metal_library_free(ggml_metal_library_t lib);

+ggml_metal_device_t ggml_metal_library_get_device(ggml_metal_library_t lib);
+
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline    (ggml_metal_library_t lib, const char * name);
 struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);

--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -95,8 +95,8 @@ int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_wi

 struct ggml_metal_library {
    id<MTLLibrary> obj;
-    id<MTLDevice> device;

+    ggml_metal_device_t dev;
    ggml_metal_pipelines_t pipelines; // cache of compiled pipelines

    NSLock * lock;
@@ -251,7 +251,7 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
    ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));

    res->obj       = library;
-    res->device    = device;
+    res->dev       = dev;
    res->pipelines = ggml_metal_pipelines_init();
    res->lock      = [NSLock new];

@@ -318,7 +318,7 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
    }

    res->obj       = library;
-    res->device    = device;
+    res->dev       = dev;
    res->pipelines = ggml_metal_pipelines_init();
    res->lock      = [NSLock new];

@@ -341,6 +341,10 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
    free(lib);
 }

+ggml_metal_device_t ggml_metal_library_get_device(ggml_metal_library_t lib) {
+    return lib->dev;
+}
+
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
    [lib->lock lock];

@@ -405,7 +409,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_
            return res;
        }

-        id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
+        id<MTLDevice> device = ggml_metal_device_get_obj(lib->dev);
+        id<MTLComputePipelineState> obj = [device newComputePipelineStateWithFunction:mtl_function error:&error];

        [mtl_function release];

@@ -699,7 +704,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
                    "    auto sB = tB.slice(0, 0); \n"
                    "    mm.run(sB, sA, cT); \n"
                    " \n"
-                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(16, 16)); \n"
                    " \n"
                    "    cT.store(tC); \n"
                    "}";
@@ -749,7 +754,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
                    "    auto sB = tB.slice(0, 0); \n"
                    "    mm.run(sB, sA, cT); \n"
                    " \n"
-                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(16, 16)); \n"
                    " \n"
                    "    cT.store(tC); \n"
                    "}";
@@ -814,7 +819,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
            }

            // print MTL GPU family:
-            GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);
+            GGML_LOG_INFO("%s: GPU name:   %s (%s)\n", __func__, dev->props.name, dev->props.desc);

            // determine max supported GPU family
            // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
@@ -931,13 +936,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
 }

 struct ggml_metal_event {
-    void * obj; // id<MTLEvent>
+    void * obj; // id<MTLSharedEvent>

    atomic_int value;
 };

 void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;

    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

@@ -945,7 +950,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
 }

 void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;

    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

@@ -953,7 +958,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
 }

 ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
-    id<MTLEvent> event = [dev->mtl_device newEvent];
+    id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];

    ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));

@@ -964,7 +969,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
 }

 void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    id<MTLEvent> event = ev->obj;
+    id<MTLSharedEvent> event = ev->obj;
    [event release];

    free(ev);
@@ -973,14 +978,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
 }

 void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    @autoreleasepool {
-        id<MTLEvent> event = ev->obj;
-
-        id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
-        [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
-        [cmd_buf commit];
-        [cmd_buf waitUntilCompleted];
+    id<MTLSharedEvent> event = ev->obj;
+    const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
+    if (!res) {
+        GGML_ABORT("%s: failed to wait for event\n", __func__);
    }
+
+    GGML_UNUSED(dev);
 }

 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -1,6 +1,19 @@
 #ifndef GGML_METAL_IMPL
 #define GGML_METAL_IMPL

+// kernel parameters for mat-mat threadgroups
+//
+// TODO: become function constants
+
+#define SZ_SIMDGROUP 16
+#define N_MM_NK 2
+#define N_MM_NK_TOTAL (SZ_SIMDGROUP * N_MM_NK)
+
+#define N_MM_BLOCK_X 4
+#define N_MM_BLOCK_Y 2
+#define N_MM_SIMD_GROUP_X 2
+#define N_MM_SIMD_GROUP_Y 2
+
 // kernel parameters for mat-vec threadgroups
 //
 // N_R0: number of src0 rows to process per simdgroup
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2195,7 +2195,12 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
        const size_t smem = pipeline.smem;

        ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
-        ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
+
+        const int nr0 = pipeline.nr0;
+        const int nr1 = pipeline.nr1;
+        const int nsg = pipeline.nsg;
+
+        ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
    } else {
        auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);

--- a/Show More
+++ b/Show More