ggml-cuda: use CMAKE_CUDA_ARCHITECTURES if set when GGML_NATIVE=ON (#18413 )

opencl: allow resizing transpose buffers (#18384 )
* opencl: allow resizing transpose buffers instead of using fixed sizes * opencl: remove commented code
2026-05-07 17:44:09 +00:00 · 2025-12-28 09:33:14 +08:00 · 2025-12-27 15:51:14 -08:00 · 2025-12-27 20:20:45 +01:00 · 2025-12-27 20:18:35 +01:00 · 2025-12-27 19:56:27 +08:00
204 changed files with 10861 additions and 3775 deletions
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -86,6 +86,7 @@ body:
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
+          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
      render: shell
    validations:
      required: false
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -70,6 +70,7 @@ jobs:
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -106,6 +107,7 @@ jobs:
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -142,6 +144,7 @@ jobs:
        with:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dawn Dependency
        id: dawn-depends
@@ -195,6 +198,7 @@ jobs:
        with:
          key: ubuntu-cpu-cmake-${{ matrix.build }}
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build Dependencies
        id: build_depends
@@ -276,6 +280,7 @@ jobs:
        with:
          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -396,6 +401,7 @@ jobs:
        with:
          key: ubuntu-24-cmake-vulkan-deb
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -431,6 +437,7 @@ jobs:
        with:
          key: ubuntu-24-cmake-vulkan
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -490,6 +497,7 @@ jobs:
        with:
          key: ubuntu-24-cmake-webgpu
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -562,6 +570,7 @@ jobs:
        with:
          key: ubuntu-latest-wasm-webgpu
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Install Emscripten
        run: |
@@ -609,6 +618,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-hip
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build with native CMake HIP support
        id: cmake_build
@@ -641,6 +651,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-musa
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build with native CMake MUSA support
        id: cmake_build
@@ -688,6 +699,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-sycl
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -738,6 +750,7 @@ jobs:
        with:
          key: ubuntu-22-cmake-sycl-fp16
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -771,6 +784,7 @@ jobs:
        with:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -802,6 +816,7 @@ jobs:
        with:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -863,6 +878,7 @@ jobs:
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Download xcframework artifact
        uses: actions/download-artifact@v4
@@ -905,6 +921,7 @@ jobs:
          key: windows-msys2
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
@@ -973,6 +990,7 @@ jobs:
          key: windows-latest-cmake-${{ matrix.build }}
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Download OpenBLAS
        id: get_openblas
@@ -1077,6 +1095,7 @@ jobs:
          with:
            key: ubuntu-latest-cmake-cuda
            evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

        - name: Build with CMake
          run: |
@@ -1109,6 +1128,7 @@ jobs:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Install Cuda Toolkit
        uses: ./.github/actions/windows-setup-cuda
@@ -1160,6 +1180,7 @@ jobs:
          key: windows-latest-cmake-sycl
          variant: ccache
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Install
        run:  |
@@ -1221,6 +1242,7 @@ jobs:
        with:
          key: ${{ github.job }}
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
@@ -1466,6 +1488,7 @@ jobs:
        with:
          key: ggml-ci-x64-cpu-low-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1491,6 +1514,7 @@ jobs:
        with:
          key: ggml-ci-arm64-cpu-low-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1516,6 +1540,7 @@ jobs:
        with:
          key: ggml-ci-x64-cpu-high-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1541,6 +1566,7 @@ jobs:
        with:
          key: ggml-ci-arm64-cpu-high-perf
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1566,6 +1592,7 @@ jobs:
        with:
          key: ggml-ci-arm64-cpu-high-perf-sve
          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Dependencies
        id: depends
@@ -1701,6 +1728,7 @@ jobs:
         with:
           key: ggml-ci-arm64-cpu-kleidiai
           evict-old-files: 1d
+           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

       - name: Dependencies
         id: depends
@@ -2084,6 +2112,7 @@ jobs:
         with:
           key: ggml-ci-arm64-graviton4-kleidiai
           evict-old-files: 1d
+           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

       - name: Test
         id: ggml-ci
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,16 +66,9 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
@@ -127,16 +120,9 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
@@ -196,16 +182,9 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
@@ -256,16 +235,9 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
@@ -716,21 +688,16 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-          tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
+          # Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
+          # For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
+          zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework

-      - name: Upload artifacts (zip)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
          name: llama-${{ steps.tag.outputs.name }}-xcframework.zip

-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
-          name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
-

  openEuler-cann:
    strategy:
@@ -797,7 +764,7 @@ jobs:
          cp LICENSE ./build/bin/
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
@@ -889,9 +856,6 @@ jobs:
        with:
          tag_name: ${{ steps.tag.outputs.name }}
          body: |
-            > [!WARNING]
-            > **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
-
            <details open>

            ${{ github.event.head_commit.message }}
@@ -901,7 +865,7 @@ jobs:
            **macOS/iOS:**
            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
-            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
+            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)

            **Linux:**
            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
@@ -911,8 +875,8 @@ jobs:
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
-            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
-            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
+            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
+            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -31,9 +31,10 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  webui-setup:
-    name: WebUI Setup
+  webui-check:
+    name: WebUI Checks
    runs-on: ubuntu-latest
+    continue-on-error: true
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -42,137 +43,66 @@ jobs:
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Setup Node.js
+        id: node
        uses: actions/setup-node@v4
        with:
          node-version: "22"
          cache: "npm"
          cache-dependency-path: "tools/server/webui/package-lock.json"

-      - name: Cache node_modules
-        uses: actions/cache@v4
-        id: cache-node-modules
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
      - name: Install dependencies
-        if: steps.cache-node-modules.outputs.cache-hit != 'true'
+        id: setup
+        if: ${{ steps.node.conclusion == 'success' }}
        run: npm ci
        working-directory: tools/server/webui

-  webui-check:
-    needs: webui-setup
-    name: WebUI Check
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
      - name: Run type checking
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
        working-directory: tools/server/webui

      - name: Run linting
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run lint
        working-directory: tools/server/webui

-  webui-build:
-    needs: webui-check
-    name: WebUI Build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build
        working-directory: tools/server/webui

-  webui-tests:
-    needs: webui-build
-    name: Run WebUI tests
-    permissions:
-      contents: read
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
      - name: Install Playwright browsers
+        id: playwright
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npx playwright install --with-deps
        working-directory: tools/server/webui

      - name: Build Storybook
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run build-storybook
        working-directory: tools/server/webui

      - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:client
        working-directory: tools/server/webui

-      - name: Run Server tests
-        run: npm run test:server
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
        working-directory: tools/server/webui

      - name: Run UI tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/server/webui

      - name: Run E2E tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/server/webui

  server-build:
-    needs: [webui-tests]
    runs-on: ubuntu-latest

    strategy:
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -1,4 +1,4 @@
-# Copilot Instructions for llama.cpp
+# Instructions for llama.cpp

 ## Repository Overview

@@ -12,6 +12,24 @@ llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model
 - **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
 - **License**: MIT

+## Disclose AI Usage
+
+It is crucial to remind contributors that the project mandates disclosure of any AI usage in pull requests. This requirement stems from the potential for AI-generated code to include suboptimal optimizations and hidden bugs, owing to the inherent overconfidence in AI outputs.
+
+When generating significant portions of code, address this by:
+- Informing the user that AI-generated content may be rejected by maintainers.
+- Clearly marking AI-generated code in commit messages and comments.
+    - Example of commit message: `[AI] Fix a race condition in ...`
+    - Example of code comment: `// [AI] spawn a new thread ...`
+
+These measures apply to:
+- Changes resulting in large portions of code or complex logic.
+- Modifications or additions to public APIs in `llama.h`, `ggml.h`, or `mtmd.h`.
+- Backend-related changes, such as those involving CPU, CUDA, Metal, Vulkan, etc.
+- Modifications to `tools/server`.
+
+Note: These measures can be omitted for small fixes or trivial changes.
+
 ## Build Instructions

 ### Prerequisites
@@ -251,6 +269,7 @@ Primary tools:
 - **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
 - **Performance focus**: This is a performance-critical inference library
 - **API stability**: Changes to `include/llama.h` require careful consideration
+- **Disclose AI Usage**: Refer to the "Disclose AI Usage" earlier in this document

 ### Git Workflow
 - Always create feature branches from `master`
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
    unicode.h
    )

+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
@@ -151,9 +154,7 @@ if (LLAMA_LLGUIDANCE)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()

-target_include_directories(${TARGET} PUBLIC . ../vendor)
-target_compile_features   (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)


 #
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
    return *this;
 }

+common_arg & common_arg::set_preset_only() {
+    is_preset_only = true;
+    return *this;
+}
+
 bool common_arg::in_example(enum llama_example ex) {
    return examples.find(ex) != examples.end();
 }
@@ -420,6 +425,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        }
    };

+    std::set<std::string> seen_args;
+
    for (int i = 1; i < argc; i++) {
        const std::string arg_prefix = "--";

@@ -430,6 +437,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        if (arg_to_options.find(arg) == arg_to_options.end()) {
            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
+        if (!seen_args.insert(arg).second) {
+            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+        }
        auto & tmp = arg_to_options[arg];
        auto opt = *tmp.first;
        bool is_positive = tmp.second;
@@ -750,6 +760,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
        }
    };

+    std::set<std::string> seen_args;
+
    for (int i = 1; i < argc; i++) {
        const std::string arg_prefix = "--";

@@ -760,8 +772,16 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
        if (arg_to_options.find(arg) == arg_to_options.end()) {
            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
+        if (!seen_args.insert(arg).second) {
+            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+        }
        auto opt = *arg_to_options[arg];
        std::string val;
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // bool arg (need to reverse the meaning for negative args)
+            bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
+            val = is_neg ? "0" : "1";
+        }
        if (opt.value_hint != nullptr) {
            // arg with single value
            check_arg(i);
@@ -863,7 +883,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        sampler_type_chars += common_sampler_type_to_chr(sampler);
        sampler_type_names += common_sampler_type_to_str(sampler) + ";";
    }
-    sampler_type_names.pop_back();
+    if (!sampler_type_names.empty()) {
+        sampler_type_names.pop_back(); // remove last semicolon
+    }


    /**
@@ -1127,7 +1149,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
-        {"--cache-ram", "-cram"}, "N",
+        {"-cram", "--cache-ram"}, "N",
        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
        [](common_params & params, int value) {
@@ -1135,7 +1157,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
-        {"--kv-unified", "-kvu"},
+        {"-kvu", "--kv-unified"},
        "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
        [](common_params & params) {
            params.kv_unified = true;
@@ -1184,7 +1206,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.system_prompt = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
    add_opt(common_arg(
        {"--perf"},
        {"--no-perf"},
@@ -1226,13 +1248,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
    add_opt(common_arg(
        {"--in-file"}, "FNAME",
-        "an input file (repeat to specify multiple files)",
+        "an input file (use comma-separated values to specify multiple files)",
        [](common_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                std::ifstream file(item);
+                if (!file) {
+                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+                }
+                params.in_files.push_back(item);
            }
-            params.in_files.push_back(value);
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
@@ -1401,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+        {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
        [](common_params & params, const std::string & value) {
            params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -1969,9 +1993,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
        {"--image", "--audio"}, "FILE",
-        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
+        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
        [](common_params & params, const std::string & value) {
-            params.image.emplace_back(value);
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                params.image.emplace_back(item);
+            }
        }
    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
@@ -2057,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
-        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
+        {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type", [](common_params & params, const std::string & value) {
            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
        }
-    ));
+    ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
    add_opt(common_arg(
-        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
+        {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
-        {"--cpu-moe", "-cmoe"},
+        {"-cmoe", "--cpu-moe"},
        "keep all Mixture of Experts (MoE) weights in the CPU",
        [](common_params & params) {
            params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
        }
    ).set_env("LLAMA_ARG_CPU_MOE"));
    add_opt(common_arg(
-        {"--n-cpu-moe", "-ncmoe"}, "N",
+        {"-ncmoe", "--n-cpu-moe"}, "N",
        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
        [](common_params & params, int value) {
            if (value < 0) {
@@ -2091,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_N_CPU_MOE"));
    add_opt(common_arg(
-        {"--cpu-moe-draft", "-cmoed"},
+        {"-cmoed", "--cpu-moe-draft"},
        "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
        [](common_params & params) {
            params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
    add_opt(common_arg(
-        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
+        {"-ncmoed", "--n-cpu-moe-draft"}, "N",
        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
        [](common_params & params, int value) {
            if (value < 0) {
@@ -2111,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+    GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
-        string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
-        [](common_params & params, int value) {
-            params.n_gpu_layers = value;
+        string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
+        [](common_params & params, const std::string & value) {
+            if (value == "auto") {
+                params.n_gpu_layers = -1;
+            } else if (value == "all") {
+                params.n_gpu_layers = -2;
+            } else {
+                params.n_gpu_layers = std::stoi(value);
+            }
            if (!llama_supports_gpu_offload()) {
                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -2218,12 +2251,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
-        {"--override-kv"}, "KEY=TYPE:VALUE",
-        "advanced option to override model metadata by key. may be specified multiple times.\n"
-        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+        {"--override-kv"}, "KEY=TYPE:VALUE,...",
+        "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
+        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
        [](common_params & params, const std::string & value) {
-            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
-                throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
+            std::vector<std::string> kv_overrides;
+
+            std::string current;
+            bool escaping = false;
+
+            for (const char c : value) {
+                if (escaping) {
+                    current.push_back(c);
+                    escaping = false;
+                } else if (c == '\\') {
+                    escaping = true;
+                } else if (c == ',') {
+                    kv_overrides.push_back(current);
+                    current.clear();
+                } else {
+                    current.push_back(c);
+                }
+            }
+
+            if (escaping) {
+                current.push_back('\\');
+            }
+
+            kv_overrides.push_back(current);
+
+            for (const auto & kv_override : kv_overrides) {
+                if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
+                    throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
+                }
            }
        }
    ));
@@ -2237,33 +2297,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"--lora"}, "FNAME",
-        "path to LoRA adapter (can be repeated to use multiple adapters)",
+        "path to LoRA adapter (use comma-separated values to load multiple adapters)",
        [](common_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
+            }
        }
        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
    add_opt(common_arg(
-        {"--lora-scaled"}, "FNAME", "SCALE",
-        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
+        {"--lora-scaled"}, "FNAME:SCALE,...",
+        "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
+        "note: use comma-separated values",
+        [](common_params & params, const std::string & value) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                auto parts = string_split<std::string>(item, ':');
+                if (parts.size() != 2) {
+                    throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
+                }
+                params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
+            }
        }
        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
    add_opt(common_arg(
        {"--control-vector"}, "FNAME",
-        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+        "add a control vector\nnote: use comma-separated values to add multiple control vectors",
        [](common_params & params, const std::string & value) {
-            params.control_vectors.push_back({ 1.0f, value, });
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                params.control_vectors.push_back({ 1.0f, item, });
+            }
        }
    ));
    add_opt(common_arg(
-        {"--control-vector-scaled"}, "FNAME", "SCALE",
+        {"--control-vector-scaled"}, "FNAME:SCALE,...",
        "add a control vector with user defined scaling SCALE\n"
-        "note: this argument can be repeated to add multiple scaled control vectors",
-        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.control_vectors.push_back({ std::stof(scale), fname });
+        "note: use comma-separated values (format: FNAME:SCALE,...)",
+        [](common_params & params, const std::string & value) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                auto parts = string_split<std::string>(item, ':');
+                if (parts.size() != 2) {
+                    throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
+                }
+                params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
+            }
        }
    ));
    add_opt(common_arg(
@@ -2353,13 +2430,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("HF_TOKEN"));
    add_opt(common_arg(
        {"--context-file"}, "FNAME",
-        "file to load context from (repeat to specify multiple files)",
+        "file to load context from (use comma-separated values to specify multiple files)",
        [](common_params & params, const std::string & value) {
-            std::ifstream file(value, std::ios::binary);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                std::ifstream file(item, std::ios::binary);
+                if (!file) {
+                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+                }
+                params.context_files.push_back(item);
            }
-            params.context_files.push_back(value);
        }
    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
    add_opt(common_arg(
@@ -2550,6 +2629,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.api_prefix = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+    add_opt(common_arg(
+        {"--webui-config"}, "JSON",
+        "JSON that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.webui_config_json = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+    add_opt(common_arg(
+        {"--webui-config-file"}, "PATH",
+        "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.webui_config_json = read_file(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
@@ -2566,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
    add_opt(common_arg(
-        {"--reranking", "--rerank"},
+        {"--rerank", "--reranking"},
        string_format("enable reranking endpoint on server (default: %s)", "disabled"),
        [](common_params & params) {
            params.embedding = true;
@@ -2801,6 +2894,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.lora_init_without_apply = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--sleep-idle-seconds"}, "SECONDS",
+        string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
+        [](common_params & params, int value) {
+            if (value == 0 || value < -1) {
+                throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
+            }
+            params.sleep_idle_seconds = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--simple-io"},
        "use basic IO for better compatibility in subprocesses and limited consoles",
@@ -3037,7 +3140,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
-        {"--draft-max", "--draft", "--draft-n"}, "N",
+        {"--draft", "--draft-n", "--draft-max"}, "N",
        string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
        [](common_params & params, int value) {
            params.speculative.n_max = value;
@@ -3079,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.devices = parse_device_list(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
    add_opt(common_arg(
        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        "number of layers to store in VRAM for the draft model",
-        [](common_params & params, int value) {
-            params.speculative.n_gpu_layers = value;
+        string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
+            params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
+        [](common_params & params, const std::string & value) {
+            if (value == "auto") {
+                params.speculative.n_gpu_layers = -1;
+            } else if (value == "all") {
+                params.speculative.n_gpu_layers = -2;
+            } else {
+                params.speculative.n_gpu_layers = std::stoi(value);
+            }
            if (!llama_supports_gpu_offload()) {
                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3413,3 +3524,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex

    return ctx_arg;
 }
+
+void common_params_add_preset_options(std::vector<common_arg> & args) {
+    // arguments below won't be treated as CLI args, only preset options
+    args.push_back(common_arg(
+        {"load-on-startup"}, "NAME",
+        "in server router mode, autoload this model on startup",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
+
+    args.push_back(common_arg(
+        {"stop-timeout"}, "SECONDS",
+        "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
+        [](common_params &, int) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
+
+    // args.push_back(common_arg(
+    //     {"pin"},
+    //     "in server router mode, do not unload this model if models_max is exceeded",
+    //     [](common_params &) { /* unused */ }
+    // ).set_preset_only());
+}
--- a/common/arg.h
+++ b/common/arg.h
@@ -8,6 +8,10 @@
 #include <vector>
 #include <cstring>

+// pseudo-env variable to identify preset-only arguments
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
+#define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"
+
 //
 // CLI argument parsing
 //
@@ -22,6 +26,7 @@ struct common_arg {
    const char * env          = nullptr;
    std::string help;
    bool is_sparam = false; // is current arg a sampling param?
+    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
@@ -70,6 +75,7 @@ struct common_arg {
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
    common_arg & set_sparam();
+    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
    bool get_value_from_env(std::string & output) const;
@@ -114,9 +120,13 @@ struct common_params_context {
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

 // parse input arguments from CLI into a map
-// TODO: support repeated args in the future
 bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);

+// populate preset-only arguments
+// these arguments are not treated as command line arguments
+// see: https://github.com/ggml-org/llama.cpp/issues/18163
+void common_params_add_preset_options(std::vector<common_arg> & args);
+
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1078,6 +1078,8 @@ struct common_init_result::impl {
    impl() = default;
    ~impl() = default;

+    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
+
    llama_model_ptr   model;
    llama_context_ptr context;

@@ -1092,7 +1094,7 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
+        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
@@ -1339,10 +1341,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.devices = params.devices.data();
    }

-    if (params.n_gpu_layers != -1) {
-        mparams.n_gpu_layers = params.n_gpu_layers;
-    }
-
+    mparams.n_gpu_layers    = params.n_gpu_layers;
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
--- a/common/common.h
+++ b/common/common.h
@@ -329,7 +329,7 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM, -1 is auto, <= -2 is all
    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
@@ -475,7 +475,8 @@ struct common_params {
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
-    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
+    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
+    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

    std::vector<std::string> api_keys;

@@ -484,8 +485,11 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

+    // webui configs
+    bool webui = true;
+    std::string webui_config_json;
+
    // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -2,6 +2,7 @@
 #include "preset.h"
 #include "peg-parser.h"
 #include "log.h"
+#include "download.h"

 #include <fstream>
 #include <sstream>
@@ -15,11 +16,22 @@ static std::string rm_leading_dashes(const std::string & str) {
    return str.substr(pos);
 }

-std::vector<std::string> common_preset::to_args() const {
+std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
    std::vector<std::string> args;

+    if (!bin_path.empty()) {
+        args.push_back(bin_path);
+    }
+
    for (const auto & [opt, value] : options) {
-        args.push_back(opt.args.back()); // use the last arg as the main arg
+        if (opt.is_preset_only) {
+            continue; // skip preset-only options (they are not CLI args)
+        }
+
+        // use the last arg as the main arg (i.e. --long-form)
+        args.push_back(opt.args.back());
+
+        // handle value(s)
        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
            // flag option, no value
            if (common_arg_utils::is_falsey(value)) {
@@ -63,6 +75,52 @@ std::string common_preset::to_ini() const {
    return ss.str();
 }

+void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
+    // try if option exists, update it
+    for (auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            val = value;
+            return;
+        }
+    }
+    // if option does not exist, we need to add it
+    if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
+        throw std::runtime_error(string_format(
+            "%s: option with env '%s' not found in ctx_params",
+            __func__, env.c_str()
+        ));
+    }
+    options[ctx.key_to_opt.at(env)] = value;
+}
+
+void common_preset::unset_option(const std::string & env) {
+    for (auto it = options.begin(); it != options.end(); ) {
+        const common_arg & opt = it->first;
+        if (opt.env && env == opt.env) {
+            it = options.erase(it);
+            return;
+        } else {
+            ++it;
+        }
+    }
+}
+
+bool common_preset::get_option(const std::string & env, std::string & value) const {
+    for (const auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            value = val;
+            return true;
+        }
+    }
+    return false;
+}
+
+void common_preset::merge(const common_preset & other) {
+    for (const auto & [opt, val] : other.options) {
+        options[opt] = val; // overwrite existing options
+    }
+}
+
 static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
    std::map<std::string, std::map<std::string, std::string>> parsed;

@@ -172,9 +230,14 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
    return value;
 }

-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
+common_preset_context::common_preset_context(llama_example ex)
+        : ctx_params(common_params_parser_init(default_params, ex)) {
+    common_params_add_preset_options(ctx_params.options);
+    key_to_opt = get_map_key_opt(ctx_params);
+}
+
+common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
    common_presets out;
-    auto key_to_opt = get_map_key_opt(ctx_params);
    auto ini_data = parse_ini_from_file(path);

    for (auto section : ini_data) {
@@ -188,7 +251,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
        for (const auto & [key, value] : section.second) {
            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
            if (key_to_opt.find(key) != key_to_opt.end()) {
-                auto & opt = key_to_opt[key];
+                const auto & opt = key_to_opt.at(key);
                if (is_bool_arg(opt)) {
                    preset.options[opt] = parse_bool_arg(opt, key, value);
                } else {
@@ -199,8 +262,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
                // TODO: maybe warn about unknown key?
            }
        }
+
+        if (preset.name == "*") {
+            // handle global preset
+            global = preset;
+        } else {
+            out[preset.name] = preset;
+        }
+    }
+
+    return out;
+}
+
+common_presets common_preset_context::load_from_cache() const {
+    common_presets out;
+
+    auto cached_models = common_list_cached_models();
+    for (const auto & model : cached_models) {
+        common_preset preset;
+        preset.name = model.to_string();
+        preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
        out[preset.name] = preset;
    }

    return out;
 }
+
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
+    if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(models_dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+
+    // convert local models to presets
+    common_presets out;
+    for (const auto & model : models) {
+        common_preset preset;
+        preset.name = model.name;
+        preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
+        if (!model.path_mmproj.empty()) {
+            preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
+        }
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
+    common_preset preset;
+    preset.name = COMMON_PRESET_DEFAULT_NAME;
+
+    bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
+    if (!ok) {
+        throw std::runtime_error("failed to parse CLI arguments into preset");
+    }
+
+    return preset;
+}
+
+common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
+    common_presets out = base; // copy
+    for (const auto & [name, preset_added] : added) {
+        if (out.find(name) != out.end()) {
+            // if exists, merge
+            common_preset & target = out[name];
+            target.merge(preset_added);
+        } else {
+            // otherwise, add directly
+            out[name] = preset_added;
+        }
+    }
+    return out;
+}
+
+common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
+    common_presets out;
+    for (const auto & [name, preset] : presets) {
+        common_preset tmp = base; // copy
+        tmp.name = name;
+        tmp.merge(preset);
+        out[name] = std::move(tmp);
+    }
+    return out;
+}
--- a/common/preset.h
+++ b/common/preset.h
@@ -13,20 +13,62 @@

 constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";

+struct common_preset_context;
+
 struct common_preset {
    std::string name;
-    // TODO: support repeated args in the future
+
+    // options are stored as common_arg to string mapping, representing CLI arg and its value
    std::map<common_arg, std::string> options;

    // convert preset to CLI argument list
-    std::vector<std::string> to_args() const;
+    std::vector<std::string> to_args(const std::string & bin_path = "") const;

    // convert preset to INI format string
    std::string to_ini() const;

    // TODO: maybe implement to_env() if needed
+
+    // modify preset options where argument is identified by its env variable
+    void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
+
+    // unset option by its env variable
+    void unset_option(const std::string & env);
+
+    // get option value by its env variable, return false if not found
+    bool get_option(const std::string & env, std::string & value) const;
+
+    // merge another preset into this one, overwriting existing options
+    void merge(const common_preset & other);
 };

 // interface for multiple presets in one file
 using common_presets = std::map<std::string, common_preset>;
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
+
+// context for loading and editing presets
+struct common_preset_context {
+    common_params default_params; // unused for now
+    common_params_context ctx_params;
+    std::map<std::string, common_arg> key_to_opt;
+    common_preset_context(llama_example ex);
+
+    // load presets from INI file
+    common_presets load_from_ini(const std::string & path, common_preset & global) const;
+
+    // generate presets from cached models
+    common_presets load_from_cache() const;
+
+    // generate presets from local models directory
+    // for the directory structure, see "Using multiple models" in server/README.md
+    common_presets load_from_models_dir(const std::string & models_dir) const;
+
+    // generate one preset from CLI arguments
+    common_preset load_from_args(int argc, char ** argv) const;
+
+    // cascade multiple presets if exist on both: base < added
+    // if preset does not exist in base, it will be added without modification
+    common_presets cascade(const common_presets & base, const common_presets & added) const;
+
+    // apply presets over a base preset (same idea as CSS cascading)
+    common_presets cascade(const common_preset & base, const common_presets & presets) const;
+};
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -104,10 +104,9 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;

+    struct llama_sampler * grmr;
    struct llama_sampler * chain;

-    bool grammar;
-
    ring_buffer<llama_token> prev;

    std::vector<llama_token_data> cur;
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    lparams.no_perf = params.no_perf;

+    llama_sampler * grmr = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);

-    bool grammar = false;
    std::vector<llama_sampler *> samplers;

    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
-        grammar = true;
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

        if (!params.grammar.empty()) {
             if (params.grammar_lazy) {
-                 samplers.push_back(
-                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                             trigger_patterns_c.data(), trigger_patterns_c.size(),
-                             trigger_tokens.data(),     trigger_tokens.size()));
+                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                         trigger_patterns_c.data(), trigger_patterns_c.size(),
+                         trigger_tokens.data(), trigger_tokens.size());
             } else {
-                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
+                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
             }
-
-             grammar = true;
        }
    }

@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    auto * result = new common_sampler {
        /* .params  = */ params,
+        /* .grmr    = */ grmr,
        /* .chain   = */ chain,
-        /* .grammar = */ grammar,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
        /* .cur_p   = */ {},
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
+        llama_sampler_free(gsmpl->grmr);
        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();

-    if (gsmpl->grammar) {
-        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
-
-        for (int i = 0; i < n_smpl; i++) {
-            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-
-            // the grammar sampler is always the first one
-            if (i == 0) {
-                if (accept_grammar) {
-                    llama_sampler_accept(smpl, token);
-                }
-            } else {
-                llama_sampler_accept(smpl, token);
-            }
-        }
-    } else {
-        llama_sampler_accept(gsmpl->chain, token);
+    if (gsmpl->grmr && accept_grammar) {
+        llama_sampler_accept(gsmpl->grmr, token);
    }

+    llama_sampler_accept(gsmpl->chain, token);
+
    gsmpl->prev.push_back(token);
 }

@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
+        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
-        /* .grammar = */ gsmpl->grammar,
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
        /* .cur_p   = */ gsmpl->cur_p,
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
    return gsmpl->chain;
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    llama_token id = LLAMA_TOKEN_NULL;

+    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

    gsmpl->set_logits(ctx, idx);

+    if (grammar_first) {
+        llama_sampler_apply(grmr, &cur_p);
+    }
+
+    llama_sampler_apply(chain, &cur_p);
+
+    id = cur_p.data[cur_p.selected].id;
+
+    if (grammar_first) {
+        return id;
+    }
+
+    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
+    {
+        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+
+        llama_sampler_apply(grmr, &single_token_data_array);
+
+        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+        if (is_valid) {
+            return id;
+        }
+    }
+
+    // resampling:
+    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
+    gsmpl->set_logits(ctx, idx);
+
+    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample

    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);

        common_sampler_accept(gsmpl, id, true);

@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }

    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);

        common_sampler_accept(gsmpl, id, true);

@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

 // generalized version of common_sampler_sample
 //
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
    for (int i = 0; i < params.n_draft; ++i) {
        common_batch_clear(batch);

-        common_sampler_sample(smpl, ctx_dft, 0);
+        common_sampler_sample(smpl, ctx_dft, 0, true);

        const auto * cur_p = common_sampler_get_candidates(smpl, true);

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -141,16 +141,24 @@ class ModelBase:
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py

-        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
+        # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
+        # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
        if self.ftype == gguf.LlamaFileType.GUESSED:
-            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
-            _, first_tensor = next(self.get_tensors())
-            if first_tensor.dtype == torch.float16:
-                logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
-                self.ftype = gguf.LlamaFileType.MOSTLY_F16
+            for _, tensor in self.get_tensors():
+                if tensor.dim() < 2:
+                    continue
+
+                if tensor.dtype == torch.bfloat16:
+                    self.ftype = gguf.LlamaFileType.MOSTLY_BF16
+                    logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
+                    break
+                elif tensor.dtype == torch.float16:
+                    self.ftype = gguf.LlamaFileType.MOSTLY_F16
+                    logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
+                    break
            else:
-                logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
-                self.ftype = gguf.LlamaFileType.MOSTLY_BF16
+                self.ftype = gguf.LlamaFileType.MOSTLY_F16
+                logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")

        self.dequant_model()

@@ -189,10 +197,10 @@ class ModelBase:
            return tensors

        prefix = "model" if not self.is_mistral_format else "consolidated"
-        part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors"))
+        part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
        is_safetensors: bool = len(part_names) > 0
        if not is_safetensors:
-            part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin"))
+            part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")

        tensor_names_from_index: set[str] = set()

@@ -209,7 +217,8 @@ class ModelBase:
                    if weight_map is None or not isinstance(weight_map, dict):
                        raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
                    tensor_names_from_index.update(weight_map.keys())
-                    part_names |= set(weight_map.values())
+                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
+                    part_names = sorted(part_dict.keys())
            else:
                weight_map = {}
        else:
@@ -711,6 +720,9 @@ class ModelBase:
        if "thinker_config" in config:
            # rename for Qwen2.5-Omni
            config["text_config"] = config["thinker_config"]["text_config"]
+        if "lfm" in config:
+            # rename for LFM2-Audio
+            config["text_config"] = config["lfm"]
        return config

    @classmethod
@@ -1200,6 +1212,9 @@ class TextModel(ModelBase):
        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
            res = "mellum"
+        if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
+            # ref: https://huggingface.co/answerdotai/ModernBERT-base
+            res = "modern-bert"
        if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
            # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
            res = "afmoe"
@@ -1838,7 +1853,7 @@ class MmprojModel(ModelBase):

    def tensor_force_quant(self, name, new_name, bid, n_dims):
        del bid, name, n_dims  # unused
-        if ".patch_embd.weight" in new_name:
+        if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
        return False

@@ -7347,6 +7362,90 @@ class MiniMaxM2Model(TextModel):
        return super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("MiMoV2FlashForCausalLM")
+class MimoV2Model(TextModel):
+    model_arch = gguf.MODEL_ARCH.MIMO2
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        assert self.hparams["swa_head_dim"] == self.hparams["head_dim"]
+        assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"]
+        assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"]
+        assert self.hparams["topk_method"] == "noaux_tc"
+
+        n_head_kv = self.hparams["num_key_value_heads"]
+        n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
+        n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
+        self.gguf_writer.add_head_count_kv(n_head_kv_arr)
+
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+        self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
+        self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"])
+        self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
+        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
+
+        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch, name, bid):
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+
+        if "attention_sink" in name and not name.endswith(".weight"):
+            name += ".weight"
+
+        # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
+        if "model.mtp." in name:
+            return []
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["n_routed_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["gate_proj", "up_proj", "down_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename_to_retrieve])
+                        del self._experts[bid][ename_to_retrieve]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+                    new_name = self.map_tensor_name(merged_name)
+                    tensors.append((new_name, data_torch))
+
+                return tensors
+            else:
+                return []
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
@ModelBase.register("PanguEmbeddedForCausalLM")
 class PanguEmbeddedModel(TextModel):
    model_arch = gguf.MODEL_ARCH.PANGU_EMBED
@@ -8680,6 +8779,11 @@ class NemotronHModel(GraniteHybridModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("LlamaBidirectionalModel")
+class LlamaEmbedNemotronModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
+
+
@ModelBase.register("BailingMoeForCausalLM")
 class BailingMoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.BAILINGMOE
@@ -9712,12 +9816,12 @@ class LFM2Model(TextModel):
        self._add_feed_forward_length()

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
-        if is_vision_tensor:
-            # skip vision tensors
+        if self._is_vision_tensor(name) or self._is_audio_tensor(name):
+            # skip multimodal tensors
            return []

-        name = name.replace("language_model.", "")
+        name = name.replace("language_model.", "") # vision
+        name = name.replace("lfm.", "model.")      # audio

        # conv op requires 2d tensor
        if 'conv.conv' in name:
@@ -9725,6 +9829,12 @@ class LFM2Model(TextModel):

        return [(self.map_tensor_name(name), data_torch)]

+    def _is_vision_tensor(self, name: str) -> bool:
+        return "vision_tower" in name or "multi_modal_projector" in name
+
+    def _is_audio_tensor(self, name: str):
+        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
+

@ModelBase.register("Lfm2MoeForCausalLM")
 class LFM2MoeModel(TextModel):
@@ -9830,6 +9940,81 @@ class LFM2VLModel(MmprojModel):
        return [] # skip other tensors


+@ModelBase.register("Lfm2AudioForConditionalGeneration")
+class LFM2AudioModel(MmprojModel):
+    has_vision_encoder = False
+    has_audio_encoder = True
+    model_name = "Lfm2AudioEncoder"
+
+    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
+
+    def get_audio_config(self) -> dict[str, Any] | None:
+        return self.global_config.get("encoder")
+
+    def set_gguf_parameters(self):
+        assert self.hparams_audio is not None
+        self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
+        self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"]
+        self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"]
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # skip language model tensors
+        if name.startswith("lfm."):
+            return []
+
+        # for training only
+        if any(p in name for p in ["audio_loss_weight"]):
+            return []
+
+        # for audio output
+        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
+            return []
+
+        # fold running_mean, running_var and eps into weight and bias for batch_norm
+        if "batch_norm" in name:
+            if self._batch_norm_tensors is None:
+                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
+            assert bid is not None
+            self._batch_norm_tensors[bid][name] = data_torch
+
+            if len(self._batch_norm_tensors[bid]) < 5:
+                return []
+
+            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
+            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
+            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
+            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
+            eps = 1e-5 # default value
+
+            a = weight / torch.sqrt(running_var + eps)
+            b = bias - running_mean * a
+            return [
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
+            ]
+
+        # reshape conv weights
+        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
+            data_torch = data_torch[:, None, None]
+        if "conv.depthwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[1] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
+        if "conv.pointwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[2] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
@ModelBase.register("SmallThinkerForCausalLM")
 class SmallThinkerModel(TextModel):
    model_arch = gguf.MODEL_ARCH.SMALLTHINKER
@@ -9906,6 +10091,36 @@ class SmallThinkerModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
+class ModernBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.MODERN_BERT
+
+    def set_vocab(self):
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(True)
+        self.gguf_writer.add_add_sep_token(True)
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
+        if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
+            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+        self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # these layers act as MLM head, so we don't need them
+        if name.startswith("decoder."):
+            return []
+
+        if name.startswith("model."):
+            name = name[6:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("ApertusForCausalLM")
 class ApertusModel(LlamaModel):
    model_arch = gguf.MODEL_ARCH.APERTUS
@@ -10472,8 +10687,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,6 +139,7 @@ models = [
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
+    {"name": "modern-bert",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
    {"name": "afmoe",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
    {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
--- a/docs/android.md
+++ b/docs/android.md
@@ -1,27 +1,27 @@

 # Android

-## Build with Android Studio
+## Build GUI binding using Android Studio

 Import the `examples/llama.android` directory into Android Studio, then perform a Gradle sync and build the project.
-![Project imported into Android Studio](./android/imported-into-android-studio.png)
+![Project imported into Android Studio](./android/imported-into-android-studio.jpg)

 This Android binding supports hardware acceleration up to `SME2` for **Arm** and `AMX` for **x86-64** CPUs on Android and ChromeOS devices.
 It automatically detects the host's hardware to load compatible kernels. As a result, it runs seamlessly on both the latest premium devices and older devices that may lack modern CPU features or have limited RAM, without requiring any manual configuration.

 A minimal Android app frontend is included to showcase the binding’s core functionalities:
-1.	**Parse GGUF metadata** via `GgufMetadataReader` from either a `ContentResolver` provided `Uri` or a local `File`.
-2.	**Obtain a `TierDetection` or `InferenceEngine`** instance through the high-level facade APIs.
-3.	**Send a raw user prompt** for automatic template formatting, prefill, and decoding. Then collect the generated tokens in a Kotlin `Flow`.
+1.	**Parse GGUF metadata** via `GgufMetadataReader` from either a `ContentResolver` provided `Uri` from shared storage, or a local `File` from your app's private storage.
+2.	**Obtain a `InferenceEngine`** instance through the `AiChat` facade and load your selected model via its app-private file path.
+3.	**Send a raw user prompt** for automatic template formatting, prefill, and batch decoding. Then collect the generated tokens in a Kotlin `Flow`.

-For a production-ready experience that leverages advanced features such as system prompts and benchmarks, check out [Arm AI Chat](https://play.google.com/store/apps/details?id=com.arm.aichat) on Google Play.
+For a production-ready experience that leverages advanced features such as system prompts and benchmarks, plus friendly UI features such as model management and Arm feature visualizer, check out [Arm AI Chat](https://play.google.com/store/apps/details?id=com.arm.aichat) on Google Play.
 This project is made possible through a collaborative effort by Arm's **CT-ML**, **CE-ML** and **STE** groups:

-| ![Home screen](./android/arm-ai-chat-home-screen.png)  | ![System prompt](./android/system-prompt-setup.png)  | !["Haiku"](./android/chat-with-system-prompt-haiku.png)  |
+| ![Home screen](https://naco-siren.github.io/ai-chat/policy/index/1-llm-starter-pack.png)  | ![System prompt](https://naco-siren.github.io/ai-chat/policy/index/5-system-prompt.png)  | !["Haiku"](https://naco-siren.github.io/ai-chat/policy/index/4-metrics.png)  |
 |:------------------------------------------------------:|:----------------------------------------------------:|:--------------------------------------------------------:|
 |                      Home screen                       |                    System prompt                     |                         "Haiku"                          |

-## Build on Android using Termux
+## Build CLI on Android using Termux

 [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.

@@ -52,7 +52,7 @@ To see what it might look like visually, here's an old demo of an interactive se

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

-## Cross-compile using Android NDK
+## Cross-compile CLI using Android NDK
 It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)

 Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
--- a/docs/android/imported-into-android-studio.jpg
+++ b/docs/android/imported-into-android-studio.jpg
--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -17,7 +17,7 @@ OpenCL (Open Computing Language) is an open, royalty-free standard for cross-pla

 ### Llama.cpp + OpenCL

-The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
+The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs such as those that do not have [SYCL](/docs/backend/SYCL.md) support although the performance is not optimal.

 ## OS

--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -829,7 +829,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  No. We can't support Ollama issue directly, because we aren't familiar with Ollama.

-  Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
+  Suggest reproducing on llama.cpp and report similar issue to llama.cpp. We will support it.

  It's same for other projects including llama.cpp SYCL backend.

--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -22,6 +22,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_CURL":       "OFF"
        }
    },
@@ -36,6 +37,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_CURL":       "OFF"
        }
    },
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
 Simple question for Llama-3.2-1B

 ```
-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v79
@@ -136,7 +136,7 @@ llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =
 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices

 ```
-~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v81
@@ -234,6 +234,6 @@ build: 6a8cf8914 (6733)

  Examples:

-      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
-      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
+      `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.

 ```
-M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
 ...
 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@@ -55,7 +55,7 @@ auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder &
 ```

 For a more complete example, see `test_example_native()` in
-[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
+[tests/test-chat-peg-parser.cpp](/tests/test-chat-peg-parser.cpp).

 ## Parsers/Combinators

@@ -175,7 +175,7 @@ Most model output can be placed in one of the following categories:
  (Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)

 To provide broad coverage,
-[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
+[`common/chat-peg-parser.h`](/common/chat-peg-parser.h) contains builders and
 mappers that help create parsers and visitors/extractors for these types. They
 require parsers to tag nodes to conform to an AST "shape". This normalization
 makes it easy to extract information and generalize parsing.
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -2,57 +2,74 @@
 #include "common.h"

 #include <fstream>
+#include <sstream>
 #include <string>

 // Export usage message (-h) to markdown format
+// Automatically update the markdown docs

-static void write_table_header(std::ofstream & file) {
-    file << "| Argument | Explanation |\n";
-    file << "| -------- | ----------- |\n";
+#define HELP_START_MARKER "<!-- HELP_START -->"
+#define HELP_END_MARKER   "<!-- HELP_END -->"
+#define NOTE_MESSAGE      "<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->"
+
+struct md_file {
+    llama_example ex;
+    std::string fname;
+    std::string specific_section_header;
+};
+
+std::vector<md_file> md_files = {
+    {LLAMA_EXAMPLE_CLI,        "tools/cli/README.md",        "CLI-specific params"},
+    {LLAMA_EXAMPLE_COMPLETION, "tools/completion/README.md", "Completion-specific params"},
+    {LLAMA_EXAMPLE_SERVER,     "tools/server/README.md",     "Server-specific params"},
+};
+
+static void write_table_header(std::ostringstream & ss) {
+    ss << "| Argument | Explanation |\n";
+    ss << "| -------- | ----------- |\n";
 }

-static void write_table_entry(std::ofstream & file, const common_arg & opt) {
-    file << "| `";
+static void write_table_entry(std::ostringstream & ss, const common_arg & opt) {
+    ss << "| `";
    // args
    auto all_args = opt.get_args();
    for (const auto & arg : all_args) {
    if (arg == all_args.front()) {
-            file << arg;
-            if (all_args.size() > 1) file << ", ";
+            ss << arg;
+            if (all_args.size() > 1) ss << ", ";
        } else {
-            file << arg << (arg != all_args.back() ? ", " : "");
+            ss << arg << (arg != all_args.back() ? ", " : "");
        }
    }
    // value hint
    if (opt.value_hint) {
        std::string md_value_hint(opt.value_hint);
        string_replace_all(md_value_hint, "|", "\\|");
-        file << " " << md_value_hint;
+        ss << " " << md_value_hint;
    }
    if (opt.value_hint_2) {
        std::string md_value_hint_2(opt.value_hint_2);
        string_replace_all(md_value_hint_2, "|", "\\|");
-        file << " " << md_value_hint_2;
+        ss << " " << md_value_hint_2;
    }
    // help text
    std::string md_help(opt.help);
+    md_help = string_strip(md_help);
    string_replace_all(md_help, "\n", "<br/>");
    string_replace_all(md_help, "|", "\\|");
-    file << "` | " << md_help << " |\n";
+    ss << "` | " << md_help << " |\n";
 }

-static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
-    write_table_header(file);
+static void write_table(std::ostringstream & ss, std::vector<common_arg *> & opts) {
+    write_table_header(ss);
    for (const auto & opt : opts) {
-        write_table_entry(file, *opt);
+        write_table_entry(ss, *opt);
    }
 }

-static void export_md(std::string fname, llama_example ex, std::string name) {
-    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
-
+static void write_help(std::ostringstream & ss, const md_file & md) {
    common_params params;
-    auto ctx_arg = common_params_parser_init(params, ex);
+    auto ctx_arg = common_params_parser_init(params, md.ex);

    std::vector<common_arg *> common_options;
    std::vector<common_arg *> sparam_options;
@@ -68,18 +85,58 @@ static void export_md(std::string fname, llama_example ex, std::string name) {
        }
    }

-    file << "**Common params**\n\n";
-    write_table(file, common_options);
-    file << "\n\n**Sampling params**\n\n";
-    write_table(file, sparam_options);
-    file << "\n\n**" << name << "-specific params**\n\n";
-    write_table(file, specific_options);
+    ss << HELP_START_MARKER << "\n\n";
+
+    ss << NOTE_MESSAGE << "\n\n";
+
+    ss << "### Common params\n\n";
+    write_table(ss, common_options);
+    ss << "\n\n### Sampling params\n\n";
+    write_table(ss, sparam_options);
+    ss << "\n\n### " << md.specific_section_header << "\n\n";
+    write_table(ss, specific_options);
+
+    ss << "\n" << HELP_END_MARKER;
 }

 int main(int, char **) {
-    // TODO: add CLI
-    export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
+    for (const auto & md : md_files) {
+        std::ifstream infile(md.fname);
+        if (!infile.is_open()) {
+            fprintf(stderr, "failed to open file '%s' for reading\n", md.fname.c_str());
+            return 1;
+        }
+
+        std::ostringstream ss;
+        ss << infile.rdbuf();
+        infile.close();
+
+        std::string content = ss.str();
+
+        size_t help_start = content.find(HELP_START_MARKER);
+        size_t help_end   = content.find(HELP_END_MARKER);
+
+        if (help_start == std::string::npos || help_end == std::string::npos || help_end <= help_start) {
+            fprintf(stderr, "failed to find help markers in file '%s'\n", md.fname.c_str());
+            return 1;
+        }
+
+        std::ostringstream new_help_ss;
+        write_help(new_help_ss, md);
+        std::string new_help = new_help_ss.str();
+
+        content = content.substr(0, help_start) + new_help + content.substr(help_end + strlen(HELP_END_MARKER));
+
+        std::ofstream outfile(md.fname);
+        if (!outfile.is_open()) {
+            fprintf(stderr, "failed to open file '%s' for writing\n", md.fname.c_str());
+            return 1;
+        }
+        outfile << content;
+        outfile.close();
+
+        printf("Updated help in '%s'\n", md.fname.c_str());
+    }

    return 0;
 }
--- a/examples/llama.android/README.md
+++ b/examples/llama.android/README.md
--- a/examples/llama.android/app/src/main/res/layout/activity_main.xml
+++ b/examples/llama.android/app/src/main/res/layout/activity_main.xml
@@ -1,55 +1,57 @@
 <?xml version="1.0" encoding="utf-8"?>
 <androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
    xmlns:app="http://schemas.android.com/apk/res-auto"
-        xmlns:tools="http://schemas.android.com/tools"
-        android:id="@+id/main"
-        android:layout_height="match_parent"
-        android:layout_width="match_parent">
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/main"
+    android:layout_height="match_parent"
+    android:layout_width="match_parent">

    <LinearLayout
        android:fitsSystemWindows="true"
        android:layout_width="match_parent"
        android:layout_height="match_parent"
        android:orientation="vertical"
+        android:layout_marginEnd="4dp"
        tools:context=".MainActivity">

-        <FrameLayout
+        <ScrollView
            android:layout_width="match_parent"
            android:layout_height="0dp"
-            android:layout_weight="1">
+            android:layout_weight="1"
+            android:fadeScrollbars="false">

-            <ScrollView
+            <TextView
+                android:id="@+id/gguf"
                android:layout_width="match_parent"
                android:layout_height="wrap_content"
-                android:fadeScrollbars="false">
+                android:layout_margin="16dp"
+                android:text="Selected GGUF model's metadata will show here."
+                style="@style/TextAppearance.MaterialComponents.Body2" />

-                <TextView
-                    android:id="@+id/gguf"
-                    android:layout_width="match_parent"
-                    android:layout_height="wrap_content"
-                    android:layout_margin="16dp"
-                    android:text="Selected GGUF model's metadata will show here."
-                    style="@style/TextAppearance.MaterialComponents.Body2"
-                    android:maxLines="100" />
+        </ScrollView>

-            </ScrollView>
-
-        </FrameLayout>
+        <com.google.android.material.divider.MaterialDivider
+            android:layout_width="match_parent"
+            android:layout_height="2dp"
+            android:layout_marginHorizontal="16dp"
+            android:layout_marginVertical="8dp" />

        <androidx.recyclerview.widget.RecyclerView
            android:id="@+id/messages"
            android:layout_width="match_parent"
            android:layout_height="0dp"
            android:layout_weight="4"
-            android:padding="16dp"
            android:fadeScrollbars="false"
+            android:scrollbars="vertical"
            app:reverseLayout="true"
            tools:listitem="@layout/item_message_assistant"/>

        <LinearLayout
            android:layout_width="match_parent"
            android:layout_height="wrap_content"
-            android:orientation="horizontal">
+            android:orientation="horizontal"
+            android:paddingStart="16dp"
+            android:paddingEnd="4dp">

            <EditText
                android:id="@+id/user_input"
@@ -67,7 +69,7 @@
                style="@style/Widget.Material3.FloatingActionButton.Primary"
                android:layout_width="wrap_content"
                android:layout_height="wrap_content"
-                android:layout_margin="8dp"
+                android:layout_margin="12dp"
                android:src="@drawable/outline_folder_open_24" />

        </LinearLayout>
--- a/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml
+++ b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml
@@ -2,7 +2,8 @@
 <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
    android:layout_width="match_parent"
    android:layout_height="wrap_content"
-    android:padding="8dp"
+    android:layout_marginHorizontal="16dp"
+    android:layout_marginVertical="8dp"
    android:gravity="start">

    <TextView
--- a/examples/llama.android/app/src/main/res/layout/item_message_user.xml
+++ b/examples/llama.android/app/src/main/res/layout/item_message_user.xml
@@ -2,7 +2,8 @@
 <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
    android:layout_width="match_parent"
    android:layout_height="wrap_content"
-    android:padding="8dp"
+    android:layout_marginHorizontal="16dp"
+    android:layout_marginVertical="8dp"
    android:gravity="end">

    <TextView
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -25,6 +25,8 @@ define quantize_model
 	@echo "Export the quantized model path to $(2) variable in your environment"
 endef

+DEVICE ?= auto
+
 ###
 ### Casual Model targets/recipes
 ###
@@ -53,7 +55,7 @@ causal-convert-mm-model:

 causal-run-original-model:
 	$(call validate_model_path,causal-run-original-model)
-	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
+	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py --device "$(DEVICE)"

 causal-run-converted-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -2,233 +2,181 @@

 import argparse
 import os
+import sys
 import importlib
-from pathlib import Path
-
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 import torch
 import numpy as np

-### If you want to dump RoPE activations, apply this monkey patch to the model
-### class from Transformers that you are running (replace apertus.modeling_apertus
-### with the proper package and class for your model
-### === START ROPE DEBUG ===
-# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig

-# orig_rope = apply_rotary_pos_emb
-# torch.set_printoptions(threshold=float('inf'))
-# torch.set_printoptions(precision=6, sci_mode=False)
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from utils.common import debug_hook

-# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-#     # log inputs
-#     summarize(q, "RoPE.q_in")
-#     summarize(k, "RoPE.k_in")
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process model with specified path")
+    parser.add_argument("--model-path", "-m", help="Path to the model")
+    parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
+    parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
+    return parser.parse_args()

-#     # call original
-#     q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
+def load_model_and_tokenizer(model_path, device="auto"):
+    print("Loading model and tokenizer using AutoTokenizer:", model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    multimodal = False
+    full_config = config

-#     # log outputs
-#     summarize(q_out, "RoPE.q_out")
-#     summarize(k_out, "RoPE.k_out")
-
-#     return q_out, k_out
-
-# # Patch it
-# import transformers.models.apertus.modeling_apertus as apertus_mod  # noqa: E402
-# apertus_mod.apply_rotary_pos_emb = debug_rope
-### == END ROPE DEBUG ===
-
-
-def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
-    """
-    Print a tensor in llama.cpp debug style.
-
-    Supports:
-    - 2D tensors (seq, hidden)
-    - 3D tensors (batch, seq, hidden)
-    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
-
-    Shows first and last max_vals of each vector per sequence position.
-    """
-    t = tensor.detach().to(torch.float32).cpu()
-
-    # Determine dimensions
-    if t.ndim == 3:
-        _, s, _ = t.shape
-    elif t.ndim == 2:
-        _, s = 1, t.shape[0]
-        t = t.unsqueeze(0)
-    elif t.ndim == 4:
-        _, s, _, _ = t.shape
+    # Determine device_map based on device argument
+    if device == "cpu":
+        device_map = {"": "cpu"}
+        print("Forcing CPU usage")
+    elif device == "auto":
+        device_map = "auto"
    else:
-        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
-        return
+        device_map = {"": device}

-    ten_shape = t.shape
+    print("Model type:       ", config.model_type)
+    if "vocab_size" not in config and "text_config" in config:
+        config = config.text_config
+        multimodal = True

-    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
-    print("                                     [")
-    print("                                      [")
+    print("Vocab size:       ", config.vocab_size)
+    print("Hidden size:      ", config.hidden_size)
+    print("Number of layers: ", config.num_hidden_layers)
+    print("BOS token id:     ", config.bos_token_id)
+    print("EOS token id:     ", config.eos_token_id)

-    # Determine indices for first and last sequences
-    first_indices = list(range(min(s, max_seq)))
-    last_indices = list(range(max(0, s - max_seq), s))
-
-    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
-    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
-
-    # Combine indices
-    if has_overlap:
-        # If there's overlap, just use the combined unique indices
-        indices = sorted(list(set(first_indices + last_indices)))
-        separator_index = None
-    else:
-        # If no overlap, we'll add a separator between first and last sequences
-        indices = first_indices + last_indices
-        separator_index = len(first_indices)
-
-    for i, si in enumerate(indices):
-        # Add separator if needed
-        if separator_index is not None and i == separator_index:
-            print("                                       ...")
-
-        # Extract appropriate slice
-        vec = t[0, si]
-        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
-            flat = vec.flatten().tolist()
-        else:  # 2D or 3D case
-            flat = vec.tolist()
-
-        # First and last slices
-        first = flat[:max_vals]
-        last = flat[-max_vals:] if len(flat) >= max_vals else flat
-        first_str = ", ".join(f"{v:12.4f}" for v in first)
-        last_str = ", ".join(f"{v:12.4f}" for v in last)
-
-        print(f"                                       [{first_str}, ..., {last_str}]")
-
-    print("                                      ],")
-    print("                                     ]")
-    print(f"                                     sum = {t.sum().item():.6f}\n")
-
-
-def debug_hook(name):
-    def fn(_m, input, output):
-        if isinstance(input, torch.Tensor):
-            summarize(input, name + "_in")
-        elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor):
-            summarize(input[0], name + "_in")
-        if isinstance(output, torch.Tensor):
-            summarize(output, name + "_out")
-        elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor):
-            summarize(output[0], name + "_out")
-
-    return fn
-
-
-unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
-
-parser = argparse.ArgumentParser(description="Process model with specified path")
-parser.add_argument("--model-path", "-m", help="Path to the model")
-args = parser.parse_args()
-
-model_path = os.environ.get("MODEL_PATH", args.model_path)
-if model_path is None:
-    parser.error(
-        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
-    )
-
-
-print("Loading model and tokenizer using AutoTokenizer:", model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-
-print("Model type:       ", config.model_type)
-print("Vocab size:       ", config.vocab_size)
-print("Hidden size:      ", config.hidden_size)
-print("Number of layers: ", config.num_hidden_layers)
-print("BOS token id:     ", config.bos_token_id)
-print("EOS token id:     ", config.eos_token_id)
-
-if unreleased_model_name:
-    model_name_lower = unreleased_model_name.lower()
-    unreleased_module_path = (
-        f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-    )
-    class_name = f"{unreleased_model_name}ForCausalLM"
-    print(f"Importing unreleased model module: {unreleased_module_path}")
-
-    try:
-        model_class = getattr(
-            importlib.import_module(unreleased_module_path), class_name
+    unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
+    if unreleased_model_name:
+        model_name_lower = unreleased_model_name.lower()
+        unreleased_module_path = (
+            f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
        )
-        model = model_class.from_pretrained(
-            model_path
-        )  # Note: from_pretrained, not fromPretrained
-    except (ImportError, AttributeError) as e:
-        print(f"Failed to import or load model: {e}")
-        exit(1)
-else:
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
-    )
+        class_name = f"{unreleased_model_name}ForCausalLM"
+        print(f"Importing unreleased model module: {unreleased_module_path}")

-for name, module in model.named_modules():
-    if len(list(module.children())) == 0:  # only leaf modules
-        module.register_forward_hook(debug_hook(name))
+        try:
+            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+            model = model_class.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=config
+            )
+        except (ImportError, AttributeError) as e:
+            print(f"Failed to import or load model: {e}")
+            exit(1)
+    else:
+        if multimodal:
+            model = AutoModelForImageTextToText.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=full_config
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=config
+            )

-model_name = os.path.basename(model_path)
-# Printing the Model class to allow for easier debugging. This can be useful
-# when working with models that have not been publicly released yet and this
-# migth require that the concrete class is imported and used directly instead
-# of using AutoModelForCausalLM.
-print(f"Model class: {model.__class__.__name__}")
+    print(f"Model class: {model.__class__.__name__}")

-device = next(model.parameters()).device
-if os.getenv("MODEL_TESTING_PROMPT"):
-    prompt = os.getenv("MODEL_TESTING_PROMPT")
-else:
-    prompt = "Hello, my name is"
-input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    return model, tokenizer, config

-print(f"Input tokens: {input_ids}")
-print(f"Input text: {repr(prompt)}")
-print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
+def enable_torch_debugging(model):
+        for name, module in model.named_modules():
+            if len(list(module.children())) == 0:  # only leaf modules
+                module.register_forward_hook(debug_hook(name))

-with torch.no_grad():
-    outputs = model(input_ids.to(model.device))
-    logits = outputs.logits
+def get_prompt(args):
+    if args.prompt_file:
+        with open(args.prompt_file, encoding='utf-8') as f:
+            return f.read()
+    elif os.getenv("MODEL_TESTING_PROMPT"):
+        return os.getenv("MODEL_TESTING_PROMPT")
+    else:
+        return "Hello, my name is"

-    # Extract logits for the last token (next token prediction)
-    last_logits = logits[0, -1, :].float().cpu().numpy()
+def main():
+    args = parse_arguments()
+    model_path = os.environ.get("MODEL_PATH", args.model_path)
+    if model_path is None:
+        print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+        sys.exit(1)

-    print(f"Logits shape: {logits.shape}")
-    print(f"Last token logits shape: {last_logits.shape}")
-    print(f"Vocab size: {len(last_logits)}")

-    data_dir = Path("data")
-    data_dir.mkdir(exist_ok=True)
-    bin_filename = data_dir / f"pytorch-{model_name}.bin"
-    txt_filename = data_dir / f"pytorch-{model_name}.txt"
+    model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)

-    # Save to file for comparison
-    last_logits.astype(np.float32).tofile(bin_filename)
+    if args.verbose:
+        enable_torch_debugging(model)

-    # Also save as text file for easy inspection
-    with open(txt_filename, "w") as f:
-        for i, logit in enumerate(last_logits):
-            f.write(f"{i}: {logit:.6f}\n")
+    model_name = os.path.basename(model_path)

-    # Print some sample logits for quick verification
-    print(f"First 10 logits: {last_logits[:10]}")
-    print(f"Last 10 logits: {last_logits[-10:]}")
+    # Iterate over the model parameters (the tensors) and get the first one
+    # and use it to get the device the model is on.
+    device = next(model.parameters()).device
+    prompt = get_prompt(args)
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

-    # Show top 5 predicted tokens
-    top_indices = np.argsort(last_logits)[-5:][::-1]
-    print("Top 5 predictions:")
-    for idx in top_indices:
-        token = tokenizer.decode([idx])
-        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
+    print(f"Input tokens: {input_ids}")
+    print(f"Input text: {repr(prompt)}")
+    print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")

-    print(f"Saved bin logits to: {bin_filename}")
-    print(f"Saved txt logist to: {txt_filename}")
+    batch_size = 512
+
+    with torch.no_grad():
+        past = None
+        outputs = None
+        for i in range(0, input_ids.size(1), batch_size):
+            print(f"Processing chunk with tokens {i} to {i + batch_size}")
+            chunk = input_ids[:, i:i + batch_size]
+            outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
+            past = outputs.past_key_values
+
+        logits = outputs.logits # type: ignore
+
+        # Extract logits for the last token (next token prediction)
+        last_logits = logits[0, -1, :].float().cpu().numpy()
+
+        print(f"Logits shape: {logits.shape}")
+        print(f"Last token logits shape: {last_logits.shape}")
+        print(f"Vocab size: {len(last_logits)}")
+
+        data_dir = Path("data")
+        data_dir.mkdir(exist_ok=True)
+        bin_filename = data_dir / f"pytorch-{model_name}.bin"
+        txt_filename = data_dir / f"pytorch-{model_name}.txt"
+
+        # Save to file for comparison
+        last_logits.astype(np.float32).tofile(bin_filename)
+
+        # Also save as text file for easy inspection
+        with open(txt_filename, "w") as f:
+            for i, logit in enumerate(last_logits):
+                f.write(f"{i}: {logit:.6f}\n")
+
+        # Print some sample logits for quick verification
+        print(f"First 10 logits: {last_logits[:10]}")
+        print(f"Last 10 logits: {last_logits[-10:]}")
+
+        # Show top 5 predicted tokens
+        top_indices = np.argsort(last_logits)[-5:][::-1]
+        print("Top 5 predictions:")
+        for idx in top_indices:
+            token = tokenizer.decode([idx])
+            print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
+
+        print(f"Saved bin logits to: {bin_filename}")
+        print(f"Saved txt logist to: {txt_filename}")
+
+if __name__ == "__main__":
+    main()
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -45,7 +45,7 @@ if use_sentence_transformers:
 else:
    tokenizer = AutoTokenizer.from_pretrained(model_path)

-    config = AutoConfig.from_pretrained(model_path)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

    # This can be used to override the sliding window size for manual testing. This
    # can be useful to verify the sliding window attention mask in the original model
@@ -64,12 +64,12 @@ else:

        try:
            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-            model = model_class.from_pretrained(model_path, config=config)
+            model = model_class.from_pretrained(model_path, config=config, trust_remote_code=True)
        except (ImportError, AttributeError) as e:
            print(f"Failed to import or load model: {e}")
            exit(1)
    else:
-        model = AutoModel.from_pretrained(model_path, config=config)
+        model = AutoModel.from_pretrained(model_path, config=config, trust_remote_code=True)
    print(f"Model class: {type(model)}")
    print(f"Model file: {type(model).__module__}")

@@ -123,7 +123,7 @@ with torch.no_grad():
        outputs = model(**encoded)
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]

-        all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
+        all_embeddings = hidden_states[0].float().cpu().numpy()  # Shape: [seq_len, hidden_size]

        print(f"Hidden states shape: {hidden_states.shape}")
        print(f"All embeddings shape: {all_embeddings.shape}")
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@@ -2,6 +2,8 @@

 import os
 import sys
+import torch
+

 def get_model_name_from_env_path(env_path_name):
    model_path = os.getenv(env_path_name)
@@ -18,3 +20,131 @@ def get_model_name_from_env_path(env_path_name):
        name = name[:-5]

    return name
+
+
+def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
+    """
+    Print a tensor in llama.cpp debug style.
+
+    Supports:
+    - 2D tensors (seq, hidden)
+    - 3D tensors (batch, seq, hidden)
+    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
+
+    Shows first and last max_vals of each vector per sequence position.
+    """
+    t = tensor.detach().to(torch.float32).cpu()
+
+    # Determine dimensions
+    if t.ndim == 3:
+        _, s, _ = t.shape
+    elif t.ndim == 2:
+        _, s = 1, t.shape[0]
+        t = t.unsqueeze(0)
+    elif t.ndim == 4:
+        _, s, _, _ = t.shape
+    else:
+        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
+        return
+
+    ten_shape = t.shape
+
+    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
+    print("                                     [")
+    print("                                      [")
+
+    # Determine indices for first and last sequences
+    first_indices = list(range(min(s, max_seq)))
+    last_indices = list(range(max(0, s - max_seq), s))
+
+    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
+    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
+
+    # Combine indices
+    if has_overlap:
+        # If there's overlap, just use the combined unique indices
+        indices = sorted(list(set(first_indices + last_indices)))
+        separator_index = None
+    else:
+        # If no overlap, we'll add a separator between first and last sequences
+        indices = first_indices + last_indices
+        separator_index = len(first_indices)
+
+    for i, si in enumerate(indices):
+        # Add separator if needed
+        if separator_index is not None and i == separator_index:
+            print("                                       ...")
+
+        # Extract appropriate slice
+        vec = t[0, si]
+        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
+            flat = vec.flatten().tolist()
+        else:  # 2D or 3D case
+            flat = vec.tolist()
+
+        # First and last slices
+        first = flat[:max_vals]
+        last = flat[-max_vals:] if len(flat) >= max_vals else flat
+        first_str = ", ".join(f"{v:12.4f}" for v in first)
+        last_str = ", ".join(f"{v:12.4f}" for v in last)
+
+        print(f"                                       [{first_str}, ..., {last_str}]")
+
+    print("                                      ],")
+    print("                                     ]")
+    print(f"                                     sum = {t.sum().item():.6f}\n")
+
+
+def debug_hook(name):
+    def fn(_m, input, output):
+        if isinstance(input, torch.Tensor):
+            summarize(input, name + "_in")
+        elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
+            summarize(input[0], name + "_in")
+        if isinstance(output, torch.Tensor):
+            summarize(output, name + "_out")
+        elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
+            summarize(output[0], name + "_out")
+
+    return fn
+
+
+def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"):
+    """
+    Apply monkey patch to dump RoPE activations for debugging.
+
+    Args:
+        model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus")
+        function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb")
+
+    Example:
+        from utils.common import setup_rope_debug
+        setup_rope_debug("transformers.models.apertus.modeling_apertus")
+    """
+    import importlib
+
+    # Import the module and get the original function
+    module = importlib.import_module(model_module_path)
+    orig_rope = getattr(module, function_name)
+
+    # Set torch print options for better debugging
+    torch.set_printoptions(threshold=float('inf'))
+    torch.set_printoptions(precision=6, sci_mode=False)
+
+    def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        # log inputs
+        summarize(q, "RoPE.q_in")
+        summarize(k, "RoPE.k_in")
+
+        # call original
+        q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+        # log outputs
+        summarize(q_out, "RoPE.q_out")
+        summarize(k_out, "RoPE.k_out")
+
+        return q_out, k_out
+
+    # Patch it
+    setattr(module, function_name, debug_rope)
+    print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -166,7 +166,7 @@ def main():
    # Load the python model to get configuration information and also to load the tokenizer.
    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    config = AutoConfig.from_pretrained(args.model_path)
+    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)

    if unreleased_model_name:
        model_name_lower = unreleased_model_name.lower()
@@ -186,9 +186,9 @@ def main():
            exit(1)
    else:
        if args.causal:
-            model = AutoModelForCausalLM.from_pretrained(args.model_path)
+            model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
        else:
-            model = AutoModel.from_pretrained(args.model_path)
+            model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)

    encoded = tokenizer(prompt, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
                bool accept = false;
                if (params.sampling.temp > 0) {
                    // stochastic verification
-                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);

                    auto & dist_tgt = *common_sampler_get_candidates(smpl, true);

@@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
+                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);

                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);

--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -22,9 +22,9 @@ if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none

 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -24,8 +24,8 @@ export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1

-.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1

-.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99
+.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -254,6 +254,7 @@ set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "gmml: OpenCL API version to target")

 option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
+set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")

 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2338,19 +2338,19 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
    // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
    // TODO: acl_yarn_ramp_tensor use rope cache.
    bool                 yarn_ramp_tensor_updated = false;
-    ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
    acl_tensor_ptr       acl_yarn_ramp_tensor;
    if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
                            ctx.rope_cache.freq_scale != freq_scale)) {
        yarn_ramp_tensor_updated = true;
-
+        if (ctx.rope_cache.yarn_ramp_cache != nullptr) {
+            ACL_CHECK(aclrtFree(ctx.rope_cache.yarn_ramp_cache));
+        }
+        ACL_CHECK(aclrtMalloc(&ctx.rope_cache.yarn_ramp_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
        // -rope_yarn_ramp
        // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
        // return MIN(1, MAX(0, y)) - 1;
-        yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
-        void * yarn_ramp_buffer = yarn_ramp_allocator.get();
        acl_yarn_ramp_tensor =
-            ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
+            ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
        float          zero_value = 0, one_value = 1;
        float          denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
        acl_scalar_ptr low              = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
@@ -2380,8 +2380,10 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
+    } else {
+        acl_yarn_ramp_tensor =
+            ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
    }
-
    // Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
    if (ext_factor != 0) {
        if (theta_scale_updated || yarn_ramp_tensor_updated) {
@@ -2988,32 +2990,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
 }

-void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
    ggml_tensor * src0 = dst->src[0];
    ggml_tensor * src1 = dst->src[1];

    // stride
-    int64_t s0 = ((const int32_t *) (dst->op_params))[0];
+    int64_t s0 = ((const int32_t*)(dst->op_params))[0];

-    acl_tensor_ptr acl_input  = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
+    acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
    acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
-    acl_tensor_ptr acl_dst    = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
+
+    // get base information of input and kernel
+    int64_t input_len = *(src1->ne);
+    int64_t dst_len = *(dst->ne);
+    int64_t kernel_size = *(src0->ne);
+
+    // set the max kernel size for each conv
+    int64_t max_kernel_size = 255;
+
+    // compute the partition of kernel
+    int64_t part_num = 1;
+    part_num = (kernel_size + max_kernel_size - 1) / max_kernel_size;

    int64_t strideVal[1];
-    strideVal[0]                    = s0;
-    acl_int_array_ptr stride        = ggml_cann_create_int_array(strideVal, 1);
-    int64_t           paddingVal[]  = { 0 };
-    acl_int_array_ptr padding       = ggml_cann_create_int_array(paddingVal, 1);
-    int64_t           dilationVal[] = { 1 };
-    acl_int_array_ptr dilation      = ggml_cann_create_int_array(dilationVal, 1);
-    int8_t            cubeMathType  = 0;
+    strideVal[0] = s0;
+    acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
+    int64_t paddingVal[] = {0};
+    acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
+    int64_t dilationVal[] = {1};
+    acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
+    bool transposed = true;
+    int64_t groups = 1;
+    int8_t cubeMathType = 0;

 #ifdef ASCEND_310P
    cubeMathType = 1;
 #endif

-    GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
-                            dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
+    auto weight_type = ggml_cann_type_mapping(src0->type);
+    auto dst_type = ggml_cann_type_mapping(dst->type);
+
+    // slice the kernel to make each conv available
+    int64_t slice_dim = -1;
+    int64_t slice_start = 0;
+    int64_t slice_end = max_kernel_size;
+    int64_t slice_step = 1;
+    int64_t interval = max_kernel_size;
+
+    int64_t left_pad_len = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
+    int64_t right_pad_len = 0;
+
+    acl_scalar_ptr alpha = nullptr;
+    float alphaValue = 1.0;
+    alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    // set zero to destination
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
+
+    for(int k = 0; k < part_num; k++){
+
+        // create part kernel tensor and slice from big kernel
+        slice_start = max_kernel_size * k;
+        if(k == part_num - 1){
+            slice_end = kernel_size;
+            interval = kernel_size - max_kernel_size * k;
+        }else{
+            slice_end = max_kernel_size * (k+1);
+        }
+
+        int64_t part_ne[4];
+        for(int i = 0; i < 4; i++) {
+            part_ne[i] = *(src0->ne + i);
+        }
+        part_ne[0] = interval;
+
+        size_t part_nb[4];
+        part_nb[0] = sizeof(weight_type);
+        for (int i = 1; i < 4; i++) {
+            part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
+        }
+
+        ggml_cann_pool_alloc part_kernel_allocator;
+        part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
+        void* part_kernel_buf = part_kernel_allocator.get();
+
+        acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type,
+                                ggml_element_size(src0), part_ne, part_nb, 3, ACL_FORMAT_NCL);
+
+        GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight.get(), slice_dim, slice_start, slice_end, slice_step, part_kernel.get());
+
+        // create the part conv result tensor
+        int64_t part_dst_ne[4];
+        for(int i = 0; i < 4; i++){
+            part_dst_ne[i] = *(dst->ne + i);
+        }
+        part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
+
+        size_t part_dst_nb[4];
+        part_dst_nb[0] = sizeof(weight_type);
+        for (int i = 1; i < 4; i++) {
+            part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
+        }
+        ggml_cann_pool_alloc part_dst_allocator;
+        part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
+        void* part_dst_buf = part_dst_allocator.get();
+
+        acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
+                                    part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
+        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst.get());
+
+        // compute part conv transpose 1d
+        GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), part_kernel.get(), nullptr, stride.get(),
+        padding.get(), dilation.get(), transposed, padding.get(), groups, acl_part_dst.get(), cubeMathType);
+
+        // compute the position of part result in final result
+        int64_t global_start = slice_start;
+        int64_t global_end = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
+
+        left_pad_len = global_start;
+        right_pad_len = dst_len - global_end;
+
+        std::vector<int64_t> padDataVal = {left_pad_len,right_pad_len};
+        acl_int_array_ptr padData = ggml_cann_create_int_array(padDataVal.data(), 2);
+
+        acl_scalar_ptr pad_value = nullptr;
+        float pad_valueVal = 0.0;
+        pad_value = ggml_cann_create_scalar(&pad_valueVal, aclDataType::ACL_FLOAT);
+
+        int64_t conv_result_ne[4];
+        for(int i = 0; i < 4; i++){
+            conv_result_ne[i] = *(dst->ne + i);
+        }
+
+        size_t conv_result_nb[4];
+        conv_result_nb[0] = sizeof(weight_type);
+        for (int i = 1; i < 4; i++) {
+            conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
+        }
+
+        ggml_cann_pool_alloc conv_result_allocator;
+        conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
+        void* conv_result_buf = conv_result_allocator.get();
+
+        acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
+                                    conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
+
+        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result.get());
+        GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst.get(), padData.get(), pad_value.get(), conv_result.get());
+        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), conv_result.get(), alpha.get());
+    }
 }

 void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -3576,3 +3702,106 @@ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
            break;
    }
 }
+
+void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src0 = dst->src[0];  // conv_x
+    ggml_tensor * src1 = dst->src[1];  // conv1d.weight
+
+    // This op is currently defined only for F32 in ggml_cpu
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    // Shapes follow ggml_compute_forward_ssm_conv_f32
+    const int64_t nc  = src1->ne[0];   // d_conv
+    const int64_t ncs = src0->ne[0];   // d_conv - 1 + n_t
+    const int64_t nr  = src0->ne[1];   // d_inner
+    const int64_t n_s = src0->ne[2];   // n_seqs
+
+    const int64_t n_t = dst->ne[1];    // tokens per sequence
+
+    GGML_ASSERT(dst->ne[0] == nr);     // dst: {d_inner, n_t, n_s}
+    GGML_ASSERT(src1->ne[1] == nr);    // weight: {d_conv, d_inner}
+    GGML_ASSERT(ncs == nc - 1 + n_t);  // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+
+    // --- Build CANN tensors ---
+
+    // 1) Input: conv_x as NCL
+    //
+    // src0->ne = { ncs, nr, n_s, 1 }  // {L_in, C, N}
+    // Passing ACL_FORMAT_NCL here means:
+    //   reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
+    acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
+
+    // 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
+    //
+    // src1 original:   ne = { nc, nr, 1, 1 }  // [K, C, 1, 1]
+    // we want a view:  ne_w = { nc, 1, nr }   // [K, 1, C]
+    // so that reversed dims -> [C, 1, K] which matches
+    //   [out_channels, in_channels/groups, kernel_size]
+    int64_t w_ne[GGML_MAX_DIMS] = { nc, 1, nr, 1 }; // [K, 1 input ch. per group, C groups]
+    // Layout: src1 data is [K, C] with
+    //   offset(k, c) = k*nb0 + c*nb1
+    // We want offset_w(k, 0, c) = k*nb0 + c*nb1,
+    // so we can reuse nb0 and nb1, and set nb2 = nb1.
+    size_t  w_nb[GGML_MAX_DIMS] = { src1->nb[0], src1->nb[1], src1->nb[1], src1->nb[3] }; // same as src1
+
+    acl_tensor_ptr acl_w = ggml_cann_create_tensor(
+        src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL);
+
+    // 3) Output: dst is { d_inner, n_t, n_s } (CLN)
+    //
+    // We need an NCL view of the same buffer:
+    //   desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
+    //
+    // Original CLN layout:
+    //   dst->ne = { nr, n_t, n_s }
+    //   dst->nb[0] = sizeof(float)
+    //   dst->nb[1] = nr * sizeof(float)
+    //   dst->nb[2] = nr * n_t * sizeof(float)
+    //
+    // We want offset_new(L, C, N) = offset_orig(C, L, N).
+    // Choose:
+    //   nb_y[0] = nr * sizeof(float);           // step in L
+    //   nb_y[1] = sizeof(float);                // step in C
+    //   nb_y[2] = nr * n_t * sizeof(float);     // step in N
+    int64_t y_ne[GGML_MAX_DIMS] = { n_t, nr, n_s, 1 }; // [L_out, C, N]
+    size_t  y_nb[GGML_MAX_DIMS] = { dst->ne[0] * sizeof(float), sizeof(float), dst->ne[0] * dst->ne[1] * sizeof(float), dst->nb[3] }; // [nr, 1, nr * n_t]
+
+    acl_tensor_ptr acl_y = ggml_cann_create_tensor(
+        dst->data, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);
+
+    // --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
+    int64_t strideVal[1]   = { 1 };
+    int64_t paddingVal[1]  = { 0 };
+    int64_t dilationVal[1] = { 1 };
+
+    acl_int_array_ptr stride   = ggml_cann_create_int_array(strideVal, 1);
+    acl_int_array_ptr padding  = ggml_cann_create_int_array(paddingVal, 1);
+    acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
+
+    const bool    transposed   = false;
+    const int64_t groups       = nr;  // depthwise: one group per inner dim
+    int8_t        cubeMathType = 0;
+
+#ifdef ASCEND_310P
+    cubeMathType = 1;
+#endif
+
+    GGML_CANN_CALL_ACLNN_OP(ctx,
+                            Convolution,
+                            acl_x.get(),    // input:  N, C, L_in = ncs
+                            acl_w.get(),    // weight: [C, 1, K] with groups=nr
+                            nullptr,        // bias
+                            stride.get(),
+                            padding.get(),
+                            dilation.get(),
+                            transposed,
+                            padding.get(),   // output padding (unused for non-transposed)
+                            groups,
+                            acl_y.get(),
+                            cubeMathType);
+}
+
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -47,6 +47,7 @@
 #include <aclnnop/aclnn_sign.h>
 #include <aclnnop/aclnn_silu.h>
 #include <aclnnop/aclnn_sin.h>
+#include <aclnnop/aclnn_slice.h>
 #include <aclnnop/aclnn_sqrt.h>
 #include <aclnnop/aclnn_tanh.h>

@@ -1032,6 +1033,8 @@ void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTenso
                        ggml_backend_cann_context &                                                ctx,
                        ggml_tensor *                                                              dst);

+void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
 /**
 * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
 *
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -229,6 +229,60 @@ struct ggml_graph_node_properties {
    // op
    ggml_op node_op;
    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+
+    /**
+     * @brief Check if a ggml tensor node matches this property set.
+     *
+     * This function compares all relevant fields (address, op type, shape, source inputs, op params)
+     * to determine whether the current node matches these previously recorded properties.
+     *
+     * @param node The current ggml tensor node.
+     * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
+     */
+    bool has_matching_properties(ggml_tensor * node) {
+        if (node->data != this->node_address && node->op != GGML_OP_VIEW) {
+            return false;
+        }
+
+        if (node->op != this->node_op) {
+            return false;
+        }
+
+        for (int i = 0; i < GGML_MAX_DIMS; i++) {
+            if (node->ne[i] != this->ne[i]) {
+                return false;
+            }
+            if (node->nb[i] != this->nb[i]) {
+                return false;
+            }
+        }
+
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (node->src[i]) {
+                if (node->src[i]->data != this->src_address[i] && node->op != GGML_OP_VIEW) {
+                    return false;
+                }
+
+                for (int d = 0; d < GGML_MAX_DIMS; d++) {
+                    if (node->src[i]->ne[d] != this->src_ne[i][d]) {
+                        return false;
+                    }
+                    if (node->src[i]->nb[d] != this->src_nb[i][d]) {
+                        return false;
+                    }
+                }
+            } else {
+                if (this->src_address[i] != nullptr) {
+                    return false;
+                }
+            }
+        }
+
+        if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
+            return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
+        }
+        return true;
+    }
 };

 struct ggml_cann_graph {
@@ -241,6 +295,79 @@ struct ggml_cann_graph {
    aclmdlRI graph = nullptr;

    std::vector<ggml_graph_node_properties> ggml_graph_properties;
+
+    /**
+     * @brief Create a new CANN graph from a ggml computation graph.
+     *
+     * This function creates a new ggml_cann_graph object and fills its node properties
+     * (operation type, dimensions, strides, input sources, and operation parameters)
+     * based on the current ggml computation graph.
+     *
+     * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
+     * - node address
+     * - operation type
+     * - shape (ne) and strides (nb)
+     * - source tensor addresses
+     * - operation parameters
+     *
+     * @param cgraph The current ggml computation graph.
+     * @return Pointer to the newly created ggml_cann_graph object.
+     */
+    static ggml_cann_graph * create_from_cgraph(ggml_cgraph * cgraph) {
+        ggml_cann_graph * new_graph = new ggml_cann_graph();
+        new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
+
+        for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
+            ggml_tensor * node = cgraph->nodes[node_idx];
+            auto &        prop = new_graph->ggml_graph_properties[node_idx];
+
+            prop.node_address = node->data;
+            prop.node_op      = node->op;
+
+            std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
+            std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
+
+            for (int src = 0; src < GGML_MAX_SRC; ++src) {
+                if (node->src[src]) {
+                    prop.src_address[src] = node->src[src]->data;
+                    std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
+                    std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
+                } else {
+                    prop.src_address[src] = nullptr;
+                    std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
+                    std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
+                }
+            }
+
+            memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
+        }
+
+        return new_graph;
+    }
+
+    /**
+     * @brief Check whether this CANN graph matches the given ggml computation graph.
+     *
+     * This function compares the number of nodes and each node's properties
+     * (operation type, dimensions, strides, inputs, and operation parameters)
+     * to determine whether this CANN graph matches the given ggml graph.
+     *
+     * @param cgraph The current ggml computation graph.
+     * @return true if this CANN graph matches the ggml graph; false otherwise.
+     */
+    bool matches_cgraph(ggml_cgraph * cgraph) {
+        if (this->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
+            return false;
+        }
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            if (!this->ggml_graph_properties[i].has_matching_properties(cgraph->nodes[i])) {
+                return false;
+            }
+        }
+
+        return true;
+    }
 };

 /**
@@ -272,15 +399,6 @@ struct ggml_cann_graph_lru_cache {
        cache_list.push_front(new_node);
    }

-    /**
-     * @brief Move an existing graph to the front of the cache.
-     * @param node Pointer to the ggml_cann_graph to move.
-     */
-    void move_to_front(ggml_cann_graph * node) {
-        cache_list.remove(node);
-        cache_list.push_front(node);
-    }
-
    /**
     * @brief Clear all graphs from the cache (also frees memory).
     */
@@ -295,6 +413,28 @@ struct ggml_cann_graph_lru_cache {
     * @brief Destructor that clears the cache and frees all cached graphs.
     */
    ~ggml_cann_graph_lru_cache() { clear(); }
+
+    /**
+     * @brief Find a cached CANN graph that matches the given ggml graph and move it to front.
+     *
+     * This function iterates through the cached CANN graphs stored in the LRU cache and
+     * compares them against the given ggml computation graph. If a matching graph is found,
+     * it is promoted to the front of the LRU cache and returned. Otherwise, the function
+     * returns nullptr.
+     *
+     * @param cgraph The current ggml computation graph.
+     * @return true if found; false otherwise.
+     */
+    bool find_and_move_to_front(ggml_cgraph * cgraph) {
+        for (auto & graph_ptr : this->cache_list) {
+            if (graph_ptr->matches_cgraph(cgraph)) {
+                cache_list.remove(graph_ptr);
+                cache_list.push_front(graph_ptr);
+                return true;
+            }
+        }
+        return false;
+    }
 };
 #endif  // USE_ACL_GRAPH

@@ -318,6 +458,9 @@ struct ggml_cann_rope_cache {
        if (position_select_index_host) {
            free(position_select_index_host);
        }
+        if (yarn_ramp_cache) {
+            ACL_CHECK(aclrtFree(yarn_ramp_cache));
+        }
    }

    bool equal(int64_t theta_scale_length,
@@ -370,6 +513,7 @@ struct ggml_cann_rope_cache {
    float * theta_scale_exp_host       = nullptr;
    int *   position_select_index_host = nullptr;
    void *  position_select_index      = nullptr;
+    void *  yarn_ramp_cache            = nullptr;
    // sin/cos cache, used only to accelerate first layer on each device
    void *  sin_cache                  = nullptr;
    void *  cos_cache                  = nullptr;
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1888,6 +1888,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
            break;
        case GGML_OP_OUT_PROD:
            ggml_cann_out_prod(ctx, dst);
+        case GGML_OP_SSM_CONV:
+            ggml_cann_ssm_conv(ctx, dst);
            break;
        default:
            return false;
@@ -2075,162 +2077,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }

-#ifdef USE_ACL_GRAPH
-/**
- * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
- *
- * This function creates a new ggml_cann_graph object and fills its node properties
- * (operation type, dimensions, strides, input sources, and operation parameters)
- * based on the current ggml computation graph.
- *
- * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
- * - node address
- * - operation type
- * - shape (ne) and strides (nb)
- * - source tensor addresses
- * - operation parameters
- *
- * After initialization, the new graph is pushed into the LRU cache owned by the
- * CANN backend context. The cache takes ownership of the graph and manages its
- * lifetime (including deletion upon eviction).
- *
- * @param cann_ctx  The CANN backend context containing the graph cache.
- * @param cgraph    The current ggml computation graph.
- */
-static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
-    ggml_cann_graph * new_graph = new ggml_cann_graph();
-    new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
-
-    for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
-        ggml_tensor * node = cgraph->nodes[node_idx];
-        auto &        prop = new_graph->ggml_graph_properties[node_idx];
-
-        prop.node_address = node->data;
-        prop.node_op      = node->op;
-
-        std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
-        std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
-
-        for (int src = 0; src < GGML_MAX_SRC; ++src) {
-            if (node->src[src]) {
-                prop.src_address[src] = node->src[src]->data;
-                std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
-                std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
-            } else {
-                prop.src_address[src] = nullptr;
-                std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
-                std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
-            }
-        }
-
-        memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
-    }
-
-    // Insert into the LRU cache (cache takes ownership and will delete it when evicted).
-    cann_ctx->graph_lru_cache.push(new_graph);
-}
-
-/**
- * @brief Check if a ggml tensor node matches a previously captured CANN graph node.
- *
- * This function compares all relevant fields (address, op type, shape, source inputs, op params)
- * to determine whether the current node matches a previously recorded version.
- *
- * @param node                  The current ggml tensor node.
- * @param graph_node_properties The stored properties of a CANN graph node.
- * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
- */
-static bool ggml_graph_node_has_matching_properties(ggml_tensor *                node,
-                                                    ggml_graph_node_properties * graph_node_properties) {
-    if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) {
-        return false;
-    }
-
-    if (node->op != graph_node_properties->node_op) {
-        return false;
-    }
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != graph_node_properties->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != graph_node_properties->nb[i]) {
-            return false;
-        }
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i]) {
-            if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) {
-                return false;
-            }
-
-            for (int d = 0; d < GGML_MAX_DIMS; d++) {
-                if (node->src[i]->ne[d] != graph_node_properties->src_ne[i][d]) {
-                    return false;
-                }
-                if (node->src[i]->nb[d] != graph_node_properties->src_nb[i][d]) {
-                    return false;
-                }
-            }
-        } else {
-            if (graph_node_properties->src_address[i] != nullptr) {
-                return false;
-            }
-        }
-    }
-
-    if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
-        return memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
-    }
-    return true;
-}
-
-/**
- * @brief Check whether there is a cached CANN graph that matches the current ggml graph.
- *
- * This function iterates through the cached CANN graphs stored in the LRU cache and
- * compares them against the given ggml computation graph. A match requires that the
- * number of nodes is the same and that each node’s properties (operation type,
- * dimensions, strides, inputs, and operation parameters) are identical.
- *
- * If a matching graph is found, it is promoted to the front of the LRU cache and the
- * function returns true. Otherwise, the function returns false, indicating that a new
- * CANN graph needs to be captured.
- *
- * @param cann_ctx  The CANN backend context containing the graph cache.
- * @param cgraph    The current ggml computation graph.
- * @return true if a matching cached graph exists; false otherwise.
- */
-static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache;
-    for (auto & graph_ptr : lru_cache.cache_list) {
-        // Skip graphs with a different number of nodes.
-        if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
-            continue;
-        }
-
-        // Check if all nodes match.
-        bool all_match = true;
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
-                all_match = false;
-                break;
-            }
-        }
-
-        if (all_match) {
-            // update cache_list && renturn graph_ptr
-            lru_cache.move_to_front(graph_ptr);
-            return true;
-        }
-    }
-
-    return false;
-}
-#endif  // USE_ACL_GRAPH
-
 /**
 * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
 *
@@ -2239,23 +2085,23 @@ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph *
 *
 * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
 *
- * @param cann_ctx                 The CANN backend context.
- * @param cgraph                   The ggml computation graph.
- * @param use_cann_graph           Whether to use CANN graph execution.
- * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
+ * @param cann_ctx                     The CANN backend context.
+ * @param cgraph                       The ggml computation graph.
+ * @param use_cann_graph               Whether to use CANN graph execution.
+ * @param cann_graph_capture_required  Whether graph capture is needed due to graph changes.
 */
 static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
                                            ggml_cgraph *               cgraph,
-                                            bool &                      use_cann_graph,
-                                            bool &                      cann_graph_update_required) {
+                                            bool                        use_cann_graph,
+                                            bool                        cann_graph_capture_required) {
 #ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) {  // Begin CANN graph capture
+    if (use_cann_graph && cann_graph_capture_required) {  // Begin CANN graph capture
        ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
    }
 #endif  // USE_ACL_GRAPH
    // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
    // With the use of CANN graphs, the execution will be performed by the graph launch.
-    if (!use_cann_graph || cann_graph_update_required) {
+    if (!use_cann_graph || cann_graph_capture_required) {
        for (int i = 0; i < cgraph->n_nodes; i++) {
            ggml_tensor * node = cgraph->nodes[i];

@@ -2274,9 +2120,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx

 #ifdef USE_ACL_GRAPH
    if (use_cann_graph) {
+        GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
        ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();

-        if (cann_graph_update_required) {  // End CANN graph capture
+        if (cann_graph_capture_required) {  // End CANN graph capture
            ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
        }

@@ -2306,7 +2153,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
    // calculate rope cache for fist layer in current device.
    cann_ctx->rope_cache.cached = false;

-    bool cann_graph_update_required = false;
+    bool graph_capture_required = false;
 #ifdef USE_ACL_GRAPH
    bool use_cann_graph = true;

@@ -2331,16 +2178,17 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,

    if (use_cann_graph) {
        // If no matching graph is found, the graph needs to be recaptured.
-        cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
-        if (cann_graph_update_required) {
+        graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
+        if (graph_capture_required) {
            // If no matching graph is found, add a new ACL graph.
-            add_lru_matched_graph_node_properties(cann_ctx, cgraph);
+            ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
+            cann_ctx->graph_lru_cache.push(new_graph);
        }
    }
 #else
    bool use_cann_graph = false;
 #endif  // USE_ACL_GRAPH
-    evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
+    evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);

    return GGML_STATUS_SUCCESS;
 }
@@ -2578,8 +2426,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
                }
            }
        case GGML_OP_CONV_TRANSPOSE_1D:
-            // TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
-            return (op->src[0]->ne[0] - 1) <= 255;
+            return true;
        case GGML_OP_SCALE:
            float bias;
            memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
@@ -2626,6 +2473,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
                }
                return true;
            }
+        case GGML_OP_SSM_CONV:
+            return true;
        default:
            return false;
    }
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_RV_ZFH)
                string(APPEND MARCH_STR "_zfh")
            endif()
+
            if (GGML_XTHEADVECTOR)
                string(APPEND MARCH_STR "_xtheadvector")
            elseif (GGML_RVV)
@@ -465,6 +466,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                if (GGML_RV_ZVFH)
                    string(APPEND MARCH_STR "_zvfh")
                endif()
+                if (GGML_RV_ZVFBFWMA)
+                    string(APPEND MARCH_STR "_zvfbfwma")
+                endif()
            endif()
            if (GGML_RV_ZICBOP)
                string(APPEND MARCH_STR "_zicbop")
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -43,6 +43,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -51,6 +53,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -67,10 +71,14 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__POWERPC__) || defined(__powerpc__)
 // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
 // quants.c
@@ -91,6 +99,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -99,6 +109,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__loongarch64)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -119,6 +131,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -127,6 +141,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__riscv)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -154,6 +170,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
@@ -161,6 +179,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -187,6 +207,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -195,6 +217,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__wasm__)
 // quants.c
 #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@@ -223,6 +247,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -231,4 +257,6 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #endif
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -786,6 +786,133 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
    ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }

+void ggml_gemv_q8_0_4x4_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
+
+    for (int c = 0; c < nc; c += ncols_interleaved) {
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float32x4_t        acc   = vdupq_n_f32(0);
+        for (int b = 0; b < nb; b++) {
+            int8x16x4_t b_low  = vld1q_s8_x4((const int8_t *) b_ptr->qs);
+            int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
+            float16x4_t bd     = vld1_f16((const __fp16 *) b_ptr->d);
+
+            int8x16x2_t a  = vld1q_s8_x2(a_ptr->qs);
+            float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+            int32x4_t ret = vdupq_n_s32(0);
+
+            ret = vdotq_laneq_s32(ret, b_low.val[0], a.val[0], 0);
+            ret = vdotq_laneq_s32(ret, b_low.val[1], a.val[0], 1);
+            ret = vdotq_laneq_s32(ret, b_low.val[2], a.val[0], 2);
+            ret = vdotq_laneq_s32(ret, b_low.val[3], a.val[0], 3);
+
+            ret = vdotq_laneq_s32(ret, b_high.val[0], a.val[1], 0);
+            ret = vdotq_laneq_s32(ret, b_high.val[1], a.val[1], 1);
+            ret = vdotq_laneq_s32(ret, b_high.val[2], a.val[1], 2);
+            ret = vdotq_laneq_s32(ret, b_high.val[3], a.val[1], 3);
+
+            acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+            a_ptr++;
+            b_ptr++;
+        }
+        vst1q_f32(s, acc);
+        s += ncols_interleaved;
+    }
+    return;
+
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_q8_0_4x8_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
+
+    for (int c = 0; c < nc; c += ncols_interleaved) {
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float32x4_t        acc   = vdupq_n_f32(0);
+
+        for (int b = 0; b < nb; b++) {
+            int8x16x4_t b_low  = vld1q_s8_x4((const int8_t *) b_ptr->qs);
+            int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
+            float16x4_t bd     = vld1_f16((const __fp16 *) b_ptr->d);
+
+            int8x8x4_t  a_chunks = vld1_s8_x4(a_ptr->qs);
+            int8x16_t   a0       = vcombine_s8(a_chunks.val[0], a_chunks.val[0]);
+            int8x16_t   a1       = vcombine_s8(a_chunks.val[1], a_chunks.val[1]);
+            int8x16_t   a2       = vcombine_s8(a_chunks.val[2], a_chunks.val[2]);
+            int8x16_t   a3       = vcombine_s8(a_chunks.val[3], a_chunks.val[3]);
+            float16x4_t ad       = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+            int32x4_t ret0 = vdupq_n_s32(0);
+            int32x4_t ret1 = vdupq_n_s32(0);
+
+            // 0..7
+            ret0 = vdotq_s32(ret0, b_low.val[0], a0);
+            ret1 = vdotq_s32(ret1, b_low.val[1], a0);
+            // 8..15
+            ret0 = vdotq_s32(ret0, b_low.val[2], a1);
+            ret1 = vdotq_s32(ret1, b_low.val[3], a1);
+            // 16..23
+            ret0 = vdotq_s32(ret0, b_high.val[0], a2);
+            ret1 = vdotq_s32(ret1, b_high.val[1], a2);
+            // 24..31
+            ret0 = vdotq_s32(ret0, b_high.val[2], a3);
+            ret1 = vdotq_s32(ret1, b_high.val[3], a3);
+
+            int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+            acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+            a_ptr++;
+            b_ptr++;
+        }
+        vst1q_f32(s, acc);
+        s += ncols_interleaved;
+    }
+    return;
+
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -2610,3 +2737,159 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,
 #endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
    ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
+
+
+void ggml_gemm_q8_0_4x4_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+
+            float32x4_t sumf[4];
+            for (int m = 0; m < 4; m++) {
+                sumf[m] = vdupq_n_f32(0);
+            }
+
+            for (int l = 0; l < nb; l++) {
+                float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *) a_ptr[l].d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *) b_ptr[l].d));
+
+                int32x4_t sumi_0 = vdupq_n_s32(0);
+                int32x4_t sumi_1 = vdupq_n_s32(0);
+                int32x4_t sumi_2 = vdupq_n_s32(0);
+                int32x4_t sumi_3 = vdupq_n_s32(0);
+
+                for (int k_group = 0; k_group < 8; k_group += 4) {
+                    int8x16x4_t a = vld1q_s8_x4(a_ptr[l].qs + 16 * k_group);
+                    int8x16x4_t b = vld1q_s8_x4(b_ptr[l].qs + 16 * k_group);
+
+                    for (int k = 0; k < 4; k++) {
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b.val[k], a.val[k], 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b.val[k], a.val[k], 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b.val[k], a.val[k], 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b.val[k], a.val[k], 3);
+                    }
+                }
+
+                sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+                sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+                sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+                sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+            }
+
+            for (int m = 0; m < 4; m++) {
+                vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
+            }
+        }
+    }
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_q8_0_4x8_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
+
+    for (int y = 0; y < nr; y += 4) {
+        const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
+
+        for (int x = 0; x < nc; x += ncols_interleaved) {
+            const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
+            const block_q8_0x4 * a_ptr = a_ptr_base;
+
+            float32x4_t acc_f32[4];
+            for (int i = 0; i < 4; i++) {
+                acc_f32[i] = vdupq_n_f32(0);
+            }
+
+            for (int b = 0; b < nb; b++) {
+                int32x4_t acc[4];
+                for (int i = 0; i < 4; i++) {
+                    acc[i] = vdupq_n_s32(0);
+                }
+
+                // Process 4 chunks of 8 positions each
+                for (int chunk = 0; chunk < 4; chunk++) {
+                    int8x16_t a01 = vld1q_s8(a_ptr->qs + chunk * 32);
+                    int8x16_t a23 = vld1q_s8(a_ptr->qs + chunk * 32 + 16);
+                    int8x16_t b01 = vld1q_s8(b_ptr->qs + chunk * 32);
+                    int8x16_t b23 = vld1q_s8(b_ptr->qs + chunk * 32 + 16);
+
+                    acc[0] = vmmlaq_s32(acc[0], a01, b01);
+                    acc[1] = vmmlaq_s32(acc[1], a01, b23);
+                    acc[2] = vmmlaq_s32(acc[2], a23, b01);
+                    acc[3] = vmmlaq_s32(acc[3], a23, b23);
+                }
+
+                // Reorder outputs from 2×2 tiles to row-major
+                // acc[0] = [r0c0, r0c1, r1c0, r1c1]
+                // acc[1] = [r0c2, r0c3, r1c2, r1c3]
+                // acc[2] = [r2c0, r2c1, r3c0, r3c1]
+                // acc[3] = [r2c2, r2c3, r3c2, r3c3]
+                int32x4_t row0 = vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1]));
+                int32x4_t row1 = vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1]));
+                int32x4_t row2 = vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3]));
+                int32x4_t row3 = vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3]));
+
+                // Scales
+                float32x4_t a_d = vcvt_f32_f16(vld1_f16((const __fp16 *) a_ptr->d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const __fp16 *) b_ptr->d));
+
+                acc_f32[0] = vfmaq_f32(acc_f32[0], vcvtq_f32_s32(row0), vmulq_laneq_f32(b_d, a_d, 0));
+                acc_f32[1] = vfmaq_f32(acc_f32[1], vcvtq_f32_s32(row1), vmulq_laneq_f32(b_d, a_d, 1));
+                acc_f32[2] = vfmaq_f32(acc_f32[2], vcvtq_f32_s32(row2), vmulq_laneq_f32(b_d, a_d, 2));
+                acc_f32[3] = vfmaq_f32(acc_f32[3], vcvtq_f32_s32(row3), vmulq_laneq_f32(b_d, a_d, 3));
+
+                a_ptr++;
+                b_ptr++;
+            }
+
+            for (int row = 0; row < 4; row++) {
+                vst1q_f32(s + (y + row) * bs + x, acc_f32[row]);
+            }
+        }
+    }
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3320,13 +3320,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
        __m128 y_vec = _mm_cvtph_ps(x_vec);
        _mm_storeu_ps(y + i, y_vec);
    }
-#elif defined(__riscv_zvfh)
-    for (int vl; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m1(n - i);
-        vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
-        vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
-        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m2();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
+
+        vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
    }
+
+    // leftovers
+    int vl;
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
+    }
+
 #endif

    for (; i < n; ++i) {
@@ -3371,6 +3391,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
                                        (const __m128i *)(x + i))),
                                16)));
    }
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m2();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
+
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
+    }
+
+    // leftovers
+    int vl;
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
+    }
 #endif
    for (; i < n; i++) {
        y[i] = GGML_BF16_TO_FP32(x[i]);
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -69,6 +69,10 @@
 #define VECTOR_REGISTERS 16
 #endif

+#if defined(__riscv_v_intrinsic)
+#define LMUL 4
+#endif
+
 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)

 namespace {
@@ -175,6 +179,46 @@ inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
 }
 #endif

+#if defined(__riscv_zvfh)
+template <>
+inline vfloat32m1_t madd(vfloat16mf2_t a, vfloat16mf2_t b, vfloat32m1_t c) {
+    return __riscv_vfwmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
+}
+inline vfloat32m2_t madd(vfloat16m1_t a, vfloat16m1_t b, vfloat32m2_t c) {
+    return __riscv_vfwmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
+}
+inline vfloat32m4_t madd(vfloat16m2_t a, vfloat16m2_t b, vfloat32m4_t c) {
+    return __riscv_vfwmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
+}
+inline vfloat32m8_t madd(vfloat16m4_t a, vfloat16m4_t b, vfloat32m8_t c) {
+    return __riscv_vfwmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
+}
+inline vfloat32m1_t madd(vfloat32m1_t a, vfloat32m1_t b, vfloat32m1_t c) {
+    return __riscv_vfmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
+}
+inline vfloat32m2_t madd(vfloat32m2_t a, vfloat32m2_t b, vfloat32m2_t c) {
+    return __riscv_vfmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
+}
+inline vfloat32m4_t madd(vfloat32m4_t a, vfloat32m4_t b, vfloat32m4_t c) {
+    return __riscv_vfmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
+}
+inline vfloat32m8_t madd(vfloat32m8_t a, vfloat32m8_t b, vfloat32m8_t c) {
+    return __riscv_vfmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
+}
+#endif
+
+#if defined(__riscv_zvfbfwma)
+inline vfloat32m1_t madd(vbfloat16mf2_t a, vbfloat16mf2_t b, vfloat32m1_t c) {
+    return __riscv_vfwmaccbf16_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
+}
+inline vfloat32m2_t madd(vbfloat16m1_t a, vbfloat16m1_t b, vfloat32m2_t c) {
+    return __riscv_vfwmaccbf16_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
+}
+inline vfloat32m4_t madd(vbfloat16m2_t a, vbfloat16m2_t b, vfloat32m4_t c) {
+    return __riscv_vfwmaccbf16_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED HORIZONTAL SUM

@@ -227,6 +271,25 @@ inline float hsum(__m512 x) {
 }
 #endif // __AVX512F__

+#if defined(__riscv_zvfh)
+inline float hsum(vfloat32m1_t x) {
+    return __riscv_vfmv_f_s_f32m1_f32(
+        __riscv_vfredusum_vs_f32m1_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m1()));
+}
+inline float hsum(vfloat32m2_t x) {
+    return __riscv_vfmv_f_s_f32m1_f32(
+        __riscv_vfredusum_vs_f32m2_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m2()));
+}
+inline float hsum(vfloat32m4_t x) {
+    return __riscv_vfmv_f_s_f32m1_f32(
+        __riscv_vfredusum_vs_f32m4_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m4()));
+}
+inline float hsum(vfloat32m8_t x) {
+    return __riscv_vfmv_f_s_f32m1_f32(
+        __riscv_vfredusum_vs_f32m8_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m8()));
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED MEMORY LOADING

@@ -315,6 +378,88 @@ template <> inline __m256bh load(const float *p) {
 }
 #endif

+#if defined(__riscv_zvfh)
+template <> inline vfloat16mf2_t load(const ggml_fp16_t *p) {
+    return __riscv_vle16_v_f16mf2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16mf2());
+}
+template <> inline vfloat16m1_t load(const ggml_fp16_t *p) {
+    return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m1());
+}
+template <> inline vfloat16m2_t load(const ggml_fp16_t *p) {
+    return __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m2());
+}
+template <> inline vfloat16m4_t load(const ggml_fp16_t *p) {
+    return __riscv_vle16_v_f16m4(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m4());
+}
+template <> inline vfloat32m1_t load(const float *p) {
+    return __riscv_vle32_v_f32m1(p, __riscv_vsetvlmax_e32m1());
+}
+template <> inline vfloat32m2_t load(const float *p) {
+    return __riscv_vle32_v_f32m2(p, __riscv_vsetvlmax_e32m2());
+}
+template <> inline vfloat32m4_t load(const float *p) {
+    return __riscv_vle32_v_f32m4(p, __riscv_vsetvlmax_e32m4());
+}
+template <> inline vfloat32m8_t load(const float *p) {
+    return __riscv_vle32_v_f32m8(p, __riscv_vsetvlmax_e32m8());
+}
+#endif
+
+#if defined(__riscv_zvfbfwma)
+template <> inline vbfloat16mf2_t load(const ggml_bf16_t *p) {
+    return __riscv_vle16_v_bf16mf2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16mf2());
+}
+template <> inline vbfloat16m1_t load(const ggml_bf16_t *p) {
+    return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m1());
+}
+template <> inline vbfloat16m2_t load(const ggml_bf16_t *p) {
+    return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m2());
+}
+#endif
+
+#if defined(__riscv_zvfh)
+template <typename T> T set_zero();
+
+template <> inline vfloat16mf2_t set_zero() {
+    return __riscv_vfmv_v_f_f16mf2(0, __riscv_vsetvlmax_e16mf2());
+}
+template <> inline vfloat16m1_t set_zero() {
+    return __riscv_vfmv_v_f_f16m1(0, __riscv_vsetvlmax_e16m1());
+}
+template <> inline vfloat16m2_t set_zero() {
+    return __riscv_vfmv_v_f_f16m2(0, __riscv_vsetvlmax_e16m2());
+}
+template <> inline vfloat16m4_t set_zero() {
+    return __riscv_vfmv_v_f_f16m4(0, __riscv_vsetvlmax_e16m4());
+}
+template <> inline vfloat32m1_t set_zero() {
+    return __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvlmax_e32m1());
+}
+template <> inline vfloat32m2_t set_zero() {
+    return __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
+}
+template <> inline vfloat32m4_t set_zero() {
+    return __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+}
+template <> inline vfloat32m8_t set_zero() {
+    return __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
+}
+#endif
+
+#if defined(__riscv_v_intrinsic)
+template <typename T> size_t vlmax() {
+    if constexpr (std::is_same_v<T, vfloat16mf2_t>) { return  __riscv_vsetvlmax_e16mf2(); }
+    else if constexpr (std::is_same_v<T, vfloat16m1_t>) { return  __riscv_vsetvlmax_e16m1(); }
+    else if constexpr (std::is_same_v<T, vfloat16m2_t>) { return  __riscv_vsetvlmax_e16m2(); }
+    else if constexpr (std::is_same_v<T, vfloat16m4_t>) { return  __riscv_vsetvlmax_e16m4(); }
+    else if constexpr (std::is_same_v<T, vfloat32m1_t>) { return  __riscv_vsetvlmax_e32m1(); }
+    else if constexpr (std::is_same_v<T, vfloat32m2_t>) { return  __riscv_vsetvlmax_e32m2(); }
+    else if constexpr (std::is_same_v<T, vfloat32m4_t>) { return  __riscv_vsetvlmax_e32m4(); }
+    else if constexpr (std::is_same_v<T, vfloat32m8_t>) { return  __riscv_vsetvlmax_e32m8(); }
+    return 0;
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FLOATING POINT MATRIX MULTIPLICATION

@@ -488,6 +633,573 @@ class tinyBLAS {
    const int64_t ldc;
 };

+#if defined(__riscv_v_intrinsic)
+template <typename D, typename V, typename TA, typename TB, typename TC>
+class tinyBLAS_RVV {
+  public:
+    tinyBLAS_RVV(const ggml_compute_params * params, int64_t k,
+             const TA *A, int64_t lda,
+             const TB *B, int64_t ldb,
+             TC *C, int64_t ldc)
+        : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
+    }
+
+    bool matmul(int64_t m, int64_t n) {
+        if (k % vlmax<V>() != 0) {
+            return false;
+        }
+
+#if LMUL == 1
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 4>(m, n, SIZE_N, 12);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 2>(m, n, SIZE_N, 12);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 1>(m, n, SIZE_N, 12);
+            return true;
+        }
+#elif LMUL == 2
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 4>(m, n, SIZE_N, 24);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 2>(m, n, SIZE_N, 24);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 1>(m, n, SIZE_N, 24);
+            return true;
+        }
+#else // LMUL = 4
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<2>(n);
+            mnpack<2, 2, 8>(m, n, SIZE_N, 36);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<2>(n);
+            mnpack<2, 2, 4>(m, n, SIZE_N, 36);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<2>(n);
+            mnpack<2, 2, 2>(m, n, SIZE_N, 36);
+            return true;
+        }
+#endif
+        return false;
+    }
+
+  private:
+    template<int RM, int RN, int BM>
+    inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
+        if (SIZE_N == RN) {
+            return gemm<RM, RN, BM>(m, n, BN);
+        }
+        if constexpr (RN > 1) {
+            return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
+        } else {
+            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_ASSERT(false); // we have miss something.
+        }
+    }
+
+    inline void gemm_bloc_4x6(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv02 = set_zero<D>();
+        D Cv03 = set_zero<D>();
+        D Cv10 = set_zero<D>();
+        D Cv11 = set_zero<D>();
+        D Cv12 = set_zero<D>();
+        D Cv13 = set_zero<D>();
+        D Cv20 = set_zero<D>();
+        D Cv21 = set_zero<D>();
+        D Cv22 = set_zero<D>();
+        D Cv23 = set_zero<D>();
+        D Cv30 = set_zero<D>();
+        D Cv31 = set_zero<D>();
+        D Cv32 = set_zero<D>();
+        D Cv33 = set_zero<D>();
+        D Cv40 = set_zero<D>();
+        D Cv41 = set_zero<D>();
+        D Cv42 = set_zero<D>();
+        D Cv43 = set_zero<D>();
+        D Cv50 = set_zero<D>();
+        D Cv51 = set_zero<D>();
+        D Cv52 = set_zero<D>();
+        D Cv53 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            V Bv1 = load<V>(B + ldb * (jj + 1) + l);
+            V Bv2 = load<V>(B + ldb * (jj + 2) + l);
+            V Bv3 = load<V>(B + ldb * (jj + 3) + l);
+            V Bv4 = load<V>(B + ldb * (jj + 4) + l);
+            V Bv5 = load<V>(B + ldb * (jj + 5) + l);
+
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv10 = madd(Av0, Bv1, Cv10);
+            Cv20 = madd(Av0, Bv2, Cv20);
+            Cv30 = madd(Av0, Bv3, Cv30);
+            Cv40 = madd(Av0, Bv4, Cv40);
+            Cv50 = madd(Av0, Bv5, Cv50);
+
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+            Cv01 = madd(Av1, Bv0, Cv01);
+            Cv11 = madd(Av1, Bv1, Cv11);
+            Cv21 = madd(Av1, Bv2, Cv21);
+            Cv31 = madd(Av1, Bv3, Cv31);
+            Cv41 = madd(Av1, Bv4, Cv41);
+            Cv51 = madd(Av1, Bv5, Cv51);
+
+            V Av2 = load<V>(A + lda * (ii + 2) + l);
+            Cv02 = madd(Av2, Bv0, Cv02);
+            Cv12 = madd(Av2, Bv1, Cv12);
+            Cv22 = madd(Av2, Bv2, Cv22);
+            Cv32 = madd(Av2, Bv3, Cv32);
+            Cv42 = madd(Av2, Bv4, Cv42);
+            Cv52 = madd(Av2, Bv5, Cv52);
+
+            V Av3 = load<V>(A + lda * (ii + 3) + l);
+            Cv03 = madd(Av3, Bv0, Cv03);
+            Cv13 = madd(Av3, Bv1, Cv13);
+            Cv23 = madd(Av3, Bv2, Cv23);
+            Cv33 = madd(Av3, Bv3, Cv33);
+            Cv43 = madd(Av3, Bv4, Cv43);
+            Cv53 = madd(Av3, Bv5, Cv53);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
+        C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
+        C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
+        C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
+        C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
+        C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
+        C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
+        C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
+        C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
+        C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
+        C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
+        C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
+        C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
+        C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
+        C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
+        C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
+        C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
+        C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
+        C[ldc * (jj + 5) + (ii + 0)] = hsum(Cv50);
+        C[ldc * (jj + 5) + (ii + 1)] = hsum(Cv51);
+        C[ldc * (jj + 5) + (ii + 2)] = hsum(Cv52);
+        C[ldc * (jj + 5) + (ii + 3)] = hsum(Cv53);
+    }
+
+    inline void gemm_bloc_4x5(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv02 = set_zero<D>();
+        D Cv03 = set_zero<D>();
+        D Cv10 = set_zero<D>();
+        D Cv11 = set_zero<D>();
+        D Cv12 = set_zero<D>();
+        D Cv13 = set_zero<D>();
+        D Cv20 = set_zero<D>();
+        D Cv21 = set_zero<D>();
+        D Cv22 = set_zero<D>();
+        D Cv23 = set_zero<D>();
+        D Cv30 = set_zero<D>();
+        D Cv31 = set_zero<D>();
+        D Cv32 = set_zero<D>();
+        D Cv33 = set_zero<D>();
+        D Cv40 = set_zero<D>();
+        D Cv41 = set_zero<D>();
+        D Cv42 = set_zero<D>();
+        D Cv43 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            V Bv1 = load<V>(B + ldb * (jj + 1) + l);
+            V Bv2 = load<V>(B + ldb * (jj + 2) + l);
+            V Bv3 = load<V>(B + ldb * (jj + 3) + l);
+            V Bv4 = load<V>(B + ldb * (jj + 4) + l);
+
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv10 = madd(Av0, Bv1, Cv10);
+            Cv20 = madd(Av0, Bv2, Cv20);
+            Cv30 = madd(Av0, Bv3, Cv30);
+            Cv40 = madd(Av0, Bv4, Cv40);
+
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+            Cv01 = madd(Av1, Bv0, Cv01);
+            Cv11 = madd(Av1, Bv1, Cv11);
+            Cv21 = madd(Av1, Bv2, Cv21);
+            Cv31 = madd(Av1, Bv3, Cv31);
+            Cv41 = madd(Av1, Bv4, Cv41);
+
+            V Av2 = load<V>(A + lda * (ii + 2) + l);
+            Cv02 = madd(Av2, Bv0, Cv02);
+            Cv12 = madd(Av2, Bv1, Cv12);
+            Cv22 = madd(Av2, Bv2, Cv22);
+            Cv32 = madd(Av2, Bv3, Cv32);
+            Cv42 = madd(Av2, Bv4, Cv42);
+
+            V Av3 = load<V>(A + lda * (ii + 3) + l);
+            Cv03 = madd(Av3, Bv0, Cv03);
+            Cv13 = madd(Av3, Bv1, Cv13);
+            Cv23 = madd(Av3, Bv2, Cv23);
+            Cv33 = madd(Av3, Bv3, Cv33);
+            Cv43 = madd(Av3, Bv4, Cv43);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
+        C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
+        C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
+        C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
+        C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
+        C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
+        C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
+        C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
+        C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
+        C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
+        C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
+        C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
+        C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
+        C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
+        C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
+        C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
+        C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
+        C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
+    }
+
+    inline void gemm_bloc_4x4(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv02 = set_zero<D>();
+        D Cv03 = set_zero<D>();
+        D Cv10 = set_zero<D>();
+        D Cv11 = set_zero<D>();
+        D Cv12 = set_zero<D>();
+        D Cv13 = set_zero<D>();
+        D Cv20 = set_zero<D>();
+        D Cv21 = set_zero<D>();
+        D Cv22 = set_zero<D>();
+        D Cv23 = set_zero<D>();
+        D Cv30 = set_zero<D>();
+        D Cv31 = set_zero<D>();
+        D Cv32 = set_zero<D>();
+        D Cv33 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+            V Av2 = load<V>(A + lda * (ii + 2) + l);
+            V Av3 = load<V>(A + lda * (ii + 3) + l);
+
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv01 = madd(Av1, Bv0, Cv01);
+            Cv02 = madd(Av2, Bv0, Cv02);
+            Cv03 = madd(Av3, Bv0, Cv03);
+
+            V Bv1 = load<V>(B + ldb * (jj + 1) + l);
+            Cv10 = madd(Av0, Bv1, Cv10);
+            Cv11 = madd(Av1, Bv1, Cv11);
+            Cv12 = madd(Av2, Bv1, Cv12);
+            Cv13 = madd(Av3, Bv1, Cv13);
+
+            V Bv2 = load<V>(B + ldb * (jj + 2) + l);
+            Cv20 = madd(Av0, Bv2, Cv20);
+            Cv21 = madd(Av1, Bv2, Cv21);
+            Cv22 = madd(Av2, Bv2, Cv22);
+            Cv23 = madd(Av3, Bv2, Cv23);
+
+            V Bv3 = load<V>(B + ldb * (jj + 3) + l);
+            Cv30 = madd(Av0, Bv3, Cv30);
+            Cv31 = madd(Av1, Bv3, Cv31);
+            Cv32 = madd(Av2, Bv3, Cv32);
+            Cv33 = madd(Av3, Bv3, Cv33);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
+        C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
+        C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
+        C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
+        C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
+        C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
+        C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
+        C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
+        C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
+        C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
+        C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
+        C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
+        C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
+        C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
+    }
+
+    inline void gemm_bloc_4x3(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv02 = set_zero<D>();
+        D Cv03 = set_zero<D>();
+        D Cv10 = set_zero<D>();
+        D Cv11 = set_zero<D>();
+        D Cv12 = set_zero<D>();
+        D Cv13 = set_zero<D>();
+        D Cv20 = set_zero<D>();
+        D Cv21 = set_zero<D>();
+        D Cv22 = set_zero<D>();
+        D Cv23 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+            V Av2 = load<V>(A + lda * (ii + 2) + l);
+            V Av3 = load<V>(A + lda * (ii + 3) + l);
+
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv01 = madd(Av1, Bv0, Cv01);
+            Cv02 = madd(Av2, Bv0, Cv02);
+            Cv03 = madd(Av3, Bv0, Cv03);
+
+            V Bv1 = load<V>(B + ldb * (jj + 1) + l);
+            Cv10 = madd(Av0, Bv1, Cv10);
+            Cv11 = madd(Av1, Bv1, Cv11);
+            Cv12 = madd(Av2, Bv1, Cv12);
+            Cv13 = madd(Av3, Bv1, Cv13);
+
+            V Bv2 = load<V>(B + ldb * (jj + 2) + l);
+            Cv20 = madd(Av0, Bv2, Cv20);
+            Cv21 = madd(Av1, Bv2, Cv21);
+            Cv22 = madd(Av2, Bv2, Cv22);
+            Cv23 = madd(Av3, Bv2, Cv23);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
+        C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
+        C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
+        C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
+        C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
+        C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
+        C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
+        C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
+        C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
+        C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
+    }
+
+    inline void gemm_bloc_4x2(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv02 = set_zero<D>();
+        D Cv03 = set_zero<D>();
+        D Cv10 = set_zero<D>();
+        D Cv11 = set_zero<D>();
+        D Cv12 = set_zero<D>();
+        D Cv13 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+            V Av2 = load<V>(A + lda * (ii + 2) + l);
+            V Av3 = load<V>(A + lda * (ii + 3) + l);
+
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv01 = madd(Av1, Bv0, Cv01);
+            Cv02 = madd(Av2, Bv0, Cv02);
+            Cv03 = madd(Av3, Bv0, Cv03);
+
+            V Bv1 = load<V>(B + ldb * (jj + 1) + l);
+            Cv10 = madd(Av0, Bv1, Cv10);
+            Cv11 = madd(Av1, Bv1, Cv11);
+            Cv12 = madd(Av2, Bv1, Cv12);
+            Cv13 = madd(Av3, Bv1, Cv13);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
+        C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
+        C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
+        C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
+        C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
+        C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
+    }
+
+    inline void gemm_bloc_4x1(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv02 = set_zero<D>();
+        D Cv03 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+            V Av2 = load<V>(A + lda * (ii + 2) + l);
+            V Av3 = load<V>(A + lda * (ii + 3) + l);
+
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv01 = madd(Av1, Bv0, Cv01);
+            Cv02 = madd(Av2, Bv0, Cv02);
+            Cv03 = madd(Av3, Bv0, Cv03);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
+        C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
+    }
+
+    inline void gemm_bloc_2x2(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+        D Cv10 = set_zero<D>();
+        D Cv11 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv01 = madd(Av1, Bv0, Cv01);
+
+            V Bv1 = load<V>(B + ldb * (jj + 1) + l);
+            Cv10 = madd(Av0, Bv1, Cv10);
+            Cv11 = madd(Av1, Bv1, Cv11);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+        C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
+        C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
+    }
+
+    inline void gemm_bloc_2x1(int64_t ii, int64_t jj) {
+        size_t vl = vlmax<V>();
+        D Cv00 = set_zero<D>();
+        D Cv01 = set_zero<D>();
+
+        for (int64_t l = 0; l < k; l += vl) {
+            V Av0 = load<V>(A + lda * (ii + 0) + l);
+            V Av1 = load<V>(A + lda * (ii + 1) + l);
+
+            V Bv0 = load<V>(B + ldb * (jj + 0) + l);
+            Cv00 = madd(Av0, Bv0, Cv00);
+            Cv01 = madd(Av1, Bv0, Cv01);
+        }
+
+        C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
+        C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
+    }
+
+    template <int RM, int RN>
+    inline void gemm_bloc(int64_t ii, int64_t jj) {
+        if constexpr (RM == 4) {
+            if constexpr (RN == 6) { return gemm_bloc_4x6(ii, jj); }
+            if constexpr (RN == 5) { return gemm_bloc_4x5(ii, jj); }
+            if constexpr (RN == 4) { return gemm_bloc_4x4(ii, jj); }
+            if constexpr (RN == 3) { return gemm_bloc_4x3(ii, jj); }
+            if constexpr (RN == 2) { return gemm_bloc_4x2(ii, jj); }
+            if constexpr (RN == 1) { return gemm_bloc_4x1(ii, jj); }
+        } else if constexpr (RM == 2) {
+            if constexpr (RN == 2) { return gemm_bloc_2x2(ii, jj); }
+            if constexpr (RN == 1) { return gemm_bloc_2x1(ii, jj); }
+        }
+    }
+
+    template <int RM, int RN, int BM>
+    NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
+        GGML_ASSERT(m % (RM * BM) == 0);
+        const int64_t ytiles = m / (RM * BM);
+        const int64_t xtiles = (n + RN -1) / RN;
+        const int64_t jj_RN = (xtiles - (xtiles * RN - n));
+
+        // "round" bloc_size to "nearest" BN
+        const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
+        const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
+        const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
+        const int64_t nb_job = ytiles * NB_BN;
+
+        if (params->ith == 0) {
+            GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            ggml_threadpool_chunk_set(params->threadpool, params->nth);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        int64_t job = params->ith;
+        while (job < nb_job) {
+            const int64_t ii = (job % ytiles) * RM * BM;
+            const int64_t jb =  job / ytiles;
+            const int64_t jr0 = BLOC_POS(jb  , jj_BN, SIZE_BN);
+            const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
+
+            const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
+            const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
+            const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
+
+            for (int64_t bi = 0; bi < BM * RM; bi += RM) {
+                int64_t jj = jj0;
+                for (; jj < jj1; jj += RN) {
+                    gemm_bloc<RM, RN>(ii + bi, jj);
+                }
+                if constexpr (RN > 1) {
+                    for (; jj < jj2; jj += RN - 1) {
+                        gemm_bloc<RM, RN-1>(ii + bi, jj);
+                    }
+                }
+                GGML_ASSERT(jj == jj2);
+            }
+
+            job = ggml_threadpool_chunk_add(params->threadpool, 1);
+        }
+
+        ggml_barrier(params->threadpool);
+        return;
+    }
+
+    const ggml_compute_params * params;
+    const TA *const A;
+    const TB *const B;
+    TC *const C;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+};
+#endif
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // QUANT ZERO MATRIX MULTIPLICATION

@@ -2657,6 +3369,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
+#elif defined(__riscv_zvfh)
+    #if LMUL == 1
+        tinyBLAS_RVV<vfloat32m1_t, vfloat32m1_t, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+    #elif LMUL == 2
+        tinyBLAS_RVV<vfloat32m2_t, vfloat32m2_t, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+    #else // LMUL = 4
+        tinyBLAS_RVV<vfloat32m4_t, vfloat32m4_t, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+    #endif
+        return tb.matmul(m, n);
 #else
        return false;
 #endif
@@ -2699,6 +3429,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
        tb.matmul(m, n);
        return true;
        }
+#elif defined(__riscv_zvfbfwma)
+        #if LMUL == 1
+            tinyBLAS_RVV<vfloat32m1_t, vbfloat16mf2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
+                k, (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+        #elif LMUL == 2
+            tinyBLAS_RVV<vfloat32m2_t, vbfloat16m1_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
+                k, (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+        #else // LMUL = 4
+            tinyBLAS_RVV<vfloat32m4_t, vbfloat16m2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
+                k, (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+        #endif
+            return tb.matmul(m, n);
 #endif
        return false;
    }
@@ -2748,6 +3496,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
                (float *)C, ldc};
            return tb.matmul(m, n);
        }
+#elif defined(__riscv_zvfh)
+        if (Btype == GGML_TYPE_F16) {
+        #if LMUL == 1
+            tinyBLAS_RVV<vfloat32m1_t, vfloat16mf2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+        #elif LMUL == 2
+            tinyBLAS_RVV<vfloat32m2_t, vfloat16m1_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+        #else // LMUL = 4
+            tinyBLAS_RVV<vfloat32m4_t, vfloat16m2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+        #endif
+            return tb.matmul(m, n);
+        }
 #endif
        return false;
    }
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
    }
 }

+void ggml_gemv_q8_0_4x4_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(nr == 1);
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+    UNUSED(nr);
+
+    float sumf[4];
+    int   sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j] = 0.0;
+        }
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / blocklen); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                        sumi += v0 * a_ptr[l].qs[k * blocklen + i];
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemv_q8_0_4x8_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(nr == 1);
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+    UNUSED(nr);
+
+    float sumf[4];
+    int   sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j] = 0.0;
+        }
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / blocklen); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                        sumi += v0 * a_ptr[l].qs[k * blocklen + i];
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
 void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
    }
 }

+void ggml_gemm_q8_0_4x4_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    float sumf[4][4];
+    int   sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j] = 0.0;
+                }
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / blocklen); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                                sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
+                            }
+                            sumf[m][j] +=
+                                sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+void ggml_gemm_q8_0_4x8_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    float sumf[4][4];
+    int   sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j] = 0.0;
+                }
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / blocklen); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                                sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
+                            }
+                            sumf[m][j] +=
+                                sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
 } // extern "C"

+static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
+    block_q8_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK8_0 * 4 / blck_size_interleave;
+    for (int i = 0; i < end; ++i) {
+        int src_id     = i % 4;
+        int src_offset = (i / 4) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
+    }
+    return out;
+}
+
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
    block_q4_0x4 out;

@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
    GGML_UNUSED(data_size);
 }

+static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor *       t,
+                                    int                        interleave_block,
+                                    const void * GGML_RESTRICT data,
+                                    size_t                     data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    constexpr int nrows_interleaved = 4;
+
+    block_q8_0x4 *     dst = (block_q8_0x4 *) t->data;
+    const block_q8_0 * src = (const block_q8_0 *) data;
+    block_q8_0         dst_tmp[4];
+    int                nrow    = ggml_nrows(t);
+    int                nblocks = t->ne[0] / QK8_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+}
+
 static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
    block_iq4_nlx4 out;

@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
    return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
 }

+template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
+}
+
+template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
+}
+
 // gemv
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
    ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }

+template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
 // gemm
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
    ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }

+template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
 class tensor_traits_base : public ggml::cpu::tensor_traits {
  public:
    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2168,6 +2439,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;

+    // instance for Q8_0
+    static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
+    static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
+
    if (cur->type == GGML_TYPE_Q4_0) {
        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
@@ -2218,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                return &iq4_nl_4x4_q8_0;
            }
        }
+    } else if (cur->type == GGML_TYPE_Q8_0) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &q8_0_4x8_q8_0;
+            }
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &q8_0_4x4_q8_0;
+            }
+        }
    }

    return nullptr;
--- a/ggml/src/ggml-cpu/repack.h
+++ b/ggml/src/ggml-cpu/repack.h
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

 // Native implementations
 void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

 #if defined(__cplusplus)
 } // extern "C"
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
    sumf += (ggml_float)_mm_cvtss_f32(g);

 #undef LOAD
-#endif
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
+    size_t vl = __riscv_vsetvlmax_e32m4();

+    // initialize accumulators to all zeroes
+    vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+    vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+    // calculate step size
+    const size_t epr = __riscv_vsetvlmax_e16m2();
+    const size_t step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
+        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
+        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
+        vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
+        vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+
+    // accumulate in 1 register
+    vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
+
+    // leftovers
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
+        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
+        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
+    }
+
+    // reduce
+    vl = __riscv_vsetvlmax_e32m4();
+    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+    sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
+
+#endif
    for (; i < n; ++i) {
        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
                             GGML_BF16_TO_FP32(y[i]));
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
        }
        GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
        GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
-    #elif defined(__riscv_v_intrinsic)
-      // todo: RVV impl
-      for (int i = 0; i < n; ++i) {
-          for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-              sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
-          }
-      }
+
+    #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
+        size_t vl = __riscv_vsetvlmax_e32m4();
+
+        // initialize accumulators to all zeroes
+        vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+        // calculate step size
+        const size_t epr = __riscv_vsetvlmax_e16m2();
+        const size_t step = epr * 2;
+        const int np = (n & ~(step - 1));
+
+        // unroll by 2 along the row dimension
+        for (int i = 0; i < np; i += step) {
+            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
+            vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
+            vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
+            vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
+            vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
+
+            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
+            vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
+            vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
+            vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
+            vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
+        }
+
+        vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
+        vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
+
+        // leftovers
+        for (int i = np; i < n; i += vl) {
+            vl = __riscv_vsetvl_e16m2(n - i);
+            vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
+            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
+            vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
+
+            vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
+            vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
+        }
+
+        // reduce
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
+        __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
+        vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
+                                    acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
+                                    __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
+        vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
+                                    acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+        sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
+        sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
+
    #else
        const int np = (n & ~(GGML_F16_STEP - 1));

@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
    }
    np = n;
 #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
-    const int np = n;
-    _Float16 hv = (_Float16)v;
-    for (int i = 0, avl; i < n; i += avl) {
-        avl = __riscv_vsetvl_e16m8(n - i);
-        vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
-        vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
-        vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
-        __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
+    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+    const _Float16 scale = *(const _Float16*)(&s);
+
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m4();
+    const int step = epr * 2;
+    int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (int i = 0; i < np; i += step) {
+        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
+        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+        ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
    }
+
+    // leftovers
+    int vl;
+    for (int i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m4(n - i);
+        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
+    }
+    np = n;
 #elif defined(GGML_SIMD)
    const int np = (n & ~(GGML_F16_STEP - 1));

@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
        svst1_f16(pg, (__fp16 *)(y + np), out);
    }
 #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
-    for (int i = 0, vl; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m2(n - i);
-        vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
-        vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
-        vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
-        vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
-        __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
+    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+    const _Float16 scale = *(const _Float16*)(&s);
+
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m4();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (int i = 0; i < np; i += step) {
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+        ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+
+    // leftovers
+    int vl;
+    for (int i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m4(n - i);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
    }
 #elif defined(GGML_SIMD)
    const int np = (n & ~(GGML_F16_STEP - 1));
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND)
        # 80     == Ampere, asynchronous data loading, faster tensor core instructions
        # 86     == RTX 3000, needs CUDA v11.1
        # 89     == RTX 4000, needs CUDA v11.8
+        # 120    == Blackwell, needs CUDA v12.8, FP4 tensor cores
        #
        # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
        # XX-real    == compile CUDA code as device code for this specific architecture
@@ -36,10 +37,36 @@ if (CUDAToolkit_FOUND)
            endif()
        endif()
    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    enable_language(CUDA)

+    # Replace any 12x-real architectures with 12x{a}-real. FP4 ptx instructions are not available in just 12x
+    if (GGML_NATIVE)
+        set(PROCESSED_ARCHITECTURES "")
+        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND CMAKE_CUDA_ARCHITECTURES_NATIVE)
+            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
+        else()
+            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES})
+        endif()
+        foreach(ARCH ${ARCH_LIST})
+            if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
+                string(REGEX REPLACE "^(12[0-9]).*$" "\\1" BASE_ARCH ${ARCH})
+                message(STATUS "Replacing ${ARCH} with ${BASE_ARCH}a-real")
+                list(APPEND PROCESSED_ARCHITECTURES "${BASE_ARCH}a-real")
+            else()
+                list(APPEND PROCESSED_ARCHITECTURES ${ARCH})
+            endif()
+        endforeach()
+        set(CMAKE_CUDA_ARCHITECTURES ${PROCESSED_ARCHITECTURES})
+    else()
+        foreach(ARCH ${CMAKE_CUDA_ARCHITECTURES})
+            if(ARCH MATCHES "^12[0-9](-real|-virtual)?$")
+                message(FATAL_ERROR "Compute capability ${ARCH} used, use ${ARCH}a or ${ARCH}f for Blackwell specific optimizations")
+            endif()
+        endforeach()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")

--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -50,6 +50,10 @@
 #define GGML_CUDA_CC_TURING          750
 #define GGML_CUDA_CC_AMPERE          800
 #define GGML_CUDA_CC_ADA_LOVELACE    890
+// While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
+// https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
+#define GGML_CUDA_CC_BLACKWELL       1200
+#define GGML_CUDA_CC_RUBIN           1300
 #define GGML_CUDA_CC_OFFSET_AMD      0x1000000
 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
 #define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
@@ -246,6 +250,10 @@ static const char * cu_get_error_str(CUresult err) {
 #define AMPERE_MMA_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE

+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL && __CUDA_ARCH__ < GGML_CUDA_CC_RUBIN
+#    define BLACKWELL_MMA_AVAILABLE
+#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL
+
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 #define CP_ASYNC_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -316,6 +324,11 @@ static bool cp_async_available(const int cc) {
    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
 }

+static bool blackwell_mma_available(const int cc) {
+    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_BLACKWELL &&
+           ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_RUBIN;
+}
+
 static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
 #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
    return 64;
@@ -701,6 +714,28 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 #endif // CUDART_VERSION >= 12050
 }

+__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
+    const uint8_t sign_bit = (x < 0.0f) << 3;
+    float         ax       = fabsf(x) * e;
+
+    // Positive LUT
+    static constexpr float pos_lut[8] = { 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f };
+
+    int   best_i   = 0;
+    float best_err = fabsf(ax - pos_lut[0]);
+
+#pragma unroll
+    for (int i = 1; i < 8; ++i) {
+        const float err = fabsf(ax - pos_lut[i]);
+        if (err < best_err) {
+            best_err = err;
+            best_i   = i;
+        }
+    }
+
+    return static_cast<uint8_t>(best_i | sign_bit);
+}
+
 // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
 // Precompute mp (m' in the paper) and L such that division
 // can be computed using a multiply (high 32b of 64b result)
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -5,7 +5,7 @@
 #include "ggml.h"

 #ifdef GGML_CUDA_USE_CUB
-#   include <cub/device/device_scan.cuh>
+#   include <cub/block/block_scan.cuh>
 #endif // GGML_CUDA_USE_CUB

 template<typename T, int BLOCK_SIZE>
@@ -16,12 +16,14 @@ static __global__ void cumsum_cub_kernel(
        const int64_t  s01, const int64_t  s02, const int64_t  s03,
        const int64_t   s1,  const int64_t   s2,  const int64_t   s3) {
 #ifdef GGML_CUDA_USE_CUB
-    using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;
+    using BlockScanT = cub::BlockScan<T, BLOCK_SIZE>;

-    __shared__ typename BlockScan::TempStorage temp_storage;
-    __shared__ T block_carry;      // carry from previous tile
+    __shared__ typename BlockScanT::TempStorage temp_storage;
+    __shared__ T block_carry;

    const int tid = threadIdx.x;
+    constexpr int UNROLL_FACTOR = 4;
+    constexpr int TILE_SIZE = BLOCK_SIZE * UNROLL_FACTOR;

    const int64_t i1 = blockIdx.x;
    const int64_t i2 = blockIdx.y;
@@ -39,29 +41,38 @@ static __global__ void cumsum_cub_kernel(
    }
    __syncthreads();

-    for (int64_t start = 0; start < ne00; start += BLOCK_SIZE) {
-        int64_t idx = start + tid;
-        T x = (idx < ne00) ? src_row[idx] : T(0);
+    for (int64_t start = 0; start < ne00; start += TILE_SIZE) {
+        T items[UNROLL_FACTOR];
+        T thread_sum = T(0);

-        T inclusive;
-        T block_total;
-        BlockScan(temp_storage).InclusiveSum(x, inclusive, block_total);
-
-        __syncthreads();
-
-        T final_val = inclusive + block_carry;
-
-        // store result
-        if (idx < ne00) {
-            dst_row[idx] = final_val;
+#pragma unroll
+        for (int i = 0; i < UNROLL_FACTOR; i++) {
+            int64_t idx = start + tid * UNROLL_FACTOR + i;
+            T val = (idx < ne00) ? src_row[idx] : T(0);
+            thread_sum += val;
+            items[i] = thread_sum;
        }

+        // Block-wide scan on thread sums
+        T thread_prefix;
+        T block_total;
+        BlockScanT(temp_storage).InclusiveSum(thread_sum, thread_prefix, block_total);
        __syncthreads();

+        // Add offset to each item and store
+        T thread_offset = thread_prefix - thread_sum + block_carry;
+        #pragma unroll
+        for (int i = 0; i < UNROLL_FACTOR; i++) {
+            int64_t idx = start + tid * UNROLL_FACTOR + i;
+            if (idx < ne00) {
+                dst_row[idx] = items[i] + thread_offset;
+            }
+        }
+
+        // Update carry for next tile
        if (tid == 0) {
            block_carry += block_total;
        }
-
        __syncthreads();
    }
 #else
@@ -69,7 +80,7 @@ static __global__ void cumsum_cub_kernel(
 #endif // GGML_CUDA_USE_CUB
 }

-// Fallback kernel implementation (original)
+// Fallback kernel implementation
 template<typename T>
 static __global__ void cumsum_kernel(
        const T * src, T * dst,
@@ -86,10 +97,10 @@ static __global__ void cumsum_kernel(
    const int warps_per_block = blockDim.x / warp_size;

    extern __shared__ float smem[];
-    float * s_vals = smem;
-    float * s_warp_sums = smem + blockDim.x;
-    float * s_carry = smem + blockDim.x + warps_per_block;
-    float * s_chunk_total = s_carry + 1;
+    float *                 s_vals        = smem;
+    float *                 s_warp_sums   = smem + blockDim.x;
+    float *                 s_carry       = smem + blockDim.x + warps_per_block;
+    float *                 s_chunk_total = s_carry + 1;

    // Initialize carry
    if (tid == 0) {
@@ -107,21 +118,39 @@ static __global__ void cumsum_kernel(
    const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
    T       * dst_row = dst + i1 * s1  + i2 * s2  + i3 * s3;

-    for (int64_t start = 0; start < ne00; start += blockDim.x) {
-        int64_t idx = start + tid;
-        float val = (idx < ne00) ? ggml_cuda_cast<float, T>(src_row[idx]) : 0.0f;
+    // register blocking: process 4 elements per thread to hide latency
+    // and reduce synchronization overhead
+    constexpr int num_unroll = 4;
+    T             temp[num_unroll];

-        // 1. Warp inclusive scan
+    for (int64_t i = 0; i < ne00; i += num_unroll * blockDim.x) {
+        int64_t idx = i + tid * num_unroll;
+
+        // thread local sequential scan
+        temp[0] = (idx < ne00 ? src_row[idx] : T(0));
+#pragma unroll
+        for (int64_t j = 1; j < num_unroll; j++) {
+            temp[j] = temp[j - 1];
+            if (idx + j < ne00) {
+                temp[j] += src_row[idx + j];
+            } else {
+                temp[j] += 0;
+            }
+        }
+
+        // last emenent is sum of all values assigned to thread
+        float val = (idx < ne00) ? ggml_cuda_cast<float, T>(temp[num_unroll - 1]) : 0.0f;
+
+        // Warp inclusive scan
        val = warp_prefix_inclusive_sum<T, warp_size>(val);
        s_vals[tid] = val;

-        // Store warp total
        if (lane == warp_size - 1) {
            s_warp_sums[warp] = val;
        }
        __syncthreads();

-        // 2. Exclusive scan of warp sums (warp 0 only)
+        // Exclusive scan of warp sums (warp 0 only)
        if (warp == 0) {
            float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
            float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
@@ -134,12 +163,17 @@ static __global__ void cumsum_kernel(
        }
        __syncthreads();

+        // write back results
        float carry = *s_carry;
-        float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
-        if (idx < ne00) {
-            dst_row[idx] = ggml_cuda_cast<T, float>(final_val);
+        // calculate sum offset for this thread
+        float final_val_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
+
+#pragma unroll
+        for (int32_t j = 0; j < num_unroll; j++) {
+            if (idx + j < ne00) {
+                dst_row[idx + j] = temp[j] + ggml_cuda_cast<T, float>(final_val_offset);
+            }
        }
-        __syncthreads();

        // Update carry for next chunk
        if (tid == 0) {
@@ -177,7 +211,7 @@ static void cumsum_cuda(
    const int warps_per_block = block_size / warp_size;
    const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);

-    if (use_cub) {
+    if (use_cub && ne00 >= 1024) {
        cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
            src, dst,
            ne00, ne01, ne02, ne03,
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3076,8 +3076,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx];
        ggml_tensor * weights = cgraph->nodes[node_idx + 9];
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];

-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
            return true;
        }
    }
@@ -3085,7 +3088,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
    if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx];
        ggml_tensor * weights = cgraph->nodes[node_idx + 4];
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
+
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
            return true;
        }
    }
@@ -3094,8 +3101,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
        ggml_tensor * weights = cgraph->nodes[node_idx + 5];
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];

-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
            return true;
        }
    }
--- a/ggml/src/ggml-cuda/mean.cu
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    const int id  = ggml_cuda_get_device();
    const int nsm = ggml_cuda_info().devices[id].nsm;
+
+    // Heuristic for block size selection to optimize occupancy.
+    // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
    if ((nrows / nsm) < 2) {
        const dim3 block_dims(512, 1, 1);
        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -76,15 +76,29 @@ namespace ggml_cuda_mma {
        // For the A/C matrices this means I major == row major, J major == column major.
        // For the B matrix this means I major == column major, J major == row major.
        // MIRRORED == Each data value is held exactly once per thread subgroup.
-        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
-        DATA_LAYOUT_I_MAJOR_MIRRORED  = 10,
-        DATA_LAYOUT_J_MAJOR_MIRRORED  = 20,
+        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
+        DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20, // Volta, matrix A&B for RDNA3.
+        DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
    };
    // Implemented mma combinations are:
    //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
    //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
    //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR

+    static constexpr bool is_i_major(const data_layout dl) {
+        return dl == DATA_LAYOUT_I_MAJOR ||
+               dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
+    }
+
+    static constexpr __device__ data_layout get_input_data_layout() {
+#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        return DATA_LAYOUT_I_MAJOR_MIRRORED;
+#else
+        return DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+    }
+
    template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
    struct tile {};

@@ -115,9 +129,9 @@ namespace ggml_cuda_mma {
            } else if constexpr (I == 32 && J == 4) {
                return threadIdx.x % 32;
            } else if constexpr (I == 16 && J == 16) {
-                return 4 * (threadIdx.x / 16) + l;
+                return threadIdx.x % 16;
            } else if constexpr (I == 32 && J == 32) {
-                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
+                return threadIdx.x % 32;
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -132,9 +146,9 @@ namespace ggml_cuda_mma {
            } else if constexpr (I == 32 && J == 4) {
                return 2 * (threadIdx.x / 32) + l;
            } else if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
+                return 4 * (threadIdx.x / 16) + l;
            } else if constexpr (I == 32 && J == 32) {
-                return threadIdx.x % 32;
+                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -171,28 +185,19 @@ namespace ggml_cuda_mma {
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA4)
        static constexpr int ne = I * J / 32;
-#elif defined(RDNA3)
-        static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
-#endif // defined(RDNA4)
        T x[ne] = {0};

        static constexpr __device__ bool supported() {
            if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8) return true;
+            if (I == 16 && J == 4) return true;
            return false;
        }

        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 16) {
-#if defined(RDNA4)
-                return 8 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return 2 * l + (threadIdx.x / 16);
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -201,7 +206,17 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
+                // matrix C
+#if defined(RDNA3)
+                return 2 * l + (threadIdx.x / 16);
+#else
+                return ne * (threadIdx.x / 16) + l;
+#endif // defined(RDNA3)
+            } else if constexpr (I == 16 && J == 8) {
+                // mmq input for RDNA4
+                return ne * (threadIdx.x / 16) + l;
+            } else if constexpr (I == 16 && J == 4) {
+                return ne * (threadIdx.x / 16) + l;
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -293,12 +308,7 @@ namespace ggml_cuda_mma {
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
        static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
        half2 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
@@ -317,14 +327,7 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -382,42 +385,19 @@ namespace ggml_cuda_mma {
        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;

 #if defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
        static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
-            if (I == 16 && J == 8) return true;
-            return false;
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::supported();
        }

        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 8) {
-                return threadIdx.x % 16;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_i(l);
        }

        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
-                return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_j(l);
        }
 #else
        static constexpr int ne = I * J / WARP_SIZE;
@@ -458,11 +438,87 @@ namespace ggml_cuda_mma {
 #endif  // defined(AMD_WMMA_AVAILABLE)
    };

+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_J_MAJOR> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR;
+
+        static constexpr int ne = tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::ne;
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_j(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_i(l);
+        }
+    };
+
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+
+        // RDNA3
+        static constexpr int         ne = I * J / 32 * 2;
+
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8)  return true;
+            if (I == 16 && J == 4)  return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int /*l*/) {
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (supported()) {
+                return l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
    template <int I_, int J_>
    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+#if defined(RDNA3)
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
+#else // Volta
        static constexpr int         ne = I * J / (WARP_SIZE/4);

        half2 x[ne] = {{0.0f, 0.0f}};
@@ -489,6 +545,29 @@ namespace ggml_cuda_mma {
                return -1;
            }
        }
+#endif // defined(RDNA3)
+    };
+
+    template <int I_, int J_>
+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
    };

    template <int I_, int J_>
@@ -569,55 +648,28 @@ namespace ggml_cuda_mma {
                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
            }
        } else {
-            int64_t * xi = (int64_t *) t.x;
-            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-            xi[0] = xs[0];
+            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-#if defined(RDNA4)
-                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-#elif defined(RDNA3)
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
-#else
-                NO_DEVICE_CODE;
-#endif // defined(RDNA4)
-        } else if constexpr (std::is_same_v<T, int>) {
-            if constexpr (I == 16 && J == 4) {
-                int64_t * xi = (int64_t *) t.x;
-#if defined(RDNA4)
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-                xi[0] = xs[0];
-#elif defined(RDNA3)
-                static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
-                xi[0] = xs[0];
-                xi[1] = xs[1];
-#endif // defined(RDNA4)
-            } else if constexpr (I == 16 && J == 8) {
-                int64_t * xi = (int64_t *) t.x;
-#if defined(RDNA4)
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
-                xi[0] = xs[0];
-
-                const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
-                xi[1] = xs1[0];
-#elif defined(RDNA3)
-                static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
-                // contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
-                xi[0] = xs[0];
-                xi[1] = xs[1];
-                const int64_t * xs1 = xs + 2;
-                xi[2] = xs1[0];
-                xi[3] = xs1[1];
-#endif // defined(RDNA4)
+        // All wmma layout has contiguous data when i-major.
+        if constexpr (is_i_major(dl)) {
+            // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
+            constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
+            if constexpr (sizeof(t.x) > aligned_copy_bytes) {
+                static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
+                constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
+#pragma unroll
+                for (int i = 0; i < aligned_copy_count; ++i) {
+                    ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
+                }
            } else {
-                NO_DEVICE_CODE;
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
            }
        } else {
-            NO_DEVICE_CODE;
+#pragma unroll
+            for (int l = 0; l < t.ne; ++l) {
+                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
+            }
        }
 #else
 #pragma unroll
@@ -660,9 +712,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
    }

-    template <typename T>
+    template <typename T, data_layout dl>
    static __device__ __forceinline__ void load_ldmatrix(
-            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
+            tile<16, 8, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #if defined(TURING_MMA_AVAILABLE)
        int * xi = (int * ) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
@@ -832,8 +884,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
    }

+    template <data_layout dl_ab, data_layout dl_d>
    static __device__ __forceinline__ void mma(
-            tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) {
+            tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) {
 #ifdef AMPERE_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
@@ -847,6 +900,27 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
    }

+    static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> &     D,
+                                                            const tile<16, 8, int> & A,
+                                                            const tile<8, 8, int> &  B,
+                                                            uint32_t                 a_scale,
+                                                            uint32_t                 b_scale) {
+#ifdef BLACKWELL_MMA_AVAILABLE
+        const int * Axi = (const int *) A.x;
+        const int * Bxi = (const int *) B.x;
+        float *     Dxi = (float *) D.x;
+
+        asm volatile(
+            "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
+            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
+            "%10, {0, 0}, %11, {0, 0};"
+            : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
+#else
+        GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
+#endif  // BLACKWELL_MMA_AVAILABLE
+    }
+
    static __device__ __forceinline__ void mma(
            tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
 #ifdef TURING_MMA_AVAILABLE
@@ -887,8 +961,9 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
    }

+    template <data_layout dl_ab, data_layout dl_d>
    static __device__ __forceinline__ void mma(
-            tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
+            tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) {
 #ifdef TURING_MMA_AVAILABLE
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
@@ -940,8 +1015,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
    }

+    template <data_layout dl_ab, data_layout dl_d>
    static __device__ __forceinline__ void mma(
-            tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
+            tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA4)
        using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
@@ -967,8 +1043,9 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
    }

+    template <data_layout dl_d, data_layout dl_ab>
    static __device__ __forceinline__ void mma(
-            tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
+            tile<16, 16, int, dl_d> & D, const tile<16, 8, int, dl_ab> & A, const tile<16, 8, int, dl_ab> & B) {
 #if defined(AMD_MFMA_AVAILABLE)
        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
        int32x4_t * acc = (int32x4_t *) D.x;
@@ -1122,8 +1199,9 @@ namespace ggml_cuda_mma {
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
    }

-static __device__ __forceinline__ void mma(
-            tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
+    template <data_layout dl_d, data_layout dl_ab>
+    static __device__ __forceinline__ void mma(
+            tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
        using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
        int32x8_t * acc = (int32x8_t *) D.x;
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -32,11 +32,13 @@ static __global__ void mul_mat_f(
 #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 #if defined(AMD_WMMA_AVAILABLE)
    // Special case for tf32, just dummy mma layout as wmma doesn't support it.
-    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
-    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
-    typedef tile<16,       8, T>     tile_A;
-    typedef tile<tile_B_I, 8, T>     tile_B;
-    typedef tile<16,       tile_C_J, float> tile_C;
+    constexpr bool is_tf32 = std::is_same_v<T, float>;
+    constexpr int tile_B_I = is_tf32 ? 8 : 16;
+    constexpr int tile_C_J = is_tf32 ? 8 : 16;
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
+    typedef tile<16,       8,        T,     ab_layout>           tile_A;
+    typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
+    typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
 #else
 #ifdef VOLTA_MMA_AVAILABLE
    if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
@@ -272,11 +274,13 @@ static __global__ void mul_mat_f_ids(
 #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 #if defined(AMD_WMMA_AVAILABLE)
    // Special case for tf32, just dummy mma layout as wmma doesn't support it.
-    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
-    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
-    typedef tile<16,       8, T>     tile_A;
-    typedef tile<tile_B_I, 8, T>     tile_B;
-    typedef tile<16,       tile_C_J, float> tile_C;
+    constexpr bool is_tf32 = std::is_same_v<T, float>;
+    constexpr int tile_B_I = is_tf32 ? 8 : 16;
+    constexpr int tile_C_J = is_tf32 ? 8 : 16;
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
+    typedef tile<16,       8,        T,     ab_layout>           tile_A;
+    typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
+    typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
 #else
 #ifdef VOLTA_MMA_AVAILABLE
    if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -1,3 +1,4 @@
+#include "common.cuh"
 #include "mmq.cuh"
 #include "quantize.cuh"
 #include "mmid.cuh"
@@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q(
    const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
                            || GGML_CUDA_CC_IS_CDNA(cc);

+    // TODO: tighter pool buffer size vs q8 path
+    const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
+
    if (!ids) {
        const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
            get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
@@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q(
            const int64_t s11 = src1->nb[1] / ts_src1;
            const int64_t s12 = src1->nb[2] / ts_src1;
            const int64_t s13 = src1->nb[3] / ts_src1;
-            quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
-                ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
+            if (use_native_mxfp4) {
+                static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
+                quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
+                                        ne11, ne12, ne13, stream);
+
+            } else {
+                quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
+                                       ne11, ne12, ne13, stream);
+            }
            CUDA_CHECK(cudaGetLastError());
        }

-        const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
+        // Stride depends on quantization format
+        const int64_t s12 = use_native_mxfp4 ?
+                                ne11 * ne10_padded * sizeof(block_fp4_mmq) /
+                                    (8 * QK_MXFP4 * sizeof(int))  // block_fp4_mmq holds 256 values (8 blocks of 32)
+                                :
+                                ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
        const int64_t s13 = ne12*s12;

        const mmq_args args = {
@@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q(
        const int64_t s11 = src1->nb[1] / ts_src1;
        const int64_t s12 = src1->nb[2] / ts_src1;
        const int64_t s13 = src1->nb[2] / ts_src1;
-        quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
-            ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
+
+        if (use_native_mxfp4) {
+            quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
+                                    ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
+        } else {
+            quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
+                                   ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
+        }
        CUDA_CHECK(cudaGetLastError());
    }

-    const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
+    const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
+                                           ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
    const int64_t s13 = ne12*s12;

    // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -11,6 +11,7 @@ using namespace ggml_cuda_mma;

 #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
 #define MMQ_ITER_K 256
+#define MMQ_ITER_K_MXFP4_FP4    512
 #define MMQ_NWARPS 8

 typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
@@ -44,8 +45,15 @@ struct block_q8_1_mmq {
    };
    int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
 };
+
+struct block_fp4_mmq {
+    uint32_t d4[4];       // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc.
+    int8_t   qs[4 * 32];  // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values
+};
+
 static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
 static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
+static_assert(sizeof(block_fp4_mmq)  == sizeof(block_q8_1_mmq),    "Unexpected block_fp4_mmq size");

 static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
    switch (type_x) {
@@ -129,6 +137,14 @@ static int get_mmq_y_host(const int cc) {
        ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
 }

+static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
+#if defined(BLACKWELL_MMA_AVAILABLE)
+    return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K;
+#else
+    return MMQ_ITER_K;
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
+}
+
 static constexpr __device__ int get_mmq_y_device() {
 #if defined(GGML_USE_HIP)
 #if defined(RDNA1)
@@ -191,6 +207,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
 }

 #define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0                   + 4)
+#define MMQ_MMA_TILE_X_K_FP4  (2*MMQ_TILE_NE_K + 8                                       + 4)
 #define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0                   + 4)
 #define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K                           + 4)
 #define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2                         + 4)
@@ -201,6 +218,8 @@ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
+static_assert(MMQ_MMA_TILE_X_K_FP4  % 8 == 4, "Wrong padding.");
+static_assert(MMQ_MMA_TILE_X_K_FP4 == MMQ_MMA_TILE_X_K_Q8_1, "Wrong tile size for MXFP4");

 static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
    switch (type) {
@@ -209,6 +228,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
        case GGML_TYPE_Q5_0:    return MMQ_MMA_TILE_X_K_Q8_0;
        case GGML_TYPE_Q5_1:    return MMQ_MMA_TILE_X_K_Q8_1;
        case GGML_TYPE_Q8_0:    return MMQ_MMA_TILE_X_K_Q8_0;
+        // tile sizes are the same for Q8_1 and FP4 for blackwell
        case GGML_TYPE_MXFP4:   return MMQ_MMA_TILE_X_K_Q8_1;
        case GGML_TYPE_Q2_K:    return MMQ_MMA_TILE_X_K_Q2_K;
        case GGML_TYPE_Q3_K:    return MMQ_MMA_TILE_X_K_Q3_K;
@@ -228,7 +248,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
 }

 // block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales)
-#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI8_1)
+#define MMQ_TILE_Y_K     (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1)
+#define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K

 static int mmq_get_granularity_host(const int mmq_x, const int cc) {
    if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
@@ -761,6 +782,50 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
    }
 }

+template <int mmq_y, bool need_check>
+static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restrict__ x,
+                                                            int * __restrict__ x_tile,
+                                                            const int kbx0,
+                                                            const int i_max,
+                                                            const int stride) {
+    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    int *      x_qs = (int *) x_tile;
+    uint32_t * x_sc = (uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
+
+    const int txi = threadIdx.x;
+
+    constexpr int iter_k = get_iter_k(GGML_TYPE_MXFP4);
+
+    constexpr int threads_per_row = iter_k / QK_MXFP4;  // each thread processes 1 block
+    constexpr int rows_per_warp   = warp_size / threads_per_row;
+    const int     kbx             = txi % threads_per_row;
+    const int     row_in_warp     = txi / threads_per_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
+        int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
+
+        if constexpr (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx;
+
+        // quantize_mxfp4_mmq permutes nibbles to match the quantized format
+        const int k0 = kbx * 4;
+        memcpy(x_qs + i * MMQ_MMA_TILE_X_K_FP4 + k0, bxi->qs, 16);
+
+        // Load E8M0 scales: pack 2 consecutive scales into one uint32
+        if (kbx % 2 == 0) {
+            uint32_t e = bxi->e;
+            e |= ((bxi + 1)->e << 8);
+            x_sc[i * MMQ_MMA_TILE_X_K_FP4 + kbx / 2] = e;
+        }
+    }
+}
+
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
@@ -797,9 +862,10 @@ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -930,6 +996,78 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
 }

+template <int mmq_x, int mmq_y>
+static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x,
+                                                               const int * __restrict__ y,
+                                                               float * __restrict__ sum,
+                                                               const int k00) {
+    typedef tile<16, 8, int>   tile_A;
+    typedef tile<8, 8, int>    tile_B;
+    typedef tile<16, 8, float> tile_C;  // Output is float for native scaled MMA
+
+    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx           = rows_per_warp / tile_C::I;  // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K);
+
+    // Match layout from load_tiles_mxfp4_fp4
+    const int *      x_qs = (const int *) x;
+    const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
+    const int *      y_qs = (const int *) y + 4;
+    const uint32_t * y_sc = (const uint32_t *) y;
+
+    // tile_A has a length of 64 logical values vs. 32 values in block_mxfp4
+    tile_A   A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
+    uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
+
+    // Block scale
+    // Each thread has to point to a 4 byte scale value
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
+
+    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
+            const int k0 = k00 + k01;
+
+            load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0,
+                          MMQ_MMA_TILE_X_K_FP4);
+
+            // based on block-scaling document, 2 threads in each quad need to supply to the scale value
+            const int tidx         = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
+            scaleA[n][k01 / (2 * QI_MXFP4)] =
+                *(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4));
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
+#pragma unroll
+        for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
+            tile_B   B;
+            uint32_t scaleB;  // 2xN scales
+
+            load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K);
+
+            scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)];
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                tile_C C;
+
+                mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB);
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
+                }
+            }
+        }
+    }
+}
+
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
@@ -966,9 +1104,10 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -1130,10 +1269,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -1179,9 +1319,10 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
        }
    }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -1435,10 +1576,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -1501,10 +1643,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
        }
    }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -2265,10 +2407,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -2316,9 +2459,10 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
        }
    }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;

    constexpr int granularity = mmq_get_granularity_device(mmq_x);
    constexpr int rows_per_warp = granularity;
@@ -3015,7 +3159,7 @@ static __device__ __forceinline__ void mmq_write_back_mma(

 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
    constexpr int tileC_IJ = mmq_get_granularity_device(0);
-    typedef tile<tileC_IJ, tileC_IJ, int> tile_C;
+    typedef tile<tileC_IJ, tileC_IJ, int, DATA_LAYOUT_J_MAJOR> tile_C;
    constexpr int rows_per_warp = granularity;
 #else
    typedef tile<16, 8, int> tile_C;
@@ -3102,8 +3246,13 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q8_0> {
 template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
    static constexpr int              vdr          = VDR_MXFP4_Q8_1_MMQ;
+#ifdef BLACKWELL_MMA_AVAILABLE
+    static constexpr load_tiles_mmq_t load_tiles  = load_tiles_mxfp4_fp4<mmq_y, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
+#else
    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_mxfp4<mmq_y, need_check>;
    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
+#endif // BLACKWELL_MMA_AVAILABLE
    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
 };

@@ -3236,17 +3385,26 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
    constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, need_check>;
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)

-    constexpr int blocks_per_iter = MMQ_ITER_K / qk;
+#if defined(BLACKWELL_MMA_AVAILABLE)
+    // FP4 tile stores 8 blocks
+    constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1;
+#else
+    constexpr int ne_block = 4 * QK8_1;
+#endif  // defined(BLACKWELL_MMA_AVAILABLE)
+
+    constexpr int ITER_K          = get_iter_k(type);
+    constexpr int blocks_per_iter = ITER_K / qk;

    float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};

+    constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int);
+
    for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
        load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
-
        {
-            const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
+            const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz;
 #pragma unroll
-            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) {
+            for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
                int l = l0 + threadIdx.y*warp_size + threadIdx.x;

                tile_y[l] = by0[l];
@@ -3260,9 +3418,9 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
        __syncthreads();

        {
-            const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
+            const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz);
 #pragma unroll
-            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) {
+            for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
                int l = l0 + threadIdx.y*warp_size + threadIdx.x;

                tile_y[l] = by0[l];
@@ -3394,8 +3552,10 @@ static __global__ void mul_mat_q(
    }
 #endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA

+    constexpr int ITER_K = get_iter_k(type);
+
    const     int64_t blocks_per_ne00 = ncols_x / qk;
-    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
+    constexpr int     blocks_per_iter = ITER_K / qk;

    // kbc == k block continuous, current index in continuous ijk space.
    int64_t kbc      = (int64_t) blockIdx.x     *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
@@ -3456,7 +3616,7 @@ static __global__ void mul_mat_q(
            __syncthreads();
        }

-        offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
+        offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
        offset_dst += it*mmq_y;

        const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
@@ -3523,7 +3683,7 @@ static __global__ void mul_mat_q(
        __syncthreads();
    }

-    offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
+    offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
    offset_dst += it*mmq_y;

    const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
@@ -3546,7 +3706,9 @@ static __global__ void mul_mat_q_stream_k_fixup(
        const int ncols_max) {
    constexpr int     mmq_y           = get_mmq_y_device();
    constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
-    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
+    constexpr int     ITER_K          = get_iter_k(type);
+
+    constexpr int     blocks_per_iter = ITER_K / qk;
    const     int64_t blocks_per_ne00 = ncols_x / qk;

    constexpr int nwarps = mmq_get_nwarps_device();
@@ -3704,7 +3866,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
    const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
    const size_t nbs_ids = mmq_x*sizeof(int);
    const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
-    const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
+    const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq));
    return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
 }

--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -47,6 +47,131 @@ static __global__ void quantize_q8_1(
    y[ib].ds = make_half2(d, sum);
 }

+__device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
+    if (!(amax > 0.0f)) {
+        return 0;
+    }
+
+    // FP4 E2M1: max exponent (unbiased) is 2.
+    constexpr int FP4_E2M1_EMAX = 2;
+
+    const float e = log2f(amax);
+
+    // "even" -> round-to-nearest integer, ties-to-even
+    const int e_int = __float2int_rn(e);
+
+    const int shared_exp = e_int - FP4_E2M1_EMAX;
+
+    int biased = shared_exp + 127;
+
+    biased = max(biased, 0);
+    biased = min(biased, 254);
+
+    return static_cast<uint8_t>(biased);
+}
+
+// quantize values in the format mxfp4 is stored which is interleaved nibbles
+// i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31
+static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
+                                          const int32_t * __restrict__ ids,
+                                          void * __restrict__ vy,
+                                          const int64_t ne00,
+                                          const int64_t s01,
+                                          const int64_t s02,
+                                          const int64_t s03,
+                                          const int64_t ne0,
+                                          const int     ne1,
+                                          const int     ne2) {
+    constexpr int vals_per_scale = 32;
+    constexpr int vals_per_warp  = 2 * vals_per_scale;  // Each warp processes 2 blocks of 32 = 64 values
+
+    const int warp_id = threadIdx.y;
+    const int lane_id_32 = threadIdx.x;
+
+    const int nwarps = blockDim.y;
+
+    const int64_t warp_start_offset = (blockIdx.y * nwarps + warp_id) * vals_per_warp;
+
+    if (warp_start_offset >= ne0) {
+        return;
+    }
+
+    const int64_t i1 = blockIdx.x;
+    const int64_t i2 = blockIdx.z % ne2;
+    const int64_t i3 = blockIdx.z / ne2;
+
+    const int64_t i01 = ids ? ids[i1] : i1;
+    const int64_t i02 = i2;
+    const int64_t i03 = i3;
+
+    block_fp4_mmq * y = (block_fp4_mmq *) vy;
+
+    const int64_t block_fp4_mmq_size = 8 * QK_MXFP4;  // 256 values
+    const int64_t ib0                = blockIdx.z * ((int64_t) ne1 * (ne0 / block_fp4_mmq_size));
+    const int64_t ib = ib0 + (warp_start_offset / block_fp4_mmq_size) * ne1 + blockIdx.x;
+    const int64_t quad_idx_in_block  = (warp_start_offset % block_fp4_mmq_size) / vals_per_warp;
+
+    const int group_id = lane_id_32 / 4;
+    const int lane_in_group = lane_id_32 % 4;
+    const int base = group_id * 2;
+    char2 * yqs2 = (char2 *) y[ib].qs;
+
+    const int64_t base_pos = i03 * s03 + i02 * s02 + i01 * s01;
+
+    uint8_t scales[2];
+
+#pragma unroll
+    for (int b = 0; b < 2; ++b) {
+        const int64_t i0 = warp_start_offset + b * vals_per_scale + lane_id_32;
+        const float xi = (i0 < ne00) ? x[base_pos + i0] : 0.0f;
+
+        float amax = fabsf(xi);
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1) {
+            amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
+        }
+
+        const uint8_t e = compute_e8m0_scale(amax);
+        scales[b] = e;
+        const float inv_s = (amax == 0.0f) ? 0.0f : __frcp_rn(ggml_cuda_e8m0_to_fp32(e));
+
+#if CUDART_VERSION >= 12080
+        const float scaled_val = xi * inv_s;
+
+        const float val0 = __shfl_sync(0xFFFFFFFF, scaled_val, base, WARP_SIZE);
+        const float val1 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 16, WARP_SIZE);
+        const float val2 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 1, WARP_SIZE);
+        const float val3 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 17, WARP_SIZE);
+
+        if (lane_in_group == 0) {
+            __nv_fp4x4_e2m1 fp4_packed(make_float4(val0, val1, val2, val3));
+
+            yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = *(char2 *) &fp4_packed;
+        }
+#else
+        // Fallback: manual FP4 conversion using LUT
+        const uint8_t q_val = ggml_cuda_float_to_fp4_e2m1(xi, inv_s);
+
+        const uint8_t q_lo_0 = __shfl_sync(0xFFFFFFFF, q_val, base,      WARP_SIZE);
+        const uint8_t q_lo_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 1,  WARP_SIZE);
+        const uint8_t q_hi_0 = __shfl_sync(0xFFFFFFFF, q_val, base + 16, WARP_SIZE);
+        const uint8_t q_hi_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 17, WARP_SIZE);
+
+        if (lane_in_group == 0) {
+            char2 q;
+            q.x = (q_hi_0 << 4) | q_lo_0;
+            q.y = (q_hi_1 << 4) | q_lo_1;
+            yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = q;
+        }
+#endif // CUDART_VERSION >= 12080
+    }
+
+    if (lane_id_32 == 0) {
+        // Store 2 scales packed into 1 uint32
+        y[ib].d4[quad_idx_in_block] = (scales[1] << 8) | scales[0];
+    }
+}
+
 template <mmq_q8_1_ds_layout ds_layout>
 static __global__ void quantize_mmq_q8_1(
        const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
@@ -190,3 +315,29 @@ void quantize_mmq_q8_1_cuda(
            break;
    }
 }
+
+void quantize_mmq_mxfp4_cuda(const float *                    x,
+                             const int32_t *                  ids,
+                             void *                           vy,
+                             [[maybe_unused]] const ggml_type type_src0,
+                             const int64_t                    ne00,
+                             const int64_t                    s01,
+                             const int64_t                    s02,
+                             const int64_t                    s03,
+                             const int64_t                    ne0,
+                             const int64_t                    ne1,
+                             const int64_t                    ne2,
+                             const int64_t                    ne3,
+                             cudaStream_t                     stream) {
+    GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
+
+    constexpr int nwarps = 8;
+    constexpr int vals_per_warp  = 2 * QK_MXFP4;
+    constexpr int vals_per_block = nwarps * vals_per_warp;
+
+    const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
+    const dim3    num_blocks(ne1, block_num_y, ne2 * ne3);
+    const dim3    block_size(WARP_SIZE, nwarps, 1);
+
+    quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
+}
--- a/ggml/src/ggml-cuda/quantize.cuh
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -25,3 +25,17 @@ void quantize_mmq_q8_1_cuda(
        const float * x, const int32_t * ids, void * vy,
        ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
+
+void quantize_mmq_mxfp4_cuda(const float *   x,
+                             const int32_t * ids,
+                             void *          vy,
+                             ggml_type       type_src0,
+                             int64_t         ne00,
+                             int64_t         s01,
+                             int64_t         s02,
+                             int64_t         s03,
+                             int64_t         ne0,
+                             int64_t         ne1,
+                             int64_t         ne2,
+                             int64_t         ne3,
+                             cudaStream_t    stream);
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@@ -102,31 +102,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
    const int threads = 128;
    GGML_ASSERT(nr % threads == 0);

-    if (n_t <= 32) {
-        const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
-        if (nc == 4) {
-            ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else if (nc == 3) {
-            ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+    auto launch_kernel = [&](auto NC) {
+        constexpr int kNC = decltype(NC)::value;
+        if (n_t <= 32) {
+            const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
+            ssm_conv_f32<threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
+                                                                       dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else {
-            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
-        }
-    } else {
-        if (nc == 4) {
            const int64_t split_n_t = 32;
            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
-            ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
+            ssm_conv_long_token_f32<threads, kNC, split_n_t><<<blocks, threads, 0, stream>>>(
                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else if (nc == 3) {
-            const int64_t split_n_t = 32;
-            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
-            ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
-                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else {
-            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
        }
+    };
+
+    switch (nc) {
+        case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
+        case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
+        case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
+        default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
    }
 }

--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -268,7 +268,23 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
    }
 }

-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * get_rows,
+                                   const ggml_tensor * argsort,
+                                   const ggml_tensor * clamp,
+                                   int n_expert) {
+    ggml_tensor * probs = get_rows->src[0];
+    if (probs->op != GGML_OP_RESHAPE) {
+        return false;
+    }
+    probs = probs->src[0];
+    ggml_tensor * selection_probs = argsort->src[0];
+
+    if (probs != selection_probs) {
+        return false;
+    }
+
    float scale    = 1.0f;
    float max_bias = 0.0f;

@@ -288,7 +304,6 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
        return false;
    }

-    const int n_expert = softmax->ne[0];
    // n_expert must be a power of 2
    if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
        return false;
--- a/ggml/src/ggml-cuda/topk-moe.cuh
+++ b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -11,6 +11,11 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
                           const bool                  delayed_softmax = false,
                           ggml_tensor *               weight_clamp    = nullptr);

-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * get_rows,
+                                   const ggml_tensor * argsort,
+                                   const ggml_tensor * clamp,
+                                   int n_expert);

 std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
--- a/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -10,6 +10,10 @@
 #include <cuda_fp8.h>
 #endif // CUDART_VERSION >= 12050

+#if CUDART_VERSION >= 12080
+#include <cuda_fp4.h>
+#endif // CUDART_VERSION >= 12080
+
 #if CUDART_VERSION < 11020
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -2,6 +2,7 @@ include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 include(ExternalProject)

 option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
+set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")

 add_library(htp_iface OBJECT
    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
@@ -41,7 +42,8 @@ set(HTP_CMAKE_ARGS
    -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
    -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
    -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
+    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
+    -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})

 ExternalProject_Add(htp-v68
    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -8,6 +8,7 @@ extern "C" {
 #include <AEEStdErr.h>
 #include <inttypes.h>
 #include <remote.h>
+#include <rpcmem.h>
 #include <stdbool.h>

 /* Offset to differentiate HLOS and Hexagon error codes.
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -31,7 +31,8 @@ add_library(${HTP_LIB} SHARED
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
+    FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})

 build_idl(htp_iface.idl ${HTP_LIB})

--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -231,7 +231,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
        // x (src0_spad_data) = std::min(src0_p[k], limit);
        hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
        // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
-        hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
+        hvx_clamp_scalar_f32((const uint8_t *) src1, -limit, limit, src1_spad_data, nc);
        // y (src1_spad_data)  = y1 + 1.f
        hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
        // x1 (dst_spad_data) = alpha * (x)
@@ -255,6 +255,115 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }

+
+static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
+                                       struct htp_tensor *       dst,
+                                       const int32_t *           op_params,
+                                       struct htp_spad *         src0_spad,
+                                       struct htp_spad *         dst_spad,
+                                       uint32_t                  nth,
+                                       uint32_t                  ith,
+                                       uint32_t                  src0_nrows_per_thread,
+                                       dma_queue *               dma_queue) {
+    htp_act_preamble2;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    const uint8_t * data_src0 = (const uint8_t *) src0->data;
+    uint8_t * data_dst        = (uint8_t *) dst->data;
+
+    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
+    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+
+    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
+    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
+    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
+
+    // In gelu = x*sigmoid(x*1.702)
+    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+
+    if (BLOCK == 0) {
+        FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
+                src0_spad->size_per_thread, src0_row_size_aligned);
+        return;
+    }
+
+    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+
+        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
+            dst_row_size, dst_row_size_aligned, 0);
+
+        dma_queue_push_ddr_to_vtcm(dma_queue,
+            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
+            src0_row_size_aligned, src0_row_size, block_size);
+    }
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+
+        float* dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+        float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
+
+        for (uint32_t ib = 0; ib < block_size; ib++) {
+            const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
+            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
+
+            // gelu = x * sigmoid(1.702 * x) // current implementation
+            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+        }
+
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
+            dst_row_size, dst_row_size_aligned, block_size);
+
+        // prefetch N+2 loop iteration if any
+        const uint32_t pref_block = (ir + BLOCK * 2);
+        if (pref_block < src0_end_row) {
+            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
+            dma_queue_push_ddr_to_vtcm(dma_queue,
+                dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
+                src0_row_size_aligned, src0_row_size, pref_block_size);
+        }
+    }
+
+    dma_queue_flush(dma_queue);
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
+         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+    unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+
+
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
@@ -371,7 +480,10 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
            act_op_func = glu_swiglu_oai_fp32;
            op_type     = "swiglu-oai-f32";
            break;
-
+        case HTP_OP_UNARY_GELU:
+            act_op_func = unary_gelu_fp32;
+            op_type     = "gelu-f32";
+            break;
        default:
            FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
@@ -380,21 +492,45 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
    const uint32_t n_threads  = octx->n_threads;
    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];

-    const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
-    const size_t dst_row_size  = dst->nb[1];
+    size_t src0_row_size = src0->nb[1];
+    size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
+    size_t dst_row_size  = dst->nb[1];

+    const bool src1_valid = src1->ne[0];
+    if (!src1_valid) {
+        src1_row_size = src0_row_size;
+    }
+
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * octx->n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;

-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    size_t spad_size_per_row   = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
+    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
+
+    // Make sure the reserved vtcm size is sufficient
+    if(vtcm_row_per_thread ==0){
+        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size_per_row * n_threads);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
+    octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
+    octx->dst_spad.size_per_thread  = dst_row_size_aligned * vtcm_row_per_thread;
+
+    octx->dst_spad.size  = n_threads* octx->dst_spad.size_per_thread;
+    octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
+    octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;

    if (src1->ne[0]) {
-        FARF(HIGH,
-             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+        FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
             octx->dst_spad.size);
@@ -404,20 +540,8 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
    }

-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
-        return HTP_STATUS_VTCM_TOO_SMALL;
-    }
-
-    octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
-    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
-
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs = MIN(n_threads, src0_nrows);
-
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
    }
--- a/ggml/src/ggml-hexagon/htp/htp-dma.c
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.c
@@ -34,12 +34,12 @@ dma_queue * dma_queue_create(size_t capacity) {
    q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
    memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));

-    q->dst = (void **) memalign(4, capacity * sizeof(void *));
-    memset(q->dst, 0, capacity * sizeof(void *));
+    q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
+    memset(q->dptr, 0, capacity * sizeof(dma_ptr));

    q->tail = &q->desc[capacity - 1];

-    if (!q->desc && !q->dst) {
+    if (!q->desc && !q->dptr) {
        FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
        return NULL;
    }
@@ -54,16 +54,10 @@ void dma_queue_delete(dma_queue * q) {
        return;
    }
    free(q->desc);
-    free(q->dst);
+    free(q->dptr);
    free(q);
 }

 void dma_queue_flush(dma_queue * q) {
-    while (1) {
-        uint32_t s = dmwait() & 0x3;
-        if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
-            break;
-        }
-    }
-    q->tail = NULL;
+    while (dma_queue_pop(q).dst != NULL) ;
 }
--- a/ggml/src/ggml-hexagon/htp/htp-dma.h
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@@ -11,10 +11,15 @@
 extern "C" {
 #endif

+typedef struct {
+    void *dst;
+    const void *src;
+} dma_ptr;
+
 typedef struct {
    hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
    hexagon_udma_descriptor_type1_t * tail;  // tail pointer
-    void **                           dst;   // dst pointers
+    dma_ptr                         * dptr;  // dst/src pointers
    uint32_t                          push_idx;
    uint32_t                          pop_idx;
    uint32_t                          capacity;
@@ -49,13 +54,20 @@ static inline unsigned int dmwait(void) {
    return ret;
 }

-static inline bool dma_queue_push(dma_queue *  q,
-                                  void *       dst,
-                                  const void * src,
-                                  size_t       dst_row_size,
-                                  size_t       src_row_size,
-                                  size_t       nrows) {
+static inline dma_ptr dma_make_ptr(void *dst, const void *src)
+{
+    dma_ptr p = { dst, src };
+    return p;
+}
+
+static inline bool dma_queue_push(dma_queue * q,
+                                  dma_ptr     dptr,
+                                  size_t      dst_row_size,
+                                  size_t      src_row_size,
+                                  size_t      width, // width in bytes. number of bytes to transfer per row
+                                  size_t      nrows) {
    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+        FARF(ERROR, "dma-push: queue full\n");
        return false;
    }

@@ -75,18 +87,18 @@ static inline bool dma_queue_push(dma_queue *  q,
 #endif
    desc->order          = 0;
    desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
-    desc->src            = (void *) src;
-    desc->dst            = (void *) dst;
+    desc->src            = (void *) dptr.src;
+    desc->dst            = (void *) dptr.dst;
    desc->allocation     = 0;
    desc->padding        = 0;
-    desc->roiwidth       = src_row_size;
+    desc->roiwidth       = width;
    desc->roiheight      = nrows;
    desc->srcstride      = src_row_size;
    desc->dststride      = dst_row_size;
    desc->srcwidthoffset = 0;
    desc->dstwidthoffset = 0;

-    q->dst[q->push_idx] = dst;
+    q->dptr[q->push_idx] = dptr;

    dmlink(q->tail, desc);
    q->tail = desc;
@@ -96,9 +108,28 @@ static inline bool dma_queue_push(dma_queue *  q,
    return true;
 }

-static inline uint8_t * dma_queue_pop(dma_queue * q) {
+static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
+                                              dma_ptr     dptr,
+                                              size_t      dst_row_size,
+                                              size_t      src_row_size,
+                                              size_t      nrows) {
+    return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
+
+static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
+                                              dma_ptr     dptr,
+                                              size_t      dst_row_size,
+                                              size_t      src_row_size,
+                                              size_t      nrows) {
+    return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
+}
+
+static inline dma_ptr dma_queue_pop(dma_queue * q) {
+    dma_ptr dptr  = { NULL };
+
    if (q->push_idx == q->pop_idx) {
-        return NULL;
+        return dptr;
    }

    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
@@ -112,11 +143,11 @@ static inline uint8_t * dma_queue_pop(dma_queue * q) {
        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
    }

-    uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
+    dptr = q->dptr[q->pop_idx];

    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
-    return dst;
+    return dptr;
 }

 #ifdef __cplusplus
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -51,11 +51,12 @@ enum htp_op {
    HTP_OP_MUL_MAT_ID     = 5,
    HTP_OP_RMS_NORM       = 6,
    HTP_OP_UNARY_SILU     = 7,
-    HTP_OP_GLU_SWIGLU     = 8,
-    HTP_OP_GLU_SWIGLU_OAI = 9,
-    HTP_OP_SOFTMAX        = 10,
-    HTP_OP_ADD_ID         = 11,
-    HTP_OP_ROPE           = 12,
+    HTP_OP_UNARY_GELU     = 8,
+    HTP_OP_GLU_SWIGLU     = 9,
+    HTP_OP_GLU_SWIGLU_OAI = 10,
+    HTP_OP_SOFTMAX        = 11,
+    HTP_OP_ADD_ID         = 12,
+    HTP_OP_ROPE           = 13,
    INVALID
 };

--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0,
        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
    }

+
+    bool handled_leftover = false;
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
@@ -60,18 +62,59 @@ void hvx_mul_f32(const uint8_t * restrict src0,
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
+        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int leftover_size = left_over * sizeof(float);
+
+
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
+
+        HVX_Vector slinep;
+        HVX_Vector slinec;
+        HVX_Vector sline;
+        HVX_Vector sline2p;
+        HVX_Vector sline2c;
+        HVX_Vector sline2;
+
+        slinep  = *vec_in1++;
+        sline2p = *vec_in2++;
        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
-            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+        for (int i = step_of_1 - 1; i > 0; i--) {
+            slinec  = *vec_in1++;
+            sline2c = *vec_in2++;
+            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);

-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep                         = slinec;
+            sline2p                        = sline2c;
+        }
+        if (step_of_1 > 1) {
+            slinec  = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
+            sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;

-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            sline                          = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2                         = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep                         = slinec;
+            sline2p                        = sline2c;
+        }
+        if (left_over > 0) {
+            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
+
+            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
+            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+
+            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
+            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
+            handled_leftover = true;
        }
    }

-    if (left_over > 0) {
+
+    if (left_over > 0 && !handled_leftover) {
        const float * src0f = (const float *) src0 + num_elems_whole;
        const float * src1f = (const float *) src1 + num_elems_whole;
        float *       dstf  = (float *) dst + num_elems_whole;
@@ -464,7 +507,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
    }

    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
-
+    bool handled_leftover = false;
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
@@ -475,17 +518,47 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
+        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int leftover_size = left_over * sizeof(float);
+
+        HVX_Vector *  input_v_ptr  = (HVX_Vector *) src;
+        HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
+
+        HVX_Vector slinep;
+        HVX_Vector slinec;
+        HVX_Vector sline;
+
+        slinep = *input_v_ptr++;
+
        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+        for (int i = step_of_1 - 1; i > 0; i--) {
+            slinec                              = *input_v_ptr++;
+            sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            /* Prepare slinep for next iteration */
+            slinep                              = slinec;
+        }

-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+        if (step_of_1 > 0) {
+            slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
+            sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));

-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            slinep = slinec;
+        }
+
+        if (leftover_size > 0) {
+            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
+
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+
+            HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            hvx_vec_store_u(output_v_ptr, leftover_size, sout);
+            handled_leftover = true;
        }
    }

-    if (left_over > 0) {
+    if (left_over > 0 && !handled_leftover) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;

@@ -875,35 +948,45 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
 void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
-
+    int unalign_address = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unalign_address = 1;
    }

-    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
    const float * src_f = (const float *) src;

-    HVX_Vector vec_min = Q6_V_vsplat_R(val);
+    HVX_Vector vec_min = hvx_vec_splat_fp32(val);

-    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
-    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+    if(unalign_address == 0){
+        HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-    #pragma unroll(4)
-    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
-        *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector min_clamp    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+            *vec_out++ = (min_clamp);
+        }
+    }else{
+        HVX_UVector * restrict vec_in  = (HVX_Vector *) src;
+        HVX_UVector * restrict vec_out = (HVX_Vector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector min_clamp     = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+            *vec_out++ = (min_clamp);
+        }
    }

-    if (left_over > 0) {
+    if (left_over > 0 ) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;

-        HVX_Vector in = *(HVX_UVector *) srcf;
+        HVX_UVector in = *(HVX_UVector *) srcf;

-        vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
+        HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in);

-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp));
    }
 }

@@ -915,46 +998,70 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;

+    int unalign_address = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unalign_address = 1;
    }

-    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
-    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
-    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
    HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
    HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);

-    #pragma unroll(4)
-    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        HVX_Vector in_vec = *vec_in++;
-        HVX_Vector temp_v = in_vec;
+    if(unalign_address == 0){
+        HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
-        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);

-        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);

-        *vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in_vec = *vec_in++;
+            HVX_Vector temp_v = in_vec;
+
+            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+            HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+            in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+            in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
+
+            *vec_out++ = in_vec;
+        }
+
+    }else{
+
+        HVX_UVector * restrict vec_in  = (HVX_UVector *) src;
+        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in_vec = *vec_in++;
+            HVX_Vector temp_v = in_vec;
+
+            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+            HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+            in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+            in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
+
+            *vec_out++ = in_vec;
+        }
+
    }

    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;

-        HVX_Vector in = *(HVX_UVector *) srcf;
+        HVX_Vector in_vec = *(HVX_UVector *) srcf;

-        HVX_Vector temp_v = in;
+        HVX_Vector temp_v = in_vec;

-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
-        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);

-        in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-        in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
+        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);

-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
    }
 }
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
    }
 }

+
+/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
    uint32_t right_off = left_off + n;
    return right_off <= chunk_size;
 }

+
+
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
    HVX_VectorAlias u = { .v = v };

@@ -976,8 +980,6 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
    int step_of_1 = num_elems >> 5;
    int remaining = num_elems - step_of_1 * VLEN_FP32;

-    assert(remaining == 0);
-
    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;

@@ -992,8 +994,69 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
    for (int i = 0; i < step_of_1; i++) {
        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
    }
+
+    if (remaining > 0) {
+        const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
+        float *       dstf = (float *) dst + step_of_1*VLEN_FP32;
+
+        HVX_Vector in  = *(HVX_UVector *) srcf;
+        HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
+        hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
+    }
 }

+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+    int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+    int leftover = num_elems - (step_of_1 * VLEN_FP32);
+
+    int32_t leftover_size = leftover * sizeof(float);
+
+    static const float kMinExp = -87.f;  // 0
+    static const float kMaxExp = 87.f;   // 1
+
+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
+    const float *input = (float *)src;
+    float *output = (float *)dst;
+
+    HVX_Vector *  input_v_ptr  = (HVX_Vector *) input;
+    HVX_UVector * output_v_ptr = (HVX_UVector *) output;
+
+    HVX_Vector slinep;
+    HVX_Vector slinec;
+    HVX_Vector sline;
+
+    slinep = *input_v_ptr++;
+    #pragma unroll(4)
+    for (int i = step_of_1 - 1; i > 0; i--) {
+        slinec                              = *input_v_ptr++;
+        sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        /* Prepare slinep for next iteration */
+        slinep                              = slinec;
+    }
+
+    if (step_of_1 > 0) {
+        slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
+        sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        ;
+
+        slinep = slinec;
+    }
+    if (leftover > 0) {
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
+
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+        HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        hvx_vec_store_u(output_v_ptr, leftover_size, sout);
+    }
+}
+
+
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                  const uint8_t * restrict src1,
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -299,7 +299,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que

    ctx->n_threads = n_hvx;
    for (int i = 0; i < ctx->n_threads; i++) {
-        ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
+        // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
+        ctx->dma[i] = dma_queue_create(64);
    }

    // init worker pool
@@ -798,6 +799,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
                break;

            case HTP_OP_UNARY_SILU:
+            case HTP_OP_UNARY_GELU:
                if (n_bufs != 2) {
                    FARF(ERROR, "Bad act-req buffer list");
                    continue;
@@ -806,6 +808,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
                break;

            case HTP_OP_GLU_SWIGLU:
+            case HTP_OP_GLU_SWIGLU_OAI:
            case HTP_OP_SOFTMAX:
                if ((n_bufs != 2) && (n_bufs != 3)) {
                    FARF(ERROR, "Bad act-req buffer list");
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -92,6 +92,18 @@ static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = {
    0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
 };

+// vdelta control to replicate first fp16 value across all elements
+static const uint8_t __attribute__((aligned(128))) repl_2x_fp16[128] = {
+    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+};
+
 // vdelta control to expand first 32 e8m0 values into 32 uint32 elements
 static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
    0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00,
@@ -1115,13 +1127,13 @@ static void matmul(struct htp_matmul_type * mt,
        if (is0 >= HTP_SPAD_SRC0_NROWS) {
            break;
        }
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                       src0_row_size_padded, src0_row_size, 2);
    }

    // Process src0 rows
    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

        #pragma unroll(2)
        for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@@ -1134,7 +1146,7 @@ static void matmul(struct htp_matmul_type * mt,
        const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
        const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
        if (pr0 < src0_end_row_x2) {
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 2);
        }
    }
@@ -1143,9 +1155,9 @@ static void matmul(struct htp_matmul_type * mt,
    if (src0_end_row != src0_end_row_x2) {
        uint32_t  ir0 = src0_end_row_x2;
        const int is0 = (ir0 - src0_start_row);
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                       src0_row_size_padded, src0_row_size, 1);
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

        #pragma unroll(2)
        for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@@ -1217,20 +1229,20 @@ static void matvec(struct htp_matmul_type * mt,
        if (is0 >= HTP_SPAD_SRC0_NROWS) {
            break;
        }
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                       src0_row_size_padded, src0_row_size, 2);
    }

    // Process src0 rows
    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
        mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);

        // Prefetch next (n + spad_nrows) row
        const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
        const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
        if (pr0 < src0_end_row_x2) {
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 2);
        }
    }
@@ -1239,9 +1251,9 @@ static void matvec(struct htp_matmul_type * mt,
    if (src0_end_row != src0_end_row_x2) {
        const uint32_t ir0 = src0_end_row_x2;
        const uint32_t is0 = (ir0 - src0_start_row);
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                       src0_row_size_padded, src0_row_size, 1);
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
        mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
    }

@@ -1331,13 +1343,13 @@ static void matmul_id(struct htp_matmul_type * mt,
            if (is0 >= HTP_SPAD_SRC0_NROWS) {
                break;
            }
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 2);
        }

        // Process src0 rows
        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

            for (uint32_t cid = 0; cid < cne1; ++cid) {
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@@ -1356,7 +1368,7 @@ static void matmul_id(struct htp_matmul_type * mt,
            const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
            const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
            if (pr0 < src0_end_row_x2) {
-                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                               src0_row_size_padded, src0_row_size, 2);
            }
        }
@@ -1365,9 +1377,9 @@ static void matmul_id(struct htp_matmul_type * mt,
        if (src0_end_row != src0_end_row_x2) {
            uint32_t       ir0 = src0_end_row_x2;
            const uint32_t is0 = (ir0 - src0_start_row);
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 1);
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;

            for (uint32_t cid = 0; cid < cne1; ++cid) {
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@@ -1455,20 +1467,20 @@ static void matvec_id(struct htp_matmul_type * mt,
            if (is0 >= HTP_SPAD_SRC0_NROWS) {
                break;
            }
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 2);
        }

        // Process src0 rows
        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
            mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);

            // Prefetch next (n + spad_nrows) row
            const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
            const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
            if (pr0 < src0_end_row_x2) {
-                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                               src0_row_size_padded, src0_row_size, 2);
            }
        }
@@ -1477,9 +1489,9 @@ static void matvec_id(struct htp_matmul_type * mt,
        if (src0_end_row != src0_end_row_x2) {
            uint32_t       ir0 = src0_end_row_x2;
            const uint32_t is0 = (ir0 - src0_start_row);
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                           src0_row_size_padded, src0_row_size, 1);
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
            mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
        }
    }
@@ -1594,6 +1606,118 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,

 // *** dynamic quant

+static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+    assert((unsigned long) x % 128 == 0);
+    assert((unsigned long) y_q % 128 == 0);
+
+    HVX_Vector * vx = (HVX_Vector *) x;
+    HVX_Vector zero   = Q6_V_vsplat_R(0);
+
+    // Use reduce max fp32 to find max(abs(e)) first
+    HVX_Vector vmax0_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[0]));
+    HVX_Vector vmax1_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[1]));
+    HVX_Vector vmax2_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[2]));
+    HVX_Vector vmax3_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[3]));
+    // Load and convert into QF32
+    HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
+    HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
+    HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero);  // 32 elements
+    HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
+
+    // Convert to QF32
+    HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero);
+    HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero);
+    HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero);
+    HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero);
+
+    // Combine and convert to fp16
+    HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf)));
+    HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf)));
+
+    // Convert into fp16
+    HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
+    HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
+
+    // Replicate first fp16 scale across all lanes
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_fp16;
+    vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
+    vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
+
+    HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd01_hf   = Q6_Vhf_equals_Vqf16(vd01_qf16);
+    HVX_Vector vd23_hf   = Q6_Vhf_equals_Vqf16(vd23_qf16);
+
+    hvx_vec_store_u(y_d + 0, 2, vd01_hf);
+    HVX_Vector rotated_vd_hf = Q6_V_vror_VR(vd01_hf, 64);
+    hvx_vec_store_u(y_d + 2, 2, rotated_vd_hf);
+
+    hvx_vec_store_u(y_d + 4, 2, vd23_hf);
+    rotated_vd_hf = Q6_V_vror_VR(vd23_hf, 64);
+    hvx_vec_store_u(y_d + 6, 2, rotated_vd_hf);
+
+    // Divide input by the scale
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf);
+    vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
+    vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
+
+    // Convert to int8
+    HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
+    HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
+    HVX_Vector vx_i8    = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
+
+    *(HVX_Vector *) y_q = vx_i8;
+}
+
+static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+    assert((unsigned long) x % 128 == 0);
+    assert((unsigned long) y_q % 128 == 0);
+
+    HVX_Vector * vx = (HVX_Vector *) x;
+
+    // Load and convert into QF32
+    HVX_Vector zero   = Q6_V_vsplat_R(0);
+    HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
+    HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
+    HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero);  // 32 elements
+    HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
+
+    // Convert into fp16
+    HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
+    HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
+
+    // Compute max and scale
+    HVX_Vector vmax01_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf));
+    HVX_Vector vmax23_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx23_hf));
+
+    // Replicate first fp16 scale across all lanes
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16;
+    vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
+    vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
+
+    HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd01_hf   = Q6_Vhf_equals_Vqf16(vd01_qf16);
+    HVX_Vector vd23_hf   = Q6_Vhf_equals_Vqf16(vd23_qf16);
+
+    hvx_vec_store_u(y_d + 0, 4, vd01_hf);
+    hvx_vec_store_u(y_d + 4, 4, vd23_hf);
+
+    // Divide input by the scale
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf);
+    vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
+    vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
+
+    // Convert to int8
+    HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
+    HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
+    HVX_Vector vx_i8    = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
+
+    *(HVX_Vector *) y_q = vx_i8;
+}
+
 static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
    assert((unsigned long) x % 128 == 0);
    assert((unsigned long) y_q % 128 == 0);
@@ -1655,10 +1779,24 @@ static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, u
    uint8_t * restrict t_d = (uint8_t *) x;

    for (uint32_t i = 0; i < nb; i++) {
+#if FP32_QUANTIZE_GROUP_SIZE == 32
+        quantize_block_fp32_q8x1(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
+                                 t_d + (i * 2 + 0) * dblk_size / 2);
+        quantize_block_fp32_q8x1(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
+                                 t_d + (i * 2 + 1) * dblk_size / 2);
+#elif FP32_QUANTIZE_GROUP_SIZE == 64
+        quantize_block_fp32_q8x2(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
+                                 t_d + (i * 2 + 0) * dblk_size / 2);
+        quantize_block_fp32_q8x2(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
+                                 t_d + (i * 2 + 1) * dblk_size / 2);
+#elif FP32_QUANTIZE_GROUP_SIZE == 128
        quantize_block_fp32_q8x4(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
                                 t_d + (i * 2 + 0) * dblk_size / 2);
        quantize_block_fp32_q8x4(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
                                 t_d + (i * 2 + 1) * dblk_size / 2);
+#else
+#error "FP32_QUANTIZE_GROUP_SIZE must be 32, 64, or 128"
+#endif
    }

    // now copy the scales into final location
@@ -1671,6 +1809,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
                                 uint32_t          nth,
                                 uint32_t          ith,
                                 uint32_t          nrows_per_thread) {
+
    uint64_t t1 = HAP_perf_get_qtimer_count();

    const uint32_t ne0 = src->ne[0];
--- a/ggml/src/ggml-hexagon/op-desc.h
+++ b/ggml/src/ggml-hexagon/op-desc.h
@@ -0,0 +1,153 @@
+#ifndef OP_DESC_H
+#define OP_DESC_H
+
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+
+#include <string>
+#include <stdio.h>
+
+struct op_desc {
+    char strides[64 * GGML_MAX_SRC];
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+
+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+        } else {
+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+        }
+    }
+
+    void format_op_dims(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_dims(p, t->src[0]);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_dims(p, t->src[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_dims(self, t);
+
+        p += sprintf(p, "%s", self);
+    }
+
+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+        const char * c = ggml_is_contiguous(t) ? "" : "!";
+
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+        } else {
+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+        }
+    }
+
+    void format_op_strides(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_strides(p, t->src[0]);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_strides(p, t->src[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_strides(self, t);
+
+        p += sprintf(p, "%s", self);
+    }
+
+    void format_op_types(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", ggml_type_name(t->type));
+    }
+
+    const char * tensor_buff_name(const struct ggml_tensor * t) {
+        if (t->buffer) {
+            return ggml_backend_buffer_name(t->buffer);
+        }
+        return "NONE";
+    }
+
+    void format_op_buffs(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", tensor_buff_name(t));
+    }
+
+    void format_op_names(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", t->src[0]->name);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", t->src[i]->name);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", t->name);
+    }
+
+    void format(const ggml_tensor * op) {
+        format_op_dims(dims, op);
+        format_op_strides(strides, op);
+        format_op_types(types, op);
+        format_op_buffs(buffs, op);
+        format_op_names(names, op);
+    }
+
+    op_desc() {}
+    op_desc(const ggml_tensor * op) { format(op); }
+};
+
+#endif // OP_DESC_H
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -263,6 +263,32 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
    return { type, major, minor, patch };
 }

+// cl buffer wrapper
+struct ggml_cl_buffer {
+    cl_mem buffer;
+    size_t size;
+
+    ggml_cl_buffer()
+        : buffer(nullptr), size(0) {}
+
+    ~ggml_cl_buffer() {
+        if (buffer) {
+            CL_CHECK(clReleaseMemObject(buffer));
+        }
+    }
+
+    void allocate(cl_context context, size_t new_size) {
+        if (new_size > size) {
+            size = new_size;
+            if (buffer) {
+                CL_CHECK(clReleaseMemObject(buffer));
+            }
+            cl_int err;
+            CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
+        }
+    }
+};
+
 // Profiling
 struct ProfilingInfo {
    std::string op_name;
@@ -376,6 +402,11 @@ struct ggml_backend_opencl_context {
    cl_context context;
    cl_command_queue queue;

+    // prealloc buffers for transposing weights and activations
+    ggml_cl_buffer prealloc_quant_trans;
+    ggml_cl_buffer prealloc_scales_trans;
+    ggml_cl_buffer prealloc_act_trans;
+
    cl_program program_add;
    cl_program program_add_id;
    cl_program program_clamp;
@@ -494,6 +525,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
    cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
    cl_kernel kernel_convert_block_q4_0_noshuffle;
+    cl_kernel kernel_restore_block_q4_0_noshuffle;
    cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
    cl_kernel kernel_mul_mv_q6_K_f32;
    cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
@@ -634,12 +666,9 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_transpose_32;
    cl_kernel kernel_transpose_32_16;
    cl_kernel kernel_transpose_16;
+    cl_kernel kernel_transpose_16_buf;
    cl_kernel kernel_transpose_16_4x1;

-    cl_mem A_s_d_max;            // max scale buffer size for transpose
-    cl_mem A_q_d_max;            // max weight buffer size for transpose
-    cl_mem B_d_max;              // max activation buffer size for transpose
-
    // Gemm and Gemv related programs, kernels, etc
    cl_program program_CL_gemm;
    cl_program program_CL_gemv_general;
@@ -806,6 +835,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

        CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
        CL_CHECK((backend_ctx->kernel_restore_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
@@ -2004,7 +2034,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
        CL_CHECK((backend_ctx->kernel_transpose_32    = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
        CL_CHECK((backend_ctx->kernel_transpose_16    = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
-        CL_CHECK((backend_ctx->kernel_transpose_16_4x1    = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
+        CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
+        CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
        GGML_LOG_CONT(".");
    }

@@ -2596,9 +2627,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
                      required_B_d_bytes, max_B_d_bytes);
    }

-    CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
-    CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
-    CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
+    backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes);
+    backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes);
+    backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes);
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS

    backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
@@ -3603,32 +3634,35 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
        // use sub_buffer of max buffer size instead

        size_t q_size_bytes = K * M / 8 * sizeof(float);
+        backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
+
        cl_buffer_region region;
        region.origin = 0;
        region.size = q_size_bytes;
        cl_mem qT_d = clCreateSubBuffer(
-            backend_ctx->A_q_d_max,
+            backend_ctx->prealloc_quant_trans.buffer,
            0,
            CL_BUFFER_CREATE_TYPE_REGION,
            &region,
            &err);
-        // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
        CL_CHECK(err);

        bool K_tile_trans = true;
        if ((K / 32) % 4 != 0){
            K_tile_trans =false;
        }
+
        size_t d_size_bytes = M * (K / 32) * 2;
+        backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
+
        region.origin = 0;
        region.size = d_size_bytes;
        cl_mem dT_d = clCreateSubBuffer(
-            backend_ctx->A_s_d_max,
+            backend_ctx->prealloc_scales_trans.buffer,
            0,
            CL_BUFFER_CREATE_TYPE_REGION,
            &region,
            &err);
-        // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
        CL_CHECK(err);

        // <----------------------------------------------------------------------------------> //
@@ -3933,6 +3967,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
    if (tensor->type == GGML_TYPE_Q4_0) {
        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;

+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_kernels(backend_ctx, tensor)) {
+            cl_int err;
+            cl_kernel kernel;
+
+            cl_int M = tensor->ne[1];   // ne01
+            cl_int K = tensor->ne[0];   // ne00
+
+            GGML_ASSERT(K % 32 == 0);
+            GGML_ASSERT(M % 4 == 0);
+
+            size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
+            size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
+            GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+            cl_mem buf_trans_q;
+            cl_mem buf_trans_d;
+
+            CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                size_q, NULL, &err), err));
+            CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                size_d, NULL, &err), err));
+
+            kernel = backend_ctx->kernel_transpose_16_buf;
+
+            // transpose q back
+            cl_int stride_k_q = K/4;
+            size_t local_size_q[3] = {64, 1, 1};
+            size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
+
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_size_q, local_size_q, 0, NULL, NULL));
+
+            // transpose scales back
+            cl_int stride_k_d = K/32;
+            size_t local_size_d[3] = {64, 1, 1};
+            size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
+
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_size_d, local_size_d, 0, NULL, NULL));
+
+            // unpack
+            cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                ggml_nbytes(tensor), NULL, &err);
+            CL_CHECK(err);
+
+            cl_uchar mask_0F = 0x0F;
+            cl_uchar mask_F0 = 0xF0;
+
+            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+            size_t local_work_size[] = {1, 1, 1};
+
+            kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &buf_trans_q));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &buf_trans_d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
+
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_work_size, local_work_size, 0, NULL, NULL));
+
+            // read back to host
+            CL_CHECK(clEnqueueReadBuffer(
+                queue, data_device, CL_TRUE, offset,
+                size, data, 0, NULL, NULL));
+
+            CL_CHECK(clReleaseMemObject(data_device));
+            CL_CHECK(clReleaseMemObject(buf_trans_q));
+            CL_CHECK(clReleaseMemObject(buf_trans_d));
+
+            return;
+        }
+#endif
+
        cl_int err;
        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
            ggml_nbytes(tensor), NULL, &err);
@@ -7306,8 +7425,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
            region.origin = 0;
            // Specify the size of the sub-buffer (divide by 2 for FP16)
            region.size = K * (N + padding) * sizeof(float)/2;
+            backend_ctx->prealloc_act_trans.allocate(context, region.size);
+
            B_d = clCreateSubBuffer(
-                backend_ctx->B_d_max,
+                backend_ctx->prealloc_act_trans.buffer,
                0,
                CL_BUFFER_CREATE_TYPE_REGION,
                &region,
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -117,6 +117,27 @@ kernel void kernel_convert_block_q4_0_noshuffle(
    }
 }

+kernel void kernel_restore_block_q4_0_noshuffle(
+    global uchar * src_q,
+    global half  * src_d,
+    global struct block_q4_0 * dst,
+    uchar mask_0F,
+    uchar mask_F0
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
+    global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) src_d + get_global_id(0);
+
+    b->d = *d;
+    for (int i = 0; i < QK4_0/4; ++i) {
+        uchar x0 = q[i + 0      ] ;
+        uchar x1 = q[i + QK4_0/4];
+
+        b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
+        b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
+    }
+}
+
 //------------------------------------------------------------------------------
 // block_mxfp4
 //------------------------------------------------------------------------------
--- a/ggml/src/ggml-opencl/kernels/transpose.cl
+++ b/ggml/src/ggml-opencl/kernels/transpose.cl
@@ -44,6 +44,19 @@ kernel void kernel_transpose_16_4x1(
    write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
 }

+// Transpose treating each element as 16-bit using buffer
+kernel void kernel_transpose_16_buf(
+    global const ushort * input,
+    global ushort * output,
+    const int ldi,
+    const int ldo
+) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    output[x*ldo + y] = input[y*ldi + x];
+}
+
 // 32-bit transpose, loading/storing a 4x4 tile of elements
 kernel void kernel_transpose_32(
    __read_only image1d_buffer_t input,
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -571,6 +571,10 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
    return ctx->base_ptr;
 }

+static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
+}
+
 static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
    rpc_tensor result;
    if (!tensor) {
@@ -580,10 +584,10 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {

    result.id = reinterpret_cast<uint64_t>(tensor);
    result.type = tensor->type;
-    if (tensor->buffer) {
+    if (tensor->buffer && ggml_backend_buffer_is_rpc(tensor->buffer)) {
        ggml_backend_buffer_t buffer = tensor->buffer;
        ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-        result.buffer = ctx->remote_ptr;
+        result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
    } else {
        result.buffer = 0;
    }
@@ -664,10 +668,6 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
    RPC_STATUS_ASSERT(status);
 }

-static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
-    return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
-}
-
 static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
    if (ggml_backend_buffer_is_rpc(src->buffer)) {
        // check if src and dst are on the same server
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp
@@ -0,0 +1,51 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+
+#include "types.glsl"
+
+layout (push_constant) uniform parameter
+{
+    uint32_t ne00;
+    uint32_t ne01;
+    uint32_t nb00;
+    uint32_t nb01;
+    uint32_t a_offset;
+} p;
+
+#define BLOCK_SIZE 256
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {uint data_a[];};
+layout (binding = 1) writeonly buffer D {uint data_d[];};
+
+shared uint vals[BLOCK_SIZE];
+
+void main() {
+    const uint expert_id = gl_WorkGroupID.x;
+    const uint num_elements = p.ne00 * p.ne01;
+    const uint tid = gl_LocalInvocationID.x;
+
+    uint count = 0;
+    for (uint idx = tid; idx < num_elements; idx += BLOCK_SIZE) {
+        const uint i01 = idx / p.ne00;
+        const uint i00 = idx % p.ne00;
+        const uint a = data_a[p.a_offset + i01 * p.nb01 + i00 * p.nb00];
+
+        count += uint(a == expert_id);
+    }
+
+    vals[tid] = count;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] += vals[tid + s];
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        data_d[expert_id] = vals[0];
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -401,13 +401,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
    const uint qshift = (iqs & 16) >> 2;
-    u8vec4 qs = u8vec4(
-        data_a[a_offset + ib].qs[iq + 0],
-        data_a[a_offset + ib].qs[iq + 1],
-        data_a[a_offset + ib].qs[iq + 2],
-        data_a[a_offset + ib].qs[iq + 3]
-    );
-    qs = (qs >> qshift) & uint8_t(0xF);
+    const u8vec4 qs = unpack8((data_a_packed32[a_offset + ib].qs[iq/4] >> qshift) & 0x0F0F0F0F);

    const float dl = float(int(sl | (sh << 4)) - 32);
    return dl * vec4(
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
@@ -6,4 +6,6 @@ layout (push_constant) uniform parameter
    uint KY;
    float param1;
    float param2;
+    float param3;
+    float param4;
 } p;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
    int s0; int s1;
    int p0; int p1;
    int d0; int d1;
+    uint batch_IC;
 } p;

 layout(constant_id = 0) const uint BLOCK_SIZE = 32;
@@ -34,12 +35,12 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 layout (buffer_reference) buffer D_ptr {D_TYPE d;};
 #endif

-void main() {
+void im2col(const uint y, const uint z) {
    const uint gidx = gl_GlobalInvocationID.x;

-    const uint oh = gl_GlobalInvocationID.y;
-    const uint batch = gl_GlobalInvocationID.z / p.IC;
-    const uint ic = gl_GlobalInvocationID.z % p.IC;
+    const uint oh = y;
+    const uint batch = z / p.IC;
+    const uint ic = z % p.IC;

    const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
    const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
@@ -101,3 +102,15 @@ void main() {
 #endif
    }
 }
+
+void main() {
+    uint y = gl_GlobalInvocationID.y;
+    while (y < p.OH) {
+        uint z = gl_GlobalInvocationID.z;
+        while (z < p.batch_IC) {
+            im2col(y, z);
+            z += gl_NumWorkGroups.z;
+        }
+        y += gl_NumWorkGroups.y;
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
@@ -11,36 +11,54 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
    const uint y_idx = i * QUANT_K + 16 * itid;
    const uint nibble_shift = 4 * (itid & 1);
    const uint ib32 = itid / 2; // 0..7
-
    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    // Precompute db multiplication factors
+    float db_vals[NUM_ROWS];
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const float d = float(data_a[ibi].d);
-        const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
-        const float db = d * (0.5 + scale) * 0.25;
-
+        const uint scale_raw = data_a[ibi].scales[ib32];
+        const uint scale = (scale_raw >> nibble_shift) & 0xF;
+        // Merge constant calculations d * (0.5 + scale) * 0.25 = d*0.125 + d*scale*0.25
+        db_vals[n] = d * (0.125f + float(scale) * 0.25f);
+        ibi += num_blocks_per_row;
+    }
+    ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        // Preload grid and sign data for all l values
+        vec4 grid0_vals[2], grid1_vals[2];
+        uint sign_vals[2], sign7_vals[2];
        [[unroll]] for (uint l = 0; l < 2; ++l) {
            const uint qs = data_a[ibi].qs[2 * itid + l];
-            const uint sign = qs >> 9;
-            const uint sign7 = bitCount(sign);
-            const vec4 grid0 = vec4(unpack8(iq2xs_grid[qs & 511].x));
-            const vec4 grid1 = vec4(unpack8(iq2xs_grid[qs & 511].y));
-
-            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-                vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
-                vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
-
-                FLOAT_TYPE sum =
-                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
-                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
-                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
-                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
-                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
-                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
-                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
-                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
-                      FLOAT_TYPE(0.0)))))))));
-                temp[j][n] = fma(db, sum, temp[j][n]);
+            sign_vals[l] = qs >> 9;
+            sign7_vals[l] = bitCount(sign_vals[l]);
+            const uvec2 grid_data = iq2xs_grid[qs & 511];
+            grid0_vals[l] = vec4(unpack8(grid_data.x));
+            grid1_vals[l] = vec4(unpack8(grid_data.y));
+        }
+        // Preload B data for all j columns (reduce repeated index calculations)
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
+            [[unroll]] for (uint l = 0; l < 2; ++l) {
+                const uint sign = sign_vals[l];
+                const uint sign7 = sign7_vals[l];
+                const vec4 grid0 = grid0_vals[l];
+                const vec4 grid1 = grid1_vals[l];
+                // Precompute indices
+                const uint b_idx = (j * p.batch_stride_b + b_offset + y_idx) / 4 + 2 * l;
+                const vec4 b0 = vec4(data_b_v4[b_idx + 0]);
+                const vec4 b4 = vec4(data_b_v4[b_idx + 1]);
+                sum +=
+                    fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                    fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                    fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                    fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                    fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                    fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                    fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                    fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
+                    FLOAT_TYPE(0.0)))))))));
            }
+            temp[j][n] = fma(FLOAT_TYPE(db_vals[n]), sum, temp[j][n]);
        }
        ibi += num_blocks_per_row;
    }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -68,6 +68,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

 #ifdef MUL_MAT_ID
 layout (binding = 3) readonly buffer IDS {int data_ids[];};
+layout (binding = 4) readonly buffer Counts {int data_expert_count[];};
 #endif

 layout (push_constant) uniform parameter
@@ -135,13 +136,19 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
 #include "mul_mm_funcs.glsl"

 void main() {
+    const uint ic = gl_WorkGroupID.y;
+
+#ifdef MUL_MAT_ID
+    const uint expert_idx = gl_GlobalInvocationID.z;
+    if (ic * BN >= data_expert_count[expert_idx]) {
+        return;
+    }
+#endif
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
 #endif

-#ifdef MUL_MAT_ID
-    const uint expert_idx = gl_GlobalInvocationID.z;
-#else
+#ifndef MUL_MAT_ID
    const uint batch_idx = gl_GlobalInvocationID.z;

    const uint i13 = batch_idx / p.ne12;
@@ -156,7 +163,6 @@ void main() {
    const uint blocks_m = (p.M + BM - 1) / BM;
    const uint ir = gl_WorkGroupID.x % blocks_m;
    const uint ik = gl_WorkGroupID.x / blocks_m;
-    const uint ic = gl_WorkGroupID.y;

    const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
    const uint WSUBM = WM / WMITER;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -92,6 +92,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

 #ifdef MUL_MAT_ID
 layout (binding = 3) readonly buffer IDS {int data_ids[];};
+layout (binding = 4) readonly buffer Counts {int data_expert_count[];};

 shared u16vec4 row_ids[BN];

@@ -107,11 +108,7 @@ B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const i
 {
    const uint row_i = blockCoords[0];

-    if (row_i >= _ne1) {
-        return B_TYPE(0.0);
-    }
-
-    const u16vec4 row_idx = row_ids[row_i & (BN - 1)];
+    const u16vec4 row_idx = row_ids[row_i];
    B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]];

    return ret;
@@ -138,6 +135,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
    uint ids[16];
    uint iter = 0;

+    uint expert_count = data_expert_count[expert_idx];
+
    for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
        // prefetch up to 16 elements
        if (iter == 0) {
@@ -185,7 +184,7 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
        }
        _ne1 += total;
        iter &= 15;
-        if (_ne1 >= (ic + 1) * BN) {
+        if (_ne1 >= (ic + 1) * BN || _ne1 == expert_count) {
            break;
        }
    }
@@ -194,15 +193,28 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
 #endif

 void main() {
+    const uint tid = gl_LocalInvocationIndex;
+    const uint ic = gl_WorkGroupID.y;
+
+#ifdef MUL_MAT_ID
+    const uint expert_idx = gl_GlobalInvocationID.z;
+    if (ic * BN >= data_expert_count[expert_idx]) {
+        return;
+    }
+    // initialize to row 0 so we don't need to bounds check
+    if (tid < BN) {
+        row_ids[tid] = u16vec4(0);
+    }
+#if !defined(NEEDS_INIT_IQ_SHMEM)
+    barrier();
+#endif
+#endif
+
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
 #endif

-    const uint tid = gl_LocalInvocationIndex;
-
-#ifdef MUL_MAT_ID
-    const uint expert_idx = gl_GlobalInvocationID.z;
-#else
+#ifndef MUL_MAT_ID
    const uint batch_idx = gl_GlobalInvocationID.z;

    const uint i13 = batch_idx / p.ne12;
@@ -217,7 +229,6 @@ void main() {
    const uint blocks_m = (p.M + BM - 1) / BM;
    const uint ir = gl_WorkGroupID.x % blocks_m;
    const uint ik = gl_WorkGroupID.x / blocks_m;
-    const uint ic = gl_WorkGroupID.y;

 #ifdef MUL_MAT_ID
    if (bitCount(p.nei0) == 1) {
@@ -482,7 +493,7 @@ void main() {
                    coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;

                    coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
-                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
+                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);

                    sum = coopMatMulAdd(mat_a, mat_b, sum);
                } else {
@@ -490,7 +501,7 @@ void main() {
                    coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;

                    coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
-                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
+                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);

                    sum = coopMatMulAdd(mat_a, mat_b, sum);
                }
@@ -526,7 +537,7 @@ void main() {
                    coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;

                    coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
-                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
+                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);

                    sum = coopMatMulAdd(mat_a, mat_b, sum);
                } else {
@@ -534,7 +545,7 @@ void main() {
                    coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;

                    coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
-                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
+                    coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);

                    sum = coopMatMulAdd(mat_a, mat_b, sum);
                }
@@ -571,7 +582,7 @@ void main() {

                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
 #ifdef MUL_MAT_ID
-                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
 #else
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
 #endif
@@ -583,7 +594,7 @@ void main() {

                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
 #ifdef MUL_MAT_ID
-                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
 #else
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
@@ -159,14 +159,16 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint is = iqs / 8;                     // 0..15
            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
            const uint qsshift = halfsplit * 2;          // 0,2,4,6
-            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128

            const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
                                  | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
            const float dl = float(data_a[ib].d) * float(us - 32);

-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)),
-                                             dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
+            const vec2 qs = vec2(unpack8((uint(data_a_packed16[ib].qs[qsi / 2]) >> qsshift) & 0x0303).xy);
+            const vec2 hm = vec2(unpack8(((uint(data_a_packed16[ib].hmask[hmi / 2]) >> (4 * n + halfsplit)) & 0x0101 ^ 0x0101) << 2).xy);
+
+            buf_a[buf_idx] = FLOAT_TYPE_VEC2(dl * (qs.x - hm.x),
+                                             dl * (qs.y - hm.y));
 #elif defined(DATA_A_Q4_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@@ -198,8 +200,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const float d = loadd.x * sc;
            const float m = -loadd.y * mbyte;

-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF), m),
-                                             fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
+            const vec2 q = vec2(unpack8((uint(data_a_packed16[ib].qs[qsi / 2]) >> (b * 4)) & 0x0F0F).xy);
+
+            buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, q.x, m),
+                                             fma(d, q.y, m));
 #elif defined(DATA_A_Q5_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@@ -213,8 +217,6 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30

-            const uint8_t hm = uint8_t(1 << (iqs / 16));
-
            const vec2 loadd = vec2(data_a[ib].dm);

            const uint scidx0 = (is < 4) ? is : (is + 4);
@@ -234,8 +236,12 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const float d = loadd.x * sc;
            const float m = -loadd.y * mbyte;

-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0), m),
-                                             fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
+            const uint qs = (uint(data_a_packed16[ib].qs[qsi / 2]) >> (b * 4)) & 0x0F0F;
+            const uint qh = ((uint(data_a_packed16[ib].qh[qhi / 2]) >> (iqs / 16)) & 0x0101) << 4;
+            const vec2 q = vec2(unpack8(qs | qh).xy);
+
+            buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, q.x, m),
+                                             fma(d, q.y, m));
 #elif defined(DATA_A_Q6_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@@ -394,11 +400,9 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin

            const float d = float(data_a[ib].d);
            const uint qs = data_a[ib].qs[iqs];
-            const uint signs = pack32(u8vec4(
-                data_a[ib].qs[is+0],
-                data_a[ib].qs[is+1],
-                data_a[ib].qs[is+2],
-                data_a[ib].qs[is+3]
+            const uint signs = pack32(u16vec2(
+                data_a_packed16[ib].qs[is/2],
+                data_a_packed16[ib].qs[is/2+1]
            ));
            const float db = d * 0.5 * (0.5 + (signs >> 28));
            const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
@@ -443,8 +447,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
            const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
            const uint qshift = (idx & 8) >> 1;
-            u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
-            qs = (qs >> qshift) & uint8_t(0xF);
+            u8vec2 qs = unpack8((uint(data_a_packed16[ib].qs[iq/2]) >> qshift) & 0x0F0F).xy;

            const float d = float(data_a[ib].d);
            const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl
@@ -13,6 +13,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
    uint ids[16];
    uint iter = 0;

+    uint expert_count = data_expert_count[expert_idx];
+
    for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
        // prefetch up to 16 elements
        if (iter == 0) {
@@ -60,7 +62,7 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
        }
        _ne1 += total;
        iter &= 15;
-        if (_ne1 >= (ic + 1) * BN) {
+        if (_ne1 >= (ic + 1) * BN || _ne1 == expert_count) {
            break;
        }
    }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
@@ -35,6 +35,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

 #ifdef MUL_MAT_ID
 layout (binding = 3) readonly buffer IDS {int data_ids[];};
+layout (binding = 4) readonly buffer Counts {int data_expert_count[];};
 #endif

 layout (push_constant) uniform parameter
@@ -104,13 +105,19 @@ block_b_cache cache_b;
 #include "mul_mmq_funcs.glsl"

 void main() {
+    const uint ic = gl_WorkGroupID.y;
+
+#ifdef MUL_MAT_ID
+    const uint expert_idx = gl_GlobalInvocationID.z;
+    if (ic * BN >= data_expert_count[expert_idx]) {
+        return;
+    }
+#endif
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
 #endif

-#ifdef MUL_MAT_ID
-    const uint expert_idx = gl_GlobalInvocationID.z;
-#else
+#ifndef MUL_MAT_ID
    const uint batch_idx = gl_GlobalInvocationID.z;

    const uint i13 = batch_idx / p.ne12;
@@ -125,7 +132,6 @@ void main() {
    const uint blocks_m = (p.M + BM - 1) / BM;
    const uint ir = gl_WorkGroupID.x % blocks_m;
    const uint ik = gl_WorkGroupID.x / blocks_m;
-    const uint ic = gl_WorkGroupID.y;

    const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
    const uint WSUBM = WM / WMITER;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
@@ -49,8 +49,8 @@ void rope_norm(const uint i0, const uint i1, rope_params p) {
    uint idst = i1*ne0 + i0;
    const uint ix = rope_a_coord(i0, i01, i02, p);

-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
+    // Fusion optimization: ROPE + VIEW + SET_ROWS.
+    // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
    if (p.set_rows_stride != 0) {
        idst = i01*ne0 + i0;
        idst += rope_data_i[i02].x * p.set_rows_stride;
@@ -91,7 +91,7 @@ void rope_neox(const uint i0, const uint i1, rope_params p) {
    uint idst = i1*ne0 + i0/2;
    const uint ix = rope_a_coord(i0/2, i01, i02, p);

-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // Fusion optimization: ROPE + VIEW + SET_ROWS.
    // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
    if (p.set_rows_stride != 0) {
        idst = i01*ne0 + i0/2;
@@ -132,9 +132,16 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
    const uint i01 = i1 % ne1;
    const uint i02 = i1 / ne1;

-    const uint idst = i1*ne0 + i0/2;
+    uint idst = i1*ne0 + i0/2;
    const uint ix = rope_a_coord(i0/2, i01, i02, p);

+    // Fusion optimization: ROPE + VIEW + SET_ROWS.
+    // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
+    if (p.set_rows_stride != 0) {
+        idst = i01*ne0 + i0/2;
+        idst += rope_data_i[i02].x * p.set_rows_stride;
+    }
+
    if (i0 >= p.n_dims) {
        rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
        rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
@@ -6,6 +6,9 @@
 void main() {
    const uint i0 = 2*gl_GlobalInvocationID.y;
    // i1 is actually i2*nb2+i1, but the rows are contiguous
-    const uint i1 = gl_GlobalInvocationID.x;
+    const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
+    if (i1 >= pc.nrows) {
+        return;
+    }
    rope_multi(i0, i1, pc);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
@@ -6,6 +6,9 @@
 void main() {
    const uint i0 = 2*gl_GlobalInvocationID.y;
    // i1 is actually i2*nb2+i1, but the rows are contiguous
-    const uint i1 = gl_GlobalInvocationID.x;
+    const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
+    if (i1 >= pc.nrows) {
+        return;
+    }
    rope_neox(i0, i1, pc);
 }
--- a/Show More
+++ b/Show More