mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-08 10:04:10 +00:00
Compare commits
91 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d1c84a662d | ||
|
|
364a7a6d4a | ||
|
|
2df5bcf357 | ||
|
|
075c01567b | ||
|
|
a014310374 | ||
|
|
35fb82497e | ||
|
|
3c62aed89f | ||
|
|
f1eb1cb1eb | ||
|
|
de41f2b7bf | ||
|
|
a74a0d69f3 | ||
|
|
5f7e166cbf | ||
|
|
d72f5f7ba2 | ||
|
|
b77e6c18e1 | ||
|
|
2ddd3f2356 | ||
|
|
4d3d455d3c | ||
|
|
c9b1c06467 | ||
|
|
b6ae75afb4 | ||
|
|
b6dff20e2f | ||
|
|
2db78c75e4 | ||
|
|
02463ab27b | ||
|
|
adc76347d7 | ||
|
|
3a2bdcda0b | ||
|
|
66bb7985c3 | ||
|
|
2f61c0f5bf | ||
|
|
3ffd0fae47 | ||
|
|
a4a0aa5ea2 | ||
|
|
92cd103f62 | ||
|
|
b887d2f341 | ||
|
|
bd0af02fc9 | ||
|
|
d9e0e7c819 | ||
|
|
0124ac989f | ||
|
|
2811c65286 | ||
|
|
d8359f5fde | ||
|
|
6a2c6145a0 | ||
|
|
3b53634fe3 | ||
|
|
1384abf8b8 | ||
|
|
e6d65fb02d | ||
|
|
8656f5de68 | ||
|
|
4807e8f96a | ||
|
|
c0bfc57af4 | ||
|
|
75a3a6c2cd | ||
|
|
0499b29c6f | ||
|
|
234e2ff8ed | ||
|
|
3f81b4e91c | ||
|
|
ace6a54565 | ||
|
|
72b24d96c6 | ||
|
|
624207e676 | ||
|
|
807e8c6d31 | ||
|
|
1a18927894 | ||
|
|
e0539eb6ae | ||
|
|
5d0a40f390 | ||
|
|
d12a983659 | ||
|
|
cc1cfa277b | ||
|
|
54dbc37053 | ||
|
|
b995a10760 | ||
|
|
4710dd31bb | ||
|
|
9b26511857 | ||
|
|
00217cd413 | ||
|
|
3b337b01a1 | ||
|
|
a86a580a66 | ||
|
|
0f7c69689f | ||
|
|
835b2b915c | ||
|
|
b05a9d650f | ||
|
|
27052978e4 | ||
|
|
077c94d0ca | ||
|
|
aa3ee0eb0b | ||
|
|
d0991da39d | ||
|
|
aa719c2f88 | ||
|
|
4cdd0bb453 | ||
|
|
b5bd037832 | ||
|
|
dfcd53f7ec | ||
|
|
4ea00794b8 | ||
|
|
02a6a82ae7 | ||
|
|
c498fc82fe | ||
|
|
e7a5130a20 | ||
|
|
bee378e098 | ||
|
|
5fb557653b | ||
|
|
4ae88d07d0 | ||
|
|
e789095502 | ||
|
|
f2a789e334 | ||
|
|
3a59971967 | ||
|
|
63b54c81a6 | ||
|
|
152729f884 | ||
|
|
c0c59c1157 | ||
|
|
7735706b93 | ||
|
|
4d9ea03d17 | ||
|
|
8ba548dae2 | ||
|
|
f505bd83ca | ||
|
|
0889589dbe | ||
|
|
4e29084ba4 | ||
|
|
f6b4af3d04 |
@@ -1,6 +1,6 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc4.2.0
|
||||
ARG MUSA_VERSION=rc4.3.0
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
|
||||
@@ -2,10 +2,10 @@ ARG GCC_VERSION=15.2.0
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
### Build Llama.cpp stage
|
||||
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
|
||||
FROM gcc:${GCC_VERSION} AS build
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt/lists \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
apt upgrade -y && \
|
||||
apt install -y --no-install-recommends \
|
||||
@@ -40,7 +40,7 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
|
||||
|
||||
|
||||
### Collect all llama.cpp binaries, libraries and distro libraries
|
||||
FROM --platform=linux/s390x scratch AS collector
|
||||
FROM scratch AS collector
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
@@ -49,13 +49,14 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
|
||||
|
||||
### Base image
|
||||
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt/lists \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
apt install -y --no-install-recommends \
|
||||
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
|
||||
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
|
||||
curl libgomp1 libopenblas-dev && \
|
||||
apt autoremove -y && \
|
||||
apt clean -y && \
|
||||
@@ -68,13 +69,13 @@ COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
|
||||
|
||||
|
||||
### Full
|
||||
FROM --platform=linux/s390x base AS full
|
||||
FROM base AS full
|
||||
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
WORKDIR /app
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt/lists \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
apt install -y \
|
||||
git cmake libjpeg-dev \
|
||||
@@ -97,7 +98,7 @@ ENTRYPOINT [ "/app/tools.sh" ]
|
||||
|
||||
|
||||
### CLI Only
|
||||
FROM --platform=linux/s390x base AS light
|
||||
FROM base AS light
|
||||
|
||||
WORKDIR /llama.cpp/bin
|
||||
|
||||
@@ -108,7 +109,7 @@ ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||
|
||||
|
||||
### Server
|
||||
FROM --platform=linux/s390x base AS server
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
|
||||
52
.github/workflows/build-amd.yml
vendored
Normal file
52
.github/workflows/build-amd.yml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
name: CI (AMD)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-amd.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.comp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ggml-ci-x64-amd-vulkan:
|
||||
runs-on: [self-hosted, Linux, X64, AMD]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-amd-rocm:
|
||||
runs-on: [self-hosted, Linux, X64, AMD]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
amd-smi static
|
||||
GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
135
.github/workflows/build-linux-cross.yml
vendored
135
.github/workflows/build-linux-cross.yml
vendored
@@ -141,97 +141,6 @@ jobs:
|
||||
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-ppc64el-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup PowerPC64le
|
||||
run: |
|
||||
sudo dpkg --add-architecture ppc64el
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
# ubuntu-24-ppc64el-vulkan-cross:
|
||||
# runs-on: ubuntu-24.04
|
||||
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Setup PowerPC64le
|
||||
# run: |
|
||||
# sudo dpkg --add-architecture ppc64el
|
||||
|
||||
# # Add arch-specific repositories for non-amd64 architectures
|
||||
# cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
# EOF
|
||||
|
||||
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# build-essential \
|
||||
# glslc \
|
||||
# gcc-14-powerpc64le-linux-gnu \
|
||||
# g++-14-powerpc64le-linux-gnu \
|
||||
# libvulkan-dev:ppc64el
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_CURL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_VULKAN=ON \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
||||
# -DLLAMA_BUILD_TOOLS=ON \
|
||||
# -DLLAMA_BUILD_TESTS=OFF \
|
||||
# -DCMAKE_SYSTEM_NAME=Linux \
|
||||
# -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||
# -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||
# -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
|
||||
debian-13-loongarch64-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||
@@ -344,3 +253,47 @@ jobs:
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-riscv64-cpu-spacemit-ime-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
SPACEMIT_IME_TOOLCHAIN_PATH: "spacemit-toolchain-linux-glibc-x86_64"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Cache Toolchain
|
||||
uses: actions/cache@v4
|
||||
id: cache-spacemit-ime-cross-toolchain
|
||||
with:
|
||||
path: ./${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
|
||||
key: ${{ runner.os }}-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||
|
||||
- name: Setup Toolchain
|
||||
if: steps.cache-spacemit-ime-cross-toolchain.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
wget --quiet --no-check-certificate https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}.tar.xz -O ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz
|
||||
rm -rf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
|
||||
mkdir -p ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
|
||||
tar xf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz -C ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }} --strip-components=1
|
||||
rm -rf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
export RISCV_ROOT_PATH=${PWD}/${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
60
.github/workflows/build-riscv-native.yml
vendored
60
.github/workflows/build-riscv-native.yml
vendored
@@ -58,3 +58,63 @@ jobs:
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
# debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
|
||||
# runs-on: [self-hosted, RISCV64]
|
||||
|
||||
# steps:
|
||||
# - name: Install prerequisites
|
||||
# run: |
|
||||
# sudo apt-get update || true
|
||||
# sudo apt-get install -y libatomic1
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Setup Riscv
|
||||
# run: |
|
||||
# sudo apt-get update || true
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# build-essential \
|
||||
# gcc-14-riscv64-linux-gnu \
|
||||
# g++-14-riscv64-linux-gnu \
|
||||
# ccache \
|
||||
# cmake
|
||||
# sudo apt-get upgrade binutils -y
|
||||
|
||||
# - name: Setup ccache
|
||||
# run: |
|
||||
# mkdir -p $HOME/.ccache
|
||||
# ccache -M 5G -d $HOME/.ccache
|
||||
# export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
||||
# export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
||||
# echo "$GITHUB_WORKSPACE"
|
||||
# echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
||||
# echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
||||
# echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
||||
# echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build \
|
||||
# -DLLAMA_CURL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
||||
# -DLLAMA_BUILD_TOOLS=ON \
|
||||
# -DLLAMA_BUILD_TESTS=OFF \
|
||||
# -DCMAKE_SYSTEM_NAME=Linux \
|
||||
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
# -DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
# -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
|
||||
# -DGGML_RVV=ON \
|
||||
# -DGGML_RV_ZFH=ON \
|
||||
# -DGGML_RV_ZICBOP=ON \
|
||||
# -DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
# -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
|
||||
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
|
||||
257
.github/workflows/build.yml
vendored
257
.github/workflows/build.yml
vendored
@@ -192,6 +192,10 @@ jobs:
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-22.04-arm
|
||||
- build: 's390x'
|
||||
os: ubuntu-24.04-s390x
|
||||
- build: 'ppc64le'
|
||||
os: ubuntu-24.04-ppc64le
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
@@ -206,11 +210,28 @@ jobs:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev \
|
||||
libjpeg-dev build-essential libcurl4-openssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Swap Endianness
|
||||
id: endianness
|
||||
if: ${{ matrix.build == 's390x' }}
|
||||
run: |
|
||||
for f in models/*.gguf; do
|
||||
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
|
||||
done
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -228,6 +249,7 @@ jobs:
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
if: ${{ matrix.build != 's390x' }}
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
@@ -237,6 +259,15 @@ jobs:
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
- name: Test llama2c (s390x)
|
||||
id: llama2c_test_s390x
|
||||
if: ${{ matrix.build == 's390x' }}
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch llama2c big-endian model"
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -475,7 +506,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -1191,11 +1222,12 @@ jobs:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: android-build
|
||||
evict-old-files: 1d
|
||||
# Disabled due to size (400MB) and always 0 cache hits
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: android-build
|
||||
# evict-old-files: 1d
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v3
|
||||
@@ -1251,59 +1283,132 @@ jobs:
|
||||
# TODO: simplify the following workflows using a matrix
|
||||
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
||||
ggml-ci-x64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, X64, CPU, low-perf]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-x64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU, low-perf]
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, X64, CPU, high-perf]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-x64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU, high-perf]
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-nvidia-v100-cuda:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
|
||||
ggml-ci-arm64-cpu-high-perf-sve:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-high-perf-sve
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-nvidia-cuda:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -1316,8 +1421,8 @@ jobs:
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-nvidia-v100-vulkan:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
|
||||
ggml-ci-x64-nvidia-vulkan-cm:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -1327,51 +1432,23 @@ jobs:
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-nvidia-t4-cuda:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-nvidia-t4-vulkan:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-nvidia-t4-vulkan-coopmat1:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-nvidia-vulkan-cm2:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-cpu-amx:
|
||||
runs-on: [self-hosted, Linux, X64, CPU, AMX]
|
||||
|
||||
@@ -1385,32 +1462,6 @@ jobs:
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-amd-v710-vulkan:
|
||||
runs-on: [self-hosted, Linux, X64, AMD, V710]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-amd-v710-rocm:
|
||||
runs-on: [self-hosted, Linux, X64, AMD, V710]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
@@ -1424,16 +1475,16 @@ jobs:
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: install vulkan drivers
|
||||
# ggml-ci-mac-vulkan:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
ggml-ci-mac-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
65
.github/workflows/docker.yml
vendored
65
.github/workflows/docker.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ${{ matrix.config.runs_on }}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
@@ -39,12 +39,12 @@ jobs:
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
steps:
|
||||
@@ -54,6 +54,7 @@ jobs:
|
||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ matrix.config.tag != 's390x' }}
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v7.0.0-28
|
||||
@@ -68,22 +69,19 @@ jobs:
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Determine tag name
|
||||
- name: Determine source tag name
|
||||
id: srctag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Determine image tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
||||
REPO_NAME="${{ github.event.repository.name }}"
|
||||
|
||||
# determine tag name postfix (build number, commit hash)
|
||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
||||
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
||||
fi
|
||||
# list all tags possible
|
||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
@@ -91,9 +89,9 @@ jobs:
|
||||
TYPE="-${{ matrix.config.tag }}"
|
||||
fi
|
||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||
@@ -101,7 +99,6 @@ jobs:
|
||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
||||
env:
|
||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
@@ -177,3 +174,29 @@ jobs:
|
||||
# return to this if the experimental github cache is having issues
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
|
||||
create_tag:
|
||||
name: Create and push git tag
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine source tag name
|
||||
id: srctag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Create and push git tag
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
git tag ${{ steps.srctag.outputs.name }} || exit 0
|
||||
git push origin ${{ steps.srctag.outputs.name }} || exit 0
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -149,6 +149,6 @@ poetry.toml
|
||||
/run-chat.sh
|
||||
.ccache/
|
||||
|
||||
# Code Workspace
|
||||
# IDE
|
||||
*.code-workspace
|
||||
|
||||
.windsurf/
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
#### Tailwind & CSS
|
||||
|
||||
- We are using Tailwind v4 which uses oklch colors so we now want to refer to the CSS vars directly, without wrapping it with any color function like `hsla/hsl`, `rgba` etc.
|
||||
@@ -1,48 +0,0 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
# Coding rules
|
||||
|
||||
## Svelte & SvelteKit
|
||||
|
||||
### Services vs Stores Separation Pattern
|
||||
|
||||
#### `lib/services/` - Pure Business Logic
|
||||
|
||||
- **Purpose**: Stateless business logic and external communication
|
||||
- **Contains**:
|
||||
- API calls to external services (ApiService)
|
||||
- Pure business logic functions (ChatService, etc.)
|
||||
- **Rules**:
|
||||
- NO Svelte runes ($state, $derived, $effect)
|
||||
- NO reactive state management
|
||||
- Pure functions and classes only
|
||||
- Can import types but not stores
|
||||
- Focus on "how" - implementation details
|
||||
|
||||
#### `lib/stores/` - Reactive State Management
|
||||
|
||||
- **Purpose**: Svelte-specific reactive state with runes
|
||||
- **Contains**:
|
||||
- Reactive state classes with $state, $derived, $effect
|
||||
- Database operations (DatabaseStore)
|
||||
- UI-focused state management
|
||||
- Store orchestration logic
|
||||
- **Rules**:
|
||||
- USE Svelte runes for reactivity
|
||||
- Import and use services for business logic
|
||||
- NO direct database operations
|
||||
- NO direct API calls (use services)
|
||||
- Focus on "what" - reactive state for UI
|
||||
|
||||
#### Enforcement
|
||||
|
||||
- Services should be testable without Svelte
|
||||
- Stores should leverage Svelte's reactivity system
|
||||
- Clear separation: services handle data, stores handle state
|
||||
- Services can be reused across multiple stores
|
||||
|
||||
#### Misc
|
||||
|
||||
- Always use `let` for $derived state variables
|
||||
@@ -1,9 +0,0 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
# Automated Tests
|
||||
|
||||
## General rules
|
||||
|
||||
- NEVER include any test code in the production code - we should always have it in a separate dedicated files
|
||||
@@ -1,7 +0,0 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
## TypeScript
|
||||
|
||||
- Add JSDocs for functions
|
||||
@@ -92,6 +92,7 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
|
||||
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
||||
|
||||
# Required for relocatable CMake package
|
||||
|
||||
23
CODEOWNERS
23
CODEOWNERS
@@ -50,6 +50,7 @@
|
||||
/ggml/src/ggml-blas/ @slaren
|
||||
/ggml/src/ggml-common.h @ggerganov @slaren
|
||||
/ggml/src/ggml-cpu/ @ggerganov @slaren
|
||||
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
|
||||
/ggml/src/ggml-cuda/common.cuh @slaren
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
|
||||
@@ -59,11 +60,13 @@
|
||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||
/ggml/src/ggml-impl.h @ggerganov @slaren
|
||||
/ggml/src/ggml-metal/ @ggerganov
|
||||
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-quants.* @ggerganov
|
||||
/ggml/src/ggml-rpc/ @rgerganov
|
||||
/ggml/src/ggml-threading.* @ggerganov @slaren
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-zdnn/ @taronaeo
|
||||
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml.c @ggerganov @slaren
|
||||
/ggml/src/ggml.cpp @ggerganov @slaren
|
||||
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
||||
@@ -89,18 +92,20 @@
|
||||
/tools/mtmd/ @ngxson
|
||||
/tools/perplexity/ @ggerganov
|
||||
/tools/quantize/ @ggerganov
|
||||
/tools/rpc/ @rgerganov
|
||||
/tools/run/ @ericcurtin
|
||||
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
|
||||
/tools/server/webui/ @allozaur
|
||||
/tools/tokenize/ @ggerganov
|
||||
/tools/tts/ @ggerganov
|
||||
/vendor/ @ggerganov
|
||||
.clang-format @slaren
|
||||
.clang-tidy @slaren
|
||||
AUTHORS @ggerganov
|
||||
CMakeLists.txt @ggerganov
|
||||
CONTRIBUTING.md @ggerganov
|
||||
LICENSE @ggerganov
|
||||
README.md @ggerganov
|
||||
SECURITY.md @ggerganov
|
||||
/.clang-format @slaren
|
||||
/.clang-tidy @slaren
|
||||
/AUTHORS @ggerganov
|
||||
/CMakeLists.txt @ggerganov
|
||||
/CONTRIBUTING.md @ggerganov
|
||||
/LICENSE @ggerganov
|
||||
/README.md @ggerganov
|
||||
/SECURITY.md @ggerganov
|
||||
/build-xcframework.sh @danbev
|
||||
requirements*.txt @CISC
|
||||
|
||||
@@ -25,7 +25,7 @@ The project differentiates between 3 levels of contributors:
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers, merge their own PRs
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
|
||||
@@ -178,6 +178,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
|
||||
|
||||
@@ -422,6 +422,7 @@ echo "Building for iOS devices..."
|
||||
cmake -B build-ios-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_SYSROOT=iphoneos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
|
||||
@@ -21,7 +21,7 @@ docker run --privileged -it \
|
||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||
-v $PWD:/ws -w /ws \
|
||||
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
```
|
||||
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
34
ci/run.sh
34
ci/run.sh
@@ -92,6 +92,12 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
@@ -103,6 +109,12 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
|
||||
MUSA_ARCH=${MUSA_ARCH:-21}
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_NO_SVE} ]; then
|
||||
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
|
||||
fi
|
||||
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
@@ -339,16 +351,16 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||
@@ -421,7 +433,7 @@ function gg_run_qwen3_0_6b {
|
||||
function gg_sum_qwen3_0_6b {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Pythia 2.8B:\n'
|
||||
gg_printf 'Qwen3 0.6B:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
|
||||
29
cmake/riscv64-spacemit-linux-gnu-gcc.cmake
Normal file
29
cmake/riscv64-spacemit-linux-gnu-gcc.cmake
Normal file
@@ -0,0 +1,29 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_PROCESSOR riscv64)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
|
||||
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
|
||||
message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
else()
|
||||
set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
|
||||
if (DEFINED ENV{RISCV_ROOT_PATH})
|
||||
file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
|
||||
else()
|
||||
message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
|
||||
endif()
|
||||
|
||||
set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
|
||||
set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
|
||||
set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
|
||||
set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
|
||||
set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
|
||||
set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
|
||||
endif()
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
|
||||
@@ -87,7 +87,43 @@ if (LLAMA_CURL)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
if (LLAMA_OPENSSL)
|
||||
find_package(OpenSSL)
|
||||
if (OpenSSL_FOUND)
|
||||
include(CheckCSourceCompiles)
|
||||
set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
||||
set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
|
||||
check_c_source_compiles("
|
||||
#include <openssl/opensslv.h>
|
||||
#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
|
||||
# if OPENSSL_VERSION_NUMBER < 0x1010107f
|
||||
# error bad version
|
||||
# endif
|
||||
#else
|
||||
# if OPENSSL_VERSION_NUMBER < 0x30000000L
|
||||
# error bad version
|
||||
# endif
|
||||
#endif
|
||||
int main() { return 0; }
|
||||
" OPENSSL_VERSION_SUPPORTED)
|
||||
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
|
||||
if (OPENSSL_VERSION_SUPPORTED)
|
||||
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
||||
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
|
||||
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
|
||||
find_library(SECURITY_FRAMEWORK Security REQUIRED)
|
||||
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "OpenSSL not found, SSL support disabled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
|
||||
401
common/arg.cpp
401
common/arg.cpp
@@ -37,6 +37,8 @@
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#include <curl/curl.h>
|
||||
#include <curl/easy.h>
|
||||
#else
|
||||
#include <cpp-httplib/httplib.h>
|
||||
#endif
|
||||
|
||||
#ifdef __linux__
|
||||
@@ -215,12 +217,55 @@ struct common_hf_file_res {
|
||||
std::string mmprojFile;
|
||||
};
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
bool common_has_curl() {
|
||||
return true;
|
||||
static void write_etag(const std::string & path, const std::string & etag) {
|
||||
const std::string etag_path = path + ".etag";
|
||||
write_file(etag_path, etag);
|
||||
LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
|
||||
}
|
||||
|
||||
static std::string read_etag(const std::string & path) {
|
||||
std::string none;
|
||||
const std::string etag_path = path + ".etag";
|
||||
|
||||
if (std::filesystem::exists(etag_path)) {
|
||||
std::ifstream etag_in(etag_path);
|
||||
if (!etag_in) {
|
||||
LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
|
||||
return none;
|
||||
}
|
||||
std::string etag;
|
||||
std::getline(etag_in, etag);
|
||||
return etag;
|
||||
}
|
||||
|
||||
// no etag file, but maybe there is an old .json
|
||||
// remove this code later
|
||||
const std::string metadata_path = path + ".json";
|
||||
|
||||
if (std::filesystem::exists(metadata_path)) {
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
try {
|
||||
nlohmann::json metadata_json;
|
||||
metadata_in >> metadata_json;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
|
||||
metadata_json.dump().c_str());
|
||||
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
|
||||
std::string etag = metadata_json.at("etag");
|
||||
write_etag(path, etag);
|
||||
if (!std::filesystem::remove(metadata_path)) {
|
||||
LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
|
||||
}
|
||||
return etag;
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
return none;
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// CURL utils
|
||||
//
|
||||
@@ -371,36 +416,15 @@ static bool common_download_head(CURL * curl,
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token) {
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
std::string etag;
|
||||
|
||||
// Check if the file already exists locally
|
||||
const auto file_exists = std::filesystem::exists(path);
|
||||
if (file_exists) {
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
|
||||
metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
etag = read_etag(path);
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
@@ -438,11 +462,6 @@ static bool common_download_file_single_online(const std::string & url,
|
||||
headers.etag.c_str());
|
||||
should_download = true;
|
||||
should_download_from_scratch = true;
|
||||
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
||||
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
|
||||
last_modified.c_str(), headers.last_modified.c_str());
|
||||
should_download = true;
|
||||
should_download_from_scratch = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -473,15 +492,9 @@ static bool common_download_file_single_online(const std::string & url,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write the updated JSON metadata file.
|
||||
metadata.update({
|
||||
{ "url", url },
|
||||
{ "etag", headers.etag },
|
||||
{ "lastModified", headers.last_modified }
|
||||
});
|
||||
write_file(metadata_path, metadata.dump(4));
|
||||
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
if (head_request_ok) {
|
||||
write_etag(path, headers.etag);
|
||||
}
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
|
||||
@@ -568,21 +581,305 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
|
||||
#else
|
||||
|
||||
bool common_has_curl() {
|
||||
return false;
|
||||
}
|
||||
struct common_url {
|
||||
std::string scheme;
|
||||
std::string user;
|
||||
std::string password;
|
||||
std::string host;
|
||||
std::string path;
|
||||
};
|
||||
|
||||
static bool common_download_file_single_online(const std::string &, const std::string &, const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||
return false;
|
||||
}
|
||||
static common_url parse_url(const std::string & url) {
|
||||
common_url parts;
|
||||
auto scheme_end = url.find("://");
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
||||
if (!url.empty()) {
|
||||
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
||||
if (scheme_end == std::string::npos) {
|
||||
throw std::runtime_error("invalid URL: no scheme");
|
||||
}
|
||||
parts.scheme = url.substr(0, scheme_end);
|
||||
|
||||
if (parts.scheme != "http" && parts.scheme != "https") {
|
||||
throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
|
||||
}
|
||||
|
||||
return {};
|
||||
auto rest = url.substr(scheme_end + 3);
|
||||
auto at_pos = rest.find('@');
|
||||
|
||||
if (at_pos != std::string::npos) {
|
||||
auto auth = rest.substr(0, at_pos);
|
||||
auto colon_pos = auth.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
parts.user = auth.substr(0, colon_pos);
|
||||
parts.password = auth.substr(colon_pos + 1);
|
||||
} else {
|
||||
parts.user = auth;
|
||||
}
|
||||
rest = rest.substr(at_pos + 1);
|
||||
}
|
||||
|
||||
auto slash_pos = rest.find('/');
|
||||
|
||||
if (slash_pos != std::string::npos) {
|
||||
parts.host = rest.substr(0, slash_pos);
|
||||
parts.path = rest.substr(slash_pos);
|
||||
} else {
|
||||
parts.host = rest;
|
||||
parts.path = "/";
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
|
||||
static std::pair<httplib::Client, common_url> http_client(const std::string & url) {
|
||||
common_url parts = parse_url(url);
|
||||
|
||||
if (parts.host.empty()) {
|
||||
throw std::runtime_error("error: invalid URL format");
|
||||
}
|
||||
|
||||
if (!parts.user.empty()) {
|
||||
throw std::runtime_error("error: user:password@ not supported yet"); // TODO
|
||||
}
|
||||
|
||||
httplib::Client cli(parts.scheme + "://" + parts.host);
|
||||
cli.set_follow_location(true);
|
||||
|
||||
// TODO cert
|
||||
|
||||
return { std::move(cli), std::move(parts) };
|
||||
}
|
||||
|
||||
static std::string show_masked_url(const common_url & parts) {
|
||||
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
|
||||
}
|
||||
|
||||
static void print_progress(size_t current, size_t total) { // TODO isatty
|
||||
if (!total) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t width = 50;
|
||||
size_t pct = (100 * current) / total;
|
||||
size_t pos = (width * current) / total;
|
||||
|
||||
std::cout << "["
|
||||
<< std::string(pos, '=')
|
||||
<< (pos < width ? ">" : "")
|
||||
<< std::string(width - pos, ' ')
|
||||
<< "] " << std::setw(3) << pct << "% ("
|
||||
<< current / (1024 * 1024) << " MB / "
|
||||
<< total / (1024 * 1024) << " MB)\r";
|
||||
std::cout.flush();
|
||||
}
|
||||
|
||||
static bool common_pull_file(httplib::Client & cli,
|
||||
const std::string & resolve_path,
|
||||
const std::string & path_tmp,
|
||||
bool supports_ranges,
|
||||
size_t existing_size,
|
||||
size_t & total_size) {
|
||||
std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
|
||||
if (!ofs.is_open()) {
|
||||
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
httplib::Headers headers;
|
||||
if (supports_ranges && existing_size > 0) {
|
||||
headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
|
||||
}
|
||||
|
||||
std::atomic<size_t> downloaded{existing_size};
|
||||
|
||||
auto res = cli.Get(resolve_path, headers,
|
||||
[&](const httplib::Response &response) {
|
||||
if (existing_size > 0 && response.status != 206) {
|
||||
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
|
||||
return false;
|
||||
}
|
||||
if (existing_size == 0 && response.status != 200) {
|
||||
LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
|
||||
return false;
|
||||
}
|
||||
if (total_size == 0 && response.has_header("Content-Length")) {
|
||||
try {
|
||||
size_t content_length = std::stoull(response.get_header_value("Content-Length"));
|
||||
total_size = existing_size + content_length;
|
||||
} catch (const std::exception &e) {
|
||||
LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
|
||||
}
|
||||
}
|
||||
return true;
|
||||
},
|
||||
[&](const char *data, size_t len) {
|
||||
ofs.write(data, len);
|
||||
if (!ofs) {
|
||||
LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
|
||||
return false;
|
||||
}
|
||||
downloaded += len;
|
||||
print_progress(downloaded, total_size);
|
||||
return true;
|
||||
},
|
||||
nullptr
|
||||
);
|
||||
|
||||
std::cout << "\n";
|
||||
|
||||
if (!res) {
|
||||
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
|
||||
auto [cli, parts] = http_client(url);
|
||||
|
||||
httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
|
||||
if (!bearer_token.empty()) {
|
||||
default_headers.insert({"Authorization", "Bearer " + bearer_token});
|
||||
}
|
||||
cli.set_default_headers(default_headers);
|
||||
|
||||
const bool file_exists = std::filesystem::exists(path);
|
||||
|
||||
std::string last_etag;
|
||||
if (file_exists) {
|
||||
last_etag = read_etag(path);
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
auto head = cli.Head(parts.path);
|
||||
bool head_ok = head && head->status >= 200 && head->status < 300;
|
||||
if (!head_ok) {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
|
||||
if (file_exists) {
|
||||
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
std::string etag;
|
||||
if (head_ok && head->has_header("ETag")) {
|
||||
etag = head->get_header_value("ETag");
|
||||
}
|
||||
|
||||
size_t total_size = 0;
|
||||
if (head_ok && head->has_header("Content-Length")) {
|
||||
try {
|
||||
total_size = std::stoull(head->get_header_value("Content-Length"));
|
||||
} catch (const std::exception& e) {
|
||||
LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
|
||||
}
|
||||
}
|
||||
|
||||
bool supports_ranges = false;
|
||||
if (head_ok && head->has_header("Accept-Ranges")) {
|
||||
supports_ranges = head->get_header_value("Accept-Ranges") != "none";
|
||||
}
|
||||
|
||||
bool should_download_from_scratch = false;
|
||||
if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
|
||||
last_etag.c_str(), etag.c_str());
|
||||
should_download_from_scratch = true;
|
||||
}
|
||||
|
||||
if (file_exists) {
|
||||
if (!should_download_from_scratch) {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
return true;
|
||||
}
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const std::string path_temporary = path + ".downloadInProgress";
|
||||
size_t existing_size = 0;
|
||||
|
||||
if (std::filesystem::exists(path_temporary)) {
|
||||
if (supports_ranges && !should_download_from_scratch) {
|
||||
existing_size = std::filesystem::file_size(path_temporary);
|
||||
} else if (remove(path_temporary.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
|
||||
__func__, show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
|
||||
const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
|
||||
if (!was_pull_successful) {
|
||||
if (i + 1 < max_attempts) {
|
||||
const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
|
||||
LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
} else {
|
||||
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
}
|
||||
if (!etag.empty()) {
|
||||
write_etag(path, etag);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
|
||||
const common_remote_params & params) {
|
||||
auto [cli, parts] = http_client(url);
|
||||
|
||||
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
|
||||
for (const auto & header : params.headers) {
|
||||
size_t pos = header.find(':');
|
||||
if (pos != std::string::npos) {
|
||||
headers.emplace(header.substr(0, pos), header.substr(pos + 1));
|
||||
} else {
|
||||
headers.emplace(header, "");
|
||||
}
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
cli.set_read_timeout(params.timeout, 0);
|
||||
cli.set_write_timeout(params.timeout, 0);
|
||||
}
|
||||
|
||||
std::vector<char> buf;
|
||||
auto res = cli.Get(parts.path, headers,
|
||||
[&](const char *data, size_t len) {
|
||||
buf.insert(buf.end(), data, data + len);
|
||||
return params.max_size == 0 ||
|
||||
buf.size() <= static_cast<size_t>(params.max_size);
|
||||
},
|
||||
nullptr
|
||||
);
|
||||
|
||||
if (!res) {
|
||||
throw std::runtime_error("error: cannot make GET request");
|
||||
}
|
||||
|
||||
return { res->status, std::move(buf) };
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
@@ -78,7 +78,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
bool common_has_curl();
|
||||
|
||||
struct common_remote_params {
|
||||
std::vector<std::string> headers;
|
||||
|
||||
@@ -1616,17 +1616,36 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
);
|
||||
});
|
||||
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
||||
channel + " \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
||||
if (data.grammar_lazy) {
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
||||
} else {
|
||||
auto not_end = builder.add_rule("not-end",
|
||||
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
||||
auto analysis = builder.add_rule("analysis",
|
||||
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
||||
auto commentary = builder.add_rule("commentary",
|
||||
"\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
||||
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root",
|
||||
"( " + analysis + " \"<|start|>assistant\" )? " +
|
||||
"( " + commentary + " \"<|start|>assistant\" )? " +
|
||||
"( " + recipient_in_role + " | " + recipient_in_channel + " )"
|
||||
);
|
||||
}
|
||||
|
||||
// Trigger on tool calls that appear in the commentary channel
|
||||
data.grammar_triggers.push_back({
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <codecvt>
|
||||
#include <chrono>
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
@@ -50,6 +51,11 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/types.h>
|
||||
#include <pwd.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
@@ -864,8 +870,20 @@ std::string fs_get_cache_directory() {
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
} else if (std::getenv("HOME")) {
|
||||
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
||||
} else {
|
||||
#if defined(__linux__)
|
||||
/* no $HOME is defined, fallback to getpwuid */
|
||||
struct passwd *pw = getpwuid(getuid());
|
||||
if ((!pw) || (!pw->pw_dir)) {
|
||||
throw std::runtime_error("Failed to find $HOME directory");
|
||||
}
|
||||
|
||||
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
|
||||
#else /* defined(__linux__) */
|
||||
throw std::runtime_error("Failed to find $HOME directory");
|
||||
#endif /* defined(__linux__) */
|
||||
}
|
||||
#elif defined(__APPLE__)
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
@@ -960,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
|
||||
@@ -738,7 +738,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
// MoE utils
|
||||
//
|
||||
|
||||
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
|
||||
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||
|
||||
static std::string llm_ffn_exps_block_regex(int idx) {
|
||||
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
||||
|
||||
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_context_print(ctx);
|
||||
llama_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3717,11 +3717,29 @@ class Qwen2MoeModel(TextModel):
|
||||
class Qwen3Model(Qwen2Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
# extra logic for rerank models
|
||||
is_rerank: bool = False
|
||||
is_tied_embeddings: bool = False
|
||||
token_false_id: int | None = None
|
||||
token_true_id: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# track for intern-s1-mini
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# a bit hacky, but currently the only way to detect if this is a rerank model
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
|
||||
readme_path = self.dir_model / "README.md"
|
||||
readme_text = ""
|
||||
if readme_path.exists():
|
||||
with readme_path.open("r", encoding="utf-8") as f:
|
||||
readme_text = f.read()
|
||||
if "# Qwen3-Reranker" in readme_text:
|
||||
self._find_rerank_config()
|
||||
|
||||
def set_vocab(self):
|
||||
# deal with intern-s1-mini
|
||||
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
|
||||
@@ -3730,6 +3748,53 @@ class Qwen3Model(Qwen2Model):
|
||||
|
||||
super().set_vocab()
|
||||
|
||||
def _find_rerank_config(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
|
||||
self.is_rerank = True
|
||||
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
|
||||
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
|
||||
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
|
||||
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
|
||||
|
||||
assert self.token_false_id is not None and self.token_true_id is not None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if self.is_rerank:
|
||||
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
|
||||
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
|
||||
self.gguf_writer.add_chat_template([{
|
||||
"name": "rerank",
|
||||
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
|
||||
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||
}])
|
||||
|
||||
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
|
||||
# extract "yes" and "no" tokens from the output lm_head tensor
|
||||
false_row = data_torch[self.token_false_id]
|
||||
true_row = data_torch[self.token_true_id]
|
||||
return torch.stack([true_row, false_row], dim=0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.is_rerank:
|
||||
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
|
||||
is_real_head = not self.is_tied_embeddings and "lm_head" in name
|
||||
if is_tied_head or is_real_head:
|
||||
cls_out_head = (
|
||||
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
|
||||
self._get_cls_out_tensor(data_torch),
|
||||
)
|
||||
if is_tied_head:
|
||||
embed = (self.map_tensor_name(name), data_torch)
|
||||
return [cls_out_head, embed]
|
||||
if is_real_head:
|
||||
return [cls_out_head]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3MoeForCausalLM")
|
||||
class Qwen3MoeModel(Qwen2MoeModel):
|
||||
@@ -7930,6 +7995,121 @@ class BailingMoeModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
|
||||
class GroveMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GROVEMOE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
|
||||
self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
|
||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
|
||||
self.gguf_writer.add_experts_per_group(2)
|
||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
||||
self.gguf_writer.add_expert_group_scale(0.05)
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
_chunk_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith(".expert_bias"):
|
||||
# FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
|
||||
return []
|
||||
|
||||
# process the experts separately
|
||||
if name.find("chunk_experts") != -1:
|
||||
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
|
||||
assert bid is not None
|
||||
|
||||
if self._chunk_experts is None:
|
||||
self._chunk_experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._chunk_experts[bid][name] = data_torch
|
||||
|
||||
if len(self._chunk_experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._chunk_experts[bid][ename])
|
||||
del self._chunk_experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
elif name.find("experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._chunk_experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
|
||||
if len(chunk_experts) > 0:
|
||||
raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("ChameleonForConditionalGeneration")
|
||||
@ModelBase.register("ChameleonForCausalLM") # obsolete
|
||||
class ChameleonModel(TextModel):
|
||||
|
||||
89
docs/build-riscv64-spacemit.md
Normal file
89
docs/build-riscv64-spacemit.md
Normal file
@@ -0,0 +1,89 @@
|
||||
> [!IMPORTANT]
|
||||
> This build documentation is specific only to RISC-V SpacemiT SOCs.
|
||||
|
||||
## Build llama.cpp locally (for riscv64)
|
||||
|
||||
1. Prepare Toolchain For RISCV
|
||||
~~~
|
||||
wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
|
||||
~~~
|
||||
|
||||
2. Build
|
||||
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
|
||||
```bash
|
||||
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=build/installed
|
||||
|
||||
cmake --build build --parallel $(nproc) --config Release
|
||||
|
||||
pushd build
|
||||
make install
|
||||
popd
|
||||
```
|
||||
|
||||
## Simulation
|
||||
You can use QEMU to perform emulation on non-RISC-V architectures.
|
||||
|
||||
1. Download QEMU
|
||||
~~~
|
||||
wget https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz
|
||||
~~~
|
||||
|
||||
2. Run Simulation
|
||||
After build your llama.cpp, you can run the executable file via QEMU for simulation, for example:
|
||||
~~~
|
||||
export QEMU_ROOT_PATH={your QEMU file path}
|
||||
export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
|
||||
|
||||
${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
|
||||
~~~
|
||||
## Performance
|
||||
#### Quantization Support For Matrix
|
||||
~~~
|
||||
model name : Spacemit(R) X60
|
||||
isa : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
|
||||
mmu : sv39
|
||||
uarch : spacemit,x60
|
||||
mvendorid : 0x710
|
||||
marchid : 0x8000000058000001
|
||||
~~~
|
||||
|
||||
Q4_0
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | pp512|64.12 ± 0.26|
|
||||
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | tg128|10.03 ± 0.01|
|
||||
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | pp512|24.16 ± 0.02|
|
||||
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | tg128|3.83 ± 0.06|
|
||||
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | pp512|12.08 ± 0.02|
|
||||
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | tg128|2.23 ± 0.02|
|
||||
|
||||
Q4_1
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | pp512|62.07 ± 0.12|
|
||||
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | tg128|9.91 ± 0.01|
|
||||
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | pp512|22.95 ± 0.25|
|
||||
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | tg128|4.01 ± 0.15|
|
||||
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | pp512|11.55 ± 0.16|
|
||||
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | tg128|2.25 ± 0.04|
|
||||
|
||||
|
||||
Q4_K
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | pp512|9.29 ± 0.05|
|
||||
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | tg128|5.67 ± 0.04|
|
||||
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | pp512|10.38 ± 0.10|
|
||||
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | tg128|3.17 ± 0.08|
|
||||
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | pp512|4.23 ± 0.04|
|
||||
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | tg128|1.73 ± 0.00|
|
||||
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `MUSA_VERSION` set to `rc4.2.0`
|
||||
- `MUSA_VERSION` set to `rc4.3.0`
|
||||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
|
||||
@@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
|
||||
params.n_batch = params.n_ctx;
|
||||
}
|
||||
|
||||
// For non-causal models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
// for non-causal models, batch size must be equal to ubatch size
|
||||
if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
|
||||
params.n_ubatch = params.n_batch;
|
||||
}
|
||||
|
||||
// get max number of sequences per batch
|
||||
const int n_seq_max = llama_max_parallel_sequences();
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -144,6 +149,7 @@ int main(int argc, char ** argv) {
|
||||
// get added sep and eos token, if any
|
||||
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
|
||||
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
|
||||
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
@@ -153,21 +159,28 @@ int main(int argc, char ** argv) {
|
||||
// split classification pairs and insert expected separator tokens
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
|
||||
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
|
||||
std::string final_prompt;
|
||||
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
if (rerank_prompt != nullptr) {
|
||||
const std::string query = pairs[0];
|
||||
const std::string doc = pairs[1];
|
||||
std::string final_prompt = rerank_prompt;
|
||||
string_replace_all(final_prompt, "{query}" , query);
|
||||
string_replace_all(final_prompt, "{document}", doc );
|
||||
inp = common_tokenize(vocab, final_prompt, true, true);
|
||||
} else {
|
||||
std::string final_prompt;
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
}
|
||||
}
|
||||
}
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
}
|
||||
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
} else {
|
||||
inp = common_tokenize(ctx, prompt, true, true);
|
||||
}
|
||||
@@ -229,7 +242,7 @@ int main(int argc, char ** argv) {
|
||||
const uint64_t n_toks = inp.size();
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||
|
||||
@@ -5,6 +5,11 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
else()
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
|
||||
endif()
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||
|
||||
@@ -118,13 +118,17 @@ embedding-convert-model:
|
||||
|
||||
embedding-run-original-model:
|
||||
$(call validate_embedding_model_path,embedding-run-original-model)
|
||||
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
|
||||
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
|
||||
./scripts/embedding/run-original-model.py \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-run-converted-model:
|
||||
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
|
||||
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
|
||||
@./scripts/embedding/compare-embeddings-logits.sh
|
||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-inspect-original-model:
|
||||
$(call validate_embedding_model_path,embedding-inspect-original-model)
|
||||
@@ -156,7 +160,8 @@ embedding-quantize-model:
|
||||
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
|
||||
|
||||
embedding-run-quantized-model:
|
||||
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
|
||||
@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
###
|
||||
### Perplexity targets/recipes
|
||||
|
||||
@@ -105,12 +105,12 @@ new model, the model can be converted to GGUF format using the following command
|
||||
### Inspecting the converted model
|
||||
The converted model can be inspected using the following command:
|
||||
```console
|
||||
(venv) $ make inspect-converted-model
|
||||
(venv) $ make causal-inspect-converted-model
|
||||
```
|
||||
|
||||
### Running the converted model
|
||||
```console
|
||||
(venv) $ make run-converted-model
|
||||
(venv) $ make causal-run-converted-model
|
||||
```
|
||||
|
||||
### Model logits verfication
|
||||
|
||||
@@ -151,6 +151,35 @@ int main(int argc, char ** argv) {
|
||||
logits = llama_get_embeddings(ctx);
|
||||
n_logits = llama_model_n_embd(model) * batch.n_tokens;
|
||||
type = "-embeddings";
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
const int n_embd_count = batch.n_tokens;
|
||||
|
||||
printf("Embedding dimension: %d\n", n_embd);
|
||||
printf("\n");
|
||||
|
||||
// Print embeddings in the specified format
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
printf("embedding %d: ", j);
|
||||
|
||||
// Print first 3 values
|
||||
for (int i = 0; i < 3 && i < n_embd; i++) {
|
||||
printf("%9.6f ", logits[j * n_embd + i]);
|
||||
}
|
||||
|
||||
printf(" ... ");
|
||||
|
||||
// Print last 3 values
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
if (i >= 0) {
|
||||
printf("%9.6f ", logits[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Embeddings size: %d\n", n_logits);
|
||||
} else {
|
||||
logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
||||
@@ -183,22 +212,23 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
for (int i = 0; i < n_logits; i++) {
|
||||
fprintf(f, "%d: %.6f\n", i, logits[i]); // Added index and changed format
|
||||
fprintf(f, "%d: %.6f\n", i, logits[i]);
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
// Print first and last 10 logits for quick verification
|
||||
printf("First 10 logits: ");
|
||||
for (int i = 0; i < 10 && i < n_logits; i++) {
|
||||
printf("%.6f ", logits[i]);
|
||||
}
|
||||
printf("\n");
|
||||
if (!embedding_mode) {
|
||||
printf("First 10 logits: ");
|
||||
for (int i = 0; i < 10 && i < n_logits; i++) {
|
||||
printf("%.6f ", logits[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Last 10 logits: ");
|
||||
for (int i = n_logits - 10; i < n_logits; i++) {
|
||||
if (i >= 0) printf("%.6f ", logits[i]);
|
||||
printf("Last 10 logits: ");
|
||||
for (int i = n_logits - 10; i < n_logits; i++) {
|
||||
if (i >= 0) printf("%.6f ", logits[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
printf("Logits saved to %s\n", bin_filename);
|
||||
printf("Logits saved to %s\n", txt_filename);
|
||||
|
||||
@@ -48,7 +48,7 @@ def main():
|
||||
print(f"Error: Model file not found: {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
model_name = os.path.splitext(os.path.basename(model_path))[0]
|
||||
model_name = os.path.basename(model_path)
|
||||
data_dir = Path("data")
|
||||
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
|
||||
@@ -193,7 +193,7 @@ print(f"Input text: {repr(prompt)}")
|
||||
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids)
|
||||
outputs = model(input_ids.to(model.device))
|
||||
logits = outputs.logits
|
||||
|
||||
# Extract logits for the last token (next token prediction)
|
||||
|
||||
@@ -2,8 +2,37 @@
|
||||
|
||||
set -e
|
||||
|
||||
MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
|
||||
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
|
||||
# Parse command line arguments
|
||||
MODEL_PATH=""
|
||||
MODEL_NAME=""
|
||||
PROMPTS_FILE=""
|
||||
|
||||
# First argument is always model path
|
||||
if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
|
||||
MODEL_PATH="$1"
|
||||
shift
|
||||
fi
|
||||
|
||||
# Parse remaining arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--prompts-file|-pf)
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
# If MODEL_NAME not set and this isn't a flag, use as model name
|
||||
if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
|
||||
MODEL_NAME="$1"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Set defaults
|
||||
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
||||
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||
|
||||
if [ -t 0 ]; then
|
||||
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
|
||||
@@ -35,8 +64,18 @@ with open('$TEMP_FILE', 'wb') as f:
|
||||
trap "rm -f $TEMP_FILE" EXIT
|
||||
fi
|
||||
|
||||
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
|
||||
# Build the semantic_check.py command
|
||||
SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
|
||||
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
|
||||
--cpp-embeddings $CPP_EMBEDDINGS \
|
||||
--prompt "Hello world today"
|
||||
--cpp-embeddings $CPP_EMBEDDINGS"
|
||||
|
||||
# Add prompts file if specified, otherwise use default prompt
|
||||
if [ -n "$PROMPTS_FILE" ]; then
|
||||
SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
|
||||
else
|
||||
SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
|
||||
fi
|
||||
|
||||
# Execute the command
|
||||
eval $SEMANTIC_CMD
|
||||
|
||||
|
||||
@@ -2,8 +2,27 @@
|
||||
|
||||
set -e
|
||||
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
# Parse command line arguments
|
||||
CONVERTED_MODEL=""
|
||||
PROMPTS_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-p|--prompts-file)
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
CONVERTED_MODEL="$1"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# First try command line argument, then environment variable
|
||||
CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
@@ -13,8 +32,19 @@ if [ -z "$CONVERTED_MODEL" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read prompt from file or use default
|
||||
if [ -n "$PROMPTS_FILE" ]; then
|
||||
if [ ! -f "$PROMPTS_FILE" ]; then
|
||||
echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
PROMPT=$(cat "$PROMPTS_FILE")
|
||||
else
|
||||
PROMPT="Hello world today"
|
||||
fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
cmake --build ../../build --target llama-logits -j8
|
||||
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
|
||||
# TODO: update logits.cpp to accept a --file/-f option for the prompt
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
|
||||
|
||||
@@ -13,14 +13,37 @@ unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process model with specified path')
|
||||
parser.add_argument('--model-path', '-m', help='Path to the model')
|
||||
parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
|
||||
args = parser.parse_args()
|
||||
|
||||
def read_prompt_from_file(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{file_path}' not found")
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
exit(1)
|
||||
|
||||
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
# This can be used to override the sliding window size for manual testing. This
|
||||
# can be useful to verify the sliding window attention mask in the original model
|
||||
# and compare it with the converted .gguf model.
|
||||
if hasattr(config, 'sliding_window'):
|
||||
original_sliding_window = config.sliding_window
|
||||
#original_sliding_window = 6
|
||||
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
|
||||
|
||||
print(f"Using unreleased model: {unreleased_model_name}")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
@@ -29,19 +52,28 @@ if unreleased_model_name:
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
|
||||
model = model_class.from_pretrained(model_path, config=config)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
model = AutoModel.from_pretrained(model_path)
|
||||
model = AutoModel.from_pretrained(model_path, config=config)
|
||||
print(f"Model class: {type(model)}")
|
||||
#print(f"Model file: {type(model).__module__}")
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
print(f"Model file: {type(model).__module__}")
|
||||
|
||||
# Verify the model is using the correct sliding window
|
||||
if hasattr(model.config, 'sliding_window'):
|
||||
print(f"Model's sliding_window: {model.config.sliding_window}")
|
||||
else:
|
||||
print("Model config does not have sliding_window attribute")
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
texts = [ "Hello world today" ]
|
||||
if args.prompts_file:
|
||||
prompt_text = read_prompt_from_file(args.prompts_file)
|
||||
texts = [prompt_text]
|
||||
else:
|
||||
texts = ["Hello world today"]
|
||||
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
|
||||
@@ -67,7 +67,7 @@ def main():
|
||||
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
model_name = os.path.splitext(os.path.basename(args.model_path))[0]
|
||||
model_name = os.path.basename(args.model_path)
|
||||
data_dir = Path("data")
|
||||
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
|
||||
@@ -40,7 +40,7 @@ if os.path.exists(index_path):
|
||||
file_path = os.path.join(model_path, file_name)
|
||||
print(f"\n--- From {file_name} ---")
|
||||
|
||||
with safe_open(file_path, framework="pt") as f: # type: ignore
|
||||
with safe_open(file_path, framework="pt") as f:
|
||||
for tensor_name in sorted(tensor_names):
|
||||
tensor = f.get_tensor(tensor_name)
|
||||
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
|
||||
@@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
|
||||
# Single file model (original behavior)
|
||||
print("Single-file model detected")
|
||||
|
||||
with safe_open(single_file_path, framework="pt") as f: # type: ignore
|
||||
with safe_open(single_file_path, framework="pt") as f:
|
||||
keys = f.keys()
|
||||
print("Tensors in model:")
|
||||
for key in sorted(keys):
|
||||
|
||||
@@ -101,6 +101,17 @@ def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
|
||||
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
|
||||
}
|
||||
|
||||
def read_prompt_from_file(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{file_path}' not found")
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
exit(1)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
|
||||
parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
|
||||
@@ -108,14 +119,20 @@ def main():
|
||||
parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
|
||||
parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
|
||||
parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
|
||||
parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.prompts_file:
|
||||
prompt = read_prompt_from_file(args.prompts_file)
|
||||
else:
|
||||
prompt = args.prompt
|
||||
|
||||
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
|
||||
print("=" * 70)
|
||||
|
||||
# Single prompt detailed comparison
|
||||
print(f"\nTesting with prompt: '{args.prompt}'")
|
||||
print(f"\nTesting with prompt: '{prompt}'")
|
||||
|
||||
# Load the python model to get configuration information and also to load the tokenizer.
|
||||
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
|
||||
@@ -144,7 +161,7 @@ def main():
|
||||
else:
|
||||
model = AutoModel.from_pretrained(args.model_path)
|
||||
|
||||
encoded = tokenizer(args.prompt, return_tensors="pt")
|
||||
encoded = tokenizer(prompt, return_tensors="pt")
|
||||
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
|
||||
n_tokens = len(tokens)
|
||||
print(f"n_tokens: {n_tokens}");
|
||||
@@ -155,7 +172,7 @@ def main():
|
||||
python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
|
||||
|
||||
# Run comparison
|
||||
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
|
||||
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
|
||||
|
||||
# Summary
|
||||
print(f"\n=== SUMMARY ===")
|
||||
|
||||
@@ -4,8 +4,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 9)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
|
||||
set(GGML_VERSION_PATCH 4)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
||||
@@ -26,8 +25,8 @@ if(GIT_EXE)
|
||||
)
|
||||
endif()
|
||||
|
||||
# Build the version string with optional -dev suffix and dirty flag
|
||||
set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
|
||||
# Build the version string with optional dirty flag
|
||||
set(GGML_VERSION "${GGML_VERSION_BASE}")
|
||||
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
|
||||
set(GGML_VERSION "${GGML_VERSION}-dirty")
|
||||
endif()
|
||||
@@ -177,7 +176,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
||||
|
||||
|
||||
if (MINGW)
|
||||
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
||||
set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
|
||||
endif()
|
||||
|
||||
# ggml core
|
||||
|
||||
@@ -314,7 +314,8 @@ extern "C" {
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
@@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
||||
}
|
||||
|
||||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
@@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
|
||||
|
||||
// dynamic tensor allocator
|
||||
|
||||
#define GGML_VBUFFER_MAX_CHUNKS 16
|
||||
|
||||
// relative memory address within an allocation that can be split into multiple buffers (chunks)
|
||||
struct buffer_address {
|
||||
int chunk; // index of a backend buffer
|
||||
size_t offset; // local memory offset within the buffer
|
||||
};
|
||||
|
||||
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
|
||||
|
||||
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
|
||||
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
|
||||
}
|
||||
|
||||
struct free_block {
|
||||
size_t offset;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct tallocr_chunk {
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
int n_free_blocks;
|
||||
size_t max_size;
|
||||
};
|
||||
|
||||
struct ggml_dyn_tallocr {
|
||||
size_t alignment;
|
||||
int n_free_blocks;
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
size_t max_size;
|
||||
size_t max_chunk_size;
|
||||
struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
|
||||
int n_chunks;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
struct {
|
||||
const struct ggml_tensor * tensor;
|
||||
size_t offset;
|
||||
struct buffer_address addr;
|
||||
} allocated_tensors[1024];
|
||||
#endif
|
||||
};
|
||||
|
||||
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
|
||||
GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||
int insert_pos = 0;
|
||||
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
|
||||
insert_pos++;
|
||||
}
|
||||
// shift all blocks from insert_pos onward to make room for the new block
|
||||
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
|
||||
chunk->free_blocks[i] = chunk->free_blocks[i-1];
|
||||
}
|
||||
// insert the new block
|
||||
chunk->free_blocks[insert_pos].offset = offset;
|
||||
chunk->free_blocks[insert_pos].size = size;
|
||||
chunk->n_free_blocks++;
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
|
||||
// shift all elements after idx by 1 to the left, overwriting the element at idx
|
||||
for (int i = idx; i < chunk->n_free_blocks; i++) {
|
||||
chunk->free_blocks[i] = chunk->free_blocks[i+1];
|
||||
}
|
||||
chunk->n_free_blocks--;
|
||||
}
|
||||
|
||||
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
|
||||
if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
|
||||
return -1;
|
||||
}
|
||||
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
|
||||
chunk->n_free_blocks = 1;
|
||||
chunk->free_blocks[0].offset = 0;
|
||||
// available space in a chunk is limited to max_chunk_size, but can be higher if:
|
||||
// 1. a single tensor exceeds the maximum, and cannot fit any other way
|
||||
// 2. we are running out of chunks
|
||||
// backends will either manage to allocate the larger size, or report an error.
|
||||
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
|
||||
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
|
||||
chunk->free_blocks[0].size = SIZE_MAX/2;
|
||||
}
|
||||
alloc->chunks[alloc->n_chunks] = chunk;
|
||||
alloc->n_chunks++;
|
||||
return alloc->n_chunks - 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].tensor == NULL) {
|
||||
alloc->allocated_tensors[i].tensor = tensor;
|
||||
alloc->allocated_tensors[i].offset = offset;
|
||||
alloc->allocated_tensors[i].addr = addr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ABORT("out of allocated_tensors");
|
||||
}
|
||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].offset == offset) {
|
||||
if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
|
||||
alloc->allocated_tensors[i].tensor = NULL;
|
||||
return;
|
||||
}
|
||||
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
||||
static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
|
||||
int best_fit_chunk = -1;
|
||||
int best_fit_block = -1;
|
||||
size_t max_avail = 0;
|
||||
|
||||
// find the best fitting free block besides the last block
|
||||
int best_fit_block = -1;
|
||||
size_t best_fit_size = SIZE_MAX;
|
||||
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
||||
struct free_block * block = &alloc->free_blocks[i];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size && block->size <= best_fit_size) {
|
||||
best_fit_block = i;
|
||||
best_fit_size = block->size;
|
||||
// find the best fitting free block besides the last block, within any chunk
|
||||
for (int c = 0; c < alloc->n_chunks; ++c) {
|
||||
struct tallocr_chunk * chunk = alloc->chunks[c];
|
||||
size_t best_fit_size = SIZE_MAX;
|
||||
for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
|
||||
struct free_block * block = &chunk->free_blocks[i];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size && block->size <= best_fit_size) {
|
||||
best_fit_chunk = c;
|
||||
best_fit_block = i;
|
||||
best_fit_size = block->size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
// the last block is our last resort
|
||||
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size) {
|
||||
best_fit_block = alloc->n_free_blocks - 1;
|
||||
} else {
|
||||
// this should never happen
|
||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
}
|
||||
|
||||
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
||||
size_t offset = block->offset;
|
||||
block->offset = offset + size;
|
||||
block->size -= size;
|
||||
if (block->size == 0) {
|
||||
// remove block if empty
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
add_allocated_tensor(alloc, offset, tensor);
|
||||
size_t cur_max = offset + size;
|
||||
if (cur_max > alloc->max_size) {
|
||||
// sort allocated_tensors by offset
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
for (int j = i + 1; j < 1024; j++) {
|
||||
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
|
||||
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
||||
size_t tmp_offset = alloc->allocated_tensors[i].offset;
|
||||
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
||||
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
|
||||
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
||||
alloc->allocated_tensors[j].offset = tmp_offset;
|
||||
// no suitable block found, try the last block (this will grow a chunks size)
|
||||
for (int c = 0; c < alloc->n_chunks; ++c) {
|
||||
struct tallocr_chunk * chunk = alloc->chunks[c];
|
||||
if (chunk->n_free_blocks > 0) {
|
||||
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size) {
|
||||
best_fit_chunk = c;
|
||||
best_fit_block = chunk->n_free_blocks - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
// none of the existing chunks have enough space left
|
||||
best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
|
||||
best_fit_block = 0;
|
||||
}
|
||||
if (best_fit_chunk == -1) {
|
||||
// since the last chunk always has virtually endless memory, this should never happen
|
||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ABORT("graph allocation: failed to reserve memory");
|
||||
}
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
|
||||
struct free_block * block = &chunk->free_blocks[best_fit_block];
|
||||
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
|
||||
block->offset += size;
|
||||
block->size -= size;
|
||||
if (block->size == 0) {
|
||||
// remove block if empty
|
||||
ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
add_allocated_tensor(alloc, addr, tensor);
|
||||
size_t cur_max = addr.offset + size;
|
||||
if (cur_max > alloc->max_size[addr.chunk]) {
|
||||
// sort allocated_tensors by chunk/offset
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
for (int j = i + 1; j < 1024; j++) {
|
||||
if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
|
||||
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
||||
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
|
||||
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
||||
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
|
||||
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
||||
alloc->allocated_tensors[j].addr = tmp_addr;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].tensor) {
|
||||
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||
alloc->allocated_tensors[i].offset,
|
||||
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||
GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||
alloc->allocated_tensors[i].addr.chunk,
|
||||
alloc->allocated_tensors[i].addr.offset,
|
||||
alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
@@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||
}
|
||||
#endif
|
||||
|
||||
alloc->max_size = MAX(alloc->max_size, offset + size);
|
||||
chunk->max_size = MAX(chunk->max_size, addr.offset + size);
|
||||
|
||||
return offset;
|
||||
return addr;
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, offset, tensor);
|
||||
remove_allocated_tensor(alloc, addr, tensor);
|
||||
#endif
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||
|
||||
// see if we can merge with an existing block
|
||||
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
||||
struct free_block * block = &alloc->free_blocks[i];
|
||||
for (int i = 0; i < chunk->n_free_blocks; i++) {
|
||||
struct free_block * block = &chunk->free_blocks[i];
|
||||
// check if ptr is at the end of the block
|
||||
if (block->offset + block->size == offset) {
|
||||
if (block->offset + block->size == addr.offset) {
|
||||
block->size += size;
|
||||
// check if we can merge with the next block
|
||||
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
|
||||
block->size += alloc->free_blocks[i+1].size;
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
if (i < chunk->n_free_blocks - 1) {
|
||||
struct free_block * next = &chunk->free_blocks[i+1];
|
||||
if (block->offset + block->size == next->offset) {
|
||||
block->size += next->size;
|
||||
ggml_dyn_tallocr_remove_block(chunk, i+1);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
// check if ptr is at the beginning of the block
|
||||
if (offset + size == block->offset) {
|
||||
block->offset = offset;
|
||||
if (addr.offset + size == block->offset) {
|
||||
block->offset = addr.offset;
|
||||
block->size += size;
|
||||
// check if we can merge with the previous block
|
||||
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
|
||||
alloc->free_blocks[i-1].size += block->size;
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = i; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
if (i > 0) {
|
||||
struct free_block * prev = &chunk->free_blocks[i-1];
|
||||
if (prev->offset + prev->size == block->offset) {
|
||||
prev->size += block->size;
|
||||
ggml_dyn_tallocr_remove_block(chunk, i);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
// otherwise, add a new block
|
||||
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||
int insert_pos = 0;
|
||||
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
||||
insert_pos++;
|
||||
}
|
||||
// shift all blocks from insert_pos onward to make room for the new block
|
||||
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
|
||||
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
||||
}
|
||||
// insert the new block
|
||||
alloc->free_blocks[insert_pos].offset = offset;
|
||||
alloc->free_blocks[insert_pos].size = size;
|
||||
alloc->n_free_blocks++;
|
||||
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
alloc->n_free_blocks = 1;
|
||||
alloc->free_blocks[0].offset = 0;
|
||||
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
||||
alloc->max_size = 0;
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
|
||||
free(alloc->chunks[i]);
|
||||
alloc->chunks[i] = NULL;
|
||||
}
|
||||
alloc->n_chunks = 0;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
@@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
||||
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
|
||||
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
||||
|
||||
*alloc = (struct ggml_dyn_tallocr) {
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.alignment = */ alignment,
|
||||
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
|
||||
/*.chunks = */ {NULL},
|
||||
/*.n_chunks = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ {{0}},
|
||||
#endif
|
||||
@@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
||||
for (int i = 0; i < alloc->n_chunks; ++i) {
|
||||
free(alloc->chunks[i]);
|
||||
}
|
||||
free(alloc);
|
||||
}
|
||||
|
||||
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
||||
return alloc->max_size;
|
||||
size_t max_size = 0;
|
||||
for (int i = 0; i < alloc->n_chunks; i++) {
|
||||
max_size += alloc->chunks[i]->max_size;
|
||||
}
|
||||
return max_size;
|
||||
}
|
||||
|
||||
|
||||
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
|
||||
|
||||
struct vbuffer {
|
||||
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
|
||||
};
|
||||
|
||||
static void ggml_vbuffer_free(struct vbuffer * buf) {
|
||||
if (buf == NULL) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
|
||||
ggml_backend_buffer_free(buf->chunks[i]);
|
||||
}
|
||||
free(buf);
|
||||
}
|
||||
|
||||
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
|
||||
int n = 0;
|
||||
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
|
||||
return n;
|
||||
}
|
||||
|
||||
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
|
||||
size_t size = 0;
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
||||
size += ggml_backend_buffer_get_size(buf->chunks[i]);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
|
||||
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
|
||||
if (buf == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int n = 0; n < talloc->n_chunks; n++) {
|
||||
size_t chunk_size = talloc->chunks[n]->max_size;
|
||||
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
|
||||
if (buf->chunks[n] == NULL) {
|
||||
ggml_vbuffer_free(buf);
|
||||
return NULL;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
|
||||
void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
|
||||
void * addr = (char *)base + buf_addr.offset;
|
||||
ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
|
||||
}
|
||||
|
||||
static void ggml_vbuffer_reset(struct vbuffer * buf) {
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
||||
ggml_backend_buffer_reset(buf->chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -328,13 +470,13 @@ struct hash_node {
|
||||
int n_children;
|
||||
int n_views;
|
||||
int buffer_id;
|
||||
size_t offset; // offset within the buffer
|
||||
struct buffer_address addr;
|
||||
bool allocated;
|
||||
};
|
||||
|
||||
struct tensor_alloc {
|
||||
int buffer_id;
|
||||
size_t offset;
|
||||
struct buffer_address addr;
|
||||
size_t size_max; // 0 = pre-allocated, unused, or view
|
||||
};
|
||||
|
||||
@@ -349,7 +491,7 @@ struct node_alloc {
|
||||
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
||||
struct vbuffer ** buffers; // [n_buffers]
|
||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -370,7 +512,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||
GGML_ASSERT(galloc->bufts != NULL);
|
||||
|
||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
@@ -390,7 +532,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
|
||||
if (galloc->buf_tallocs[i] == NULL) {
|
||||
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
||||
size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
|
||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
|
||||
}
|
||||
}
|
||||
galloc->n_buffers = n_bufs;
|
||||
@@ -418,7 +561,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
}
|
||||
}
|
||||
if (!freed) {
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
if (galloc->buf_tallocs != NULL) {
|
||||
@@ -467,7 +610,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
|
||||
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
||||
hn->allocated = true;
|
||||
assert(hn->offset == 0);
|
||||
assert(hn->addr.offset == 0);
|
||||
|
||||
// try to reuse a parent's buffer (inplace)
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
@@ -501,9 +644,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||
assert(view_src_hn->offset == p_hn->offset);
|
||||
assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
|
||||
hn->buffer_id = p_hn->buffer_id;
|
||||
hn->offset = p_hn->offset;
|
||||
hn->addr = p_hn->addr;
|
||||
p_hn->allocated = false; // avoid freeing the parent
|
||||
view_src_hn->allocated = false;
|
||||
return;
|
||||
@@ -511,7 +654,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
} else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
hn->buffer_id = p_hn->buffer_id;
|
||||
hn->offset = p_hn->offset;
|
||||
hn->addr = p_hn->addr;
|
||||
p_hn->allocated = false; // avoid freeing the parent
|
||||
return;
|
||||
}
|
||||
@@ -522,9 +665,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
||||
hn->buffer_id = buffer_id;
|
||||
hn->offset = offset;
|
||||
hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -536,12 +678,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
||||
}
|
||||
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
size_t offset = hn->offset;
|
||||
int buffer_id = hn->buffer_id;
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
||||
hn->allocated = false;
|
||||
}
|
||||
|
||||
@@ -692,24 +833,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||
if (node->view_src || node->data) {
|
||||
node_alloc->dst.buffer_id = -1;
|
||||
node_alloc->dst.offset = SIZE_MAX;
|
||||
node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||
node_alloc->dst.size_max = 0;
|
||||
} else {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
node_alloc->dst.buffer_id = hn->buffer_id;
|
||||
node_alloc->dst.offset = hn->offset;
|
||||
node_alloc->dst.addr = hn->addr;
|
||||
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (!src || src->view_src || src->data) {
|
||||
node_alloc->src[j].buffer_id = -1;
|
||||
node_alloc->src[j].offset = SIZE_MAX;
|
||||
node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||
node_alloc->src[j].size_max = 0;
|
||||
} else {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
||||
node_alloc->src[j].buffer_id = hn->buffer_id;
|
||||
node_alloc->src[j].offset = hn->offset;
|
||||
node_alloc->src[j].addr = hn->addr;
|
||||
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
||||
}
|
||||
}
|
||||
@@ -725,11 +866,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||
if (leaf->view_src || leaf->data) {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
||||
galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||
galloc->leaf_allocs[i].leaf.size_max = 0;
|
||||
} else {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
||||
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
||||
galloc->leaf_allocs[i].leaf.addr = hn->addr;
|
||||
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||
}
|
||||
}
|
||||
@@ -744,7 +885,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
||||
|
||||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||
@@ -753,13 +894,12 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
if (galloc->buffers[i] == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -772,11 +912,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
|
||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
||||
int buffer_id = tensor_alloc->buffer_id;
|
||||
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
if (tensor->buffer == NULL) {
|
||||
assert(tensor_alloc->offset == SIZE_MAX);
|
||||
assert(tensor_alloc->addr.offset == SIZE_MAX);
|
||||
if (tensor->view_src->buffer == NULL) {
|
||||
// this tensor was allocated without ggml-backend
|
||||
return;
|
||||
@@ -785,11 +925,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
}
|
||||
} else {
|
||||
if (tensor->data == NULL) {
|
||||
assert(tensor_alloc->offset != SIZE_MAX);
|
||||
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
||||
void * addr = (char *)base + tensor_alloc->offset;
|
||||
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
||||
assert(tensor_alloc->addr.offset != SIZE_MAX);
|
||||
assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
|
||||
} else {
|
||||
if (tensor->buffer == NULL) {
|
||||
// this tensor was allocated without ggml-backend
|
||||
@@ -874,7 +1012,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||
// reset buffers
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
if (galloc->buffers[i] != NULL) {
|
||||
ggml_backend_buffer_reset(galloc->buffers[i]);
|
||||
ggml_vbuffer_reset(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -917,7 +1055,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
@@ -135,6 +135,10 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return p;
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
@@ -155,6 +159,11 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||
@@ -240,7 +249,7 @@ struct ggml_backend_registry {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -530,7 +539,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
|
||||
}
|
||||
if (handle) {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
|
||||
@@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
|
||||
return sched->backends[i];
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
GGML_ASSERT(sched);
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
return sched->bufts[backend_index];
|
||||
}
|
||||
|
||||
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
GGML_ASSERT(sched);
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
|
||||
@@ -74,7 +74,7 @@ if (BLAS_FOUND)
|
||||
|
||||
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
||||
|
||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
||||
if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -439,6 +439,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
ggml-cpu/arch/riscv/quants.c
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/spacemit/ime.cpp
|
||||
ggml-cpu/spacemit/ime.h
|
||||
ggml-cpu/spacemit/ime1_kernels.cpp
|
||||
ggml-cpu/spacemit/ime_kernels.h
|
||||
)
|
||||
endif()
|
||||
set(MARCH_STR "rv64gc")
|
||||
if (GGML_RV_ZFH)
|
||||
string(APPEND MARCH_STR "_zfh")
|
||||
@@ -504,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.13.0")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.14.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
@@ -583,6 +592,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
||||
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
||||
|
||||
@@ -160,7 +160,6 @@
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
||||
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||
|
||||
return ((v4f32)res)[0];
|
||||
}
|
||||
|
||||
// multiply int8_t, add results pairwise twice
|
||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||
return lsx_madd_h(ones, dot);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
|
||||
}
|
||||
}
|
||||
|
||||
// multiply int8_t, add results pairwise twice
|
||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||
return lsx_madd_h(ones, dot);
|
||||
}
|
||||
|
||||
// horizontally add 8 floats
|
||||
static inline float hsum_float_8(const __m256 x) {
|
||||
__m128 res = lasx_extractf128(x, 1);
|
||||
|
||||
@@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
||||
const int32x4_t vi = vec_signed(v);
|
||||
/* Uses non-default rounding for vec_signed or vec_round */
|
||||
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
|
||||
|
||||
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
||||
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
||||
@@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
||||
const int32x4_t vi = vec_signed(v);
|
||||
/* Uses non-default rounding for vec_signed or vec_round */
|
||||
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
|
||||
|
||||
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
||||
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
||||
@@ -260,6 +262,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_MXFP4 == 0);
|
||||
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
||||
|
||||
const int qk = QK_MXFP4;
|
||||
const int nb = n / qk;
|
||||
|
||||
const block_mxfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
|
||||
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
||||
|
||||
float32x4_t v_acc = vec_splats(0.0f);
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
||||
const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
|
||||
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
||||
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
||||
|
||||
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
|
||||
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
|
||||
|
||||
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
||||
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
||||
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
||||
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
||||
|
||||
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
|
||||
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
|
||||
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
|
||||
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
|
||||
|
||||
const int8x16_t v_y0l = vec_xl(0, y0->qs);
|
||||
const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
|
||||
const int8x16_t v_y1l = vec_xl(0, y1->qs);
|
||||
const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
|
||||
|
||||
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
|
||||
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
|
||||
|
||||
const float32x4_t v_xy0f = vec_float(v_xy0);
|
||||
const float32x4_t v_xy1f = vec_float(v_xy1);
|
||||
|
||||
const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
|
||||
v_acc = vec_madd(v_xy0f, v_d0, v_acc);
|
||||
v_acc = vec_madd(v_xy1f, v_d1, v_acc);
|
||||
}
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
||||
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
||||
|
||||
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
||||
|
||||
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
||||
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
||||
|
||||
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
|
||||
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
|
||||
|
||||
const int8x16_t v_yl = vec_xl(0, y0->qs);
|
||||
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
||||
|
||||
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
||||
const float32x4_t v_xyf = vec_float(v_xy);
|
||||
|
||||
const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
v_acc = vec_madd(v_xyf, v_d, v_acc);
|
||||
}
|
||||
|
||||
sumf = vec_hsum_f32x4(v_acc);
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
@@ -636,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
uint8x16_t q3h[4];
|
||||
uint8x16_t q3b[2];
|
||||
int8x16_t q3bytes[4];
|
||||
int8x16_t q8bytes[4];
|
||||
int8x16_t q8bytes[8];
|
||||
uint8x16_t qhbits[2];
|
||||
|
||||
float sum = 0;
|
||||
|
||||
@@ -473,10 +473,10 @@ struct ggml_threadpool {
|
||||
struct ggml_compute_state {
|
||||
#ifndef GGML_USE_OPENMP
|
||||
ggml_thread_t thrd;
|
||||
bool cpumask[GGML_MAX_N_THREADS];
|
||||
int last_graph;
|
||||
bool pending;
|
||||
#endif
|
||||
bool cpumask[GGML_MAX_N_THREADS];
|
||||
struct ggml_threadpool * threadpool;
|
||||
int ith;
|
||||
};
|
||||
@@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||
|
||||
threadpool->workers = workers;
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
#ifdef GGML_USE_OPENMP
|
||||
int32_t cpumask_iter = 0;
|
||||
|
||||
// Compute CPU masks for each thread
|
||||
for (int j = 0; j < tpp->n_threads; j++) {
|
||||
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
|
||||
}
|
||||
#else // GGML_USE_OPENMP
|
||||
ggml_mutex_init(&threadpool->mutex);
|
||||
ggml_cond_init(&threadpool->cond);
|
||||
|
||||
@@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
||||
}
|
||||
|
||||
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
|
||||
// Apply thread CPU mask and priority
|
||||
int ith = omp_get_thread_num();
|
||||
|
||||
ggml_thread_apply_priority(threadpool->prio);
|
||||
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
|
||||
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
|
||||
}
|
||||
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
||||
}
|
||||
} else {
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
# include "kleidiai/kleidiai.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
# include "spacemit/ime.h"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
@@ -45,6 +49,12 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
if (ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
|
||||
|
||||
@@ -87,15 +87,38 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
|
||||
return tensor->ne[dim];
|
||||
}
|
||||
|
||||
template <typename Variant, typename Ret, typename... Args, std::size_t... Is>
|
||||
constexpr bool variant_any_invocable_impl(std::index_sequence<Is...>) {
|
||||
using V = std::remove_reference_t<Variant>;
|
||||
return (std::is_invocable_r_v<
|
||||
Ret,
|
||||
std::variant_alternative_t<Is, V>,
|
||||
Args...> || ...);
|
||||
}
|
||||
|
||||
template <typename Variant, typename Ret, typename... Args>
|
||||
constexpr bool variant_any_invocable_v =
|
||||
variant_any_invocable_impl<Variant, Ret, Args...>(
|
||||
std::make_index_sequence<
|
||||
std::variant_size_v<std::remove_reference_t<Variant>>>{});
|
||||
|
||||
template<typename Ret, typename Variant, typename... Args>
|
||||
static Ret variant_call(const Variant & var, Args&&... args) {
|
||||
return std::visit([&](auto&& func) -> Ret {
|
||||
if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) {
|
||||
return func(std::forward<Args>(args)...);
|
||||
} else {
|
||||
throw std::runtime_error("Invalid function type in variant_call");
|
||||
}
|
||||
}, var);
|
||||
static inline Ret variant_call(Variant && var, Args&&... args) {
|
||||
static_assert(variant_any_invocable_v<std::remove_reference_t<Variant>, Ret, Args...>,
|
||||
"No alternative in Variant is invocable with the provided arguments and return type.");
|
||||
|
||||
return std::visit(
|
||||
[&](auto && f) -> Ret {
|
||||
using F = std::decay_t<decltype(f)>;
|
||||
if constexpr (std::is_invocable_r_v<Ret, F, Args...>) {
|
||||
return std::invoke(std::forward<decltype(f)>(f), std::forward<Args>(args)...);
|
||||
} else {
|
||||
GGML_ABORT("Invalid function type in variant_call");
|
||||
GGML_UNREACHABLE();
|
||||
}
|
||||
},
|
||||
std::forward<Variant>(var)
|
||||
);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::kleidiai {
|
||||
@@ -138,7 +161,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
if (kernels->rhs_type == GGML_TYPE_Q4_0) {
|
||||
size = variant_call<size_t>(lhs_info->packed_size, m, k, QK4_0, mr, kr, sr);
|
||||
} else if (kernels->rhs_type == GGML_TYPE_F16) {
|
||||
size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr) +
|
||||
const int64_t lhs_batch_size0 = op->src[1]->ne[2];
|
||||
const int64_t rhs_batch_size0 = op->src[0]->ne[2];
|
||||
const int64_t r = lhs_batch_size0 / rhs_batch_size0;
|
||||
size = variant_call<size_t>(lhs_info->packed_size, m * r, k, mr, kr, sr) +
|
||||
variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
|
||||
k * n * sizeof(float) + n * sizeof(float);
|
||||
} else {
|
||||
@@ -148,7 +174,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
|
||||
if (dst->op == GGML_OP_MUL_MAT) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
@@ -165,8 +190,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
}
|
||||
|
||||
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
@@ -175,7 +198,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||
GGML_ASSERT(kernels);
|
||||
|
||||
bool is_gemv = src1->ne[1] == 1;
|
||||
const bool is_gemv = src1->ne[1] == 1;
|
||||
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
||||
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
||||
GGML_ASSERT(kernel);
|
||||
@@ -185,27 +208,30 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
|
||||
const int64_t lhs_batch_size0 = ne12;
|
||||
const int64_t rhs_batch_size0 = ne02;
|
||||
const int64_t batch_size = rhs_batch_size0;
|
||||
const int64_t batch_size = lhs_batch_size0;
|
||||
|
||||
GGML_ASSERT(rhs_batch_size0 > 0);
|
||||
GGML_ASSERT(lhs_batch_size0 % rhs_batch_size0 == 0);
|
||||
const int64_t r = lhs_batch_size0 / rhs_batch_size0;
|
||||
|
||||
const int64_t m = ne11 * r;
|
||||
const int64_t n = ne01;
|
||||
const int64_t k = ne00;
|
||||
const int64_t m_group = ne11;
|
||||
const int64_t m = m_group;
|
||||
const int64_t n = ne01;
|
||||
const int64_t k = ne00;
|
||||
|
||||
const size_t lhs_stride = src1->nb[1];
|
||||
const size_t rhs_stride = src0->nb[1];
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
|
||||
const int64_t mr = static_cast<int64_t>(kernel->get_mr());
|
||||
const int64_t nr = static_cast<int64_t>(kernel->get_nr());
|
||||
const int64_t kr = static_cast<int64_t>(kernel->get_kr());
|
||||
const int64_t sr = static_cast<int64_t>(kernel->get_sr());
|
||||
const int64_t mr = (int64_t) kernel->get_mr();
|
||||
const int64_t nr = (int64_t) kernel->get_nr();
|
||||
const int64_t kr = (int64_t) kernel->get_kr();
|
||||
const int64_t sr = (int64_t) kernel->get_sr();
|
||||
|
||||
const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr);
|
||||
const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
|
||||
const size_t kxn_size = k * n * sizeof(float);
|
||||
const size_t bias_size = n * sizeof(float);
|
||||
const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, (size_t)m, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
|
||||
const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, (size_t)n, (size_t)k);
|
||||
const size_t kxn_size = (size_t)k * (size_t)n * sizeof(float);
|
||||
const size_t bias_size = (size_t)n * sizeof(float);
|
||||
|
||||
const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
|
||||
GGML_ASSERT(wsize_required <= params->wsize);
|
||||
@@ -216,82 +242,102 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
uint8_t * bias = rhs_kxn + kxn_size;
|
||||
|
||||
for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
|
||||
const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride;
|
||||
const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride;
|
||||
uint8_t * dst_batch = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride;
|
||||
const int64_t rhs_batch_idx = batch_idx / r;
|
||||
const uint8_t * rhs_batch_base = static_cast<const uint8_t *>(src0->data) + rhs_batch_idx * src0->nb[2];
|
||||
uint8_t * dst_batch_base = static_cast<uint8_t *>(dst->data) + batch_idx * dst->nb[2];
|
||||
|
||||
// LHS packing
|
||||
// LHS packing (threaded over m, honoring mr alignment and KV groups)
|
||||
{
|
||||
const int64_t m_roundup_mr = kai_roundup(m, mr);
|
||||
const int64_t num_threads = KAI_MIN(m_roundup_mr / mr, nth);
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_m_per_thread0 = round_down(m_roundup_mr / num_threads, mr);
|
||||
const int64_t num_m_per_thread0 = round_down((size_t)(m_roundup_mr / num_threads), (size_t)mr);
|
||||
const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
|
||||
|
||||
const int64_t m_start = ith * num_m_per_thread0;
|
||||
const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
|
||||
const int64_t m_start = ith * num_m_per_thread0;
|
||||
const int64_t m_count = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
|
||||
|
||||
const size_t lhs_offset = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
|
||||
const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, mr, kr, sr);
|
||||
// Base packed offset (aligned) and per-row stride in bytes
|
||||
const size_t base_packed_off = variant_call<size_t>(
|
||||
lhs_info->get_packed_offset, (size_t)m_start, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
|
||||
const size_t next_block_off = variant_call<size_t>(
|
||||
lhs_info->get_packed_offset, (size_t)(m_start + mr), (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
|
||||
const size_t row_stride_bytes = (next_block_off - base_packed_off) / (size_t)mr;
|
||||
|
||||
const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
|
||||
void * dst_ptr = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
|
||||
int64_t remaining = m_count;
|
||||
int64_t cur = m_start;
|
||||
|
||||
variant_call<void>(lhs_info->pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
|
||||
while (remaining > 0) {
|
||||
const int64_t row_in_group = cur;
|
||||
const int64_t avail = m_group - row_in_group;
|
||||
const int64_t take = std::min(avail, remaining);
|
||||
|
||||
const uint8_t * lhs_batch_base = static_cast<const uint8_t *>(src1->data) + batch_idx * src1->nb[2];
|
||||
const void * src_ptr = lhs_batch_base + (size_t)row_in_group * lhs_stride;
|
||||
const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes;
|
||||
void * dst_ptr = lhs_packed + dst_off;
|
||||
|
||||
variant_call<void>(lhs_info->pack_func,
|
||||
(size_t)take, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr,
|
||||
/*m_idx_start*/ 0, src_ptr, lhs_stride, dst_ptr);
|
||||
|
||||
cur += take;
|
||||
remaining -= take;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RHS packing
|
||||
if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) {
|
||||
// First thread to reach this point handles RHS packing
|
||||
memset(bias, 0, n * sizeof(float));
|
||||
transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn),
|
||||
reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride);
|
||||
// RHS packing (single thread), then synchronize
|
||||
if (ith == 0) {
|
||||
memset(bias, 0, (size_t)n * sizeof(float));
|
||||
transpose_f32kxn_f16nxk((size_t)n, (size_t)k,
|
||||
reinterpret_cast<float *>(rhs_kxn),
|
||||
reinterpret_cast<const uint16_t *>(rhs_batch_base),
|
||||
rhs_stride);
|
||||
|
||||
variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float),
|
||||
rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
|
||||
variant_call<void>(kernels->rhs_info.pack_func,
|
||||
/*num_groups*/ 1, (size_t)n, (size_t)k, (size_t)nr, (size_t)kr, (size_t)sr,
|
||||
/*rhs_stride (bytes)*/ (size_t)(n * sizeof(float)),
|
||||
rhs_kxn, bias, nullptr, rhs_packed, /*extra_bytes*/ 0, /*params*/ nullptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
first_to_arrive.clear(std::memory_order_release);
|
||||
|
||||
// Perform the matmul
|
||||
// Matmul (threaded over n)
|
||||
{
|
||||
const int64_t m_to_process = m;
|
||||
const int64_t m_start = 0;
|
||||
|
||||
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step());
|
||||
int64_t num_threads = KAI_MIN(n / n_step, nth);
|
||||
if (num_threads <= 0) {
|
||||
num_threads = 1;
|
||||
const int64_t n_step = (int64_t) kernel->get_n_step();
|
||||
int64_t num_threads_n = KAI_MIN(n / n_step, nth);
|
||||
if (num_threads_n <= 0) {
|
||||
num_threads_n = 1;
|
||||
}
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step);
|
||||
const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0;
|
||||
if (ith < num_threads_n) {
|
||||
const int64_t num_n_per_thread0 = round_down((size_t)(n / num_threads_n), (size_t)n_step);
|
||||
const int64_t num_n_per_threadN_1 = n - (num_threads_n - 1) * num_n_per_thread0;
|
||||
|
||||
const int64_t n_start = ith * num_n_per_thread0;
|
||||
const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
|
||||
const int64_t n_to_process = (ith == num_threads_n - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
|
||||
|
||||
const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k);
|
||||
const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k);
|
||||
const size_t dst_offset = kernel->get_dst_offset(m_start, n_start, dst_stride);
|
||||
// LHS packed base at row 0 (consistent with packing above)
|
||||
const size_t lhs_packed_offset0 = variant_call<size_t>(
|
||||
lhs_info->get_packed_offset, (size_t)0, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
|
||||
const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, (size_t)n_start, (size_t)k);
|
||||
const size_t dst_offset = kernel->get_dst_offset((size_t)0, (size_t)n_start, dst_stride);
|
||||
|
||||
const void * lhs_ptr = lhs_packed + lhs_packed_offset;
|
||||
const void * lhs_ptr = lhs_packed + lhs_packed_offset0;
|
||||
const void * rhs_ptr = rhs_packed + rhs_packed_offset;
|
||||
float * dst_ptr = reinterpret_cast<float *>(dst_batch + dst_offset);
|
||||
float * dst_ptr = reinterpret_cast<float *>(dst_batch_base + dst_offset);
|
||||
|
||||
variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
variant_call<void>(kernel->run_kernel,
|
||||
(size_t)m, (size_t)n_to_process, (size_t)k,
|
||||
lhs_ptr, rhs_ptr,
|
||||
dst_ptr, dst_stride, sizeof(float),
|
||||
-FLT_MAX, FLT_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
if (batch_idx != batch_size - 1) {
|
||||
// This barrier is necessary when the batch size is larger than 1. While processing a batch,
|
||||
// the work data buffer (params->wdata) is used as temporary storage which means that only
|
||||
// a single batch can be processed at any given time. No barrier is needed for the last
|
||||
// batch since GGML inserts a barrier between the execution of every operator.
|
||||
ggml_barrier(params->threadpool);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __m128
|
||||
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
||||
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
|
||||
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
||||
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
|
||||
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
||||
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
||||
|
||||
return __lsx_vld(tmp, 0);
|
||||
return (__m128)__lsx_vld(tmp, 0);
|
||||
}
|
||||
|
||||
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
||||
|
||||
1024
ggml/src/ggml-cpu/spacemit/ime.cpp
Normal file
1024
ggml/src/ggml-cpu/spacemit/ime.cpp
Normal file
File diff suppressed because it is too large
Load Diff
13
ggml/src/ggml-cpu/spacemit/ime.h
Normal file
13
ggml/src/ggml-cpu/spacemit/ime.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
3196
ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
Normal file
3196
ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
Normal file
File diff suppressed because it is too large
Load Diff
26
ggml/src/ggml-cpu/spacemit/ime_kernels.h
Normal file
26
ggml/src/ggml-cpu/spacemit/ime_kernels.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace sqnbitgemm_spacemit_ime {
|
||||
namespace ime1 {
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const std::byte * quant_a_ptr,
|
||||
const std::byte * quant_b_data,
|
||||
const float * quant_b_scale,
|
||||
const std::byte * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t count_k,
|
||||
size_t block_count_k,
|
||||
size_t ldc,
|
||||
const float * bias,
|
||||
const size_t scale_stride);
|
||||
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
|
||||
} // namespace ime1
|
||||
} // namespace sqnbitgemm_spacemit_ime
|
||||
@@ -610,7 +610,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
||||
ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -586,17 +586,42 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
|
||||
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
acc += v*u;
|
||||
#else
|
||||
const float2 tmpv = __half22float2(v);
|
||||
const float2 tmpu = __half22float2(u);
|
||||
float2 tmpacc = __half22float2(acc);
|
||||
tmpacc.x += tmpv.x * tmpu.x;
|
||||
tmpacc.y += tmpv.y * tmpu.y;
|
||||
acc = make_half2(tmpacc.x, tmpacc.y);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
|
||||
template <int nbytes>
|
||||
template <int nbytes, int alignment = 0>
|
||||
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
|
||||
if constexpr (nbytes == 4) {
|
||||
*(int *) dst = *(const int *) src;
|
||||
} else if constexpr (nbytes == 8) {
|
||||
*(int2 *) dst = *(const int2 *) src;
|
||||
} else if constexpr (nbytes == 16) {
|
||||
*(int4 *) dst = *(const int4 *) src;
|
||||
} else {
|
||||
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
|
||||
if constexpr (alignment != 0) {
|
||||
static_assert(nbytes % alignment == 0, "bad alignment");
|
||||
}
|
||||
constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
|
||||
if constexpr (nb_per_cpy == 1) {
|
||||
((char *) dst)[i] = ((const char *) src)[i];
|
||||
} else if constexpr (nb_per_cpy == 2) {
|
||||
((short *) dst)[i] = ((const short *) src)[i];
|
||||
} else if constexpr (nb_per_cpy == 4) {
|
||||
((int *) dst)[i] = ((const int *) src)[i];
|
||||
} else if constexpr (nb_per_cpy == 8) {
|
||||
((int2 *) dst)[i] = ((const int2 *) src)[i];
|
||||
} else if constexpr (nb_per_cpy == 16) {
|
||||
((int4 *) dst)[i] = ((const int4 *) src)[i];
|
||||
} else {
|
||||
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
} else
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
return nullptr;
|
||||
// Prioritize CUDA graph compatibility over direct memory copy optimization.
|
||||
// Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_flt<cpy_1_flt<float, float>>;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_flt<cpy_1_flt<float, float>>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
|
||||
@@ -33,276 +33,230 @@ typedef void (* fattn_kernel_t)(
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33);
|
||||
|
||||
typedef half (*vec_dot_KQ_f16_t)(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||
typedef float (*vec_dot_KQ_f32_t)(
|
||||
typedef float (*vec_dot_KQ_t)(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI4_0;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
const int v = (get_int_b2(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/warp_size];
|
||||
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */);
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (8/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI4_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
const int v = (get_int_b4(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/warp_size];
|
||||
const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1);
|
||||
sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
const float sumid4d8 = __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
|
||||
const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
|
||||
|
||||
sum += (T) (sumid4d8 + m4s8scaled);
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI5_0;
|
||||
const int iqs8 = k_KQ % QI8_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v = (get_int_b2(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int vh = get_int_b2(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
|
||||
v |= (vh << 4) & 0x00000010; // 0 -> 4
|
||||
v |= (vh << 11) & 0x00001000; // 1 -> 12
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
v |= (vh << 25) & 0x10000000; // 3 -> 28
|
||||
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/warp_size];
|
||||
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */;
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (16/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI5_1;
|
||||
const int iqs8 = k_KQ % QI8_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v = (get_int_b2(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int vh = get_int_b2(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
|
||||
v |= (vh << 4) & 0x00000010; // 0 -> 4
|
||||
v |= (vh << 11) & 0x00001000; // 1 -> 12
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
v |= (vh << 25) & 0x10000000; // 3 -> 28
|
||||
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/warp_size];
|
||||
const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1);
|
||||
sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
const float sumid5d8 = __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
|
||||
const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
|
||||
|
||||
sum += (T) (sumid5d8 + m5s8scaled);
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_0;
|
||||
const int iqs = k_KQ % QI8_0;
|
||||
|
||||
const int v = get_int_b2(K_q8_0[ib].qs, iqs);
|
||||
|
||||
T Q_d;
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
Q_d = __low2half(Q_ds[k_KQ_0/warp_size]);
|
||||
} else {
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
Q_d = Q_ds[k_KQ_0/warp_size].x;
|
||||
}
|
||||
|
||||
sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/warp_size], K_q8_0[ib].d, Q_d);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename T, int D, int warp_size>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||
template <int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const half2 * K_h2 = (const half2 *) K_c;
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_h2 = (const half2 *) Q_v;
|
||||
|
||||
half2 sum2 = make_half2(0.0f, 0.0f);
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[k_KQ];
|
||||
sum2 += K_ik * Q_h2[k_KQ_0/warp_size];
|
||||
}
|
||||
|
||||
return __low2half(sum2) + __high2half(sum2);
|
||||
}
|
||||
#endif // FP16_AVAILABLE
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) Q_v;
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[k_KQ];
|
||||
sum += __low2float(K_ik) * Q_f2[k_KQ_0/warp_size].x;
|
||||
sum += __high2float(K_ik) * Q_f2[k_KQ_0/warp_size].y;
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
|
||||
half2 tmp[cpy_ne];
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_h2 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
ggml_cuda_mad(sum, tmp[k_KQ_1] , ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
|
||||
#else
|
||||
ggml_cuda_mad(sum, __half22float2(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename Tds>
|
||||
template<int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
|
||||
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI4_0;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v;
|
||||
ggml_cuda_memcpy_1<sizeof(int), 2>(&v, K_q4_0[ib].qs + sizeof(int)*iqs4);
|
||||
v = (v >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/nthreads];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
|
||||
sum += __half2float(K_q4_0[ib].d) * (sumi*Q_ds.x - (8/QI8_1)*Q_ds.y);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
|
||||
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI4_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v;
|
||||
ggml_cuda_memcpy_1<sizeof(int)>(&v, K_q4_1[ib].qs + sizeof(int)*iqs4);
|
||||
v = (v >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/nthreads];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
const float2 K_dm = __half22float2(K_q4_1[ib].dm);
|
||||
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
|
||||
|
||||
sum += K_dm.x*Q_ds.x*sumi + K_dm.y*Q_ds.y/QI8_1;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
|
||||
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI5_0;
|
||||
const int iqs8 = k_KQ % QI8_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v;
|
||||
ggml_cuda_memcpy_1<sizeof(int), 2>(&v, K_q5_0[ib].qs + sizeof(int)*iqs4);
|
||||
v = (v >> shift) & 0x0F0F0F0F;
|
||||
|
||||
{
|
||||
int vh;
|
||||
ggml_cuda_memcpy_1<sizeof(int), 2>(&vh, K_q5_0[ib].qh);
|
||||
vh >>= iqs8 * QI5_0;
|
||||
|
||||
v |= (vh << 4) & 0x00000010; // 0 -> 4
|
||||
v |= (vh << 11) & 0x00001000; // 1 -> 12
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
v |= (vh << 25) & 0x10000000; // 3 -> 28
|
||||
}
|
||||
|
||||
const int u = Q_q8[k_KQ_0/nthreads];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
|
||||
|
||||
sum += __half2float(K_q5_0[ib].d) * (sumi*Q_ds.x - (16/QI8_1)*Q_ds.y);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
|
||||
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
const int iqs4 = k_KQ % QI5_1;
|
||||
const int iqs8 = k_KQ % QI8_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v;
|
||||
ggml_cuda_memcpy_1<sizeof(int)>(&v, K_q5_1[ib].qs + sizeof(int)*iqs4);
|
||||
v = (v >> shift) & 0x0F0F0F0F;
|
||||
|
||||
{
|
||||
int vh;
|
||||
ggml_cuda_memcpy_1<sizeof(int)>(&vh, K_q5_1[ib].qh);
|
||||
vh >>= iqs8 * QI5_0;
|
||||
|
||||
v |= (vh << 4) & 0x00000010; // 0 -> 4
|
||||
v |= (vh << 11) & 0x00001000; // 1 -> 12
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
v |= (vh << 25) & 0x10000000; // 3 -> 28
|
||||
}
|
||||
|
||||
const int u = Q_q8[k_KQ_0/nthreads];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
const float2 K_dm = __half22float2(K_q5_1[ib].dm);
|
||||
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
|
||||
|
||||
sum += K_dm.x*Q_ds.x*sumi + K_dm.y*Q_ds.y/QI8_1;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
|
||||
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
|
||||
|
||||
const int ib = k_KQ / QI8_0;
|
||||
const int iqs = k_KQ % QI8_0;
|
||||
|
||||
int v;
|
||||
ggml_cuda_memcpy_1<sizeof(v), 2>(&v, K_q8_0[ib].qs + 4*iqs);
|
||||
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
const float Q_d = Q_ds[k_KQ_0/nthreads].x;
|
||||
|
||||
sum += vec_dot_q8_0_q8_1_impl<float, 1>(&v, &Q_q8[k_KQ_0/nthreads], K_q8_0[ib].d, Q_d);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename Tds, int ni>
|
||||
static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
||||
const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
|
||||
|
||||
float vals[sizeof(int)] = {0.0f};
|
||||
#pragma unroll
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
vals[l] = scale * x[4*threadIdx.x + l];
|
||||
vals[l] = (ni == WARP_SIZE || threadIdx.x < ni) ? scale * x[4*threadIdx.x + l] : 0.0f;
|
||||
}
|
||||
|
||||
float amax = fabsf(vals[0]);
|
||||
@@ -330,7 +284,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
||||
}
|
||||
|
||||
yq32[threadIdx.x] = q32;
|
||||
if (threadIdx.x % QI8_1 == 0) {
|
||||
if (threadIdx.x % QI8_1 == 0 && (ni == WARP_SIZE || threadIdx.x < ni)) {
|
||||
if (std::is_same<Tds, half2>::value) {
|
||||
((half2 *) yds)[threadIdx.x/QI8_1] = make_half2(d, sum);
|
||||
} else {
|
||||
@@ -339,167 +293,276 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
||||
}
|
||||
}
|
||||
|
||||
typedef half (*dequantize_1_f16_t)(const void *, const int64_t);
|
||||
typedef float (*dequantize_1_f32_t)(const void *, const int64_t);
|
||||
typedef void (*dequantize_V_t)(const void *, void *, const int64_t);
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__ vx, const int64_t i) {
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
ggml_cuda_memcpy_1<ne*sizeof(half)>(dst, (const half *) vx + i0);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
static_assert(ne % 2 == 0, "bad ne");
|
||||
half2 tmp[ne/2];
|
||||
ggml_cuda_memcpy_1<ne*sizeof(half)>(tmp, (const half *) vx + i0);
|
||||
float2 * dst_f2 = (float2 *) dst;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne/2; ++l) {
|
||||
dst_f2[l] = __half22float2(tmp[l]);
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
const int64_t ib = i / QK4_0;
|
||||
const int iqs = i % (QK4_0/2);
|
||||
const int shift = (i % QK4_0) / (QK4_0/2);
|
||||
const int64_t ib = i0 / QK4_0;
|
||||
const int iqs = i0 % (QK4_0/2);
|
||||
const int shift = (i0 % QK4_0) / (QK4_0/2);
|
||||
|
||||
const T d = x[ib].d;
|
||||
const int q0 = x[ib].qs[iqs];
|
||||
const int q = ((q0 >> (4*shift)) & 0x0F) - 8;
|
||||
int q;
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<ne, 2>(&q, x[ib].qs + iqs);
|
||||
q >>= 4*shift;
|
||||
q &= 0x0F0F0F0F;
|
||||
q = __vsubss4(q, 0x08080808);
|
||||
|
||||
const int8_t * q8 = (const int8_t *) &q;
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return ((half) d)*((half) q);
|
||||
}
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
const half2 d = __half2half2(x[ib].d);
|
||||
|
||||
return ((float) d)*((float) q);
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < ne; l0 += 2) {
|
||||
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
|
||||
}
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
const float d = x[ib].d;
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
((float *) dst)[l] = d * q8[l];
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "bad type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__ vx, const int64_t i) {
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_q4_1(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const int64_t ib = i / QK4_1;
|
||||
const int iqs = i % (QK4_1/2);
|
||||
const int shift = (i % QK4_1) / (QK4_1/2);
|
||||
const int64_t ib = i0 / QK4_1;
|
||||
const int iqs = i0 % (QK4_1/2);
|
||||
const int shift = (i0 % QK4_1) / (QK4_1/2);
|
||||
|
||||
const half2 dm = x[ib].dm;
|
||||
const int q0 = x[ib].qs[iqs];
|
||||
const int q = ((q0 >> (4*shift)) & 0x0F);
|
||||
int q;
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<ne>(&q, x[ib].qs + iqs);
|
||||
q >>= 4*shift;
|
||||
q &= 0x0F0F0F0F;
|
||||
|
||||
const int8_t * q8 = (const int8_t *) &q;
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return __low2half(dm)*((half) q) + __high2half(dm);
|
||||
}
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
const half2 dm = x[ib].dm;
|
||||
const half2 d = __half2half2( __low2half(dm));
|
||||
const half2 m = __half2half2(__high2half(dm));
|
||||
|
||||
return __low2float(dm)*((float) q) + __high2float(dm);
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < ne; l0 += 2) {
|
||||
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
|
||||
}
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
const float2 dm = __half22float2(x[ib].dm);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
((float *) dst)[l] = dm.x * q8[l] + dm.y;
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "bad type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__ vx, const int64_t i) {
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_q5_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const int64_t ib = i / QK5_0;
|
||||
const int idq = i % QK5_0;
|
||||
const int iqs = i % (QK5_0/2);
|
||||
const int shift = (i % QK5_0) / (QK5_0/2);
|
||||
const int64_t ib = i0 / QK5_0;
|
||||
const int idq = i0 % QK5_0;
|
||||
const int iqs = i0 % (QK5_0/2);
|
||||
const int shift = (i0 % QK5_0) / (QK5_0/2);
|
||||
|
||||
const T d = x[ib].d;
|
||||
const int ql0 = x[ib].qs[iqs];
|
||||
const int qh0 = get_int_b2(x[ib].qh, 0);
|
||||
const int ql = ((ql0 >> (4*shift)) & 0x0F);
|
||||
const int qh = ((qh0 >> idq) << 4) & 0x10;
|
||||
const int q = (ql | qh) - 16;
|
||||
int q;
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<ne, 2>(&q, x[ib].qs + iqs);
|
||||
q >>= 4*shift;
|
||||
q &= 0x0F0F0F0F;
|
||||
|
||||
{
|
||||
int qh;
|
||||
ggml_cuda_memcpy_1<ne, 2>(&qh, x[ib].qh);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
q |= ((qh >> (idq + l)) & 0x00000001) << (8*l + 4);
|
||||
}
|
||||
}
|
||||
|
||||
q = __vsubss4(q, 0x10101010);
|
||||
|
||||
const int8_t * q8 = (const int8_t *) &q;
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return ((half) d)*((half) q);
|
||||
}
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
const half2 d = __half2half2(x[ib].d);
|
||||
|
||||
return ((float) d)*((float) q);
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < ne; l0 += 2) {
|
||||
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
|
||||
}
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
const float d = x[ib].d;
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
((float *) dst)[l] = d * q8[l];
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "bad type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__ vx, const int64_t i) {
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_q5_1(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const int64_t ib = i / QK5_1;
|
||||
const int idq = i % QK5_1;
|
||||
const int iqs = i % (QK5_1/2);
|
||||
const int shift = (i % QK5_1) / (QK5_1/2);
|
||||
const int64_t ib = i0 / QK5_1;
|
||||
const int idq = i0 % QK5_1;
|
||||
const int iqs = i0 % (QK5_1/2);
|
||||
const int shift = (i0 % QK5_1) / (QK5_1/2);
|
||||
|
||||
const half2 dm = x[ib].dm;
|
||||
const int ql0 = x[ib].qs[iqs];
|
||||
const int qh0 = get_int_b4(x[ib].qh, 0);
|
||||
const int ql = ((ql0 >> (4*shift)) & 0x0F);
|
||||
const int qh = ((qh0 >> idq) << 4) & 0x10;
|
||||
const int q = (ql | qh);
|
||||
int q;
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<ne>(&q, x[ib].qs + iqs);
|
||||
q >>= 4*shift;
|
||||
q &= 0x0F0F0F0F;
|
||||
|
||||
{
|
||||
int qh;
|
||||
ggml_cuda_memcpy_1<ne>(&qh, x[ib].qh);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
q |= ((qh >> (idq + l)) & 0x00000001) << (8*l + 4);
|
||||
}
|
||||
}
|
||||
|
||||
const int8_t * q8 = (const int8_t *) &q;
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return __low2half(dm)*((half) q) + __high2half(dm);
|
||||
}
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
const half2 dm = x[ib].dm;
|
||||
const half2 d = __half2half2( __low2half(dm));
|
||||
const half2 m = __half2half2(__high2half(dm));
|
||||
|
||||
return __low2float(dm)*((float) q) + __high2float(dm);
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < ne; l0 += 2) {
|
||||
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
|
||||
}
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
const float2 dm = __half22float2(x[ib].dm);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
((float *) dst)[l] = dm.x * q8[l] + dm.y;
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "bad type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__ vx, const int64_t i) {
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const int64_t ib = i / QK8_0;
|
||||
const int iqs = i % QK8_0;
|
||||
const int64_t ib = i0 / QK8_0;
|
||||
const int iqs = i0 % QK8_0;
|
||||
|
||||
const T d = x[ib].d;
|
||||
const int q = x[ib].qs[iqs];
|
||||
static_assert(ne % 2 == 0, "bad ne");
|
||||
int8_t qs[ne];
|
||||
ggml_cuda_memcpy_1<ne, 2>(qs, x[ib].qs + iqs);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return ((half) d)*((half) q);
|
||||
}
|
||||
if constexpr (std::is_same<T, half>::value) {
|
||||
const half2 d = __half2half2(x[ib].d);
|
||||
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < ne; l0 += 2) {
|
||||
((half2 *) dst)[l0/2] = d * make_half2(qs[l0 + 0], qs[l0 + 1]);
|
||||
}
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
const float d = x[ib].d;
|
||||
|
||||
return ((float) d)*((float) q);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
((float *) dst)[l] = d * qs[l];
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ vx, const int64_t i) {
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
return x[i];
|
||||
template <ggml_type type_K, int D, int nthreads>
|
||||
constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
|
||||
if constexpr (type_K == GGML_TYPE_F16) {
|
||||
return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_Q4_0) {
|
||||
return vec_dot_fattn_vec_KQ_q4_0<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_Q4_1) {
|
||||
return vec_dot_fattn_vec_KQ_q4_1<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_Q5_0) {
|
||||
return vec_dot_fattn_vec_KQ_q5_0<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_Q5_1) {
|
||||
return vec_dot_fattn_vec_KQ_q5_1<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_Q8_0) {
|
||||
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
|
||||
} else {
|
||||
static_assert(type_K == -1, "bad type");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template <int D, int warp_size = WARP_SIZE>
|
||||
constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
|
||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> :
|
||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> :
|
||||
nullptr;
|
||||
}
|
||||
|
||||
template <int D, int warp_size = WARP_SIZE>
|
||||
constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
|
||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> :
|
||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
|
||||
nullptr;
|
||||
}
|
||||
|
||||
constexpr __device__ dequantize_1_f16_t get_dequantize_1_f16(ggml_type type_V) {
|
||||
return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<half> :
|
||||
type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<half> :
|
||||
type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<half> :
|
||||
type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<half> :
|
||||
type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<half> :
|
||||
type_V == GGML_TYPE_F16 ? dequantize_1_f16<half> :
|
||||
nullptr;
|
||||
}
|
||||
|
||||
constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||
return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<float> :
|
||||
type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<float> :
|
||||
type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<float> :
|
||||
type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<float> :
|
||||
type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<float> :
|
||||
type_V == GGML_TYPE_F16 ? dequantize_1_f16<float> :
|
||||
nullptr;
|
||||
template <ggml_type type_V, typename T, int ne>
|
||||
constexpr __device__ dequantize_V_t get_dequantize_V() {
|
||||
if constexpr (type_V == GGML_TYPE_F16) {
|
||||
return dequantize_V_f16<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_Q4_0) {
|
||||
return dequantize_V_q4_0<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_Q4_1) {
|
||||
return dequantize_V_q4_1<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_Q5_0) {
|
||||
return dequantize_V_q5_0<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_Q5_1) {
|
||||
return dequantize_V_q5_1<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_Q8_0) {
|
||||
return dequantize_V_q8_0<T, ne>;
|
||||
} else {
|
||||
static_assert(type_V == -1, "bad type");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template <int ncols1>
|
||||
@@ -870,7 +933,7 @@ void launch_fattn(
|
||||
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
|
||||
|
||||
// Stop trying configurations with more waves if we already have good efficiency to avoid excessive overhead.
|
||||
if (efficiency_percent_best >= 90 && nwaves > nwaves_best) {
|
||||
if (efficiency_percent_best >= 95 && nwaves > nwaves_best) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,495 +0,0 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // GGML_USE_HIP
|
||||
static __global__ void flash_attn_vec_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
if (ncols > 1) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
|
||||
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
|
||||
constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
Q += nb03*sequence + nb02* head + nb01*ic0;
|
||||
K += nb13*sequence + nb12*(head / gqa_ratio);
|
||||
V += nb23*sequence + nb22*(head / gqa_ratio);
|
||||
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ half KQ[ncols*D];
|
||||
half2 * KQ2 = (half2 *) KQ;
|
||||
|
||||
half kqmax[ncols];
|
||||
half kqsum[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax[j] = -HALF_MAX_HALF;
|
||||
kqsum[j] = 0.0f;
|
||||
}
|
||||
|
||||
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
||||
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.y == 0) {
|
||||
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ half maskh_shared[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = 0.0f;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
|
||||
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||
int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D/(sizeof(int)*QK8_1)];
|
||||
half2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
|
||||
if (Q_q8_1) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > ncols && j >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Reuse KQ as temporary storage for converting Q to q8_1:
|
||||
int * tmp_q_i32 = (int *) &KQ[j*D];
|
||||
half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int));
|
||||
|
||||
// Set memory to zero if out of bounds:
|
||||
if (ncols > 2 && ic0 + j >= ne01) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
tmp_q_i32[i] = 0;
|
||||
}
|
||||
if (threadIdx.x < D/QK8_1) {
|
||||
tmp_q_ds[threadIdx.x] = make_half2(0.0f, 0.0f);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * Q_f = (const float *) (Q + j*nb01);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
|
||||
quantize_q8_1_to_shared<half2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
int * tmp_q_i32 = (int *) &KQ[j*D];
|
||||
half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int));
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
|
||||
Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
|
||||
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||
half kqmax_new = kqmax[0];
|
||||
half kqmax_new_arr[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax_new_arr[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half sum = vec_dot_KQ(K + i_KQ*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum((float)sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
sum += maskh_shared[j*D + i_KQ];
|
||||
|
||||
if (ncols == 1) {
|
||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||
} else {
|
||||
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
KQ[j*D + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||
KQ[j*D + tid] = val;
|
||||
|
||||
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < D; k0 += 2) {
|
||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
half2 V_k;
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const half sink = __float2half(sinksf[head]);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(sink - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale;
|
||||
|
||||
if (tid == 0) {
|
||||
kqsum[j] += val;
|
||||
}
|
||||
|
||||
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqsum[j] = warp_reduce_sum((float)kqsum[j]);
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
|
||||
break;
|
||||
}
|
||||
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
|
||||
|
||||
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||
if (gridDim.y == 1) {
|
||||
dst_val /= kqsum[j_VKQ];
|
||||
}
|
||||
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
||||
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, type_K, type_V, use_logit_softcap>;
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
GGML_ASSERT(K->type == type_K);
|
||||
GGML_ASSERT(V->type == type_V);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
constexpr int cols_per_block = 1;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \
|
||||
template void ggml_cuda_flash_attn_ext_vec_f16_case \
|
||||
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
|
||||
extern DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
@@ -1,486 +0,0 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // GGML_USE_HIP
|
||||
static __global__ void flash_attn_vec_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
if (ncols > 1) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
|
||||
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
|
||||
constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
Q += nb03*sequence + nb02* head + nb01*ic0;
|
||||
K += nb13*sequence + nb12*(head / gqa_ratio);
|
||||
V += nb23*sequence + nb22*(head / gqa_ratio);
|
||||
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ float KQ[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -FLT_MAX/2.0f;
|
||||
}
|
||||
|
||||
float kqmax[ncols];
|
||||
float kqsum[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax[j] = -FLT_MAX/2.0f;
|
||||
kqsum[j] = 0.0f;
|
||||
}
|
||||
|
||||
__shared__ float kqmax_shared[ncols][WARP_SIZE];
|
||||
__shared__ float kqsum_shared[ncols][WARP_SIZE];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.y == 0) {
|
||||
kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ float maskf_shared[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = 0.0f;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
|
||||
float2 Q_f2[ncols][D/(2*WARP_SIZE)];
|
||||
int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)];
|
||||
float2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
|
||||
if (Q_q8_1) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > ncols && j >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Reuse KQ as temporary storage for converting Q to q8_1:
|
||||
int * tmp_q_i32 = (int *) &KQ[j*D];
|
||||
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
||||
|
||||
// Set memory to zero if out of bounds:
|
||||
if (ncols > 2 && ic0 + j >= ne01) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
tmp_q_i32[i] = 0;
|
||||
}
|
||||
if (threadIdx.x < D/QK8_1) {
|
||||
tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * Q_f = (const float *) (Q + j*nb01);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
||||
quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
int * tmp_q_i32 = (int *) &KQ[j*D];
|
||||
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
|
||||
Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
|
||||
Q_f2[j][i0/WARP_SIZE].x *= scale;
|
||||
Q_f2[j][i0/WARP_SIZE].y *= scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float VKQ[ncols] = {0.0f};
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float kqmax_new_arr[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax_new_arr[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
sum += maskf_shared[j*D + i_KQ];
|
||||
|
||||
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
KQ[j*D + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float kqmax_new_j = kqmax_new_arr[j];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const float val = expf(KQ[j*D + tid] - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||
KQ[j*D + tid] = val;
|
||||
|
||||
VKQ[j] *= KQ_max_scale;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < D; ++k) {
|
||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float V_ki = dequantize_1_v(V + k*nb21, tid);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_ki*KQ[j*D + k];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale;
|
||||
|
||||
if (tid == 0) {
|
||||
kqsum[j] += val;
|
||||
}
|
||||
|
||||
VKQ[j] *= KQ_max_scale;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
|
||||
break;
|
||||
}
|
||||
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
|
||||
float dst_val = VKQ[j_VKQ];
|
||||
if (gridDim.y == 1) {
|
||||
dst_val /= kqsum[j_VKQ];
|
||||
}
|
||||
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
||||
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, type_K, type_V, use_logit_softcap>;
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
GGML_ASSERT(K->type == type_K);
|
||||
GGML_ASSERT(V->type == type_V);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
constexpr int cols_per_block = 1;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \
|
||||
template void ggml_cuda_flash_attn_ext_vec_f32_case \
|
||||
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
|
||||
extern DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
593
ggml/src/ggml-cuda/fattn-vec.cuh
Normal file
593
ggml/src/ggml-cuda/fattn-vec.cuh
Normal file
@@ -0,0 +1,593 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
static int ggml_cuda_fattn_vec_get_nthreads_host(const int cc) {
|
||||
return 128;
|
||||
GGML_UNUSED(cc);
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
|
||||
return 128;
|
||||
}
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
|
||||
static __global__ void flash_attn_ext_vec(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
#ifdef GGML_USE_HIP
|
||||
#ifdef RDNA
|
||||
constexpr int nthreads_KQ_q = 2;
|
||||
#else
|
||||
constexpr int nthreads_KQ_q = 4;
|
||||
#endif // RDNA
|
||||
constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);
|
||||
#else
|
||||
constexpr int nthreads_KQ_q = (D/4 < 32 ? D/4 : 32);
|
||||
constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
|
||||
constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q;
|
||||
constexpr int nthreads_V = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q;
|
||||
|
||||
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
|
||||
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
|
||||
|
||||
constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4;
|
||||
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
|
||||
|
||||
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
|
||||
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
|
||||
#else
|
||||
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, float, V_rows_per_thread>();
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
Q += nb03*sequence + nb02* head + nb01*ic0;
|
||||
K += nb13*sequence + nb12*(head / gqa_ratio);
|
||||
V += nb23*sequence + nb22*(head / gqa_ratio);
|
||||
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = nthreads / WARP_SIZE;
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
__builtin_assume(tid < nthreads);
|
||||
|
||||
constexpr int ne_KQ = ncols*D;
|
||||
constexpr int ne_combine = nwarps*V_cols_per_iter*D;
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
|
||||
__shared__ half KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
|
||||
#else
|
||||
float2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
|
||||
__shared__ float KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
float KQ_max[ncols];
|
||||
float KQ_sum[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ_max[j] = -FLT_MAX/2.0f;
|
||||
KQ_sum[j] = 0.0f;
|
||||
}
|
||||
|
||||
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 Q_reg[ncols][(D/2)/nthreads_KQ]; // Will be initialized completely.
|
||||
#else
|
||||
float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
|
||||
float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
|
||||
if constexpr (Q_q8_1) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > ncols && j >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Reuse KQ as temporary storage for converting Q to q8_1:
|
||||
int * tmp_q_i32 = (int *) &KQ[j*D];
|
||||
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
||||
|
||||
// Set memory to zero if out of bounds:
|
||||
if (ncols > 1 && ic0 + j >= ne01) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (i0 + WARP_SIZE <= D/sizeof(int) || i < D/sizeof(int)) {
|
||||
tmp_q_i32[i] = 0;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x < D/QK8_1) {
|
||||
tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
|
||||
}
|
||||
} else {
|
||||
const float * Q_f = (const float *) (Q + j*nb01);
|
||||
constexpr int nthreads_quantize = D/sizeof(int) < WARP_SIZE ? D/sizeof(int) : WARP_SIZE;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += nthreads_quantize) {
|
||||
quantize_q8_1_to_shared<float2, nthreads_quantize>
|
||||
(Q_f + i0*sizeof(int), scale, tmp_q_i32 + i0, tmp_q_ds + i0/QI8_1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
int * tmp_q_i32 = (int *) &KQ[j*D];
|
||||
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += nthreads_KQ) {
|
||||
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ);
|
||||
|
||||
Q_i32[j][i0/nthreads_KQ] = tmp_q_i32[i];
|
||||
Q_ds[j][i0/nthreads_KQ] = tmp_q_ds[i/QI8_1];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
} else {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 scale_h2 = make_half2(scale, scale);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
const float2 * Q_j = (const float2 *) (Q + j*nb01);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
|
||||
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
||||
|
||||
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
|
||||
if (ncols == 1 || ic0 + j < ne01) {
|
||||
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne; ++i1) {
|
||||
Q_reg[j][i0/nthreads_KQ + i1] = make_half2(tmp[i1].x, tmp[i1].y);
|
||||
}
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k = 0; k < (D/2)/nthreads_KQ; ++k) {
|
||||
Q_reg[j][k] *= scale_h2;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
const float2 * Q_j = (const float2 *) (Q + j*nb01);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
|
||||
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
||||
if (ncols == 1 || ic0 + j < ne01) {
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ], &Q_j[i]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
|
||||
}
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k = 0; k < (D/2)/nthreads_KQ; ++k) {
|
||||
Q_reg[j][k].x *= scale;
|
||||
Q_reg[j][k].y *= scale;
|
||||
}
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
K += blockIdx.y*nthreads * nb11;
|
||||
V += blockIdx.y*nthreads * nb21;
|
||||
maskh += blockIdx.y*nthreads;
|
||||
for (int k_VKQ_0 = blockIdx.y*nthreads; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nthreads,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*nthreads*nb11, V += gridDim.y*nthreads*nb21, maskh += gridDim.y*nthreads) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
float KQ_reg[ncols]; // KQ in registers.
|
||||
|
||||
float KQ_max_new[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ_max_new[j] = KQ_max[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < nthreads_KQ; ++i_KQ_0) {
|
||||
const int i_KQ = threadIdx.y*WARP_SIZE + (nthreads_KQ == WARP_SIZE ? 0 : (threadIdx.x & ~(nthreads_KQ-1))) + i_KQ_0;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum<nthreads_KQ>(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
if (mask) {
|
||||
sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
|
||||
}
|
||||
|
||||
KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
|
||||
|
||||
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == i_KQ_0) {
|
||||
KQ_reg[j] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {
|
||||
KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));
|
||||
}
|
||||
const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);
|
||||
KQ_max[j] = KQ_max_new[j];
|
||||
|
||||
KQ_reg[j] = expf(KQ_reg[j] - KQ_max[j]);
|
||||
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + KQ_reg[j];
|
||||
KQ[j*nthreads + tid] = KQ_reg[j];
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
|
||||
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#ifndef GGML_USE_HIP
|
||||
__syncwarp();
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < WARP_SIZE; k0 += V_cols_per_iter) {
|
||||
const int k = threadIdx.y*WARP_SIZE + k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V);
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 KQ_k[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
half2 tmp[V_rows_per_thread/2];
|
||||
dequantize_V(V + k*nb21, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1] += tmp[i_VKQ_1]*KQ_k[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
float KQ_k[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ_k[j] = KQ[j*nthreads + k];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
float2 tmp[V_rows_per_thread/2];
|
||||
dequantize_V(V + k*nb21, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
if (sinks && blockIdx.y == 0) {
|
||||
const float sink = ((const float *) sinks)[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > ncols && j >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float kqmax_new_j = fmaxf(sink, KQ_max[j]);
|
||||
const float KQ_max_scale = expf(KQ_max[j] - kqmax_new_j);
|
||||
KQ_max[j] = kqmax_new_j;
|
||||
|
||||
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? expf(sink - KQ_max[j]) : 0.0f);
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
|
||||
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ float KQ_max_shared[ncols][WARP_SIZE];
|
||||
__shared__ float KQ_sum_shared[ncols][WARP_SIZE];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.y == 0) {
|
||||
KQ_max_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
|
||||
KQ_sum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.x == 0) {
|
||||
KQ_max_shared[j][threadIdx.y] = KQ_max[j];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
if (ncols > 1 && ic0 + j_VKQ >= ne01) {
|
||||
break;
|
||||
}
|
||||
|
||||
float kqmax_new = KQ_max_shared[j_VKQ][threadIdx.x];
|
||||
kqmax_new = warp_reduce_max(kqmax_new);
|
||||
const float kqmax_scale = expf(KQ_max[j_VKQ] - kqmax_new);
|
||||
KQ_max[j_VKQ] = kqmax_new;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 * VKQ_tmp = (half2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
|
||||
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
|
||||
|
||||
const half2 kqmax_scale_h2 = make_half2(kqmax_scale, kqmax_scale);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
|
||||
VKQ[j_VKQ][i_VKQ_0/nthreads_V] *= kqmax_scale_h2;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
const int i_VKQ = i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*(V_rows_per_thread/2);
|
||||
|
||||
ggml_cuda_memcpy_1<V_rows_per_thread*sizeof(half)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
|
||||
}
|
||||
#else
|
||||
float2 * VKQ_tmp = (float2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
|
||||
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
|
||||
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
|
||||
VKQ[j_VKQ][i_VKQ_0/nthreads_V].x *= kqmax_scale;
|
||||
VKQ[j_VKQ][i_VKQ_0/nthreads_V].y *= kqmax_scale;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
const int i_VKQ = i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*(V_rows_per_thread/2);
|
||||
|
||||
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
|
||||
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ + V_rows_per_thread/4, &VKQ[j_VKQ][i_VKQ_0/nthreads_V + V_rows_per_thread/4]);
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
KQ_sum[j_VKQ] *= kqmax_scale;
|
||||
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
|
||||
if (threadIdx.x == 0) {
|
||||
KQ_sum_shared[j_VKQ][threadIdx.y] = KQ_sum[j_VKQ];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (nthreads <= D || tid < D) {
|
||||
KQ_sum[j_VKQ] = KQ_sum_shared[j_VKQ][threadIdx.x];
|
||||
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += nthreads) {
|
||||
float dst_val = 0;
|
||||
#pragma unroll
|
||||
for (int w = 0; w < nwarps; ++w) {
|
||||
#pragma unroll
|
||||
for (int v = 0; v < V_cols_per_iter; ++v) {
|
||||
dst_val += float(KQ[w*V_cols_per_iter*D + v*D + i0 + tid]);
|
||||
}
|
||||
}
|
||||
if (gridDim.y == 1) {
|
||||
dst_val /= KQ_sum[j_VKQ];
|
||||
}
|
||||
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
|
||||
}
|
||||
}
|
||||
|
||||
if (j_VKQ < ncols-1) {
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < ne01)) {
|
||||
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
const int nthreads = ggml_cuda_fattn_vec_get_nthreads_host(cc);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_vec<D, cols_per_block, type_K, type_V, use_logit_softcap>;
|
||||
constexpr bool need_f16_K = false;
|
||||
constexpr bool need_f16_V = false;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
GGML_ASSERT(K->type == type_K);
|
||||
GGML_ASSERT(V->type == type_V);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 2;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_FATTN_VEC_CASE(D, type_K, type_V) \
|
||||
template void ggml_cuda_flash_attn_ext_vec_case \
|
||||
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
|
||||
|
||||
#define EXTERN_DECL_FATTN_VEC_CASES(D, type_K) \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_F16); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q4_0); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q4_1); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \
|
||||
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \
|
||||
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0)
|
||||
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0)
|
||||
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-mma-f16.cuh"
|
||||
#include "fattn-tile.cuh"
|
||||
#include "fattn-vec-f16.cuh"
|
||||
#include "fattn-vec-f32.cuh"
|
||||
#include "fattn-vec.cuh"
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
#include "fattn.cuh"
|
||||
|
||||
@@ -117,151 +116,68 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
|
||||
}
|
||||
}
|
||||
|
||||
#define FATTN_VEC_F16_CASE(D, type_K, type_V) \
|
||||
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case<D, type_K, type_V>(ctx, dst); \
|
||||
return; \
|
||||
} \
|
||||
#define FATTN_VEC_CASE(D, type_K, type_V) \
|
||||
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
|
||||
ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
|
||||
return; \
|
||||
} \
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
#define FATTN_VEC_CASES_ALL_D(type_K, type_V) \
|
||||
FATTN_VEC_CASE( 64, type_K, type_V) \
|
||||
FATTN_VEC_CASE(128, type_K, type_V) \
|
||||
FATTN_VEC_CASE(256, type_K, type_V) \
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
ggml_tensor * K = dst->src[1];
|
||||
ggml_tensor * V = dst->src[2];
|
||||
|
||||
#ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 )
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_F16)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
|
||||
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
#else
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
|
||||
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
#endif // GGML_CUDA_FA_ALL_QUANTS
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
#define FATTN_VEC_F32_CASE(D, type_K, type_V) \
|
||||
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case<D, type_K, type_V>(ctx, dst); \
|
||||
return; \
|
||||
} \
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
ggml_tensor * K = dst->src[1];
|
||||
ggml_tensor * V = dst->src[2];
|
||||
|
||||
#ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
|
||||
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
#else
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
|
||||
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
|
||||
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
|
||||
#endif // GGML_CUDA_FA_ALL_QUANTS
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -271,8 +187,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
|
||||
enum best_fattn_kernel {
|
||||
BEST_FATTN_KERNEL_NONE = 0,
|
||||
BEST_FATTN_KERNEL_TILE = 200,
|
||||
BEST_FATTN_KERNEL_VEC_F32 = 100,
|
||||
BEST_FATTN_KERNEL_VEC_F16 = 110,
|
||||
BEST_FATTN_KERNEL_VEC = 100,
|
||||
BEST_FATTN_KERNEL_WMMA_F16 = 300,
|
||||
BEST_FATTN_KERNEL_MMA_F16 = 400,
|
||||
};
|
||||
@@ -283,7 +198,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
#endif// FLASH_ATTN_AVAILABLE
|
||||
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
@@ -293,8 +207,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
switch (K->ne[0]) {
|
||||
case 64:
|
||||
@@ -343,31 +255,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
#endif // GGML_CUDA_FA_ALL_QUANTS
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
#ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||
if (K->ne[0] != 128 && K->ne[0] != 64) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
#else
|
||||
if (K->ne[0] != 128) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
#endif // GGML_CUDA_FA_ALL_QUANTS
|
||||
break;
|
||||
default:
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
|
||||
switch (V->type) {
|
||||
case GGML_TYPE_F16:
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (K->ne[0] != 128) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
@@ -377,30 +264,39 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0;
|
||||
|
||||
// If Turing tensor cores available, use them except for some cases with batch size 1:
|
||||
if (turing_mma_available(cc)) {
|
||||
const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask; // The mma-based kernels have GQA-specific optimizations
|
||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (gqa_ratio > 4 && K->ne[1] >= 8192);
|
||||
const bool mma_faster_for_bs1 = gqa_opt_applies && !mma_needs_data_conversion &&
|
||||
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
|
||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
return BEST_FATTN_KERNEL_VEC_F16;
|
||||
best_fattn_kernel best = BEST_FATTN_KERNEL_MMA_F16;
|
||||
|
||||
if (can_use_vector_kernel) {
|
||||
if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
|
||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
|
||||
best = BEST_FATTN_KERNEL_VEC;
|
||||
}
|
||||
} else {
|
||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
if (Q->ne[1] <= 2) {
|
||||
best = BEST_FATTN_KERNEL_VEC;
|
||||
}
|
||||
} else {
|
||||
if (Q->ne[1] == 1) {
|
||||
best = BEST_FATTN_KERNEL_VEC;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((gqa_ratio % 2 != 0 || !mask) && Q->ne[1] == 1) {
|
||||
best = BEST_FATTN_KERNEL_VEC; // GQA-specific optimizations in the mma kernel do not apply.
|
||||
}
|
||||
return BEST_FATTN_KERNEL_VEC_F32;
|
||||
}
|
||||
return BEST_FATTN_KERNEL_MMA_F16;
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
// Use kernels specializes for small batch sizes if possible:
|
||||
// Use kernels specialized for small batch sizes if possible:
|
||||
if (Q->ne[1] <= 8 && can_use_vector_kernel) {
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
return BEST_FATTN_KERNEL_VEC_F16;
|
||||
}
|
||||
return BEST_FATTN_KERNEL_VEC_F32;
|
||||
return BEST_FATTN_KERNEL_VEC;
|
||||
}
|
||||
|
||||
// For large batch sizes, use the WMMA kernel if possible:
|
||||
@@ -420,11 +316,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
case BEST_FATTN_KERNEL_TILE:
|
||||
ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_VEC_F32:
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_VEC_F16:
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
case BEST_FATTN_KERNEL_VEC:
|
||||
ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_WMMA_F16:
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/mean.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/topk-moe.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/wkv.cuh"
|
||||
@@ -2030,7 +2031,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
@@ -2038,7 +2039,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
@@ -2110,7 +2111,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) {
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
|
||||
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
@@ -2640,6 +2641,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
|
||||
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
|
||||
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
|
||||
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
|
||||
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -2668,7 +2671,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
|
||||
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
|
||||
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
|
||||
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0) {
|
||||
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
|
||||
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
|
||||
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
|
||||
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
|
||||
// by means of matching node names. See
|
||||
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
|
||||
@@ -2825,6 +2830,44 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
GGML_ASSERT(unary_ops.size() == num_unary);
|
||||
#endif
|
||||
|
||||
//TODO: remove special case once ggml_can_fuse can handle empty nodes
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops = ggml_cuda_topk_moe_ops(false);
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
|
||||
|
||||
if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
|
||||
|
||||
if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
|
||||
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
|
||||
}
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx+8];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
|
||||
|
||||
if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < topk_moe_ops.size(); i++) {
|
||||
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
|
||||
}
|
||||
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx+4];
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
@@ -2915,6 +2958,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i+8];
|
||||
ggml_tensor * selected_experts = cgraph->nodes[i+3];
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true);
|
||||
i += 8;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i+4];
|
||||
ggml_tensor * selected_experts = cgraph->nodes[i+3];
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_ADD) {
|
||||
int n_fuse = 0;
|
||||
ggml_op ops[8];
|
||||
@@ -3584,9 +3643,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
return true;
|
||||
case GGML_OP_ARGSORT:
|
||||
// TODO: Support arbitrary column width
|
||||
return op->src[0]->ne[0] <= 1024;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
|
||||
@@ -84,7 +84,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) {
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
|
||||
|
||||
if (ggml_is_quantized(type)) {
|
||||
return false;
|
||||
@@ -96,8 +96,18 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
|
||||
return false;
|
||||
}
|
||||
if (src1_ncols > 16) {
|
||||
return false;
|
||||
|
||||
if (mul_mat_id) {
|
||||
if (type == GGML_TYPE_F32 && src1_ncols > 32) {
|
||||
return false;
|
||||
}
|
||||
if ((type == GGML_TYPE_F16 || type == GGML_TYPE_BF16) && src1_ncols > 64) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (src1_ncols > 16) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
|
||||
@@ -9,13 +9,13 @@ using namespace ggml_cuda_mma;
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols);
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
|
||||
|
||||
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
|
||||
static __global__ void mul_mat_f(
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
|
||||
const int ncols, const int ncols_dst_total, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
|
||||
const int stride_col_id, const int stride_row_id,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
@@ -31,9 +31,20 @@ static __global__ void mul_mat_f(
|
||||
|
||||
const int row0 = blockIdx.x * rows_per_block;
|
||||
|
||||
const int expert_idx = has_ids ? blockIdx.y : 0;
|
||||
int expert_idx = 0;
|
||||
int col_base = 0;
|
||||
|
||||
const int channel_dst = has_ids ? 0 : blockIdx.y;
|
||||
|
||||
if constexpr (has_ids) {
|
||||
// experts + tiles of ncols_dst are packed in the y dimension
|
||||
int col_tiles = (ncols_dst_total + cols_per_block - 1) / cols_per_block;
|
||||
const int nchannels_x = gridDim.y / col_tiles;
|
||||
const int tile_idx = blockIdx.y / nchannels_x;
|
||||
expert_idx = blockIdx.y - tile_idx * nchannels_x;
|
||||
col_base = tile_idx * cols_per_block;
|
||||
}
|
||||
|
||||
const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio);
|
||||
const int channel_y = channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
@@ -44,6 +55,14 @@ static __global__ void mul_mat_f(
|
||||
y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y);
|
||||
dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst);
|
||||
|
||||
if constexpr (has_ids) {
|
||||
constexpr int y_stride_scale = std::is_same_v<T, float> ? 1 : 2;
|
||||
const int64_t col_offset = col_base;
|
||||
y += col_offset * stride_col_y * y_stride_scale;
|
||||
dst += col_offset * stride_col_dst;
|
||||
ids += col_offset * stride_row_id;
|
||||
}
|
||||
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
@@ -61,12 +80,17 @@ static __global__ void mul_mat_f(
|
||||
|
||||
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
slot_map[j] = -1;
|
||||
}
|
||||
|
||||
if (col_base + j >= ncols_dst_total) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
|
||||
|
||||
for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) {
|
||||
int match = id_row[k*stride_col_id] == expert_idx;
|
||||
|
||||
@@ -108,7 +132,8 @@ static __global__ void mul_mat_f(
|
||||
if constexpr (!has_ids) {
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
|
||||
} else {
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
|
||||
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = valid ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
@@ -120,7 +145,8 @@ static __global__ void mul_mat_f(
|
||||
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
} else {
|
||||
float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
|
||||
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
|
||||
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
}
|
||||
}
|
||||
@@ -183,14 +209,14 @@ static __global__ void mul_mat_f(
|
||||
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
} else {
|
||||
const int slot = (j < cols_per_block) ? slot_map[j] : -1;
|
||||
if (slot >= 0) {
|
||||
if (slot >= 0 && (col_base + j) < ncols_dst_total) {
|
||||
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(x, y, ids, dst,
|
||||
ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
@@ -201,20 +227,23 @@ static __global__ void mul_mat_f(
|
||||
template<typename T, int cols_per_block, int nwarps>
|
||||
static inline void mul_mat_f_switch_ids(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nchannels_dst,
|
||||
const int64_t ncols_x, const int64_t ncols_dst, const int64_t nchannels_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t stride_col_id, const int64_t stride_row_id,
|
||||
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
|
||||
if (ids) {
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
const int64_t col_tiles = (ncols_dst + cols_per_block - 1) / cols_per_block;
|
||||
dim3 block_nums_ids = block_nums;
|
||||
block_nums_ids.y *= col_tiles;
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} else {
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
(x, y, ids, dst, ncols_x, cols_per_block, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
}
|
||||
@@ -223,7 +252,8 @@ static inline void mul_mat_f_switch_ids(
|
||||
template <typename T, int cols_per_block>
|
||||
void mul_mat_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t stride_col_id, const int64_t stride_row_id,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
@@ -268,49 +298,49 @@ void mul_mat_f_cuda(
|
||||
switch (nwarps_best) {
|
||||
case 1: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 1>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 2>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 3>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 4>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 5>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 6>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 7>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 8>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
@@ -332,84 +362,89 @@ static void mul_mat_f_switch_cols_per_block(
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
switch (ncols_dst) {
|
||||
|
||||
const int ncols_case = (ids && ncols_dst > 16) ? 16 : ncols_dst;
|
||||
|
||||
GGML_ASSERT(ids || ncols_dst <= 16);
|
||||
|
||||
switch (ncols_case) {
|
||||
case 1: {
|
||||
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 9: {
|
||||
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 10: {
|
||||
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 11: {
|
||||
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 12: {
|
||||
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 13: {
|
||||
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 14: {
|
||||
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 15: {
|
||||
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 16: {
|
||||
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
@@ -422,7 +457,7 @@ static void mul_mat_f_switch_cols_per_block(
|
||||
#define DECL_MMF_CASE_HELPER(T, ncols_dst) \
|
||||
template void mul_mat_f_cuda<T, ncols_dst>( \
|
||||
const T * x, const float * y, const int32_t * ids, float * dst, \
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
|
||||
const int64_t ncols_x, const int64_t nrows_x, int64_t ncols_dst_total, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
|
||||
const int64_t stride_col_id, const int64_t stride_row_id, \
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\
|
||||
|
||||
@@ -81,7 +81,7 @@ static __global__ void mmq_ids_helper(
|
||||
#pragma unroll
|
||||
for (int offset = neu_padded; offset < warp_size; offset += neu_padded) {
|
||||
const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size);
|
||||
if (threadIdx.x >= offset) {
|
||||
if (threadIdx.x >= static_cast<unsigned int>(offset)) {
|
||||
it_compact_add_lower += tmp;
|
||||
}
|
||||
}
|
||||
@@ -110,7 +110,7 @@ static __global__ void mmq_ids_helper(
|
||||
|
||||
expert_bounds[expert] = nex_prev;
|
||||
|
||||
if (expert < gridDim.x - 1) {
|
||||
if (expert < static_cast<int>(gridDim.x) - 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -220,7 +220,7 @@ static __global__ void mul_mat_vec_q(
|
||||
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
|
||||
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,6 +51,8 @@ static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
|
||||
}
|
||||
const float value = *(const float *) (src0_ptr + src_idx * nb00);
|
||||
*(float *) (dst_ptr + i0 * nb0) = value;
|
||||
|
||||
GGML_UNUSED(p1);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f16.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user