mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-03 07:34:07 +00:00
Compare commits
58 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e2d57b2b2 | ||
|
|
f1648e91cf | ||
|
|
d6c95b0740 | ||
|
|
d76a86d967 | ||
|
|
776f9e59cc | ||
|
|
3d652bfddf | ||
|
|
5220a16d18 | ||
|
|
3ffbbd5ce1 | ||
|
|
42994048a3 | ||
|
|
e9b2f84f14 | ||
|
|
e721c05c93 | ||
|
|
57b6abf85a | ||
|
|
94bb63e4f0 | ||
|
|
f79243992c | ||
|
|
ed4ce0dda2 | ||
|
|
07d1572347 | ||
|
|
5e43f104cc | ||
|
|
16e4b22c5e | ||
|
|
074c4fd39d | ||
|
|
669912d9a5 | ||
|
|
fa31c438e0 | ||
|
|
3ccbfe5a71 | ||
|
|
06a92a193a | ||
|
|
a057897ad4 | ||
|
|
5bbe6a9fe9 | ||
|
|
20a9b8f5e1 | ||
|
|
56d7a9f812 | ||
|
|
1a24c4621f | ||
|
|
becade5de7 | ||
|
|
dfd6b2c0be | ||
|
|
b64d7cc272 | ||
|
|
3d1cf3cf33 | ||
|
|
0cbee131ad | ||
|
|
8371d44595 | ||
|
|
87abb7e903 | ||
|
|
6d4c23b81b | ||
|
|
6512a90037 | ||
|
|
4512055792 | ||
|
|
f54a4ba11e | ||
|
|
aede2074f6 | ||
|
|
2679c3b55d | ||
|
|
c43af9276b | ||
|
|
d5c63cd7f9 | ||
|
|
9660ffef58 | ||
|
|
c950a1f692 | ||
|
|
7b69003af7 | ||
|
|
ece9745bb8 | ||
|
|
cc473cac7c | ||
|
|
14dec0c2f2 | ||
|
|
1782cdfed6 | ||
|
|
45a8e76745 | ||
|
|
80c41ddd8f | ||
|
|
2cc4a5e44a | ||
|
|
06c2b1561d | ||
|
|
70680c48e5 | ||
|
|
c43a3e7996 | ||
|
|
84d5f4bc19 | ||
|
|
438a83926a |
52
.github/workflows/build.yml
vendored
52
.github/workflows/build.yml
vendored
@@ -467,6 +467,7 @@ jobs:
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -476,6 +477,7 @@ jobs:
|
||||
cmake -B build2 -S . \
|
||||
-DCMAKE_C_COMPILER=hipcc \
|
||||
-DCMAKE_CXX_COMPILER=hipcc \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
@@ -710,12 +712,11 @@ jobs:
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
sudo cmake --install build --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
|
||||
./build-xcframework.sh
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
@@ -1203,6 +1204,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
@@ -1232,8 +1238,10 @@ jobs:
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
@@ -1252,6 +1260,11 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
@@ -1281,8 +1294,10 @@ jobs:
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
@@ -1321,6 +1336,8 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -1336,15 +1353,40 @@ jobs:
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
sudo cmake --install build --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
@@ -161,6 +161,8 @@ jobs:
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
env:
|
||||
GITHUB_ACTIONS: "true"
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
./tests.sh
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -45,6 +45,8 @@ lcov-report/
|
||||
tags
|
||||
.build/
|
||||
build*
|
||||
release
|
||||
debug
|
||||
!build-info.cmake
|
||||
!build-info.cpp.in
|
||||
!build-info.sh
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
|
||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
|
||||
|
||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
|
||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
|
||||
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
|
||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
// swift-tools-version:5.5
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
platforms: [
|
||||
.macOS(.v12),
|
||||
.iOS(.v14),
|
||||
.watchOS(.v4),
|
||||
.tvOS(.v14)
|
||||
],
|
||||
products: [
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
targets: [
|
||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
||||
]
|
||||
)
|
||||
@@ -5,7 +5,7 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
@@ -25,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
@@ -157,6 +157,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <llama.h>
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
module llama [system] {
|
||||
header "llama.h"
|
||||
link "llama"
|
||||
export *
|
||||
}
|
||||
519
build-xcframework.sh
Executable file
519
build-xcframework.sh
Executable file
@@ -0,0 +1,519 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Options
|
||||
IOS_MIN_OS_VERSION=16.4
|
||||
MACOS_MIN_OS_VERSION=13.3
|
||||
VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
GGML_METAL=ON
|
||||
GGML_METAL_EMBED_LIBRARY=ON
|
||||
GGML_BLAS_DEFAULT=ON
|
||||
GGML_METAL_USE_BF16=ON
|
||||
GGML_OPENMP=OFF
|
||||
|
||||
COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
|
||||
# Common options for all builds
|
||||
COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
|
||||
-DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
|
||||
-DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
|
||||
-DGGML_METAL=${GGML_METAL}
|
||||
-DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
|
||||
-DGGML_NATIVE=OFF
|
||||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
|
||||
if ! command -v $tool &> /dev/null; then
|
||||
echo "Error: $tool is required but not found."
|
||||
echo "$install_message"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "Checking for required tools..."
|
||||
check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
|
||||
check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
|
||||
check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
|
||||
set -e
|
||||
|
||||
## Clean up previous builds
|
||||
rm -rf build-apple
|
||||
rm -rf build-ios-sim
|
||||
rm -rf build-ios-device
|
||||
rm -rf build-macos
|
||||
rm -rf build-visionos
|
||||
rm -rf build-visionos-sim
|
||||
rm -rf build-tvos-sim
|
||||
rm -rf build-tvos-device
|
||||
|
||||
# Setup the xcframework build directory structure
|
||||
setup_framework_structure() {
|
||||
local build_dir=$1
|
||||
local min_os_version=$2
|
||||
local platform=$3 # "ios", "macos", "visionos", or "tvos"
|
||||
local framework_name="llama"
|
||||
|
||||
echo "Creating ${platform}-style framework structure for ${build_dir}"
|
||||
|
||||
if [[ "$platform" == "macos" ]]; then
|
||||
# macOS versioned structure uses versioned directories
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
|
||||
|
||||
# Create symbolic links
|
||||
ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
|
||||
ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
|
||||
ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
|
||||
ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
|
||||
ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
|
||||
|
||||
# Set header and module paths
|
||||
local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
|
||||
local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
|
||||
else
|
||||
# iOS/VisionOS/tvOS use a flat structure
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
|
||||
|
||||
# Remove any existing structure to ensure clean build
|
||||
rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
|
||||
|
||||
# Set header and module paths
|
||||
local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
|
||||
local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
|
||||
fi
|
||||
|
||||
# Copy all required headers (common for all platforms)
|
||||
cp include/llama.h ${header_path}
|
||||
cp ggml/include/ggml.h ${header_path}
|
||||
cp ggml/include/ggml-alloc.h ${header_path}
|
||||
cp ggml/include/ggml-backend.h ${header_path}
|
||||
cp ggml/include/ggml-metal.h ${header_path}
|
||||
cp ggml/include/ggml-cpu.h ${header_path}
|
||||
cp ggml/include/ggml-blas.h ${header_path}
|
||||
cp ggml/include/gguf.h ${header_path}
|
||||
|
||||
# Create module map (common for all platforms)
|
||||
cat > ${module_path}module.modulemap << EOF
|
||||
framework module llama {
|
||||
header "llama.h"
|
||||
header "ggml.h"
|
||||
header "ggml-alloc.h"
|
||||
header "ggml-backend.h"
|
||||
header "ggml-metal.h"
|
||||
header "ggml-cpu.h"
|
||||
header "ggml-blas.h"
|
||||
header "gguf.h"
|
||||
|
||||
link "c++"
|
||||
link framework "Accelerate"
|
||||
link framework "Metal"
|
||||
link framework "Foundation"
|
||||
|
||||
export *
|
||||
}
|
||||
EOF
|
||||
|
||||
# Platform-specific settings for Info.plist
|
||||
local platform_name=""
|
||||
local sdk_name=""
|
||||
local supported_platform=""
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
platform_name="iphoneos"
|
||||
sdk_name="iphoneos${min_os_version}"
|
||||
supported_platform="iPhoneOS"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=' <key>UIDeviceFamily</key>
|
||||
<array>
|
||||
<integer>1</integer>
|
||||
<integer>2</integer>
|
||||
</array>'
|
||||
;;
|
||||
"macos")
|
||||
platform_name="macosx"
|
||||
sdk_name="macosx${min_os_version}"
|
||||
supported_platform="MacOSX"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
|
||||
local device_family=""
|
||||
;;
|
||||
"visionos")
|
||||
platform_name="xros"
|
||||
sdk_name="xros${min_os_version}"
|
||||
supported_platform="XRPlatform"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=""
|
||||
;;
|
||||
"tvos")
|
||||
platform_name="appletvos"
|
||||
sdk_name="appletvos${min_os_version}"
|
||||
supported_platform="AppleTVOS"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=' <key>UIDeviceFamily</key>
|
||||
<array>
|
||||
<integer>3</integer>
|
||||
</array>'
|
||||
;;
|
||||
esac
|
||||
|
||||
# Create Info.plist
|
||||
cat > ${plist_path} << EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleDevelopmentRegion</key>
|
||||
<string>en</string>
|
||||
<key>CFBundleExecutable</key>
|
||||
<string>llama</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>org.ggml.llama</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>llama</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>FMWK</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1</string>
|
||||
<key>MinimumOSVersion</key>
|
||||
<string>${min_os_version}</string>
|
||||
<key>CFBundleSupportedPlatforms</key>
|
||||
<array>
|
||||
<string>${supported_platform}</string>
|
||||
</array>${device_family}
|
||||
<key>DTPlatformName</key>
|
||||
<string>${platform_name}</string>
|
||||
<key>DTSDKName</key>
|
||||
<string>${sdk_name}</string>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
}
|
||||
|
||||
# Create dynamic libraries from static libraries.
|
||||
combine_static_libraries() {
|
||||
local build_dir="$1"
|
||||
local release_dir="$2"
|
||||
local platform="$3" # "ios", "macos", "visionos", or "tvos"
|
||||
local is_simulator="$4"
|
||||
local base_dir="$(pwd)"
|
||||
local framework_name="llama"
|
||||
|
||||
# Determine output path based on platform
|
||||
local output_lib=""
|
||||
if [[ "$platform" == "macos" ]]; then
|
||||
# macOS uses versioned structure
|
||||
output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
|
||||
else
|
||||
# iOS, visionOS, and tvOS use a directory flat structure
|
||||
output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
|
||||
fi
|
||||
|
||||
local libs=(
|
||||
"${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
)
|
||||
|
||||
# Create temporary directory for processing
|
||||
local temp_dir="${base_dir}/${build_dir}/temp"
|
||||
mkdir -p "${temp_dir}"
|
||||
|
||||
# Since we have multiple architectures libtool will find object files that do not
|
||||
# match the target architecture. We suppress these warnings.
|
||||
libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
|
||||
|
||||
# Determine SDK, architectures, and install_name based on platform and simulator flag.
|
||||
local sdk=""
|
||||
local archs=""
|
||||
local min_version_flag=""
|
||||
local install_name=""
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="iphonesimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
|
||||
else
|
||||
sdk="iphoneos"
|
||||
archs="arm64"
|
||||
min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/llama.framework/llama"
|
||||
;;
|
||||
"macos")
|
||||
sdk="macosx"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
|
||||
install_name="@rpath/llama.framework/Versions/Current/llama"
|
||||
;;
|
||||
"visionos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="xrsimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
|
||||
else
|
||||
sdk="xros"
|
||||
archs="arm64"
|
||||
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
# Use flat structure for visionOS, same as iOS
|
||||
install_name="@rpath/llama.framework/llama"
|
||||
;;
|
||||
"tvos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="appletvsimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
|
||||
else
|
||||
sdk="appletvos"
|
||||
archs="arm64"
|
||||
min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/llama.framework/llama"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Build architecture flags
|
||||
local arch_flags=""
|
||||
for arch in $archs; do
|
||||
arch_flags+=" -arch $arch"
|
||||
done
|
||||
|
||||
# Create dynamic library
|
||||
echo "Creating dynamic library for ${platform}."
|
||||
xcrun -sdk $sdk clang++ -dynamiclib \
|
||||
-isysroot $(xcrun --sdk $sdk --show-sdk-path) \
|
||||
$arch_flags \
|
||||
$min_version_flag \
|
||||
-Wl,-force_load,"${temp_dir}/combined.a" \
|
||||
-framework Foundation -framework Metal -framework Accelerate \
|
||||
-install_name "$install_name" \
|
||||
-o "${base_dir}/${output_lib}"
|
||||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if command -v vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"visionos")
|
||||
echo "Marking binary as a framework binary for visionOS..."
|
||||
vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"tvos")
|
||||
echo "Marking binary as a framework binary for tvOS..."
|
||||
vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
echo "Warning: vtool not found. Binary may not pass App Store validation."
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Creating properly formatted dSYM..."
|
||||
# Create a separate directory for dSYMs for all platforms
|
||||
mkdir -p "${base_dir}/${build_dir}/dSYMs"
|
||||
|
||||
# iOS and visionOS style dSYM (flat structure)
|
||||
if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
|
||||
# Generate dSYM in the dSYMs directory
|
||||
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
|
||||
|
||||
# Create a copy of the binary that will be stripped
|
||||
cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
|
||||
|
||||
# Strip debug symbols from the copy
|
||||
xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
|
||||
|
||||
# Replace the original with the stripped version
|
||||
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
|
||||
else
|
||||
# macOS style dSYM
|
||||
# First strip debug info to a separate file
|
||||
xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
|
||||
|
||||
# Generate dSYM in the dSYMs directory
|
||||
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
|
||||
|
||||
# Replace original binary with stripped version
|
||||
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
|
||||
fi
|
||||
|
||||
# Remove any automatically generated dSYM files in the framework structure as they will
|
||||
# otherwise case Invalid Bundle Structure validation errors.
|
||||
if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
|
||||
echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
|
||||
rm -rf "${base_dir}/${output_lib}.dSYM"
|
||||
fi
|
||||
|
||||
# Clean up
|
||||
rm -rf "${temp_dir}"
|
||||
}
|
||||
|
||||
echo "Building for iOS simulator..."
|
||||
cmake -B build-ios-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DIOS=ON \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_SYSROOT=iphonesimulator \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
|
||||
echo "Building for iOS devices..."
|
||||
cmake -B build-ios-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_SYSROOT=iphoneos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
|
||||
echo "Building for macOS..."
|
||||
cmake -B build-macos -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
|
||||
echo "Building for visionOS..."
|
||||
cmake -B build-visionos -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xros \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
|
||||
echo "Building for visionOS simulator..."
|
||||
cmake -B build-visionos-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xrsimulator \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
|
||||
# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
|
||||
echo "Building for tvOS simulator..."
|
||||
cmake -B build-tvos-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_SYSROOT=appletvsimulator \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DGGML_METAL=ON \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -- -quiet
|
||||
|
||||
echo "Building for tvOS devices..."
|
||||
cmake -B build-tvos-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_SYSROOT=appletvos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DGGML_METAL=ON \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -- -quiet
|
||||
|
||||
# Setup frameworks and copy binaries and headers
|
||||
echo "Setting up framework structures..."
|
||||
setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
|
||||
setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
|
||||
setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
|
||||
setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
|
||||
setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
|
||||
setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
|
||||
setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
|
||||
|
||||
# Create dynamic libraries from static libraries
|
||||
echo "Creating dynamic libraries from static libraries..."
|
||||
combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
|
||||
combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
|
||||
combine_static_libraries "build-macos" "Release" "macos" "false"
|
||||
combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
|
||||
combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
|
||||
combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
|
||||
combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
|
||||
|
||||
# Create XCFramework with correct debug symbols paths
|
||||
echo "Creating XCFramework..."
|
||||
xcodebuild -create-xcframework \
|
||||
-framework $(pwd)/build-ios-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-ios-device/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-macos/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
|
||||
-framework $(pwd)/build-visionos/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-visionos-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-tvos-device/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-tvos-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
|
||||
-output $(pwd)/build-apple/llama.xcframework
|
||||
@@ -813,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
||||
add_opt(common_arg(
|
||||
{"-p", "--prompt"}, "PROMPT",
|
||||
ex == LLAMA_EXAMPLE_MAIN
|
||||
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
||||
: "prompt to start generation with",
|
||||
"prompt to start generation with; for system message, use -sys",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.prompt = value;
|
||||
}
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-sys", "--system-prompt"}, "PROMPT",
|
||||
"system prompt to use with model (if applicable, depending on chat template)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.system_prompt = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--no-perf"},
|
||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
@@ -944,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"-st", "--single-turn"},
|
||||
"run conversation for a single turn only, then exit when done\n"
|
||||
"will not be interactive if first turn is predefined with --prompt\n"
|
||||
"(default: false)",
|
||||
[](common_params & params) {
|
||||
params.single_turn = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"-i", "--interactive"},
|
||||
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
||||
@@ -2447,6 +2461,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.vocoder.use_guide_tokens = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--tts-speaker-file"}, "FNAME",
|
||||
"speaker file path for audio generation",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.vocoder.speaker_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS}));
|
||||
|
||||
// model-specific
|
||||
add_opt(common_arg(
|
||||
|
||||
437
common/chat.cpp
437
common/chat.cpp
@@ -449,12 +449,6 @@ std::string common_chat_format_name(common_chat_format format) {
|
||||
}
|
||||
}
|
||||
|
||||
const common_grammar_options grammar_options {
|
||||
/* .dotall = */ false,
|
||||
/* .compact_spaces = */ false,
|
||||
// /* .compact_spaces = */ true,
|
||||
};
|
||||
|
||||
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
@@ -500,6 +494,34 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons
|
||||
}
|
||||
}
|
||||
|
||||
static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
||||
auto expected_it = expected.begin();
|
||||
auto tmp_it = it;
|
||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
||||
++tmp_it;
|
||||
++expected_it;
|
||||
}
|
||||
if (expected_it == expected.end()) {
|
||||
it = tmp_it;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static std::optional<std::smatch> parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) {
|
||||
std::smatch match;
|
||||
if (std::regex_match(it, end, match, expected)) {
|
||||
it = match.suffix().first;
|
||||
return match;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) {
|
||||
while (it != end && std::isspace(*it)) {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
|
||||
@@ -509,7 +531,8 @@ static common_chat_msg parse_json_tool_calls(
|
||||
const std::string& input,
|
||||
const std::optional<std::regex> & trigger_opt,
|
||||
const std::regex & function_regex,
|
||||
const std::regex & close_regex) {
|
||||
const std::regex & close_regex,
|
||||
bool allow_raw_python = false) {
|
||||
std::smatch match;
|
||||
|
||||
common_chat_msg result;
|
||||
@@ -540,14 +563,19 @@ static common_chat_msg parse_json_tool_calls(
|
||||
it = rit->suffix().first;
|
||||
|
||||
json arguments;
|
||||
if (!parse_json(it, end, arguments)) {
|
||||
if (parse_json(it, end, arguments)) {
|
||||
if (!std::regex_search(it, end, match, close_regex)) {
|
||||
throw std::runtime_error("Malformed input, missing closing pattern: " + input);
|
||||
}
|
||||
it = match.suffix().first;
|
||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
||||
} else {
|
||||
if (allow_raw_python && name == "python") {
|
||||
result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""});
|
||||
break;
|
||||
}
|
||||
throw std::runtime_error("Failed to parse json tool call arguments: " + input);
|
||||
}
|
||||
if (!std::regex_search(it, end, match, close_regex)) {
|
||||
throw std::runtime_error("Malformed input, missing closing pattern: " + input);
|
||||
}
|
||||
it = match.suffix().first;
|
||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
||||
}
|
||||
|
||||
if (!result.tool_calls.empty()) {
|
||||
@@ -559,29 +587,29 @@ static common_chat_msg parse_json_tool_calls(
|
||||
return result;
|
||||
}
|
||||
|
||||
static common_chat_tool_call process_tool_call(const json & tool_call) {
|
||||
const auto & arguments = tool_call.at("arguments");
|
||||
return {
|
||||
/* .name = */ tool_call.at("name"),
|
||||
/* .arguments = */ arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
/* .id = */ tool_call.contains("id") ? tool_call.at("id") : "",
|
||||
};
|
||||
}
|
||||
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
|
||||
auto content_end = input.find(prefix);
|
||||
size_t tc_start = std::string::npos;
|
||||
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
const auto process_tool_calls = [&](const json & tool_calls) {
|
||||
for (const auto & tool_call : tool_calls) {
|
||||
const auto & arguments = tool_call.at("arguments");
|
||||
result.tool_calls.push_back({
|
||||
tool_call.at("name"),
|
||||
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
tool_call.contains("id") ? tool_call.at("id") : "",
|
||||
});
|
||||
}
|
||||
};
|
||||
if (content_end == std::string::npos) {
|
||||
result.content = input;
|
||||
} else {
|
||||
tc_start = content_end + prefix.size() - rstrip_prefix;
|
||||
result.content = input.substr(0, content_end);
|
||||
auto tool_calls = json::parse(input.substr(tc_start));
|
||||
process_tool_calls(tool_calls);
|
||||
for (const auto & tool_call : tool_calls) {
|
||||
result.tool_calls.emplace_back(process_tool_call(tool_call));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -700,7 +728,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
||||
data.grammar_lazy = false;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
builder.add_schema("root", schema);
|
||||
}, grammar_options);
|
||||
});
|
||||
|
||||
auto tweaked_messages = common_chat_template::add_system(
|
||||
inputs.messages,
|
||||
@@ -770,8 +798,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
|
||||
});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
|
||||
data.preserved_tokens = {
|
||||
"[TOOL_CALLS]",
|
||||
};
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
||||
return data;
|
||||
@@ -813,14 +844,18 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
"<|START_ACTION|>",
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<|START_ACTION|>",
|
||||
"<|END_ACTION|>",
|
||||
"<|START_RESPONSE|>",
|
||||
"<|END_RESPONSE|>",
|
||||
"<|START_THINKING|>",
|
||||
"<|END_THINKING|>",
|
||||
"<|END_ACTION|>",
|
||||
};
|
||||
auto adjusted_messages = json::array();
|
||||
for (const auto & msg : inputs.messages) {
|
||||
@@ -840,9 +875,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
|
||||
static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
|
||||
static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
|
||||
static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
|
||||
static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
|
||||
static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
|
||||
static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
|
||||
|
||||
std::smatch match;
|
||||
|
||||
@@ -945,23 +980,23 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"{\" space "
|
||||
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
|
||||
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
|
||||
builder.add_schema(name + "-args", parameters) +
|
||||
" \"}\""));
|
||||
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
|
||||
"( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
|
||||
" \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
|
||||
" \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"}\" space"));
|
||||
});
|
||||
// Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
"\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
|
||||
});
|
||||
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
||||
if (!builtin_tools.empty()) {
|
||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
// Allow a few empty lines on top of the usual constrained json schema space rule.
|
||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
||||
}, grammar_options);
|
||||
});
|
||||
data.additional_stops.push_back("<|eom_id|>");
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
||||
{"tools_in_user_message", false},
|
||||
@@ -974,33 +1009,33 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
|
||||
}
|
||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
||||
// TODO: tighten & simplify the parser, don't accept leading text context.
|
||||
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
|
||||
static std::regex close_regex("\\}");
|
||||
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
|
||||
static std::regex function_regex(
|
||||
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
|
||||
static std::regex close_regex("\\}\\s*");
|
||||
static std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
|
||||
|
||||
if (with_builtin_tools) {
|
||||
std::smatch match;
|
||||
if (std::regex_match(input, match, builtin_call_regex)) {
|
||||
auto name = match[1].str();
|
||||
auto raw_args = match[2].str();
|
||||
try {
|
||||
auto name = match[1].str();
|
||||
auto arg_name = match[2].str();
|
||||
auto arg_value_str = match[3].str();
|
||||
auto arg_value = json::parse(arg_value_str);
|
||||
|
||||
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
|
||||
auto it_eq = raw_args.find('=');
|
||||
auto arg_name = raw_args.substr(0, it_eq);
|
||||
auto arg_value_str = raw_args.substr(it_eq + 1);
|
||||
auto arg_value = json::parse(arg_value_str);
|
||||
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.content = match.prefix().str();
|
||||
msg.tool_calls.push_back({
|
||||
/* .name = */ name,
|
||||
/* .arguments = */ (json {
|
||||
{arg_name, arg_value},
|
||||
}).dump(),
|
||||
/* .id = */ "",
|
||||
});
|
||||
return msg;
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.tool_calls.push_back({
|
||||
/* .name = */ name,
|
||||
/* .arguments = */ (json {
|
||||
{arg_name, arg_value},
|
||||
}).dump(),
|
||||
/* .id = */ "",
|
||||
});
|
||||
return msg;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
||||
@@ -1017,10 +1052,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
||||
tool_rules.push_back(builder.add_rule(name + "-call",
|
||||
"\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n"
|
||||
"```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
|
||||
"```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"```<|tool▁call▁end|>\""));
|
||||
});
|
||||
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
|
||||
// so we accept common variants (then it's all constrained)
|
||||
@@ -1029,18 +1064,20 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
|
||||
"\"<|tool▁calls▁end|>\""
|
||||
" space");
|
||||
data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({"<|tool_calls_begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({"<|tool calls begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool▁calls▁begin|>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls_begin|>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool calls begin|>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool\\_calls\\_begin|>"});
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<|tool▁calls▁begin|>",
|
||||
"<|tool▁call▁begin|>",
|
||||
"<|tool▁sep|>",
|
||||
"<|tool▁calls▁end|",
|
||||
"<|tool▁call▁end|>",
|
||||
"<|tool▁calls▁end|",
|
||||
};
|
||||
}, grammar_options);
|
||||
});
|
||||
}
|
||||
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
|
||||
@@ -1129,8 +1166,11 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
|
||||
});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
|
||||
data.preserved_tokens = {
|
||||
" functools[",
|
||||
};
|
||||
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
|
||||
} else {
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
@@ -1158,11 +1198,28 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
|
||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
|
||||
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
|
||||
data.grammar_triggers.push_back({name, /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
regex_escape(name + "\n"),
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
regex_escape(">>>" + name + "\n"),
|
||||
});
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
">>>assistant<|end_header_id|>\n" + name,
|
||||
});
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<|end_header_id|>",
|
||||
};
|
||||
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
|
||||
if (inputs.parallel_tool_calls) {
|
||||
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
|
||||
@@ -1171,34 +1228,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
builder.add_rule("root", first_rule);
|
||||
}
|
||||
|
||||
}, grammar_options);
|
||||
});
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
||||
auto expected_it = expected.begin();
|
||||
auto tmp_it = it;
|
||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
||||
++tmp_it;
|
||||
++expected_it;
|
||||
}
|
||||
if (expected_it == expected.end()) {
|
||||
it = tmp_it;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
||||
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
|
||||
static std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
|
||||
static std::regex close_regex(R"($|(?=>>>))");
|
||||
|
||||
std::string content;
|
||||
auto it = input.begin();
|
||||
const auto end = input.end();
|
||||
|
||||
if (consume(it, end, "all\n")) {
|
||||
if (parse_literal(it, end, "all\n")) {
|
||||
std::smatch match;
|
||||
if (std::regex_search(it, end, match, function_regex)) {
|
||||
auto fun_it = match.prefix().second;
|
||||
@@ -1213,7 +1256,7 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
|
||||
}
|
||||
// TODO: tighten & simplify.
|
||||
try {
|
||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
|
||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true);
|
||||
res.content = content + res.content;
|
||||
return res;
|
||||
} catch (const std::exception & e) {
|
||||
@@ -1266,12 +1309,13 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
||||
});
|
||||
if (has_raw_python) {
|
||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
|
||||
});
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
// TODO: if (has_raw_python)
|
||||
@@ -1306,6 +1350,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
std::vector<std::string> tool_call_alts;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
@@ -1319,57 +1364,173 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||
}},
|
||||
{"required", json::array({"name", "arguments"})},
|
||||
}));
|
||||
tool_call_alts.push_back(builder.add_rule(
|
||||
name + "-function-tag",
|
||||
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
|
||||
builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"</function>\" space"));
|
||||
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
"<function=" + name + ">",
|
||||
});
|
||||
auto escaped_name = regex_escape(name);
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
|
||||
});
|
||||
});
|
||||
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
|
||||
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
|
||||
std::vector<std::string> alt_tags {
|
||||
any_tool_call,
|
||||
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
|
||||
// The rest is just to accommodate common "good bad" outputs.
|
||||
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
|
||||
"\"<response>\" space " + any_tool_call + " \"</response>\"",
|
||||
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
|
||||
"\"<json>\" space " + any_tool_call + " \"</json>\"",
|
||||
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
|
||||
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
|
||||
};
|
||||
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
|
||||
tool_call_alts.push_back(wrappable_tool_call);
|
||||
tool_call_alts.push_back(
|
||||
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
|
||||
data.preserved_tokens = { "</tool_call>" };
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
|
||||
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
"(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<tool_call>",
|
||||
"</tool_call>",
|
||||
"<function",
|
||||
"<tools>",
|
||||
"</tools>",
|
||||
"<response>",
|
||||
"</response>",
|
||||
"<function_call>",
|
||||
"</function_call>",
|
||||
"<json>",
|
||||
"</json>",
|
||||
"<JSON>",
|
||||
"</JSON>",
|
||||
"```",
|
||||
"```json",
|
||||
"```xml",
|
||||
};
|
||||
});
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
|
||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input) {
|
||||
const static std::regex open_regex(
|
||||
"(?:"
|
||||
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
|
||||
"(<tool_call>" // match 2 (open_tag)
|
||||
"|<function_call>"
|
||||
"|<tool>"
|
||||
"|<tools>"
|
||||
"|<response>"
|
||||
"|<json>"
|
||||
"|<xml>"
|
||||
"|<JSON>"
|
||||
")?"
|
||||
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
|
||||
")"
|
||||
"|"
|
||||
"(?:<function=([^>]+)>" // match 4 (function name)
|
||||
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
|
||||
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
|
||||
);
|
||||
|
||||
try {
|
||||
std::regex start_pattern(R"([\n\s]*<tool_call>)");
|
||||
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
|
||||
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
|
||||
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
|
||||
auto end = input.end();
|
||||
std::sregex_iterator rend;
|
||||
std::sregex_iterator rit(input.begin(), end, start_pattern);
|
||||
if (rit == rend) {
|
||||
msg.content = input;
|
||||
return msg;
|
||||
}
|
||||
std::string::const_iterator it = input.begin();
|
||||
const std::string::const_iterator end = input.end();
|
||||
std::smatch match;
|
||||
|
||||
msg.content = rit->prefix();
|
||||
|
||||
auto it = rit->suffix().first;
|
||||
while (it != end) {
|
||||
json call;
|
||||
if (!parse_json(it, end, call)) {
|
||||
throw std::runtime_error("Failed to parse json tool call");
|
||||
}
|
||||
const auto & arguments = call.at("arguments");
|
||||
msg.tool_calls.push_back({
|
||||
call.at("name"),
|
||||
arguments.dump(),
|
||||
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
/* id= */ "",
|
||||
});
|
||||
rit = {it, end, middle_pattern};
|
||||
if (rit != rend) {
|
||||
it = rit->suffix().first;
|
||||
} else {
|
||||
rit = {it, end, end_pattern};
|
||||
if (rit == rend) {
|
||||
throw std::runtime_error("Malformed input, missing </tool_call>");
|
||||
if (std::regex_search(it, end, match, open_regex)) {
|
||||
// Add content before the match
|
||||
msg.content += std::string(it, match[0].first);
|
||||
|
||||
auto block_start = match[1].str();
|
||||
std::string block_end = block_start.empty() ? "" : "```";
|
||||
|
||||
auto open_tag = match[2].str();
|
||||
std::string close_tag;
|
||||
|
||||
if (match[3].matched) {
|
||||
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
|
||||
auto json_it = match[3].first;
|
||||
json tool_call;
|
||||
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
|
||||
|
||||
msg.tool_calls.emplace_back(process_tool_call(tool_call));
|
||||
it = json_it; // Move iterator past parsed JSON
|
||||
|
||||
// Handle close tags
|
||||
consume_spaces(it, end);
|
||||
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
||||
throw std::runtime_error("Failed to parse closing tag");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
||||
throw std::runtime_error("Failed to parse block end");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
} else {
|
||||
// Not a valid tool call, treat as content
|
||||
msg.content += std::string(match[0].first, match[0].second);
|
||||
it = match[0].second;
|
||||
}
|
||||
} else {
|
||||
auto function_name = match[4].str();
|
||||
if (function_name.empty()) {
|
||||
function_name = match[5].str();
|
||||
}
|
||||
GGML_ASSERT(!function_name.empty());
|
||||
|
||||
close_tag = "</function>";
|
||||
// Start parsing from after the opening tags
|
||||
auto json_it = match[6].first;
|
||||
json arguments;
|
||||
if (parse_json(json_it, end, arguments)) {
|
||||
msg.tool_calls.emplace_back(process_tool_call({
|
||||
{"name", function_name},
|
||||
{"arguments", arguments},
|
||||
}));
|
||||
it = json_it; // Move iterator past parsed JSON
|
||||
|
||||
// Handle close tags
|
||||
consume_spaces(it, end);
|
||||
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
||||
throw std::runtime_error("Failed to parse closing tag");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
||||
throw std::runtime_error("Failed to parse block end");
|
||||
}
|
||||
consume_spaces(it, end);
|
||||
} else {
|
||||
// Not a valid tool call, treat as content
|
||||
msg.content += std::string(match[0].first, match[0].second);
|
||||
it = match[0].second;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Add remaining content
|
||||
msg.content += std::string(it, end);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -483,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
std::ostringstream result;
|
||||
for (size_t i = 0; i < values.size(); ++i) {
|
||||
@@ -2026,3 +2030,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
json common_grammar_trigger::to_json() const {
|
||||
json out {
|
||||
{"type", (int) type},
|
||||
{"value", value},
|
||||
};
|
||||
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out["token"] = (int) token;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <>
|
||||
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
|
||||
common_grammar_trigger out;
|
||||
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
||||
out.value = in.at("value").get<std::string>();
|
||||
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out.token = (llama_token) in.at("token").get<int>();
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
@@ -110,9 +110,21 @@ enum common_conversation_mode {
|
||||
COMMON_CONVERSATION_MODE_AUTO = 2,
|
||||
};
|
||||
|
||||
enum common_grammar_trigger_type {
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
};
|
||||
|
||||
struct common_grammar_trigger {
|
||||
std::string word;
|
||||
bool at_start;
|
||||
common_grammar_trigger_type type;
|
||||
std::string value;
|
||||
llama_token token = LLAMA_TOKEN_NULL;
|
||||
|
||||
// T can only be nlohmann::ordered_json
|
||||
template <class T> T to_json() const;
|
||||
template <class T> static common_grammar_trigger from_json(const T & in);
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
@@ -163,8 +175,7 @@ struct common_params_sampling {
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
bool grammar_lazy = false;
|
||||
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
|
||||
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
|
||||
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
|
||||
std::set<llama_token> preserved_tokens;
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
@@ -200,6 +211,8 @@ struct common_params_vocoder {
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
|
||||
std::string speaker_file = ""; // speaker file path // NOLINT
|
||||
|
||||
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
||||
};
|
||||
|
||||
@@ -261,6 +274,7 @@ struct common_params {
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string system_prompt = ""; // NOLINT
|
||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||
@@ -325,6 +339,8 @@ struct common_params {
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
||||
@@ -453,6 +469,8 @@ std::string string_repeat(const std::string & str, size_t n);
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||
|
||||
std::string regex_escape(const std::string & s);
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
||||
|
||||
@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||
}
|
||||
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
|
||||
|
||||
struct BuiltinRule {
|
||||
std::string content;
|
||||
@@ -764,11 +764,10 @@ private:
|
||||
public:
|
||||
SchemaConverter(
|
||||
const std::function<json(const std::string &)> & fetch_json,
|
||||
bool dotall,
|
||||
bool compact_spaces)
|
||||
bool dotall)
|
||||
: _fetch_json(fetch_json), _dotall(dotall)
|
||||
{
|
||||
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
|
||||
_rules["space"] = SPACE_RULE;
|
||||
}
|
||||
|
||||
void resolve_refs(json & schema, const std::string & url) {
|
||||
@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||
}
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||
common_grammar_builder builder {
|
||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||
return converter._add_rule(name, rule);
|
||||
|
||||
@@ -16,7 +16,6 @@ struct common_grammar_builder {
|
||||
|
||||
struct common_grammar_options {
|
||||
bool dotall = false;
|
||||
bool compact_spaces = false;
|
||||
};
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <algorithm>
|
||||
|
||||
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
@@ -159,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
} else {
|
||||
std::vector<const char *> trigger_words;
|
||||
trigger_words.reserve(params.grammar_trigger_words.size());
|
||||
for (const auto & str : params.grammar_trigger_words) {
|
||||
trigger_words.push_back(str.word.c_str());
|
||||
std::vector<std::string> patterns_at_start;
|
||||
std::vector<std::string> patterns_anywhere;
|
||||
std::vector<llama_token> trigger_tokens;
|
||||
for (const auto & trigger : params.grammar_triggers) {
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
patterns_anywhere.push_back(regex_escape(word));
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
||||
{
|
||||
const auto & pattern = trigger.value;
|
||||
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
||||
{
|
||||
const auto token = trigger.token;
|
||||
trigger_tokens.push_back(token);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown trigger type");
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> trigger_patterns;
|
||||
if (!patterns_at_start.empty()) {
|
||||
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
if (!patterns_anywhere.empty()) {
|
||||
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
|
||||
std::vector<const char *> trigger_patterns_c;
|
||||
trigger_patterns_c.reserve(trigger_patterns.size());
|
||||
for (const auto & regex : trigger_patterns) {
|
||||
trigger_patterns_c.push_back(regex.c_str());
|
||||
}
|
||||
|
||||
grmr = params.grammar_lazy
|
||||
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
|
||||
trigger_words.data(), trigger_words.size(),
|
||||
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
|
||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size())
|
||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
@@ -699,6 +699,9 @@ class Model:
|
||||
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
||||
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
||||
res = "deepseek-r1-qwen"
|
||||
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
|
||||
# ref: https://huggingface.co/Xenova/gpt-4o
|
||||
res = "gpt-4o"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2512,7 +2515,8 @@ class Phi3MiniModel(Model):
|
||||
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rope_dims = n_embd // n_head
|
||||
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
rope_dims = int(rot_pct * n_embd) // n_head
|
||||
|
||||
self.gguf_writer.add_context_length(max_pos_embds)
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
||||
@@ -2536,7 +2540,8 @@ class Phi3MiniModel(Model):
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rope_dims = n_embd // n_head
|
||||
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
rope_dims = int(rot_pct * n_embd) // n_head
|
||||
|
||||
# write rope scaling for long context (128k) model
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
@@ -2565,7 +2570,7 @@ class Phi3MiniModel(Model):
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
|
||||
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
||||
|
||||
@@ -109,6 +109,7 @@ models = [
|
||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
|
||||
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
|
||||
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
|
||||
]
|
||||
|
||||
|
||||
@@ -131,6 +132,10 @@ def download_model(model):
|
||||
|
||||
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||
|
||||
if name == "gpt-4o":
|
||||
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
|
||||
files = ["tokenizer.json", "tokenizer_config.json"]
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
files.append("tokenizer.model")
|
||||
|
||||
|
||||
@@ -235,6 +235,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
||||
|
||||
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
|
||||
|
||||
As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
|
||||
|
||||
Note that if you get the following error:
|
||||
```
|
||||
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
|
||||
|
||||
@@ -287,30 +287,32 @@ Here are some models known to work (w/ chat template override when needed):
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
|
||||
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
|
||||
|
||||
# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
|
||||
# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
|
||||
|
||||
# Native support requires the right template for these GGUFs:
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
|
||||
--chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
|
||||
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
|
||||
--chat-template-file models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
|
||||
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
|
||||
--chat-template-file models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
|
||||
--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
|
||||
--chat-template-file models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
|
||||
|
||||
llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
|
||||
--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
|
||||
--chat-template-file models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
|
||||
|
||||
# Generic format support
|
||||
llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
|
||||
@@ -318,6 +320,8 @@ llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
|
||||
llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
|
||||
```
|
||||
|
||||
To get the official template from original HuggingFace repos, you can use [scripts/get_chat_template.py](../scripts/get_chat_template.py) (see examples invocations in [models/templates/README.md](../models/templates/README.md))
|
||||
|
||||
> [!TIP]
|
||||
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <ctime>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
||||
@@ -195,7 +195,7 @@ class BuiltinRule:
|
||||
self.deps = deps or []
|
||||
|
||||
# Constraining spaces to prevent model "running away".
|
||||
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
|
||||
SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
|
||||
|
||||
PRIMITIVE_RULES = {
|
||||
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||
|
||||
@@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
const auto tokens_list = common_tokenize(context, text, true, parse_special);
|
||||
|
||||
auto n_ctx = llama_n_ctx(context);
|
||||
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
auto n_kv_req = tokens_list.size() + n_len;
|
||||
|
||||
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
|
||||
|
||||
|
||||
@@ -5,6 +5,21 @@ point for more advanced projects.
|
||||
|
||||
For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
|
||||
|
||||
|
||||
### Building
|
||||
First llama.cpp need to be built and a XCFramework needs to be created. This can be done by running
|
||||
the following script from the llama.cpp project root:
|
||||
```console
|
||||
$ ./build-xcframework.sh
|
||||
```
|
||||
Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build and run the app on
|
||||
a simulator or a real device.
|
||||
|
||||
To use the framework with a different project, the XCFramework can be added to the project by
|
||||
adding `build-ios/llama.xcframework` by dragging and dropping it into the project navigator, or
|
||||
by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
|
||||
of the project settings.
|
||||
|
||||

|
||||
|
||||
Video demonstration:
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
|
||||
@@ -18,9 +17,25 @@
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; };
|
||||
DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
|
||||
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXCopyFilesBuildPhase section */
|
||||
DD84C9FF2D747FED007778EC /* Embed Frameworks */ = {
|
||||
isa = PBXCopyFilesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
dstPath = "";
|
||||
dstSubfolderSpec = 10;
|
||||
files = (
|
||||
DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */,
|
||||
);
|
||||
name = "Embed Frameworks";
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXCopyFilesBuildPhase section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
|
||||
@@ -33,6 +48,7 @@
|
||||
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||
DD84C9FC2D747FED007778EC /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = llama.xcframework; path = "../../build-apple/llama.xcframework"; sourceTree = "<group>"; };
|
||||
DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
|
||||
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
@@ -42,9 +58,9 @@
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */,
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
@@ -86,6 +102,7 @@
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
DD84C9FC2D747FED007778EC /* llama.xcframework */,
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||
);
|
||||
@@ -144,6 +161,7 @@
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||
DD84C9FF2D747FED007778EC /* Embed Frameworks */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
@@ -151,7 +169,6 @@
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
1809696C2D05A39F00400EE8 /* llama */,
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
@@ -427,13 +444,6 @@
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
|
||||
/* Begin XCSwiftPackageProductDependency section */
|
||||
1809696C2D05A39F00400EE8 /* llama */ = {
|
||||
isa = XCSwiftPackageProductDependency;
|
||||
productName = llama;
|
||||
};
|
||||
/* End XCSwiftPackageProductDependency section */
|
||||
};
|
||||
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||
}
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
Download the model and point your `GRANITE_MODEL` environment variable to the path.
|
||||
|
||||
```bash
|
||||
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview
|
||||
$ export GRANITE_MODEL=./granite-vision-3.1-2b-preview
|
||||
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
|
||||
$ export GRANITE_MODEL=./granite-vision-3.2-2b
|
||||
```
|
||||
|
||||
|
||||
@@ -41,10 +41,18 @@ If you actually inspect the `.keys()` of the loaded tensors, you should see a lo
|
||||
|
||||
|
||||
### 2. Creating the Visual Component GGUF
|
||||
To create the GGUF for the visual components, we need to write a config for the visual encoder; make sure the config contains the correct `image_grid_pinpoints`
|
||||
Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
|
||||
|
||||
```bash
|
||||
$ ENCODER_PATH=$PWD/visual_encoder
|
||||
$ mkdir $ENCODER_PATH
|
||||
|
||||
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
|
||||
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
|
||||
```
|
||||
|
||||
Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
|
||||
|
||||
Note: we refer to this file as `$VISION_CONFIG` later on.
|
||||
```json
|
||||
{
|
||||
"_name_or_path": "siglip-model",
|
||||
@@ -52,6 +60,7 @@ Note: we refer to this file as `$VISION_CONFIG` later on.
|
||||
"SiglipVisionModel"
|
||||
],
|
||||
"image_grid_pinpoints": [
|
||||
[384,384],
|
||||
[384,768],
|
||||
[384,1152],
|
||||
[384,1536],
|
||||
@@ -94,24 +103,13 @@ Note: we refer to this file as `$VISION_CONFIG` later on.
|
||||
}
|
||||
```
|
||||
|
||||
Create a new directory to hold the visual components, and copy the llava.clip/projector files, as well as the vision config into it.
|
||||
|
||||
```bash
|
||||
$ ENCODER_PATH=$PWD/visual_encoder
|
||||
$ mkdir $ENCODER_PATH
|
||||
|
||||
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
|
||||
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
|
||||
$ cp $VISION_CONFIG $ENCODER_PATH/config.json
|
||||
```
|
||||
|
||||
At which point you should have something like this:
|
||||
At this point you should have something like this:
|
||||
```bash
|
||||
$ ls $ENCODER_PATH
|
||||
config.json llava.projector pytorch_model.bin
|
||||
```
|
||||
|
||||
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the siglip visual encoder - in the transformers model, you can find these numbers in the [preprocessor_config.json](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview/blob/main/preprocessor_config.json).
|
||||
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
|
||||
```bash
|
||||
$ python convert_image_encoder_to_gguf.py \
|
||||
-m $ENCODER_PATH \
|
||||
@@ -119,17 +117,18 @@ $ python convert_image_encoder_to_gguf.py \
|
||||
--output-dir $ENCODER_PATH \
|
||||
--clip-model-is-vision \
|
||||
--clip-model-is-siglip \
|
||||
--image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
|
||||
--image-mean 0.5 0.5 0.5 \
|
||||
--image-std 0.5 0.5 0.5
|
||||
```
|
||||
|
||||
this will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the abs path of this file as the `$VISUAL_GGUF_PATH.`
|
||||
This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
|
||||
|
||||
|
||||
### 3. Creating the LLM GGUF.
|
||||
The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
|
||||
|
||||
First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
|
||||
```
|
||||
```bash
|
||||
$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
|
||||
```
|
||||
|
||||
@@ -142,7 +141,7 @@ if not MODEL_PATH:
|
||||
raise ValueError("env var GRANITE_MODEL is unset!")
|
||||
|
||||
LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
|
||||
if not MODEL_PATH:
|
||||
if not LLM_EXPORT_PATH:
|
||||
raise ValueError("env var LLM_EXPORT_PATH is unset!")
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
|
||||
@@ -166,18 +165,26 @@ $ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
|
||||
```
|
||||
|
||||
|
||||
### 4. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. Sample usage:
|
||||
### 4. Quantization
|
||||
If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
|
||||
```bash
|
||||
$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
|
||||
$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
|
||||
```
|
||||
|
||||
Note - the test image shown below can be found [here](https://github-production-user-asset-6210df.s3.amazonaws.com/10740300/415512792-d90d5562-8844-4f34-a0a5-77f62d5a58b5.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250221T054145Z&X-Amz-Expires=300&X-Amz-Signature=86c60be490aa49ef7d53f25d6c973580a8273904fed11ed2453d0a38240ee40a&X-Amz-SignedHeaders=host).
|
||||
Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
|
||||
|
||||
|
||||
### 5. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
|
||||
|
||||
```bash
|
||||
$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
|
||||
--mmproj $VISUAL_GGUF_PATH \
|
||||
--image cherry_blossom.jpg \
|
||||
--image ./media/llama0-banner.png \
|
||||
-c 16384 \
|
||||
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat type of flowers are in this picture?\n<|assistant|>\n" \
|
||||
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
|
||||
--temp 0
|
||||
```
|
||||
|
||||
Sample response: `The flowers in the picture are cherry blossoms, which are known for their delicate pink petals and are often associated with the beauty of spring.`
|
||||
Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
|
||||
|
||||
@@ -89,6 +89,7 @@ def bytes_to_unicode():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
@@ -191,7 +192,7 @@ output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
struct ngram_data {
|
||||
bool active = false;
|
||||
|
||||
@@ -31,8 +31,6 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static common_sampler ** g_smpl;
|
||||
@@ -47,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
|
||||
(void) argc;
|
||||
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
|
||||
LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
@@ -219,6 +217,10 @@ int main(int argc, char ** argv) {
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation_mode) {
|
||||
if (params.enable_chat_template) {
|
||||
if (!params.prompt.empty() && params.system_prompt.empty()) {
|
||||
LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
|
||||
}
|
||||
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
|
||||
} else {
|
||||
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
@@ -263,6 +265,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
bool waiting_for_first_input = false;
|
||||
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg;
|
||||
new_msg.role = role;
|
||||
@@ -273,13 +276,34 @@ int main(int argc, char ** argv) {
|
||||
return formatted;
|
||||
};
|
||||
|
||||
std::string prompt;
|
||||
{
|
||||
auto prompt = (params.conversation_mode && params.enable_chat_template)
|
||||
// format the system prompt in conversation mode (fallback to default if empty)
|
||||
? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
||||
if (params.conversation_mode && params.enable_chat_template) {
|
||||
if (!params.system_prompt.empty()) {
|
||||
// format the system prompt (will use template default if empty)
|
||||
chat_add_and_format("system", params.system_prompt);
|
||||
}
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
// format and append the user prompt
|
||||
chat_add_and_format("user", params.prompt);
|
||||
} else {
|
||||
waiting_for_first_input = true;
|
||||
}
|
||||
|
||||
if (!params.system_prompt.empty() || !params.prompt.empty()) {
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = chat_msgs;
|
||||
inputs.add_generation_prompt = !params.prompt.empty();
|
||||
|
||||
prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
|
||||
}
|
||||
} else {
|
||||
// otherwise use the prompt as is
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
prompt = params.prompt;
|
||||
}
|
||||
|
||||
if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
|
||||
LOG_DBG("tokenize the prompt\n");
|
||||
embd_inp = common_tokenize(ctx, prompt, true, true);
|
||||
} else {
|
||||
@@ -292,7 +316,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
if (!waiting_for_first_input && embd_inp.empty()) {
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
@@ -352,7 +376,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.conversation_mode) {
|
||||
params.interactive_first = true;
|
||||
if (params.single_turn && !params.prompt.empty()) {
|
||||
params.interactive = false;
|
||||
params.interactive_first = false;
|
||||
} else {
|
||||
params.interactive_first = true;
|
||||
}
|
||||
}
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
@@ -476,8 +505,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_INF( "%s", control_message);
|
||||
if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
|
||||
LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
|
||||
if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
|
||||
LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n");
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
@@ -773,7 +802,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// deal with end of generation tokens in interactive mode
|
||||
if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
LOG_DBG("found an EOG token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
@@ -793,12 +822,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation_mode) {
|
||||
if (params.conversation_mode && !waiting_for_first_input) {
|
||||
const auto id = common_sampler_last(smpl);
|
||||
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||
|
||||
if (!prompt.empty()) {
|
||||
prompt.clear();
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.conversation_mode) {
|
||||
@@ -888,11 +922,17 @@ int main(int argc, char ** argv) {
|
||||
input_echo = false; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
if (n_past > 0 || waiting_for_first_input) {
|
||||
if (is_interacting) {
|
||||
common_sampler_reset(smpl);
|
||||
}
|
||||
is_interacting = false;
|
||||
|
||||
if (waiting_for_first_input && params.single_turn) {
|
||||
params.interactive = false;
|
||||
params.interactive_first = false;
|
||||
}
|
||||
waiting_for_first_input = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ctime>
|
||||
#include <algorithm>
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG("\nexample usage:\n");
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <unordered_map>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <cctype>
|
||||
|
||||
struct quant_option {
|
||||
std::string name;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -47,27 +47,27 @@ extern "C" {
|
||||
#include <stddef.h> /* For size_t. */
|
||||
#include <stdlib.h>
|
||||
|
||||
extern const char *linenoiseEditMore;
|
||||
extern const char * linenoiseEditMore;
|
||||
|
||||
/* The linenoiseState structure represents the state during line editing.
|
||||
* We pass this state to functions implementing specific editing
|
||||
* functionalities. */
|
||||
struct linenoiseState {
|
||||
int in_completion; /* The user pressed TAB and we are now in completion
|
||||
int in_completion; /* The user pressed TAB and we are now in completion
|
||||
* mode, so input is handled by completeLine(). */
|
||||
size_t completion_idx; /* Index of next completion to propose. */
|
||||
int ifd; /* Terminal stdin file descriptor. */
|
||||
int ofd; /* Terminal stdout file descriptor. */
|
||||
char *buf; /* Edited line buffer. */
|
||||
size_t buflen; /* Edited line buffer size. */
|
||||
const char *prompt; /* Prompt to display. */
|
||||
size_t plen; /* Prompt length. */
|
||||
size_t pos; /* Current cursor position. */
|
||||
size_t oldpos; /* Previous refresh cursor position. */
|
||||
size_t len; /* Current edited line length. */
|
||||
size_t cols; /* Number of columns in terminal. */
|
||||
size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */
|
||||
int history_index; /* The history index we are currently editing. */
|
||||
size_t completion_idx; /* Index of next completion to propose. */
|
||||
int ifd; /* Terminal stdin file descriptor. */
|
||||
int ofd; /* Terminal stdout file descriptor. */
|
||||
char * buf; /* Edited line buffer. */
|
||||
size_t buflen; /* Edited line buffer size. */
|
||||
const char * prompt; /* Prompt to display. */
|
||||
size_t plen; /* Prompt length. */
|
||||
size_t pos; /* Current cursor position. */
|
||||
size_t oldcolpos; /* Previous refresh cursor column position. */
|
||||
size_t len; /* Current edited line length. */
|
||||
size_t cols; /* Number of columns in terminal. */
|
||||
size_t oldrows; /* Rows used by last refreshed line (multiline mode) */
|
||||
int history_index; /* The history index we are currently editing. */
|
||||
};
|
||||
|
||||
struct linenoiseCompletions {
|
||||
@@ -89,19 +89,20 @@ struct linenoiseCompletions {
|
||||
};
|
||||
|
||||
/* Non blocking API. */
|
||||
int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
|
||||
const char *linenoiseEditFeed(struct linenoiseState *l);
|
||||
void linenoiseEditStop(struct linenoiseState *l);
|
||||
void linenoiseHide(struct linenoiseState *l);
|
||||
void linenoiseShow(struct linenoiseState *l);
|
||||
int linenoiseEditStart(struct linenoiseState * l, int stdin_fd, int stdout_fd, char * buf, size_t buflen,
|
||||
const char * prompt);
|
||||
const char * linenoiseEditFeed(struct linenoiseState * l);
|
||||
void linenoiseEditStop(struct linenoiseState * l);
|
||||
void linenoiseHide(struct linenoiseState * l);
|
||||
void linenoiseShow(struct linenoiseState * l);
|
||||
|
||||
/* Blocking API. */
|
||||
const char *linenoise(const char *prompt);
|
||||
void linenoiseFree(void *ptr);
|
||||
const char * linenoise(const char * prompt);
|
||||
void linenoiseFree(void * ptr);
|
||||
|
||||
/* Completion API. */
|
||||
typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
|
||||
typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
|
||||
typedef const char *(linenoiseHintsCallback) (const char *, int * color, int * bold);
|
||||
typedef void(linenoiseFreeHintsCallback)(const char *);
|
||||
void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
|
||||
void linenoiseSetHintsCallback(linenoiseHintsCallback *);
|
||||
@@ -109,10 +110,10 @@ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
|
||||
void linenoiseAddCompletion(linenoiseCompletions *, const char *);
|
||||
|
||||
/* History API. */
|
||||
int linenoiseHistoryAdd(const char *line);
|
||||
int linenoiseHistoryAdd(const char * line);
|
||||
int linenoiseHistorySetMaxLen(int len);
|
||||
int linenoiseHistorySave(const char *filename);
|
||||
int linenoiseHistoryLoad(const char *filename);
|
||||
int linenoiseHistorySave(const char * filename);
|
||||
int linenoiseHistoryLoad(const char * filename);
|
||||
|
||||
/* Other utilities. */
|
||||
void linenoiseClearScreen(void);
|
||||
@@ -121,6 +122,14 @@ void linenoisePrintKeyCodes(void);
|
||||
void linenoiseMaskModeEnable(void);
|
||||
void linenoiseMaskModeDisable(void);
|
||||
|
||||
/* Encoding functions. */
|
||||
typedef size_t(linenoisePrevCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
|
||||
typedef size_t(linenoiseNextCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
|
||||
typedef size_t(linenoiseReadCode)(int fd, char * buf, size_t buf_len, int * c);
|
||||
|
||||
void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc,
|
||||
linenoiseReadCode * readCodeFunc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Binary file not shown.
@@ -1,5 +1,5 @@
|
||||
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
|
||||
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
|
||||
const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
|
||||
|
||||
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||
if (minItems === 0 && maxItems === 1) {
|
||||
|
||||
@@ -131,9 +131,9 @@ struct slot_params {
|
||||
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
||||
}
|
||||
|
||||
std::vector<std::string> grammar_trigger_words;
|
||||
for (const auto & trigger : sampling.grammar_trigger_words) {
|
||||
grammar_trigger_words.push_back(trigger.word);
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : sampling.grammar_triggers) {
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -170,8 +170,8 @@ struct slot_params {
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"grammar", sampling.grammar},
|
||||
{"grammar_trigger_words", grammar_trigger_words},
|
||||
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
|
||||
{"grammar_lazy", sampling.grammar_lazy},
|
||||
{"grammar_triggers", grammar_triggers},
|
||||
{"preserved_tokens", sampling.preserved_tokens},
|
||||
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
|
||||
{"samplers", samplers},
|
||||
@@ -356,24 +356,6 @@ struct server_task {
|
||||
}
|
||||
|
||||
{
|
||||
const auto grammar_triggers = data.find("grammar_triggers");
|
||||
if (grammar_triggers != data.end()) {
|
||||
for (const auto & t : *grammar_triggers) {
|
||||
common_grammar_trigger trigger;
|
||||
trigger.word = t.at("word");
|
||||
trigger.at_start = t.at("at_start");
|
||||
|
||||
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
|
||||
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
|
||||
params.sampling.preserved_tokens.insert(ids[0]);
|
||||
continue;
|
||||
}
|
||||
SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
|
||||
params.sampling.grammar_trigger_words.push_back(trigger);
|
||||
}
|
||||
}
|
||||
const auto preserved_tokens = data.find("preserved_tokens");
|
||||
if (preserved_tokens != data.end()) {
|
||||
for (const auto & t : *preserved_tokens) {
|
||||
@@ -383,12 +365,38 @@ struct server_task {
|
||||
params.sampling.preserved_tokens.insert(ids[0]);
|
||||
} else {
|
||||
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
|
||||
SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
|
||||
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (params.sampling.grammar_lazy) {
|
||||
GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
|
||||
const auto grammar_triggers = data.find("grammar_triggers");
|
||||
if (grammar_triggers != data.end()) {
|
||||
for (const auto & t : *grammar_triggers) {
|
||||
auto ct = common_grammar_trigger::from_json(t);
|
||||
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto & word = ct.value;
|
||||
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
auto token = ids[0];
|
||||
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
|
||||
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
|
||||
}
|
||||
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
||||
common_grammar_trigger trigger;
|
||||
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
||||
trigger.value = (llama_token) token;
|
||||
params.sampling.grammar_triggers.push_back(trigger);
|
||||
} else {
|
||||
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
||||
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
||||
}
|
||||
} else {
|
||||
params.sampling.grammar_triggers.push_back(ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
|
||||
throw std::runtime_error("Error: no triggers set for lazy grammar!");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2045,7 +2053,7 @@ struct server_context {
|
||||
|
||||
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||
// Might be better to reject the request with a 400 ?
|
||||
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
|
||||
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
|
||||
slot.params.n_predict = slot.n_predict;
|
||||
}
|
||||
|
||||
@@ -3003,7 +3011,7 @@ struct server_context {
|
||||
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
||||
|
||||
for (size_t i = 0; i < n_match; i++) {
|
||||
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||
|
||||
@@ -144,6 +144,7 @@ def test_apply_chat_template():
|
||||
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
|
||||
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
|
||||
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
|
||||
({"type": "json_schema", "json_schema": {"schema": {"const": "foooooo"}}}, 10, "\"foooooo\""),
|
||||
({"type": "json_object"}, 10, "(\\{|John)+"),
|
||||
({"type": "sound"}, 0, None),
|
||||
# invalid response format (expected to fail)
|
||||
|
||||
236
examples/server/tests/unit/test_tool_call.py
Normal file → Executable file
236
examples/server/tests/unit/test_tool_call.py
Normal file → Executable file
@@ -1,4 +1,12 @@
|
||||
#!/usr/bin/env python
|
||||
import pytest
|
||||
|
||||
# ensure grandparent path is in sys.path
|
||||
from pathlib import Path
|
||||
import sys
|
||||
path = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(path))
|
||||
|
||||
from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
@@ -66,15 +74,8 @@ WEATHER_TOOL = {
|
||||
}
|
||||
|
||||
|
||||
def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
@@ -83,16 +84,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
|
||||
"tool_choice": "required",
|
||||
"tools": [tool],
|
||||
"parallel_tool_calls": False,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"top_p": 1.0,
|
||||
**kwargs,
|
||||
})
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
|
||||
assert expected_function_name == tool_call["function"]["name"]
|
||||
actual_arguments = tool_call["function"]["arguments"]
|
||||
@@ -108,7 +107,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
|
||||
("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
|
||||
])
|
||||
def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
|
||||
do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, temperature=0.0, top_k=1, top_p=1.0)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -130,10 +136,17 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
|
||||
("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
|
||||
("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
|
||||
# ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
|
||||
])
|
||||
def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
|
||||
do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -142,25 +155,33 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
|
||||
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
|
||||
(TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
# (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
# (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
@@ -176,10 +197,10 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
|
||||
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
|
||||
# (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
# TODO: fix these
|
||||
# (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
])
|
||||
def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
@@ -197,7 +218,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
@@ -215,7 +236,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
|
||||
assert expected_function_name == tool_call["function"]["name"]
|
||||
actual_arguments = tool_call["function"]["arguments"]
|
||||
@@ -225,13 +246,8 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
|
||||
|
||||
|
||||
def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
|
||||
global server
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
@@ -239,9 +255,7 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too
|
||||
],
|
||||
"tools": tools if tools else None,
|
||||
"tool_choice": tool_choice,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"top_p": 1.0,
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
@@ -254,7 +268,12 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too
|
||||
("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'),
|
||||
])
|
||||
def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
|
||||
do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
|
||||
global server
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -270,7 +289,12 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t
|
||||
("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'),
|
||||
])
|
||||
def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
|
||||
do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
|
||||
global server
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -281,6 +305,12 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
@@ -324,48 +354,52 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
do_test_weather(server, max_tokens=n_predict)
|
||||
|
||||
|
||||
def do_test_weather(server: ServerProcess, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
|
||||
{"role": "user", "content": "What is the weather in Istanbul?"},
|
||||
],
|
||||
"tools": [WEATHER_TOOL],
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
|
||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
|
||||
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
||||
assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
|
||||
location = actual_arguments["location"]
|
||||
assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
|
||||
assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
|
||||
assert re.match('^Istanbul(( |, ?)(TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
|
||||
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
(None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
(None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
|
||||
(None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
|
||||
# TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
|
||||
("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
# (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
# ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
])
|
||||
def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
# n_predict = 512
|
||||
server.n_slots = 1
|
||||
server.jinja = True
|
||||
server.n_ctx = 8192 * 2
|
||||
@@ -379,10 +413,14 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
do_test_calc_result(server, result_override, n_predict)
|
||||
|
||||
|
||||
def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."},
|
||||
{"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
|
||||
{"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
@@ -423,7 +461,8 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
@@ -434,19 +473,19 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
if result_override is not None:
|
||||
assert re.match(result_override, content), f'Expected {result_override}, got {content}'
|
||||
else:
|
||||
assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \
|
||||
assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \
|
||||
f'Expected something like "The y coordinate is 0.56.", got {content}'
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
|
||||
(128, 'deepseek', "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(128, 'deepseek', "^The sum of 102 and 7 is 109[\\s\\S]*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(128, None, "^The sum of 102 and 7 is 109[\\s\\S]*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
(1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'none', "^I need[\\s\\S]*?</think>\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'deepseek', "To find the sum of[\\s\\S]*", "I need to calculate the sum of 102 and 7[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'none', "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
|
||||
(1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
(1024, 'deepseek', "To find the sum of[\\s\\S]*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
])
|
||||
def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
@@ -464,7 +503,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the sum of 102 and 7?"},
|
||||
@@ -476,7 +515,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
|
||||
content = choice["message"].get("content")
|
||||
if expect_content is None:
|
||||
assert content is None, f'Expected no content in {choice["message"]}'
|
||||
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
else:
|
||||
assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
|
||||
|
||||
@@ -488,46 +527,46 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
|
||||
(None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
|
||||
@pytest.mark.parametrize("hf_repo,template_override", [
|
||||
("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
|
||||
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
|
||||
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
|
||||
("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
|
||||
("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
|
||||
|
||||
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
# ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
(None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
|
||||
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
|
||||
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
|
||||
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
|
||||
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
|
||||
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
|
||||
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
|
||||
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"),
|
||||
])
|
||||
def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
n_predict = 512 # High because of DeepSeek R1
|
||||
server.n_slots = 1
|
||||
server.jinja = True
|
||||
server.n_ctx = 8192
|
||||
server.n_predict = 512 # High because of DeepSeek R1
|
||||
server.n_predict = n_predict
|
||||
server.model_hf_repo = hf_repo
|
||||
server.model_hf_file = None
|
||||
if isinstance(template_override, tuple):
|
||||
@@ -537,31 +576,28 @@ def test_hello_world(expected_arguments_override: str | None, hf_repo: str, temp
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 256,
|
||||
|
||||
do_test_hello_world(server, max_tokens=n_predict)
|
||||
|
||||
|
||||
def do_test_hello_world(server: ServerProcess, **kwargs):
|
||||
res = server.make_request("POST", "/v1/chat/completions", data={
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
{"role": "system", "content": "You are a tool-calling agent."},
|
||||
{"role": "user", "content": "say hello world with python"},
|
||||
],
|
||||
"tools": [PYTHON_TOOL],
|
||||
# Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n print("Hello, World!")\nhello_world()` which is correct but a pain to test.
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"top_p": 1.0,
|
||||
**kwargs,
|
||||
}, timeout=TIMEOUT_HTTP_REQUEST)
|
||||
assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
|
||||
choice = res.body["choices"][0]
|
||||
tool_calls = choice["message"].get("tool_calls")
|
||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||
tool_call = tool_calls[0]
|
||||
assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
|
||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||
assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
|
||||
actual_arguments = tool_call["function"]["arguments"]
|
||||
if expected_arguments_override is not None:
|
||||
assert actual_arguments == expected_arguments_override
|
||||
else:
|
||||
actual_arguments = json.loads(actual_arguments)
|
||||
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
|
||||
code = actual_arguments["code"]
|
||||
assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
|
||||
assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
|
||||
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
||||
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
|
||||
code = actual_arguments["code"]
|
||||
assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
|
||||
assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
|
||||
|
||||
@@ -26,7 +26,10 @@ from re import RegexFlag
|
||||
import wget
|
||||
|
||||
|
||||
DEFAULT_HTTP_TIMEOUT = 12 if "LLAMA_SANITIZE" not in os.environ else 30
|
||||
DEFAULT_HTTP_TIMEOUT = 12
|
||||
|
||||
if "LLAMA_SANITIZE" in os.environ or "GITHUB_ACTION" in os.environ:
|
||||
DEFAULT_HTTP_TIMEOUT = 30
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
@@ -64,6 +67,9 @@ class ServerProcess:
|
||||
id_slot: int | None = None
|
||||
cache_prompt: bool | None = None
|
||||
n_slots: int | None = None
|
||||
ctk: str | None = None
|
||||
ctv: str | None = None
|
||||
fa: bool | None = None
|
||||
server_continuous_batching: bool | None = False
|
||||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
@@ -81,6 +87,7 @@ class ServerProcess:
|
||||
reasoning_format: Literal['deepseek', 'none'] | None = None
|
||||
chat_template: str | None = None
|
||||
chat_template_file: str | None = None
|
||||
server_path: str | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -94,7 +101,9 @@ class ServerProcess:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
|
||||
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
if "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
if self.server_path is not None:
|
||||
server_path = self.server_path
|
||||
elif "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
|
||||
elif os.name == "nt":
|
||||
server_path = "../../../build/bin/Release/llama-server.exe"
|
||||
@@ -148,6 +157,12 @@ class ServerProcess:
|
||||
server_args.extend(["--ctx-size", self.n_ctx])
|
||||
if self.n_slots:
|
||||
server_args.extend(["--parallel", self.n_slots])
|
||||
if self.ctk:
|
||||
server_args.extend(["-ctk", self.ctk])
|
||||
if self.ctv:
|
||||
server_args.extend(["-ctv", self.ctv])
|
||||
if self.fa is not None:
|
||||
server_args.append("-fa")
|
||||
if self.n_predict:
|
||||
server_args.extend(["--n-predict", self.n_predict])
|
||||
if self.slot_save_path:
|
||||
@@ -181,7 +196,7 @@ class ServerProcess:
|
||||
server_args.extend(["--chat-template-file", self.chat_template_file])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
print(f"tests: starting server with: {' '.join(args)}")
|
||||
|
||||
flags = 0
|
||||
if "nt" == os.name:
|
||||
@@ -212,6 +227,10 @@ class ServerProcess:
|
||||
return # server is ready
|
||||
except Exception as e:
|
||||
pass
|
||||
# Check if process died
|
||||
if self.process.poll() is not None:
|
||||
raise RuntimeError(f"Server process died with return code {self.process.returncode}")
|
||||
|
||||
print(f"Waiting for server to start...")
|
||||
time.sleep(0.5)
|
||||
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
|
||||
|
||||
@@ -590,8 +590,8 @@ static json oaicompat_completion_params_parse(
|
||||
if (response_type == "json_object") {
|
||||
json_schema = json_value(response_format, "schema", json::object());
|
||||
} else if (response_type == "json_schema") {
|
||||
json json_schema = json_value(response_format, "json_schema", json::object());
|
||||
json_schema = json_value(json_schema, "schema", json::object());
|
||||
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
|
||||
json_schema = json_value(schema_wrapper, "schema", json::object());
|
||||
} else if (!response_type.empty() && response_type != "text") {
|
||||
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
||||
}
|
||||
@@ -607,6 +607,7 @@ static json oaicompat_completion_params_parse(
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||
}
|
||||
@@ -620,10 +621,7 @@ static json oaicompat_completion_params_parse(
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
grammar_triggers.push_back({
|
||||
{"word", trigger.word},
|
||||
{"at_start", trigger.at_start},
|
||||
});
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
|
||||
@@ -2,7 +2,7 @@ import { useEffect, useMemo, useRef, useState } from 'react';
|
||||
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
|
||||
import ChatMessage from './ChatMessage';
|
||||
import { CanvasType, Message, PendingMessage } from '../utils/types';
|
||||
import { classNames, throttle } from '../utils/misc';
|
||||
import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
|
||||
import CanvasPyInterpreter from './CanvasPyInterpreter';
|
||||
import StorageUtils from '../utils/storage';
|
||||
import { useVSCodeContext } from '../utils/llama-vscode';
|
||||
@@ -18,6 +18,24 @@ export interface MessageDisplay {
|
||||
isPending?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* If the current URL contains "?m=...", prefill the message input with the value.
|
||||
* If the current URL contains "?q=...", prefill and SEND the message.
|
||||
*/
|
||||
const prefilledMsg = {
|
||||
content() {
|
||||
const url = new URL(window.location.href);
|
||||
return url.searchParams.get('m') ?? url.searchParams.get('q') ?? '';
|
||||
},
|
||||
shouldSend() {
|
||||
const url = new URL(window.location.href);
|
||||
return url.searchParams.has('q');
|
||||
},
|
||||
clear() {
|
||||
cleanCurrentUrl(['m', 'q']);
|
||||
},
|
||||
};
|
||||
|
||||
function getListMessageDisplay(
|
||||
msgs: Readonly<Message[]>,
|
||||
leafNodeId: Message['id']
|
||||
@@ -81,7 +99,7 @@ export default function ChatScreen() {
|
||||
canvasData,
|
||||
replaceMessageAndGenerate,
|
||||
} = useAppContext();
|
||||
const [inputMsg, setInputMsg] = useState('');
|
||||
const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
|
||||
const inputRef = useRef<HTMLTextAreaElement>(null);
|
||||
|
||||
const { extraContext, clearExtraContext } = useVSCodeContext(
|
||||
@@ -172,6 +190,22 @@ export default function ChatScreen() {
|
||||
|
||||
const hasCanvas = !!canvasData;
|
||||
|
||||
useEffect(() => {
|
||||
if (prefilledMsg.shouldSend()) {
|
||||
// send the prefilled message if needed
|
||||
sendNewMessage();
|
||||
} else {
|
||||
// otherwise, focus on the input and move the cursor to the end
|
||||
if (inputRef.current) {
|
||||
inputRef.current.focus();
|
||||
inputRef.current.selectionStart = inputRef.current.value.length;
|
||||
}
|
||||
}
|
||||
prefilledMsg.clear();
|
||||
// no need to keep track of sendNewMessage
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [inputRef]);
|
||||
|
||||
// due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
|
||||
const pendingMsgDisplay: MessageDisplay[] =
|
||||
pendingMsg && messages.at(-1)?.msg.id !== pendingMsg.id
|
||||
|
||||
@@ -148,13 +148,13 @@ const SETTING_SECTIONS: SettingSection[] = [
|
||||
fields: [
|
||||
{
|
||||
type: SettingInputType.CHECKBOX,
|
||||
label: 'Expand though process by default for generating message',
|
||||
label: 'Expand thought process by default when generating messages',
|
||||
key: 'showThoughtInProgress',
|
||||
},
|
||||
{
|
||||
type: SettingInputType.CHECKBOX,
|
||||
label:
|
||||
'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
|
||||
'Exclude thought process when sending requests to API (Recommended for DeepSeek-R1)',
|
||||
key: 'excludeThoughtOnReq',
|
||||
},
|
||||
],
|
||||
@@ -247,7 +247,7 @@ const SETTING_SECTIONS: SettingSection[] = [
|
||||
This feature uses{' '}
|
||||
<OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
|
||||
downloaded from CDN. To use this feature, ask the LLM to generate
|
||||
python code inside a markdown code block. You will see a "Run"
|
||||
Python code inside a Markdown code block. You will see a "Run"
|
||||
button on the code block, near the "Copy" button.
|
||||
</small>
|
||||
</>
|
||||
@@ -274,7 +274,7 @@ export default function SettingDialog({
|
||||
);
|
||||
|
||||
const resetConfig = () => {
|
||||
if (window.confirm('Are you sure to reset all settings?')) {
|
||||
if (window.confirm('Are you sure you want to reset all settings?')) {
|
||||
setLocalConfig(CONFIG_DEFAULT);
|
||||
}
|
||||
};
|
||||
@@ -296,9 +296,9 @@ export default function SettingDialog({
|
||||
return;
|
||||
}
|
||||
} else if (mustBeNumeric) {
|
||||
const trimedValue = value.toString().trim();
|
||||
const numVal = Number(trimedValue);
|
||||
if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
|
||||
const trimmedValue = value.toString().trim();
|
||||
const numVal = Number(trimmedValue);
|
||||
if (isNaN(numVal) || !isNumeric(numVal) || trimmedValue.length === 0) {
|
||||
alert(`Value for ${key} must be numeric`);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -118,3 +118,11 @@ export const throttle = <T extends unknown[]>(
|
||||
}, delay);
|
||||
};
|
||||
};
|
||||
|
||||
export const cleanCurrentUrl = (removeQueryParams: string[]) => {
|
||||
const url = new URL(window.location.href);
|
||||
removeQueryParams.forEach((param) => {
|
||||
url.searchParams.delete(param);
|
||||
});
|
||||
window.history.replaceState({}, '', url.toString());
|
||||
};
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
#include "json.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@@ -16,6 +17,13 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum outetts_version {
|
||||
OUTETTS_V0_2,
|
||||
OUTETTS_V0_3,
|
||||
};
|
||||
|
||||
//
|
||||
// Terminal utils
|
||||
//
|
||||
@@ -371,7 +379,7 @@ static std::string replace_numbers_with_words(const std::string & input_text) {
|
||||
}
|
||||
|
||||
// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
|
||||
static std::string process_text(const std::string & text) {
|
||||
static std::string process_text(const std::string & text, const outetts_version tts_version = OUTETTS_V0_2) {
|
||||
|
||||
// For now I skipped text romanization as I am unsure how to handle
|
||||
// uroman and MeCab implementations in C++
|
||||
@@ -401,7 +409,8 @@ static std::string process_text(const std::string & text) {
|
||||
if (c == ' ') {
|
||||
prompt_clean += "<|text_sep|>";
|
||||
*/
|
||||
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
|
||||
std::string separator = (tts_version == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
||||
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
|
||||
|
||||
return processed_text;
|
||||
}
|
||||
@@ -425,8 +434,8 @@ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
|
||||
prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
|
||||
}
|
||||
|
||||
static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
|
||||
const std::string& delimiter = "<|text_sep|>";
|
||||
static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str, const outetts_version tts_version = OUTETTS_V0_2) {
|
||||
const std::string& delimiter = (tts_version == OUTETTS_V0_3 ? "<|space|>" : "<|text_sep|>");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
size_t start = 0;
|
||||
@@ -452,6 +461,78 @@ static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab,
|
||||
return result;
|
||||
}
|
||||
|
||||
static json speaker_from_file(const std::string & speaker_file) {
|
||||
std::ifstream file(speaker_file);
|
||||
if (!file) {
|
||||
LOG_ERR("%s: Failed to open file '%s' for reading\n", __func__, speaker_file.c_str());
|
||||
return json();
|
||||
}
|
||||
|
||||
json speaker = json::parse(file);
|
||||
return speaker;
|
||||
}
|
||||
|
||||
static outetts_version get_tts_version(llama_model *model, json speaker = json::object()) {
|
||||
if (speaker.contains("version")) {
|
||||
std::string version = speaker["version"].get<std::string>();
|
||||
if (version == "0.2") {
|
||||
return OUTETTS_V0_2;
|
||||
} else if (version == "0.3") {
|
||||
return OUTETTS_V0_3;
|
||||
} else {
|
||||
LOG_ERR("%s: Unsupported speaker version '%s'\n", __func__, version.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// Also could get version from model itself
|
||||
const char *chat_template = llama_model_chat_template(model, nullptr);
|
||||
if (chat_template && std::string(chat_template) == "outetts-0.3") {
|
||||
return OUTETTS_V0_3;
|
||||
}
|
||||
|
||||
// Use 0.2 as the default version
|
||||
return OUTETTS_V0_2;
|
||||
}
|
||||
|
||||
static std::string audio_text_from_speaker(json speaker, const outetts_version tts_version = OUTETTS_V0_2) {
|
||||
std::string audio_text = "<|text_start|>";
|
||||
|
||||
if (tts_version == OUTETTS_V0_2 || tts_version == OUTETTS_V0_3) {
|
||||
std::string separator = (tts_version == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
||||
for (const auto &word : speaker["words"]) {
|
||||
audio_text += word["word"].get<std::string>() + separator;
|
||||
}
|
||||
}
|
||||
|
||||
return audio_text;
|
||||
}
|
||||
|
||||
static std::string audio_data_from_speaker(json speaker, const outetts_version tts_version = OUTETTS_V0_2) {
|
||||
std::string audio_data = "<|audio_start|>\n";
|
||||
|
||||
if (tts_version == OUTETTS_V0_2 || tts_version == OUTETTS_V0_3) {
|
||||
std::string code_start = (tts_version == OUTETTS_V0_3) ? "" : "<|code_start|>";
|
||||
std::string code_end = (tts_version == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
|
||||
for (const auto &word : speaker["words"]) {
|
||||
std::string word_text = word["word"].get<std::string>();
|
||||
double duration = word["duration"].get<double>();
|
||||
std::vector<int> codes = word["codes"].get<std::vector<int>>();
|
||||
|
||||
// Create the audio output entry
|
||||
std::ostringstream word_entry;
|
||||
word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
|
||||
<< duration << "|>" + code_start;
|
||||
for (const auto &Code : codes) {
|
||||
word_entry << "<|" << Code << "|>";
|
||||
}
|
||||
word_entry << code_end << "\n";
|
||||
audio_data += word_entry.str();
|
||||
}
|
||||
}
|
||||
|
||||
return audio_data;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
@@ -523,34 +604,9 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> codes;
|
||||
std::vector<llama_token> guide_tokens;
|
||||
|
||||
// process prompt and generate voice codes
|
||||
{
|
||||
LOG_INF("%s: constructing prompt ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> prompt_inp;
|
||||
|
||||
prompt_init(prompt_inp, vocab);
|
||||
|
||||
prompt_add(prompt_inp, vocab, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
|
||||
|
||||
// convert the input text into the necessary format expected by OuteTTS
|
||||
{
|
||||
std::string prompt_clean = process_text(params.prompt);
|
||||
if (params.vocoder.use_guide_tokens) {
|
||||
guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
|
||||
}
|
||||
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
|
||||
|
||||
prompt_add(prompt_inp, vocab, prompt_clean, false, true);
|
||||
}
|
||||
|
||||
prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
|
||||
|
||||
// disabled to save time on tokenizing each time
|
||||
// TODO: load voices from the json files
|
||||
#if 0
|
||||
const std::string voice_data = R"(<|audio_start|>
|
||||
// the default speaker profile is from: https://github.com/edwko/OuteTTS/blob/main/outetts/version/v1/default_speakers/en_male_1.json
|
||||
std::string audio_text = "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>";
|
||||
std::string audio_data = R"(<|audio_start|>
|
||||
the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
|
||||
overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
|
||||
package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
|
||||
@@ -582,117 +638,170 @@ it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><
|
||||
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
|
||||
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
|
||||
|
||||
auto tmp = common_tokenize(vocab, voice_data, false, true);
|
||||
printf("\n\n");
|
||||
for (int i = 0; i < tmp.size(); ++i) {
|
||||
printf("%d, ", tmp[i]);
|
||||
// audio data for 0.3 version
|
||||
outetts_version tts_version = get_tts_version(model_ttc);
|
||||
if (tts_version == OUTETTS_V0_3) {
|
||||
audio_text = std::regex_replace(audio_text, std::regex(R"(<\|text_sep\|>)"), "<|space|>");
|
||||
audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_start\|>)"), "");
|
||||
audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_end\|>)"), "<|space|>");
|
||||
}
|
||||
|
||||
// load speaker if given
|
||||
if (!params.vocoder.speaker_file.empty()) {
|
||||
LOG_INF("%s: loading speaker ..\n", __func__);
|
||||
json speaker = speaker_from_file(params.vocoder.speaker_file);
|
||||
if (speaker.empty()) {
|
||||
LOG_ERR("%s: Failed to load speaker file '%s'\n", __func__, params.vocoder.speaker_file.c_str());
|
||||
return 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
audio_text = audio_text_from_speaker(speaker, tts_version);
|
||||
audio_data = audio_data_from_speaker(speaker, tts_version);
|
||||
}
|
||||
|
||||
// process prompt and generate voice codes
|
||||
{
|
||||
LOG_INF("%s: constructing prompt ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> prompt_inp;
|
||||
|
||||
prompt_init(prompt_inp, vocab);
|
||||
|
||||
prompt_add(prompt_inp, vocab, audio_text, false, true);
|
||||
|
||||
// convert the input text into the necessary format expected by OuteTTS
|
||||
{
|
||||
std::string prompt_clean = process_text(params.prompt, tts_version);
|
||||
if (params.vocoder.use_guide_tokens) {
|
||||
guide_tokens = prepare_guide_tokens(vocab, prompt_clean, tts_version);
|
||||
}
|
||||
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
|
||||
|
||||
prompt_add(prompt_inp, vocab, prompt_clean, false, true);
|
||||
}
|
||||
|
||||
prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
|
||||
|
||||
if (!params.vocoder.speaker_file.empty()) {
|
||||
prompt_add(prompt_inp, vocab, audio_data, false, true);
|
||||
} else {
|
||||
// disabled to save time on tokenizing each time
|
||||
#if 1
|
||||
const std::string voice_data = audio_data;
|
||||
|
||||
auto tmp = common_tokenize(vocab, voice_data, false, true);
|
||||
printf("\n\n");
|
||||
for (size_t i = 0; i < tmp.size(); ++i) {
|
||||
printf("%d, ", tmp[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
prompt_add(prompt_inp, tmp);
|
||||
#else
|
||||
prompt_add(prompt_inp, llama_tokens {
|
||||
151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
|
||||
152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
|
||||
151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
|
||||
151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
|
||||
153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
|
||||
153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
|
||||
152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
|
||||
152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
|
||||
152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
|
||||
153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
|
||||
153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
|
||||
152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
|
||||
153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
|
||||
153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
|
||||
151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
|
||||
152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
|
||||
152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
|
||||
151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
|
||||
152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
|
||||
152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
|
||||
152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
|
||||
152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
|
||||
152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
|
||||
153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
|
||||
155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
|
||||
152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
|
||||
32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
|
||||
152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
|
||||
153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
|
||||
152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
|
||||
151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
|
||||
153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
|
||||
152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
|
||||
152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
|
||||
152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
|
||||
153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
|
||||
152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
|
||||
152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
|
||||
153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
|
||||
152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
|
||||
152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
|
||||
152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
|
||||
155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
|
||||
152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
|
||||
14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
|
||||
153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
|
||||
198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
|
||||
152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
|
||||
151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
|
||||
153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
|
||||
152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
|
||||
152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
|
||||
152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
|
||||
153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
|
||||
152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
|
||||
152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
|
||||
155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
|
||||
151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
|
||||
153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
|
||||
151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
|
||||
155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
|
||||
151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
|
||||
153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
|
||||
152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
|
||||
152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
|
||||
152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
|
||||
151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
|
||||
152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
|
||||
151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
|
||||
152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
|
||||
151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
|
||||
153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
|
||||
151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
|
||||
152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
|
||||
153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
|
||||
151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
|
||||
151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
|
||||
152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
|
||||
151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
|
||||
151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
|
||||
152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
|
||||
151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
|
||||
152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
|
||||
153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
|
||||
152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
|
||||
153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
|
||||
152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
|
||||
152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
|
||||
151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
|
||||
151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
|
||||
151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
|
||||
151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
|
||||
151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
|
||||
152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
|
||||
152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
|
||||
152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
|
||||
153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
|
||||
152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
|
||||
152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
|
||||
152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
|
||||
152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
|
||||
151670,});
|
||||
prompt_add(prompt_inp, llama_tokens {
|
||||
151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
|
||||
152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
|
||||
151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
|
||||
151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
|
||||
153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
|
||||
153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
|
||||
152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
|
||||
152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
|
||||
152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
|
||||
153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
|
||||
153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
|
||||
152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
|
||||
153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
|
||||
153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
|
||||
151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
|
||||
152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
|
||||
152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
|
||||
151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
|
||||
152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
|
||||
152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
|
||||
152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
|
||||
152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
|
||||
152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
|
||||
153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
|
||||
155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
|
||||
152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
|
||||
32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
|
||||
152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
|
||||
153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
|
||||
152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
|
||||
151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
|
||||
153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
|
||||
152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
|
||||
152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
|
||||
152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
|
||||
153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
|
||||
152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
|
||||
152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
|
||||
153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
|
||||
152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
|
||||
152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
|
||||
152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
|
||||
155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
|
||||
152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
|
||||
14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
|
||||
153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
|
||||
198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
|
||||
152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
|
||||
151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
|
||||
153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
|
||||
152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
|
||||
152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
|
||||
152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
|
||||
153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
|
||||
152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
|
||||
152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
|
||||
155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
|
||||
151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
|
||||
153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
|
||||
151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
|
||||
155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
|
||||
151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
|
||||
153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
|
||||
152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
|
||||
152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
|
||||
152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
|
||||
151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
|
||||
152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
|
||||
151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
|
||||
152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
|
||||
151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
|
||||
153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
|
||||
151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
|
||||
152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
|
||||
153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
|
||||
151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
|
||||
151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
|
||||
152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
|
||||
151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
|
||||
151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
|
||||
152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
|
||||
151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
|
||||
152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
|
||||
153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
|
||||
152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
|
||||
153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
|
||||
152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
|
||||
152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
|
||||
151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
|
||||
151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
|
||||
151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
|
||||
151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
|
||||
151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
|
||||
152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
|
||||
152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
|
||||
152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
|
||||
153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
|
||||
152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
|
||||
152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
|
||||
152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
|
||||
152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
|
||||
151670,});
|
||||
#endif
|
||||
}
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
|
||||
@@ -106,6 +106,7 @@ option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||
option(GGML_BMI2 "ggml: enable BMI2" ${INS_ENB})
|
||||
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
|
||||
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
||||
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
||||
@@ -155,10 +156,14 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
||||
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
|
||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
||||
"ggml: cuda link binary compression mode; requires cuda 12.8+")
|
||||
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
||||
|
||||
option(GGML_HIP "ggml: use HIP" OFF)
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
@@ -247,6 +252,7 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/ggml-backend.h
|
||||
include/ggml-blas.h
|
||||
include/ggml-cann.h
|
||||
include/ggml-cpp.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-opt.h
|
||||
|
||||
@@ -19,7 +19,7 @@ struct ggml_tallocr {
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
||||
GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
||||
|
||||
// Graph allocator
|
||||
/*
|
||||
|
||||
@@ -56,7 +56,7 @@ extern "C" {
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
@@ -342,8 +342,8 @@ extern "C" {
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
// CPU buffer types are always available
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
|
||||
@@ -80,6 +80,7 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_bmi2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
|
||||
@@ -2140,7 +2140,11 @@ extern "C" {
|
||||
# define GGML_RESTRICT
|
||||
# endif
|
||||
#else
|
||||
# define GGML_RESTRICT restrict
|
||||
# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
|
||||
# define GGML_RESTRICT __restrict
|
||||
# else
|
||||
# define GGML_RESTRICT restrict
|
||||
# endif
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
@@ -226,6 +226,9 @@ add_library(ggml-base
|
||||
gguf.cpp)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
if (GGML_BACKEND_DL)
|
||||
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
|
||||
endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
@@ -233,7 +236,7 @@ add_library(ggml
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
target_link_libraries(ggml PRIVATE dl stdc++fs)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
@@ -286,7 +289,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
AVX AVX2 AVX_VNNI FMA F16C
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
@@ -306,13 +309,13 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -89,7 +89,7 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
||||
return talloc;
|
||||
}
|
||||
|
||||
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
|
||||
enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
|
||||
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
||||
size = GGML_PAD(size, talloc->alignment);
|
||||
|
||||
@@ -104,7 +104,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
||||
|
||||
assert(((uintptr_t)addr % talloc->alignment) == 0);
|
||||
|
||||
ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
|
||||
return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
|
||||
}
|
||||
|
||||
// dynamic tensor allocator
|
||||
@@ -933,42 +933,51 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
|
||||
// utils
|
||||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free((*buffers)[i]);
|
||||
}
|
||||
free(*buffers);
|
||||
}
|
||||
|
||||
static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
struct ggml_tensor * first, struct ggml_tensor * last,
|
||||
ggml_backend_buffer_type_t buft, size_t size,
|
||||
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||
if (buffer == NULL) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||
#endif
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free((*buffers)[i]);
|
||||
}
|
||||
free(*buffers);
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||
free_buffers(buffers, n_buffers);
|
||||
return false;
|
||||
}
|
||||
|
||||
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
|
||||
|
||||
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->data == NULL) {
|
||||
if (t->view_src == NULL) {
|
||||
ggml_tallocr_alloc(&tallocr, t);
|
||||
} else if (t->buffer == NULL) {
|
||||
ggml_backend_view_init(t);
|
||||
}
|
||||
} else {
|
||||
if (t->view_src != NULL && t->buffer == NULL) {
|
||||
// view of a pre-allocated tensor
|
||||
ggml_backend_view_init(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
||||
(*buffers)[(*n_buffers)++] = buffer;
|
||||
|
||||
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
|
||||
|
||||
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
||||
enum ggml_status status = GGML_STATUS_SUCCESS;
|
||||
if (t->data == NULL) {
|
||||
if (t->view_src == NULL) {
|
||||
status = ggml_tallocr_alloc(&tallocr, t);
|
||||
} else if (t->buffer == NULL) {
|
||||
status = ggml_backend_view_init(t);
|
||||
}
|
||||
} else {
|
||||
if (t->view_src != NULL && t->buffer == NULL) {
|
||||
// view of a pre-allocated tensor
|
||||
status = ggml_backend_view_init(t);
|
||||
}
|
||||
}
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
|
||||
free_buffers(buffers, n_buffers);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ extern "C" {
|
||||
// base address of the buffer
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
// tensor data access
|
||||
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
|
||||
@@ -2,14 +2,13 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <algorithm>
|
||||
#include <codecvt>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <locale>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <cctype>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
@@ -72,14 +71,15 @@
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(str);
|
||||
}
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(str);
|
||||
static std::string path_str(const fs::path & path) {
|
||||
std::string u8path;
|
||||
try {
|
||||
u8path = path.u8string();
|
||||
} catch (...) {
|
||||
}
|
||||
return u8path;
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
@@ -96,12 +96,12 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
static dl_handle * dl_load_library(const fs::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.c_str());
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
@@ -129,8 +129,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const std::wstring & path) {
|
||||
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
static void * dl_load_library(const fs::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@@ -217,11 +217,11 @@ struct ggml_backend_registry {
|
||||
devices.push_back(device);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -229,7 +229,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_str(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -237,7 +237,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_str(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -246,16 +246,17 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n",
|
||||
__func__, path_str(path).c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
__func__, path_str(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
|
||||
|
||||
register_backend(reg, std::move(handle));
|
||||
|
||||
@@ -391,14 +392,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
return get_reg().load_backend(path, false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
static std::wstring get_executable_path() {
|
||||
static fs::path get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@@ -416,7 +417,7 @@ static std::wstring get_executable_path() {
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
return base_path + "/";
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
@@ -442,7 +443,7 @@ static std::wstring get_executable_path() {
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
return base_path + "/";
|
||||
#elif defined(_WIN32)
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
@@ -461,74 +462,69 @@ static std::wstring get_executable_path() {
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring backend_filename_prefix() {
|
||||
static fs::path backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
return L"ggml-";
|
||||
return fs::u8path("ggml-");
|
||||
#else
|
||||
return L"libggml-";
|
||||
return fs::u8path("libggml-");
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring backend_filename_suffix() {
|
||||
static fs::path backend_filename_extension() {
|
||||
#ifdef _WIN32
|
||||
return L".dll";
|
||||
return fs::u8path(".dll");
|
||||
#else
|
||||
return L".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring path_separator() {
|
||||
#ifdef _WIN32
|
||||
return L"\\";
|
||||
#else
|
||||
return L"/";
|
||||
return fs::u8path(".so");
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
std::vector<std::wstring> search_paths;
|
||||
const fs::path name_path = fs::u8path(name);
|
||||
const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
|
||||
const fs::path file_extension = backend_filename_extension();
|
||||
|
||||
std::vector<fs::path> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
search_paths.push_back(L"." + path_separator());
|
||||
// default search paths: executable directory, current directory
|
||||
search_paths.push_back(get_executable_path());
|
||||
search_paths.push_back(fs::current_path());
|
||||
} else {
|
||||
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
search_paths.push_back(user_search_path);
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
std::wstring best_path;
|
||||
fs::path best_path;
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
if (!fs::exists(search_path)) {
|
||||
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
|
||||
continue;
|
||||
}
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
std::wstring filename = entry.path().filename().wstring();
|
||||
std::wstring ext = entry.path().extension().wstring();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
auto filename = entry.path().filename().native();
|
||||
auto ext = entry.path().extension().native();
|
||||
if (filename.find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||
}
|
||||
if (handle) {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn) {
|
||||
int s = score_fn();
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_str(entry.path()).c_str(), s);
|
||||
#endif
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
best_path = entry.path().wstring();
|
||||
best_path = entry.path();
|
||||
}
|
||||
} else {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, path_str(entry.path()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -540,7 +536,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||
fs::path path = search_path.native() + filename.native();
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <sys/types.h>
|
||||
@@ -126,11 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return base;
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
// init_tensor is optional
|
||||
if (buffer->iface.init_tensor) {
|
||||
buffer->iface.init_tensor(buffer, tensor);
|
||||
return buffer->iface.init_tensor(buffer, tensor);
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -1641,7 +1643,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
||||
|
||||
// utils
|
||||
|
||||
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->buffer == NULL);
|
||||
GGML_ASSERT(tensor->view_src != NULL);
|
||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||
@@ -1649,10 +1651,10 @@ void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||
|
||||
tensor->buffer = tensor->view_src->buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||
return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||
GGML_ASSERT(tensor->buffer == NULL);
|
||||
GGML_ASSERT(tensor->data == NULL);
|
||||
GGML_ASSERT(tensor->view_src == NULL);
|
||||
@@ -1662,7 +1664,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->data = addr;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
return ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
}
|
||||
|
||||
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
||||
@@ -1708,7 +1710,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
|
||||
struct ggml_tensor * dst = node_copies[id];
|
||||
if (dst->view_src != NULL) {
|
||||
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||
ggml_backend_view_init(dst);
|
||||
enum ggml_status status = ggml_backend_view_init(dst);
|
||||
GGML_ASSERT(status == GGML_STATUS_SUCCESS);
|
||||
}
|
||||
else {
|
||||
ggml_backend_tensor_copy(src, dst);
|
||||
@@ -1823,7 +1826,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
||||
assert(g1->n_nodes == g2->n_nodes);
|
||||
|
||||
for (int i = 0; i < g1->n_nodes; i++) {
|
||||
//printf("eval %d/%d\n", i, g1->n_nodes);
|
||||
struct ggml_tensor * t1 = g1->nodes[i];
|
||||
struct ggml_tensor * t2 = g2->nodes[i];
|
||||
|
||||
|
||||
@@ -796,11 +796,11 @@ static bool need_transform(ggml_type type) {
|
||||
* @param buffer The CANN buffer from which to initialize the tensor.
|
||||
* @param tensor Pointer to the tensor to be initialized.
|
||||
*/
|
||||
static void ggml_backend_cann_buffer_init_tensor(
|
||||
static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
||||
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return;
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// TODO: can backend doesn't support quantized yet. Just leave the code
|
||||
@@ -817,6 +817,7 @@ static void ggml_backend_cann_buffer_init_tensor(
|
||||
memset_size, 0, memset_size));
|
||||
}
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// TODO: need handle tensor which has paddings.
|
||||
|
||||
@@ -219,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_AVX_VNNI)
|
||||
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
||||
endif()
|
||||
if (GGML_BMI2)
|
||||
# MSVC does not define macro __BMI2__
|
||||
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
|
||||
endif()
|
||||
else ()
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
@@ -233,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
||||
endif()
|
||||
if (GGML_BMI2)
|
||||
list(APPEND ARCH_FLAGS -mbmi2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_BMI2)
|
||||
endif()
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
@@ -281,19 +289,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
||||
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
||||
set(substring_index -1)
|
||||
endif()
|
||||
|
||||
if (${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
|
||||
if (${POWER_M} MATCHES "POWER10")
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${POWER_M} MATCHES "POWER9")
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
|
||||
@@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *) (buffer->context);
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
|
||||
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
|
||||
if (!is.SSE42()) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_BMI2
|
||||
if (!is.BMI2()) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_AVX
|
||||
if (!is.AVX()) { return 0; }
|
||||
score += 1<<4;
|
||||
|
||||
@@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -511,6 +511,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_fma()) {
|
||||
features.push_back({ "FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_bmi2()) {
|
||||
features.push_back({ "BMI2", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
features.push_back({ "AVX512", "1" });
|
||||
}
|
||||
|
||||
@@ -190,10 +190,11 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
|
||||
}
|
||||
} // namespace ggml::cpu::kleidiai
|
||||
|
||||
static void ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
GGML_API enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
|
||||
@@ -102,6 +102,15 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
# - none (not recommended)
|
||||
# - speed (nvcc's default)
|
||||
# - balance
|
||||
# - size
|
||||
list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
|
||||
endif()
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
||||
endif()
|
||||
|
||||
@@ -294,11 +294,13 @@ static void ggml_cuda_op_bin_bcast(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()(src0, src1, dst, (const half *) src0_dd, (const half *)src1_dd, (half *) dst_dd, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
|
||||
|
||||
@@ -1,34 +1,45 @@
|
||||
#include "clamp.cuh"
|
||||
|
||||
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
||||
static __device__ __forceinline__ float op_clamp(float x, float min, float max) {
|
||||
return fminf(fmaxf(x, min), max);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
||||
dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
|
||||
}
|
||||
|
||||
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
||||
template <class T>
|
||||
static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
||||
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
||||
op_clamp_kernel<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
const void * src0_d = src0->data;
|
||||
void * dst_d = dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
float min;
|
||||
float max;
|
||||
memcpy(&min, dst->op_params, sizeof(float));
|
||||
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
clamp_cuda((const half *)src0_d, (half *)dst_d, (half)min, (half)max, ggml_nelements(src0), stream);
|
||||
} else {
|
||||
clamp_cuda((const float *)src0_d, (float *)dst_d, (float)min, (float)max, ggml_nelements(src0), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,6 +62,7 @@
|
||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
||||
|
||||
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
@@ -196,6 +197,10 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
@@ -223,12 +228,18 @@ static bool fast_fp16_hardware_available(const int cc) {
|
||||
|
||||
// Any FP16 tensor core instructions are available for ggml code.
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
return false;
|
||||
#else
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
|
||||
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
}
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
static bool fp16_mma_hardware_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
|
||||
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
|
||||
@@ -57,12 +57,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
@@ -70,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
const int v = (get_int_b2(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
@@ -78,14 +79,14 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE];
|
||||
const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/warp_size];
|
||||
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */);
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (8/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y));
|
||||
sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (8/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,12 +98,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
@@ -110,7 +112,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
const int v = (get_int_b4(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
@@ -118,7 +120,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE];
|
||||
const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/warp_size];
|
||||
const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1);
|
||||
sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled));
|
||||
} else
|
||||
@@ -126,8 +128,8 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
const float sumid4d8 = __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi;
|
||||
const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1;
|
||||
const float sumid4d8 = __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
|
||||
const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
|
||||
|
||||
sum += (T) (sumid4d8 + m4s8scaled);
|
||||
}
|
||||
@@ -141,12 +143,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
@@ -161,7 +164,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
v |= (vh << 25) & 0x10000000; // 3 -> 28
|
||||
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
@@ -169,14 +172,14 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE];
|
||||
const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/warp_size];
|
||||
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */;
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (16/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y));
|
||||
sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (16/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -188,12 +191,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_1;
|
||||
@@ -208,7 +212,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
v |= (vh << 25) & 0x10000000; // 3 -> 28
|
||||
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
const int u = Q_q8[k_KQ_0/warp_size];
|
||||
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
@@ -216,7 +220,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE];
|
||||
const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/warp_size];
|
||||
const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1);
|
||||
sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled));
|
||||
} else
|
||||
@@ -224,8 +228,8 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
{
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
|
||||
const float sumid5d8 = __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi;
|
||||
const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1;
|
||||
const float sumid5d8 = __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
|
||||
const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
|
||||
|
||||
sum += (T) (sumid5d8 + m5s8scaled);
|
||||
}
|
||||
@@ -239,12 +243,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const int ib = k_KQ / QI8_0;
|
||||
@@ -255,13 +260,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
T Q_d;
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
Q_d = __low2half(Q_ds[k_KQ_0/WARP_SIZE]);
|
||||
Q_d = __low2half(Q_ds[k_KQ_0/warp_size]);
|
||||
} else {
|
||||
const float2 * Q_ds = (const float2 *) Q_ds_v;
|
||||
Q_d = Q_ds[k_KQ_0/WARP_SIZE].x;
|
||||
Q_d = Q_ds[k_KQ_0/warp_size].x;
|
||||
}
|
||||
|
||||
sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/WARP_SIZE], K_q8_0[ib].d, Q_d);
|
||||
sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/warp_size], K_q8_0[ib].d, Q_d);
|
||||
}
|
||||
|
||||
return sum;
|
||||
@@ -272,6 +277,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const half2 * K_h2 = (const half2 *) K_c;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
@@ -282,11 +288,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||
half2 sum2 = make_half2(0.0f, 0.0f);
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[k_KQ];
|
||||
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
|
||||
sum2 += K_ik * Q_h2[k_KQ_0/warp_size];
|
||||
}
|
||||
|
||||
return __low2half(sum2) + __high2half(sum2);
|
||||
@@ -298,12 +304,12 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[k_KQ];
|
||||
sum += __low2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].x;
|
||||
sum += __high2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].y;
|
||||
sum += __low2float(K_ik) * Q_f2[k_KQ_0/warp_size].x;
|
||||
sum += __high2float(K_ik) * Q_f2[k_KQ_0/warp_size].y;
|
||||
}
|
||||
|
||||
return sum;
|
||||
@@ -698,6 +704,8 @@ void launch_fattn(
|
||||
|
||||
GGML_ASSERT(Q->ne[3] == 1);
|
||||
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -750,7 +758,7 @@ void launch_fattn(
|
||||
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
||||
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
||||
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 block_dim(warp_size, nwarps, 1);
|
||||
dim3 blocks_num;
|
||||
if (parallel_blocks == 0) {
|
||||
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
|
||||
@@ -796,6 +804,8 @@ void launch_fattn(
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
GGML_ASSERT(block_dim.x % warp_size == 0);
|
||||
GGML_ASSERT(!GGML_CUDA_CC_IS_AMD(cc) || block_dim.x * block_dim.y <= 4 * (unsigned int)warp_size);
|
||||
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
||||
(const char *) Q->data,
|
||||
K_data,
|
||||
|
||||
@@ -7,14 +7,19 @@
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#include <mma.h>
|
||||
namespace wmma = nvcuda::wmma;
|
||||
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
||||
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
||||
#include <rocwmma/rocwmma.hpp>
|
||||
namespace wmma = rocwmma;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void flash_attn_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
@@ -51,7 +56,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
@@ -60,6 +65,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
|
||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||
|
||||
@@ -68,11 +75,11 @@ static __global__ void flash_attn_ext_f16(
|
||||
constexpr int frag_m = ncols == 8 ? 32 : 16;
|
||||
constexpr int frag_n = ncols == 8 ? 8 : 16;
|
||||
static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, frag_m, frag_n, 16, half, nvcuda::wmma::row_major> frag_a_K;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_a_V;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_b;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t> frag_c_KQ;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, half> frag_c_VKQ;
|
||||
typedef wmma::fragment<wmma::matrix_a, frag_m, frag_n, 16, half, wmma::row_major> frag_a_K;
|
||||
typedef wmma::fragment<wmma::matrix_a, frag_m, frag_n, 16, half, wmma::col_major> frag_a_V;
|
||||
typedef wmma::fragment<wmma::matrix_b, frag_m, frag_n, 16, half, wmma::col_major> frag_b;
|
||||
typedef wmma::fragment<wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t> frag_c_KQ;
|
||||
typedef wmma::fragment<wmma::accumulator, frag_m, frag_n, 16, half> frag_c_VKQ;
|
||||
|
||||
constexpr int KQ_stride_tc = nwarps*frag_m; // Number of KQ rows calculated in parallel.
|
||||
constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy.
|
||||
@@ -132,9 +139,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
|
||||
if (i0 + warp_size > D/2 && i >= D/2) {
|
||||
break;
|
||||
}
|
||||
VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f);
|
||||
@@ -146,9 +153,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
|
||||
for (int i0 = 0; i0 < D; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
if (i0 + WARP_SIZE > D && i >= D) {
|
||||
if (i0 + warp_size > D && i >= D) {
|
||||
break;
|
||||
}
|
||||
KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
|
||||
@@ -162,7 +169,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
for (int i0 = 0; i0 < D; i0 += 16) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
|
||||
nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
|
||||
wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,20 +183,20 @@ static __global__ void flash_attn_ext_f16(
|
||||
frag_c_KQ KQ_c[ncols/frag_n];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
|
||||
wmma::fill_fragment(KQ_c[j], static_cast<KQ_acc_t>(0.0f));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
|
||||
frag_a_K K_a;
|
||||
nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
|
||||
wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
|
||||
wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
|
||||
}
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
|
||||
nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major);
|
||||
wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, wmma::mem_col_major);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,27 +209,27 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (std::is_same<KQ_acc_t, float>::value) {
|
||||
float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE];
|
||||
float KQ_f_tmp[FATTN_KQ_STRIDE / warp_size];
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
|
||||
KQ_f_tmp[k0/warp_size] = KQ_f[j*kqs_padded + k];
|
||||
|
||||
if (use_logit_softcap) {
|
||||
KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
|
||||
KQ_f_tmp[k0/warp_size] = logit_softcap*tanhf(KQ_f_tmp[k0/warp_size]);
|
||||
}
|
||||
}
|
||||
|
||||
float KQ_max_new = KQ_max_f[j0/nwarps];
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
|
||||
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
|
||||
KQ_f_tmp[k0/warp_size] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
|
||||
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);
|
||||
}
|
||||
KQ_max_new = warp_reduce_max(KQ_max_new);
|
||||
KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);
|
||||
|
||||
const float diff = KQ_max_f[j0/nwarps] - KQ_max_new;
|
||||
KQ_max_scale_f[j0/nwarps] = expf(diff);
|
||||
@@ -233,48 +240,48 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
float KQ_rowsum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps];
|
||||
KQ_f_tmp[k0/WARP_SIZE] = expf(diff);
|
||||
const float diff = KQ_f_tmp[k0/warp_size] - KQ_max_f[j0/nwarps];
|
||||
KQ_f_tmp[k0/warp_size] = expf(diff);
|
||||
if (diff <= SOFTMAX_FTZ_THRESHOLD) {
|
||||
KQ_f_tmp[k0/WARP_SIZE] = 0.0f;
|
||||
KQ_f_tmp[k0/warp_size] = 0.0f;
|
||||
}
|
||||
KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE];
|
||||
KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE];
|
||||
KQ_rowsum_add += KQ_f_tmp[k0/warp_size];
|
||||
KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/warp_size];
|
||||
}
|
||||
KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
|
||||
KQ_rowsum_add = warp_reduce_sum<warp_size>(KQ_rowsum_add);
|
||||
|
||||
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
|
||||
KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add;
|
||||
} else {
|
||||
half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)];
|
||||
half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*warp_size)];
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
|
||||
KQ2_tmp[k0/warp_size] = KQ2[j*(kqs_padded/2) + k];
|
||||
|
||||
if (use_logit_softcap) {
|
||||
// There is no dedicated tangens hyperbolicus function for half2.
|
||||
KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
|
||||
KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
|
||||
/(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
|
||||
KQ2_tmp[k0/warp_size] = h2exp(KQ2_tmp[k0/warp_size]*make_half2(2.0f, 2.0f));
|
||||
KQ2_tmp[k0/warp_size] = (KQ2_tmp[k0/warp_size] - make_half2(1.0f, 1.0f))
|
||||
/(KQ2_tmp[k0/warp_size] + make_half2(1.0f, 1.0f));
|
||||
|
||||
KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
|
||||
KQ2_tmp[k0/warp_size] *= logit_softcap_2;
|
||||
}
|
||||
}
|
||||
|
||||
half2 KQ_max_new = KQ_max_h2[j0/nwarps];
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
||||
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
|
||||
KQ2_tmp[k0/warp_size] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
||||
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/warp_size]);
|
||||
}
|
||||
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
||||
KQ_max_new = __half2half2(warp_reduce_max<warp_size>(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
||||
const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
|
||||
KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
|
||||
const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
|
||||
@@ -283,17 +290,17 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
half2 KQ_rowsum_add = make_half2(0.0f, 0.0f);
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps];
|
||||
KQ2_tmp[k0/WARP_SIZE] = h2exp(diff);
|
||||
const half2 diff = KQ2_tmp[k0/warp_size] - KQ_max_h2[j0/nwarps];
|
||||
KQ2_tmp[k0/warp_size] = h2exp(diff);
|
||||
const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
|
||||
*((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask;
|
||||
KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE];
|
||||
KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE];
|
||||
*((uint32_t *) &KQ2_tmp[k0/warp_size]) &= ftz_mask;
|
||||
KQ_rowsum_add += KQ2_tmp[k0/warp_size];
|
||||
KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/warp_size];
|
||||
}
|
||||
KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
|
||||
KQ_rowsum_add = warp_reduce_sum<warp_size>(KQ_rowsum_add);
|
||||
|
||||
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
|
||||
KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add;
|
||||
@@ -308,7 +315,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
|
||||
const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
|
||||
nvcuda::wmma::load_matrix_sync(
|
||||
wmma::load_matrix_sync(
|
||||
KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
|
||||
KQ + j0*(kqar*kqs_padded) + k,
|
||||
kqar*kqs_padded);
|
||||
@@ -320,7 +327,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f);
|
||||
wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], static_cast<half>(0.0f));
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -328,10 +335,10 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
|
||||
|
||||
frag_a_V v_a;
|
||||
nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
|
||||
wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
|
||||
wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -343,10 +350,10 @@ static __global__ void flash_attn_ext_f16(
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
|
||||
nvcuda::wmma::store_matrix_sync(
|
||||
wmma::store_matrix_sync(
|
||||
KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
|
||||
VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n],
|
||||
D_padded, nvcuda::wmma::mem_col_major);
|
||||
D_padded, wmma::mem_col_major);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -364,9 +371,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
|
||||
if (i0 + warp_size > D/2 && i >= D/2) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -398,9 +405,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
|
||||
for (int i0 = 0; i0 < D; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
if (i0 + WARP_SIZE > D && i >= D) {
|
||||
if (i0 + warp_size > D && i >= D) {
|
||||
break;
|
||||
}
|
||||
float dst_val = VKQ[j_VKQ*D_padded + i];
|
||||
@@ -425,7 +432,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
}
|
||||
|
||||
constexpr int get_max_power_of_2(int x) {
|
||||
@@ -515,6 +522,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
|
||||
if (prec != GGML_PREC_DEFAULT) {
|
||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||
@@ -571,7 +579,8 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
|
||||
constexpr int cols_per_block = 8;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
@@ -592,6 +601,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 16;
|
||||
|
||||
@@ -250,10 +250,18 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
if (fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
@@ -291,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
const bool mma_fast_for_bs1 = fp16_mma_available(cc) && gqa_ratio % 2 == 0 &&
|
||||
K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16 && mask;
|
||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0 && !mma_fast_for_bs1) {
|
||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*warp_size) == 0 && !mma_fast_for_bs1) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
return;
|
||||
@@ -302,7 +310,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
|
||||
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
||||
if (cc == GGML_CUDA_CC_VOLTA) {
|
||||
if (fp16_mma_available(cc) && !new_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -540,12 +540,12 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return ctx->dev_ptr;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return;
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
@@ -558,6 +558,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
||||
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||
}
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -792,7 +793,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
@@ -838,6 +839,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
|
||||
}
|
||||
}
|
||||
tensor->extra = extra;
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
@@ -2145,6 +2147,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_ABS:
|
||||
ggml_cuda_op_abs(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
ggml_cuda_op_sgn(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
ggml_cuda_op_neg(ctx, dst);
|
||||
break;
|
||||
@@ -2242,6 +2250,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_cuda_op_clamp(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_LOG:
|
||||
ggml_cuda_op_log(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
@@ -2560,7 +2571,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||
*(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
}
|
||||
}
|
||||
@@ -2960,6 +2971,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_ABS:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
@@ -3142,7 +3155,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_SILU_BACK:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
break;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
@@ -3166,6 +3179,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
|
||||
@@ -1,305 +1,213 @@
|
||||
#include "unary.cuh"
|
||||
|
||||
static __global__ void neg_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = -x[i];
|
||||
static __device__ __forceinline__ float op_abs(float x) {
|
||||
return fabsf(x);
|
||||
}
|
||||
|
||||
static __global__ void step_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = x[i] > 0.0f;
|
||||
static __device__ __forceinline__ float op_sgn(float x) {
|
||||
return (x > 0.f ? 1.f : ((x < 0.f ? -1.f : 0.f)));
|
||||
}
|
||||
|
||||
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
||||
static __device__ __forceinline__ float op_neg(float x) {
|
||||
return -x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_step(float x) {
|
||||
return x > 0.0f;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_gelu(float x) {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
float xi = x[i];
|
||||
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
|
||||
static __device__ __forceinline__ float op_gelu_quick(float x) {
|
||||
const float GELU_QUICK_COEF = -1.702f;
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
||||
|
||||
return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
|
||||
}
|
||||
|
||||
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
||||
static __device__ __forceinline__ float op_silu(float x) {
|
||||
return x / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
static __global__ void silu_back_f32(
|
||||
const float * grad, const float * xf, float * dst, const int k) {
|
||||
static __device__ __forceinline__ float op_tanh(float x) {
|
||||
return tanhf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_relu(float x) {
|
||||
return fmaxf(x, 0);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sigmoid(float x) {
|
||||
return 1.0f / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_hardsigmoid(float x) {
|
||||
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_hardswish(float x) {
|
||||
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_exp(float x) {
|
||||
return expf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sqrt(float x) {
|
||||
return sqrtf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sin(float x) {
|
||||
return sinf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_cos(float x) {
|
||||
return cosf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_log(float x) {
|
||||
return logf(x);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float xfi = xf[i];
|
||||
const float s = 1.0f / (1.0f + expf(-xfi));
|
||||
dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s));
|
||||
dst[i] = (T)op((float)x[i]);
|
||||
}
|
||||
|
||||
static __global__ void tanh_f32(const float * x, float * dst, int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = tanhf(x[i]);
|
||||
}
|
||||
|
||||
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = fmaxf(x[i], 0);
|
||||
}
|
||||
|
||||
static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = 1.0f / (1.0f + expf(-x[i]));
|
||||
}
|
||||
|
||||
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __global__ void exp_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = expf(x[i]);
|
||||
}
|
||||
|
||||
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
|
||||
}
|
||||
|
||||
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * x[i];
|
||||
}
|
||||
|
||||
static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = sqrtf(x[i]);
|
||||
}
|
||||
|
||||
static __global__ void sin_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = sinf(x[i]);
|
||||
}
|
||||
|
||||
static __global__ void cos_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = cosf(x[i]);
|
||||
}
|
||||
|
||||
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
template <float (*op)(float), typename T>
|
||||
static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
|
||||
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void step_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
|
||||
step_f32<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
template <float (*op)(float)>
|
||||
void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const void * src0_d = src0->data;
|
||||
void * dst_d = dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
|
||||
} else {
|
||||
unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_abs>(ctx, dst);
|
||||
}
|
||||
|
||||
static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||
gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
|
||||
}
|
||||
|
||||
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
|
||||
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
||||
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
|
||||
sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
||||
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
|
||||
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void exp_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
|
||||
exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
||||
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
||||
}
|
||||
|
||||
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
||||
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
|
||||
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
|
||||
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
|
||||
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_sgn>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
ggml_cuda_op_unary<op_neg>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
step_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
ggml_cuda_op_unary<op_step>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
ggml_cuda_op_unary<op_gelu>(ctx, dst);
|
||||
}
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
gelu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
ggml_cuda_op_unary<op_silu>(ctx, dst);
|
||||
}
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_tanh>(ctx, dst);
|
||||
}
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_relu>(ctx, dst);
|
||||
}
|
||||
|
||||
silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_sigmoid>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_hardsigmoid>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_hardswish>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_exp>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_sqr>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_sqrt>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_sin>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_cos>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_log>(ctx, dst);
|
||||
}
|
||||
|
||||
/* silu_back */
|
||||
|
||||
static __device__ __forceinline__ float op_silu_back(float grad, float x) {
|
||||
const float s = 1.0f / (1.0f + expf(-x));
|
||||
return grad * s * (1.0f + x * (1.0f - s));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static __global__ void silu_back_kernel(const T * grad, const T * xf, T * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = (T)op_silu_back((float)grad[i], (float)xf[i]);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static void silu_back_cuda(const T * grad, const T * x, T * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
silu_back_kernel<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -314,179 +222,58 @@ void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream);
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
silu_back_cuda((const half *)src0_d, (const half *)src1_d, (half *)dst_d, ggml_nelements(src0), stream);
|
||||
} else {
|
||||
silu_back_cuda((const float*)src0_d, (const float*)src1_d, (float *)dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
/* leaky relu */
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
gelu_quick_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
static __device__ __forceinline__ float op_leaky_relu(float x, const float negative_slope) {
|
||||
return fmaxf(x, 0) + fminf(x, 0.0f) * negative_slope;
|
||||
}
|
||||
|
||||
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
template <class T>
|
||||
static __global__ void leaky_relu_kernel(const T * x, T * dst, const int k, const float negative_slope) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
tanh_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
dst[i] = (T)op_leaky_relu((float)x[i], negative_slope);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
hardsigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
exp_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
template <class T>
|
||||
static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
||||
leaky_relu_kernel<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
const void * src0_d = src0->data;
|
||||
void * dst_d = dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
float negative_slope;
|
||||
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
||||
|
||||
leaky_relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), negative_slope, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
leaky_relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), negative_slope, stream);
|
||||
} else {
|
||||
leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,10 @@
|
||||
#define CUDA_SIN_BLOCK_SIZE 256
|
||||
#define CUDA_COS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -49,3 +53,5 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -39,6 +39,12 @@ endif()
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
if (GGML_HIP_ROCWMMA_FATTN)
|
||||
CHECK_INCLUDE_FILE_CXX("rocwmma/rocwmma.hpp" FOUND_ROCWMMA)
|
||||
if (NOT ${FOUND_ROCWMMA})
|
||||
message(FATAL_ERROR "rocwmma has not been found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (${hip_VERSION} VERSION_LESS 5.5)
|
||||
message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
|
||||
@@ -107,6 +113,10 @@ if (GGML_HIP_NO_VMM)
|
||||
add_compile_definitions(GGML_HIP_NO_VMM)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_ROCWMMA_FATTN)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_FA)
|
||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||
endif()
|
||||
|
||||
@@ -27,12 +27,12 @@ configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
|
||||
|
||||
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
|
||||
if (GGML_METAL_EMBED_LIBRARY)
|
||||
enable_language(ASM)
|
||||
|
||||
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
|
||||
|
||||
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
|
||||
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
|
||||
|
||||
@@ -93,7 +93,7 @@ else()
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
|
||||
DEPENDS ggml-metal.metal ggml-common.h
|
||||
DEPENDS ggml-metal.metal ${METALLIB_COMMON}
|
||||
COMMENT "Compiling Metal kernels"
|
||||
)
|
||||
|
||||
|
||||
@@ -285,4 +285,239 @@ typedef struct {
|
||||
float eps;
|
||||
} ggml_metal_kargs_rms_norm;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int32_t n_groups;
|
||||
float eps;
|
||||
} ggml_metal_kargs_group_norm;
|
||||
|
||||
typedef struct {
|
||||
int32_t IC;
|
||||
int32_t IL;
|
||||
int32_t K;
|
||||
int32_t s0;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
} ggml_metal_kargs_conv_transpose_1d;
|
||||
|
||||
typedef struct {
|
||||
uint64_t ofs0;
|
||||
uint64_t ofs1;
|
||||
int32_t IW;
|
||||
int32_t IH;
|
||||
int32_t CHW;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
int32_t d0;
|
||||
int32_t d1;
|
||||
int32_t N;
|
||||
int32_t KH;
|
||||
int32_t KW;
|
||||
int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources
|
||||
} ggml_metal_kargs_im2col;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
int64_t ne12;
|
||||
int64_t ne13;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_sum_rows;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint32_t n_head_log2;
|
||||
} ggml_metal_kargs_soft_max;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int n_past;
|
||||
} ggml_metal_kargs_diag_mask_inf;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
} ggml_metal_kargs_ssm_conv;
|
||||
|
||||
typedef struct {
|
||||
int64_t d_state;
|
||||
int64_t d_inner;
|
||||
int64_t n_seq_tokens;
|
||||
int64_t n_seqs;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb20;
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb30;
|
||||
uint64_t nb31;
|
||||
uint64_t nb40;
|
||||
uint64_t nb41;
|
||||
uint64_t nb42;
|
||||
uint64_t nb50;
|
||||
uint64_t nb51;
|
||||
uint64_t nb52;
|
||||
} ggml_metal_kargs_ssm_scan;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int64_t ne10;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
} ggml_metal_kargs_get_rows;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
float sf0;
|
||||
float sf1;
|
||||
float sf2;
|
||||
float sf3;
|
||||
} ggml_metal_kargs_upscale;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_pad;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
} ggml_metal_kargs_pad_reflect_1d;
|
||||
|
||||
typedef struct {
|
||||
uint64_t nb1;
|
||||
int dim;
|
||||
int max_period;
|
||||
} ggml_metal_kargs_timestep_embedding;
|
||||
|
||||
typedef struct {
|
||||
float slope;
|
||||
} ggml_metal_kargs_leaky_relu;
|
||||
|
||||
typedef struct {
|
||||
int64_t ncols;
|
||||
int64_t ncols_pad;
|
||||
} ggml_metal_kargs_argsort;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne0;
|
||||
float start;
|
||||
float step;
|
||||
} ggml_metal_kargs_arange;
|
||||
|
||||
typedef struct {
|
||||
int32_t k0;
|
||||
int32_t k1;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t p0;
|
||||
int32_t p1;
|
||||
int64_t IH;
|
||||
int64_t IW;
|
||||
int64_t OH;
|
||||
int64_t OW;
|
||||
int64_t parallel_elements;
|
||||
} ggml_metal_kargs_pool_2d;
|
||||
|
||||
#endif // GGML_METAL_IMPL
|
||||
|
||||
@@ -467,11 +467,13 @@ struct ggml_backend_metal_context {
|
||||
// for now it is easier to work in a separate file
|
||||
// static NSString * const msl_library_source = @"see metal.metal";
|
||||
|
||||
#if !GGML_METAL_EMBED_LIBRARY
|
||||
// Here to assist with NSBundle Path Hack
|
||||
@interface GGMLMetalClass : NSObject
|
||||
@end
|
||||
@implementation GGMLMetalClass
|
||||
@end
|
||||
#endif
|
||||
|
||||
static void * ggml_metal_host_malloc(size_t n) {
|
||||
void * data = NULL;
|
||||
@@ -520,7 +522,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
id<MTLLibrary> metal_library;
|
||||
id<MTLLibrary> metal_library = nil;
|
||||
|
||||
// load library
|
||||
//
|
||||
@@ -529,19 +531,23 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
{
|
||||
NSBundle * bundle = nil;
|
||||
#ifdef SWIFT_PACKAGE
|
||||
bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
const bool try_metallib = false;
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
|
||||
#else
|
||||
const bool try_metallib = true;
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
@@ -574,7 +580,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
path_lib = default_metallib_path;
|
||||
}
|
||||
|
||||
if (try_metallib && path_lib != nil) {
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
@@ -585,14 +591,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
#else
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * path_source;
|
||||
@@ -613,13 +611,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!metal_library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
@@ -647,10 +647,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
}
|
||||
|
||||
// print MTL GPU family:
|
||||
@@ -1200,7 +1201,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1210,21 +1211,26 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_CONCAT:
|
||||
return true;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return true;
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_LOG:
|
||||
return false; // TODO: implement
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
@@ -1254,10 +1260,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ARANGE:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
@@ -1938,34 +1945,38 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
|
||||
ggml_metal_kargs_sum_rows args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.ne13 =*/ ne13,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -2014,8 +2025,17 @@ static void ggml_metal_encode_node(
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
// TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
|
||||
ggml_metal_kargs_soft_max args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.scale =*/ scale,
|
||||
/*.max_bias =*/ max_bias,
|
||||
/*.m0 =*/ m0,
|
||||
/*.m1 =*/ m1,
|
||||
/*.n_head_log2 =*/ n_head_log2,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
if (id_src1) {
|
||||
@@ -2024,14 +2044,7 @@ static void ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
||||
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:8];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:9];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
@@ -2049,13 +2062,16 @@ static void ggml_metal_encode_node(
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
|
||||
}
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_diag_mask_inf args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.n_past =*/ n_past,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
if (ne00%8 == 0) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
@@ -2074,27 +2090,30 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_ssm_conv args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -2145,7 +2164,31 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_ssm_scan args = {
|
||||
/*.d_state =*/ d_state,
|
||||
/*.d_inner =*/ d_inner,
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb20 =*/ nb20,
|
||||
/*.nb21 =*/ nb21,
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb30 =*/ nb30,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb40 =*/ nb40,
|
||||
/*.nb41 =*/ nb41,
|
||||
/*.nb42 =*/ nb42,
|
||||
/*.nb50 =*/ nb50,
|
||||
/*.nb51 =*/ nb51,
|
||||
/*.nb52 =*/ nb52,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
@@ -2154,30 +2197,7 @@ static void ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
|
||||
[encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:6];
|
||||
|
||||
[encoder setBytes:&d_state length:sizeof(d_state) atIndex:7];
|
||||
[encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8];
|
||||
[encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
|
||||
[encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10];
|
||||
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
|
||||
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
|
||||
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
|
||||
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
|
||||
[encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
|
||||
[encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
|
||||
[encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
|
||||
[encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
|
||||
[encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
|
||||
[encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
|
||||
[encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
|
||||
[encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:7];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -3034,19 +3054,22 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_get_rows args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
||||
} break;
|
||||
@@ -3103,18 +3126,21 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_group_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.n_groups =*/ n_groups,
|
||||
/*.eps =*/ eps,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:9];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
@@ -3272,8 +3298,8 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const int32_t CHW = IC * KH * KW;
|
||||
|
||||
const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
|
||||
const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
|
||||
const uint64_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
|
||||
const uint64_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline;
|
||||
|
||||
@@ -3295,27 +3321,30 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_im2col args = {
|
||||
/*.ofs0 =*/ ofs0,
|
||||
/*.ofs1 =*/ ofs1,
|
||||
/*.IW =*/ IW,
|
||||
/*.IH =*/ IH,
|
||||
/*.CHW =*/ CHW,
|
||||
/*.s0 =*/ s0,
|
||||
/*.s1 =*/ s1,
|
||||
/*.p0 =*/ p0,
|
||||
/*.p1 =*/ p1,
|
||||
/*.d0 =*/ d0,
|
||||
/*.d1 =*/ d1,
|
||||
/*.N =*/ N,
|
||||
/*.KH =*/ KH,
|
||||
/*.KW =*/ KW,
|
||||
/*.KHW =*/ KH * KW,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ofs0 length:sizeof(int32_t) atIndex:2];
|
||||
[encoder setBytes:&ofs1 length:sizeof(int32_t) atIndex:3];
|
||||
[encoder setBytes:&IW length:sizeof(int32_t) atIndex:4];
|
||||
[encoder setBytes:&IH length:sizeof(int32_t) atIndex:5];
|
||||
[encoder setBytes:&CHW length:sizeof(int32_t) atIndex:6];
|
||||
[encoder setBytes:&s0 length:sizeof(int32_t) atIndex:7];
|
||||
[encoder setBytes:&s1 length:sizeof(int32_t) atIndex:8];
|
||||
[encoder setBytes:&p0 length:sizeof(int32_t) atIndex:9];
|
||||
[encoder setBytes:&p1 length:sizeof(int32_t) atIndex:10];
|
||||
[encoder setBytes:&d0 length:sizeof(int32_t) atIndex:11];
|
||||
[encoder setBytes:&d1 length:sizeof(int32_t) atIndex:12];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
if (is_gt_mttpt) {
|
||||
[encoder setBytes:&N length:sizeof(int32_t) atIndex:13];
|
||||
[encoder setBytes:&KH length:sizeof(int32_t) atIndex:14];
|
||||
[encoder setBytes:&KW length:sizeof(int32_t) atIndex:15];
|
||||
|
||||
const uint64_t n_threads = MIN(pipeline.maxTotalThreadsPerThreadgroup, (uint64_t)N);
|
||||
|
||||
const int64_t quotient = N / n_threads + (N % n_threads > 0 ? 1 : 0);
|
||||
@@ -3355,16 +3384,20 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
ggml_metal_kargs_conv_transpose_1d args = {
|
||||
/*.IC =*/ IC,
|
||||
/*.IL =*/ IL,
|
||||
/*.K =*/ K,
|
||||
/*.s0 =*/ s0,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&IC length:sizeof( int32_t) atIndex:3];
|
||||
[encoder setBytes:&IL length:sizeof( int32_t) atIndex:4];
|
||||
[encoder setBytes:&K length:sizeof( int32_t) atIndex:5];
|
||||
[encoder setBytes:&s0 length:sizeof( int32_t) atIndex:6];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(OL, OC, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -3379,30 +3412,33 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_upscale args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.sf0 =*/ sf0,
|
||||
/*.sf1 =*/ sf1,
|
||||
/*.sf2 =*/ sf2,
|
||||
/*.sf3 =*/ sf3
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
|
||||
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
|
||||
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
|
||||
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
@@ -3414,26 +3450,29 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_pad args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -3448,24 +3487,31 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
|
||||
|
||||
ggml_metal_kargs_pad_reflect_1d args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.p0 =*/ p0,
|
||||
/*.p1 =*/ p1
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14];
|
||||
[encoder setBytes:&p0 length:sizeof(p0) atIndex:15];
|
||||
[encoder setBytes:&p1 length:sizeof(p1) atIndex:16];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -3483,12 +3529,15 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_arange args = {
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.start =*/ start,
|
||||
/*.step =*/ step
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
|
||||
[encoder setBytes:&start length:sizeof(start) atIndex:2];
|
||||
[encoder setBytes:&step length:sizeof(step) atIndex:3];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:1];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -3505,13 +3554,16 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_timestep_embedding args = {
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.dim =*/ dim,
|
||||
/*.max_period =*/ max_period
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
|
||||
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
|
||||
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int nth = MIN(1024, half);
|
||||
|
||||
@@ -3544,12 +3596,15 @@ static void ggml_metal_encode_node(
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_argsort args = {
|
||||
/*.ncols =*/ ne00,
|
||||
/*.ncols_pad =*/ ne00_padded
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
|
||||
@@ -3563,11 +3618,14 @@ static void ggml_metal_encode_node(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_leaky_relu args = {
|
||||
/*.slope =*/ slope
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&slope length:sizeof(slope) atIndex:2];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
@@ -4143,21 +4201,24 @@ static void ggml_metal_encode_node(
|
||||
const int64_t n_threads = MIN((int64_t)[pipeline maxTotalThreadsPerThreadgroup], parallel_elements);
|
||||
const int64_t n_tg = (parallel_elements + n_threads - 1) / n_threads;
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
ggml_metal_kargs_pool_2d args_pool_2d = {
|
||||
/* .k0 = */ k0,
|
||||
/* .k1 = */ k1,
|
||||
/* .s0 = */ s0,
|
||||
/* .s1 = */ s1,
|
||||
/* .p0 = */ p0,
|
||||
/* .p1 = */ p1,
|
||||
/* .IH = */ IH,
|
||||
/* .IW = */ IW,
|
||||
/* .OH = */ OH,
|
||||
/* .OW = */ OW,
|
||||
/* .parallel_elements = */ parallel_elements
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&k0 length:sizeof(int32_t) atIndex:2];
|
||||
[encoder setBytes:&k1 length:sizeof(int32_t) atIndex:3];
|
||||
[encoder setBytes:&s0 length:sizeof(int32_t) atIndex:4];
|
||||
[encoder setBytes:&s1 length:sizeof(int32_t) atIndex:5];
|
||||
[encoder setBytes:&p0 length:sizeof(int32_t) atIndex:6];
|
||||
[encoder setBytes:&p1 length:sizeof(int32_t) atIndex:7];
|
||||
[encoder setBytes:&IH length:sizeof(int64_t) atIndex:8];
|
||||
[encoder setBytes:&IW length:sizeof(int64_t) atIndex:9];
|
||||
[encoder setBytes:&OH length:sizeof(int64_t) atIndex:10];
|
||||
[encoder setBytes:&OW length:sizeof(int64_t) atIndex:11];
|
||||
[encoder setBytes:¶llel_elements length:sizeof(int64_t) atIndex:12];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&args_pool_2d length:sizeof(args_pool_2d) atIndex:2];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
|
||||
} break;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -278,7 +278,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
|
||||
cl_int err;
|
||||
|
||||
#ifdef GGML_PROFILE_OPENCL
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
||||
#endif
|
||||
|
||||
@@ -524,7 +524,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
return backend_ctx;
|
||||
}
|
||||
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
|
||||
cl_uint base_align_in_bits;
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
||||
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
||||
backend_ctx->alignment = base_align_in_bits / 8u;
|
||||
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
||||
@@ -1004,17 +1007,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_MUL:
|
||||
return true;
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
@@ -1198,20 +1202,17 @@ struct ggml_backend_opencl_buffer_context {
|
||||
std::string name;
|
||||
};
|
||||
|
||||
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
||||
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return cl_ptr_base;
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
return (void *) (uintptr_t) backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
|
||||
ggml_cl2_init(buffer->buft->device);
|
||||
@@ -1241,7 +1242,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
tensor->extra = view_extra;
|
||||
} else {
|
||||
{
|
||||
size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
|
||||
size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
|
||||
|
||||
ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
|
||||
extra->offset = offset;
|
||||
@@ -1251,6 +1252,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
tensor->extra = extra;
|
||||
}
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// The optimized gemm and gemv kernels are used for large matrices without batch.
|
||||
@@ -2572,26 +2574,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_norm;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
@@ -2629,16 +2638,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
||||
@@ -2653,15 +2665,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
sizeof(local_work_size), local_work_size,
|
||||
sizeof(size_t), &sgs, NULL));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
// This is local memory - the size depends on subgroup size.
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
@@ -3022,6 +3039,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
// enqueue kernel with profiling
|
||||
// <--------------------------------------------> //
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
@@ -3763,10 +3781,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const int nb00 = src0 ? src0->nb[0] : 0;
|
||||
const int nb01 = src0 ? src0->nb[1] : 0;
|
||||
const int nb02 = src0 ? src0->nb[2] : 0;
|
||||
const int nb03 = src0 ? src0->nb[3] : 0;
|
||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
||||
const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
|
||||
@@ -3778,10 +3796,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
const int ne2 = dst ? dst->ne[2] : 0;
|
||||
const int ne3 = dst ? dst->ne[3] : 0;
|
||||
|
||||
const int nb0 = dst ? dst->nb[0] : 0;
|
||||
const int nb1 = dst ? dst->nb[1] : 0;
|
||||
const int nb2 = dst ? dst->nb[2] : 0;
|
||||
const int nb3 = dst ? dst->nb[3] : 0;
|
||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
||||
|
||||
GGML_ASSERT(ne10 % ne02 == 0);
|
||||
GGML_ASSERT(ne10 >= ne02);
|
||||
|
||||
@@ -506,14 +506,23 @@ kernel void kernel_norm(
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
float eps,
|
||||
local float * sum
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
dst = (global void*)((global char*)dst + offsetd);
|
||||
|
||||
global float * x = (global float *) ((global char *) src0 + get_group_id(0)*nb01);
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
// MEAN
|
||||
// parallel sum
|
||||
@@ -533,7 +542,7 @@ kernel void kernel_norm(
|
||||
|
||||
// recenter and VARIANCE
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
global float * y = dst + get_group_id(0)*ne00;
|
||||
global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] - mean;
|
||||
@@ -566,14 +575,23 @@ kernel void kernel_rms_norm(
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
float eps,
|
||||
local float * sum // Note, the size depends on number of subgroups
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
global float4 * x = (global float4 *) ((global char *) src0 + get_group_id(0)*nb01);
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
global float * x_scalar = (global float *) x;
|
||||
float4 sumf = 0;
|
||||
float all_sum = 0;
|
||||
@@ -607,7 +625,7 @@ kernel void kernel_rms_norm(
|
||||
const float mean = sum[0];
|
||||
const float scale = 1.0f/sqrt(mean + eps);
|
||||
|
||||
global float4 * y = (global float4 *) (dst + get_group_id(0)*ne00);
|
||||
global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
global float * y_scalar = (global float *) y;
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] * scale;
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
|
||||
const int qk = QK4_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK5_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
|
||||
const int qk = QK5_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
@@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
|
||||
assert(QK8_1 == 32);
|
||||
assert(k % QK8_1 == 0);
|
||||
const int nb = k / QK8_1;
|
||||
@@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK4_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK5_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK5_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK8_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
|
||||
return (i & 0x007fffff) - 0x00400000;
|
||||
}
|
||||
|
||||
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
|
||||
const float * restrict qw) {
|
||||
static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
|
||||
const float * GGML_RESTRICT qw) {
|
||||
float max = 0;
|
||||
float amax = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||
return scale;
|
||||
}
|
||||
|
||||
static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
|
||||
static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
|
||||
float max = 0;
|
||||
float amax = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||
return 1/iscale;
|
||||
}
|
||||
|
||||
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
||||
static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
|
||||
int ntry, float alpha) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
@@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||
return scale;
|
||||
}
|
||||
|
||||
static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
||||
static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
||||
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
||||
float rmin, float rdelta, int nstep, bool use_mad) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
@@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
||||
return scale;
|
||||
}
|
||||
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & 63; *m = q[j + 4] & 63;
|
||||
} else {
|
||||
@@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
||||
|
||||
//========================- 2-bit (de)-quantization
|
||||
|
||||
void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
||||
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
||||
static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
||||
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
||||
float rmin, float rdelta, int nstep, bool use_mad) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
@@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
||||
return scale;
|
||||
}
|
||||
|
||||
static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
|
||||
static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
|
||||
float max = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
max = MAX(max, x[i]);
|
||||
@@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
||||
return sumlx/suml2;
|
||||
}
|
||||
|
||||
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
||||
static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
|
||||
GGML_ASSERT(quant_weights);
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
@@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
||||
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
||||
float sigma2 = sumx2/QK_K;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
||||
const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
|
||||
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
||||
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
||||
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
||||
@@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
@@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
//========================= 3-bit (de)-quantization
|
||||
|
||||
void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
||||
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
||||
|
||||
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * restrict q = x[i].qs;
|
||||
const uint8_t * restrict hm = x[i].hmask;
|
||||
const uint8_t * GGML_RESTRICT q = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
uint8_t m = 1;
|
||||
|
||||
memcpy(aux, x[i].scales, 12);
|
||||
@@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
||||
static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
|
||||
assert(n_per_row % QK_K == 0);
|
||||
const int nb = n_per_row / QK_K;
|
||||
|
||||
@@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
@@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== 4-bit (de)-quantization
|
||||
|
||||
void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
||||
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
assert(n_per_row % QK_K == 0);
|
||||
const int64_t nb = n_per_row / QK_K;
|
||||
|
||||
@@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
@@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== 5-bit (de)-quantization
|
||||
|
||||
void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
||||
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * restrict qh = y[i].qh;
|
||||
uint8_t * restrict ql = y[i].qs;
|
||||
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
||||
uint8_t * GGML_RESTRICT ql = y[i].qs;
|
||||
memset(qh, 0, QK_K/8);
|
||||
|
||||
uint8_t m1 = 1, m2 = 2;
|
||||
@@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
assert(n_per_row % QK_K == 0);
|
||||
const int64_t nb = n_per_row / QK_K;
|
||||
|
||||
@@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * restrict qh = y[i].qh;
|
||||
uint8_t * restrict ql = y[i].qs;
|
||||
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
||||
uint8_t * GGML_RESTRICT ql = y[i].qs;
|
||||
memset(qh, 0, QK_K/8);
|
||||
|
||||
uint8_t m1 = 1, m2 = 2;
|
||||
@@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
@@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== 6-bit (de)-quantization
|
||||
|
||||
void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * restrict ql = y[i].ql;
|
||||
uint8_t * restrict qh = y[i].qh;
|
||||
uint8_t * GGML_RESTRICT ql = y[i].ql;
|
||||
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
const uint8_t q1 = L[j + l + 0] & 0xF;
|
||||
@@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * restrict ql = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict sc = x[i].scales;
|
||||
const uint8_t * GGML_RESTRICT ql = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
|
||||
for (int n = 0; n < QK_K; n += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
@@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
assert(n_per_row % QK_K == 0);
|
||||
const int64_t nb = n_per_row / QK_K;
|
||||
|
||||
@@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * restrict ql = y[i].ql;
|
||||
uint8_t * restrict qh = y[i].qh;
|
||||
uint8_t * GGML_RESTRICT ql = y[i].ql;
|
||||
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
const uint8_t q1 = L[j + l + 0] & 0xF;
|
||||
@@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
@@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
@@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
@@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
@@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
@@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
@@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
@@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
||||
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
@@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
@@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
||||
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
@@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
||||
|
||||
void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
|
||||
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
|
||||
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
||||
quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
|
||||
quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
|
||||
|
||||
// ====================== "True" 2-bit (de)-quantization
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
||||
|
||||
// ====================== 2.3125 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
||||
|
||||
// ====================== 2.5625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
|
||||
|
||||
// ====================== 3.0625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
||||
|
||||
// ====================== 3.3125 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
||||
|
||||
// ====================== 1.5625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
|
||||
|
||||
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
const int64_t nb = k / QK4_NL;
|
||||
|
||||
@@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
|
||||
}
|
||||
}
|
||||
|
||||
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
||||
static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
||||
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
|
||||
int num_neighbors = neighbours[0];
|
||||
GGML_ASSERT(num_neighbors > 0);
|
||||
float best_d2 = FLT_MAX;
|
||||
@@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
||||
return grid_index;
|
||||
}
|
||||
|
||||
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
||||
static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
||||
|
||||
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
||||
|
||||
@@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
||||
static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
||||
|
||||
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
||||
|
||||
@@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int64_t nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
@@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
|
||||
return nrow * nblock * sizeof(block_iq2_xxs);
|
||||
}
|
||||
|
||||
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int64_t nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
@@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
|
||||
}
|
||||
}
|
||||
|
||||
static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
||||
static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
|
||||
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
|
||||
int num_neighbors = neighbours[0];
|
||||
GGML_ASSERT(num_neighbors > 0);
|
||||
float best_d2 = FLT_MAX;
|
||||
@@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
||||
return grid_index;
|
||||
}
|
||||
|
||||
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
|
||||
const float * restrict quant_weights) {
|
||||
static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
|
||||
const float * GGML_RESTRICT quant_weights) {
|
||||
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
|
||||
@@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int64_t nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
@@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
|
||||
return nrow * nblock * sizeof(block_iq3_xxs);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
||||
}
|
||||
|
||||
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
||||
const float * restrict quant_weights,
|
||||
static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
|
||||
const float * GGML_RESTRICT quant_weights,
|
||||
float * scales,
|
||||
float * weight,
|
||||
float * xval,
|
||||
@@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
||||
}
|
||||
|
||||
#define IQ3S_BLOCK_SIZE 32
|
||||
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int64_t nblock = n_per_row/QK_K;
|
||||
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
||||
@@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
|
||||
return nrow * nblock * sizeof(block_iq3_s);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
||||
void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq3_s(x, y, 1, k, NULL);
|
||||
}
|
||||
@@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
|
||||
|
||||
// =================================== 1.5 bpw ===================================================
|
||||
|
||||
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
||||
static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
||||
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
|
||||
int num_neighbors = neighbours[0];
|
||||
GGML_ASSERT(num_neighbors > 0);
|
||||
float best_score = -FLT_MAX;
|
||||
@@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
||||
return grid_index;
|
||||
}
|
||||
|
||||
static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
|
||||
static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
||||
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
|
||||
int num_neighbors = neighbours[0];
|
||||
GGML_ASSERT(num_neighbors > 0);
|
||||
float best_score = FLT_MAX;
|
||||
@@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
||||
|
||||
#define IQ1S_BLOCK_SIZE 32
|
||||
#define IQ1M_BLOCK_SIZE 16
|
||||
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
||||
static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
|
||||
float * scales,
|
||||
float * weight,
|
||||
float * sumx,
|
||||
@@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
||||
float weight[IQ1S_BLOCK_SIZE];
|
||||
@@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
|
||||
return nrow * nblock * sizeof(block_iq1_s);
|
||||
}
|
||||
|
||||
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
||||
static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
|
||||
float * scales,
|
||||
float * weight,
|
||||
float * pairs,
|
||||
@@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
||||
float weight[IQ1M_BLOCK_SIZE];
|
||||
@@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
||||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
|
||||
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
||||
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
||||
float * scales, float * weight, uint8_t * L,
|
||||
const int8_t * values,
|
||||
@@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
||||
int64_t nblock = n_per_row/QK4_NL;
|
||||
char * qrow = (char *)dst;
|
||||
@@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
|
||||
return nrow * nblock * sizeof(block_iq4_nl);
|
||||
}
|
||||
|
||||
//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
||||
//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
|
||||
GGML_ASSERT(k%QK4_NL == 0);
|
||||
int64_t nblock = k/QK4_NL;
|
||||
uint8_t L[QK4_NL];
|
||||
@@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int64_t nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
@@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
|
||||
return nrow * nblock * sizeof(block_iq4_xs);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
||||
void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq4_xs(x, y, 1, k, NULL);
|
||||
}
|
||||
|
||||
// =============================== 2.5625 bpw
|
||||
|
||||
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
||||
static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
||||
|
||||
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
||||
|
||||
@@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int64_t nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
@@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
|
||||
return nrow * nblock * sizeof(block_iq2_s);
|
||||
}
|
||||
|
||||
void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
||||
void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq2_s(x, y, 1, k, NULL);
|
||||
}
|
||||
|
||||
@@ -464,7 +464,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
|
||||
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
|
||||
@@ -478,6 +478,7 @@ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, gg
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "wkv6.hpp"
|
||||
#include "outprod.hpp"
|
||||
#include "element_wise.hpp"
|
||||
#include "cpy.hpp"
|
||||
#include "gla.hpp"
|
||||
|
||||
#endif // GGML_SYCL_BACKEND_HPP
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
||||
#include "ggml-common.h"
|
||||
#pragma clang diagnostic pop
|
||||
#include "ggml-impl.h"
|
||||
|
||||
void* ggml_sycl_host_malloc(size_t size);
|
||||
void ggml_sycl_host_free(void* ptr);
|
||||
|
||||
701
ggml/src/ggml-sycl/cpy.cpp
Normal file
701
ggml/src/ggml-sycl/cpy.cpp
Normal file
@@ -0,0 +1,701 @@
|
||||
#include "cpy.hpp"
|
||||
|
||||
#include <float.h>
|
||||
|
||||
#include "dequantize.hpp"
|
||||
|
||||
static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) {
|
||||
return 0;
|
||||
}
|
||||
if (x >= val[n - 1]) {
|
||||
return n - 1;
|
||||
}
|
||||
int ml = 0, mu = n - 1;
|
||||
while (mu - ml > 1) {
|
||||
int mav = (ml + mu) / 2;
|
||||
if (x < val[mav]) {
|
||||
mu = mav;
|
||||
} else {
|
||||
ml = mav;
|
||||
}
|
||||
}
|
||||
return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
|
||||
}
|
||||
|
||||
static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
sycl::half * dsti = (sycl::half *) cdsti;
|
||||
|
||||
*dsti = sycl::vec<float, 1>(*xi).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
|
||||
}
|
||||
|
||||
static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||
const sycl::half * xi = (const sycl::half *) cxi;
|
||||
sycl::half * dsti = (sycl::half *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
const sycl::half * xi = (const sycl::half *) cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
|
||||
const int16_t * xi = (const int16_t *) cxi;
|
||||
int16_t * dsti = (int16_t *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
const int32_t * xi = (const int32_t *) cxi;
|
||||
int32_t * dsti = (int32_t *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int i03 = i / (ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
||||
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
|
||||
const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
|
||||
const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
|
||||
|
||||
const int i13 = i / (ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
|
||||
const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
|
||||
const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
|
||||
const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const float v = xi[j];
|
||||
amax = sycl::fmax(amax, sycl::fabs((float) v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK8_0; ++j) {
|
||||
const float x0 = xi[j] * id;
|
||||
|
||||
dsti->qs[j] = sycl::round((float) x0);
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *) (cdsti);
|
||||
|
||||
for (int j = 0; j < QK8_0; j += 2) {
|
||||
dfloat2 dq;
|
||||
dequantize_q8_0(cxi, 0, j, dq);
|
||||
*(cdstf + j) = dq.x();
|
||||
*(cdstf + j + 1) = dq.y();
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -8;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK4_0 / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK4_0 / 2 + j] * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
||||
|
||||
float vmin = FLT_MAX;
|
||||
float vmax = -FLT_MAX;
|
||||
|
||||
for (int j = 0; j < QK4_1; ++j) {
|
||||
const float v = xi[j];
|
||||
|
||||
if (v < vmin) {
|
||||
vmin = v;
|
||||
}
|
||||
if (v > vmax) {
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = vmin;
|
||||
|
||||
for (int j = 0; j < QK4_1 / 2; ++j) {
|
||||
const float x0 = (xi[0 + j] - vmin) * id;
|
||||
const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q5_0 * dsti = (block_q5_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK5_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -16;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_0 / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK5_0 / 2 + j] * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
|
||||
const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
|
||||
|
||||
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
|
||||
}
|
||||
memcpy(dsti->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q5_1 * dsti = (block_q5_1 *) cdsti;
|
||||
|
||||
float min = xi[0];
|
||||
float max = xi[0];
|
||||
|
||||
for (int j = 1; j < QK5_1; ++j) {
|
||||
const float v = xi[j];
|
||||
min = v < min ? v : min;
|
||||
max = v > max ? v : max;
|
||||
}
|
||||
|
||||
const float d = (max - min) / 31;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = min;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_1 / 2; ++j) {
|
||||
const float x0 = (xi[0 + j] - min) * id;
|
||||
const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
|
||||
|
||||
const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
|
||||
const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
|
||||
|
||||
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
|
||||
}
|
||||
memcpy(dsti->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_NL; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
float d = vmax / kvalues_iq4nl[0];
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < QK4_NL / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK4_NL / 2 + j] * id;
|
||||
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
|
||||
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
|
||||
dsti->qs[j] = xi0 | (xi1 << 4);
|
||||
const float v0 = kvalues_iq4nl[xi0];
|
||||
const float v1 = kvalues_iq4nl[xi1];
|
||||
const float w0 = xi[0 + j] * xi[0 + j];
|
||||
const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
|
||||
sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
|
||||
sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
|
||||
}
|
||||
|
||||
dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
|
||||
}
|
||||
|
||||
template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *) (cdsti);
|
||||
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
dfloat2 dq;
|
||||
dequant(cxi, 0, j, dq);
|
||||
*(cdstf + j) = dq.x();
|
||||
*(cdstf + j + qk / 2) = dq.y();
|
||||
}
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i03 = i / (ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
||||
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
|
||||
const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
|
||||
const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
|
||||
|
||||
const int i13 = i / (ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
|
||||
const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
|
||||
const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
|
||||
const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static void cpy_q_f32(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i03 = i / (ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
||||
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
|
||||
const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
|
||||
const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
|
||||
|
||||
const int i13 = i / (ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
|
||||
const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
|
||||
const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
|
||||
const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f16_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK5_0 == 0);
|
||||
const int num_blocks = ne / QK5_0;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK5_1 == 0);
|
||||
const int num_blocks = ne / QK5_1;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK4_NL == 0);
|
||||
const int num_blocks = ne / QK4_NL;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS01;
|
||||
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
queue_ptr main_stream = ctx.stream();
|
||||
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
|
||||
ggml_type_name(src1->type));
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
|
||||
ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q8_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||
ggml_cpy_f32_q5_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||
ggml_cpy_f32_q5_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||
ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
|
||||
ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} catch (const sycl::exception & exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
// TODO: why do we pass dst as src1 here?
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
ggml_sycl_cpy(ctx, dst->src[0], dst);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
|
||||
}
|
||||
11
ggml/src/ggml-sycl/cpy.hpp
Normal file
11
ggml/src/ggml-sycl/cpy.hpp
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef GGML_SYCL_CPY_HPP
|
||||
#define GGML_SYCL_CPY_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
|
||||
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CPY_HPP
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "ggml-sycl/gemm.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
#include "ggml-sycl/getrows.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
int g_ggml_sycl_debug = 0;
|
||||
@@ -323,14 +324,14 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return ctx->dev_ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
static enum ggml_status
|
||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return;
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
@@ -348,6 +349,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
padded_size - original_size).wait()));
|
||||
}
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -729,7 +731,7 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void
|
||||
static enum ggml_status
|
||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
@@ -804,6 +806,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
}
|
||||
}
|
||||
tensor->extra = extra;
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -1283,8 +1286,6 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
|
||||
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
|
||||
|
||||
/// kernels
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
typedef void (*ggml_sycl_op_mul_mat_t)(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
@@ -1466,193 +1467,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
sycl::half *dsti = (sycl::half *)cdsti;
|
||||
|
||||
*dsti = sycl::vec<float, 1>(*xi)
|
||||
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
|
||||
}
|
||||
|
||||
static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||
const sycl::half *xi = (const sycl::half *)cxi;
|
||||
sycl::half *dsti = (sycl::half *)cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
const sycl::half *xi = (const sycl::half *)cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
|
||||
const int16_t *xi = (const int16_t *)cxi;
|
||||
int16_t *dsti = (int16_t *)cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
const int32_t *xi = (const int32_t *)cxi;
|
||||
int32_t *dsti = (int32_t *)cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const float v = xi[j];
|
||||
amax = sycl::fmax(amax, sycl::fabs((float)v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK8_0; ++j) {
|
||||
const float x0 = xi[j]*id;
|
||||
|
||||
dsti->qs[j] = sycl::round((float)x0);
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float)v)) {
|
||||
amax = sycl::fabs((float)v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -8;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK4_0/2; ++j) {
|
||||
const float x0 = xi[0 + j]*id;
|
||||
const float x1 = xi[QK4_0/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
||||
|
||||
float vmin = FLT_MAX;
|
||||
float vmax = -FLT_MAX;
|
||||
|
||||
for (int j = 0; j < QK4_1; ++j) {
|
||||
const float v = xi[j];
|
||||
|
||||
if (v < vmin) vmin = v;
|
||||
if (v > vmax) vmax = v;
|
||||
}
|
||||
|
||||
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = vmin;
|
||||
|
||||
for (int j = 0; j < QK4_1/2; ++j) {
|
||||
const float x0 = (xi[0 + j] - vmin)*id;
|
||||
const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2)) *
|
||||
qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int row = item_ct1.get_group(1);
|
||||
@@ -1901,231 +1715,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00,
|
||||
const int ne01, const int ne02, const int nb00,
|
||||
const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12,
|
||||
const int nb13, queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f16_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00,
|
||||
nb01, nb02, nb03, ne10, ne11, ne12,
|
||||
nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
|
||||
sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
|
||||
sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
|
||||
sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
{
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void scale_f32_sycl(const float *x, float *dst, const float scale,
|
||||
const int k, queue_ptr stream) {
|
||||
@@ -3643,58 +3233,6 @@ static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp);
|
||||
}
|
||||
|
||||
static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
ggml_tensor *dst) try {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS01;
|
||||
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
queue_ptr main_stream = ctx.stream();
|
||||
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
|
||||
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
// TODO: why do we pass dst as src1 here?
|
||||
ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr);
|
||||
}
|
||||
|
||||
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
|
||||
}
|
||||
@@ -3891,7 +3429,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
||||
ggml_sycl_clamp(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst);
|
||||
ggml_sycl_cpy(ctx, dst->src[0], dst->src[1]);
|
||||
break;
|
||||
case GGML_OP_CONT:
|
||||
ggml_sycl_dup(ctx, dst);
|
||||
@@ -4327,7 +3865,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -4405,6 +3943,30 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CONCAT:
|
||||
@@ -4420,23 +3982,24 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
return true;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
return true;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
return (op->src[0]->type == GGML_TYPE_F32);
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_SCALE:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
|
||||
@@ -1992,6 +1992,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
}
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL)
|
||||
rm_stdq = 2;
|
||||
uint32_t rm_iq = 2 * rm_kq;
|
||||
|
||||
for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
@@ -2006,15 +2007,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f32_f32_len, mul_mat_vec_iq1_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f32_f32_len, mul_mat_vec_iq1_m_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f32_f32_len, mul_mat_vec_iq2_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f32_f32_len, mul_mat_vec_iq1_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f32_f32_len, mul_mat_vec_iq1_m_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f32_f32_len, mul_mat_vec_iq2_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
|
||||
@@ -2028,15 +2029,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f16_f32_len, mul_mat_vec_iq1_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f16_f32_len, mul_mat_vec_iq1_m_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f16_f32_len, mul_mat_vec_iq2_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f16_f32_len, mul_mat_vec_iq1_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f16_f32_len, mul_mat_vec_iq1_m_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f16_f32_len, mul_mat_vec_iq2_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
@@ -2051,15 +2052,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", mul_mat_vec_id_iq1_s_f32_len, mul_mat_vec_id_iq1_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", mul_mat_vec_id_iq1_m_f32_len, mul_mat_vec_id_iq1_m_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", mul_mat_vec_id_iq1_s_f32_len, mul_mat_vec_id_iq1_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", mul_mat_vec_id_iq1_m_f32_len, mul_mat_vec_id_iq1_m_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
|
||||
|
||||
// dequant shaders
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
@@ -7922,11 +7923,12 @@ static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
||||
if (tensor->view_src != nullptr) {
|
||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -8450,7 +8452,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -8651,19 +8653,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_RMS_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
#include "dequant_funcs.comp"
|
||||
|
||||
90
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp
Normal file
90
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp
Normal file
@@ -0,0 +1,90 @@
|
||||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||
const uint nibble_shift = 4 * (itid & 1);
|
||||
const uint ib32 = itid / 2; // 0..7
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
|
||||
const float db = d * (0.5 + scale) * 0.25;
|
||||
|
||||
const uint qh = data_a[ibi].qh[ib32];
|
||||
const u8vec2 qs16 = unpack8(data_a_packed16[ibi].qs[itid]);
|
||||
const u8vec2 sign16 = unpack8(data_a_packed16[ibi].qs[QUANT_K / 16 + itid]);
|
||||
[[unroll]] for (uint l = 0; l < 2; ++l) {
|
||||
const uint8_t sign = sign16[l];
|
||||
const uint qs = qs16[l] | ((qh << (8 - nibble_shift - 2 * l)) & 0x300);
|
||||
const uvec2 grid = iq2s_grid[qs];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
FLOAT_TYPE sum =
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w),
|
||||
FLOAT_TYPE(0.0)))))))));
|
||||
temp[j][n] = fma(db, sum, temp[j][n]);
|
||||
}
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint blocks_per_wg = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid % 16; // 0...15
|
||||
const uint ix = tid / 16;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
|
||||
calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
87
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
Normal file
87
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
Normal file
@@ -0,0 +1,87 @@
|
||||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||
const uint nibble_shift = 4 * (itid & 1);
|
||||
const uint ib32 = itid / 2; // 0..7
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
|
||||
const float db = d * (0.5 + scale) * 0.25;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 2; ++l) {
|
||||
const uint qs = data_a[ibi].qs[2 * itid + l];
|
||||
const uint sign = qs >> 9;
|
||||
const uint sign7 = bitCount(sign);
|
||||
const vec4 grid0 = vec4(unpack8(iq2xs_grid[qs & 511].x));
|
||||
const vec4 grid1 = vec4(unpack8(iq2xs_grid[qs & 511].y));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
FLOAT_TYPE sum =
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
|
||||
FLOAT_TYPE(0.0)))))))));
|
||||
temp[j][n] = fma(db, sum, temp[j][n]);
|
||||
}
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint blocks_per_wg = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid % 16; // 0...15
|
||||
const uint ix = tid / 16;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
|
||||
calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
87
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp
Normal file
87
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp
Normal file
@@ -0,0 +1,87 @@
|
||||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||
const uint ib32 = itid / 2; // 0..7
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint signscale = pack32(u16vec2(
|
||||
data_a_packed16[ibi].qs[4 * ib32 + 2],
|
||||
data_a_packed16[ibi].qs[4 * ib32 + 3]));
|
||||
const float db = d * 0.25 * (0.5 + (signscale >> 28));
|
||||
[[unroll]] for (uint l = 0; l < 2; ++l) {
|
||||
const uint qs = data_a[ibi].qs[8 * ib32 + 2 * (itid & 1) + l];
|
||||
const uint sign = bitfieldExtract(signscale, 7 * int(2 * (itid & 1) + l), 7);
|
||||
const uint sign7 = bitCount(sign);
|
||||
const vec4 grid0 = vec4(unpack8(iq2xxs_grid[qs].x));
|
||||
const vec4 grid1 = vec4(unpack8(iq2xxs_grid[qs].y));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
FLOAT_TYPE sum =
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
|
||||
FLOAT_TYPE(0.0)))))))));
|
||||
temp[j][n] = fma(db, sum, temp[j][n]);
|
||||
}
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint blocks_per_wg = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid % 16; // 0...15
|
||||
const uint ix = tid / 16;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
|
||||
calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
90
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp
Normal file
90
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp
Normal file
@@ -0,0 +1,90 @@
|
||||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint scale = (data_a[ibi].scales[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const float dscale = d * (1 + 2 * scale);
|
||||
const uint qh = data_a[ibi].qh[ib32];
|
||||
FLOAT_TYPE sum[NUM_COLS];
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
sum[j] = 0.0;
|
||||
}
|
||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||
const u8vec2 qs = unpack8(data_a_packed16[ibi].qs[4 * ib32 + l]);
|
||||
const uint sign = data_a[ibi].signs[4 * ib32 + l];
|
||||
const vec4 grid0 = vec4(unpack8(iq3s_grid[qs.x | ((qh << (8 - 2*l)) & 0x100)]));
|
||||
const vec4 grid1 = vec4(unpack8(iq3s_grid[qs.y | ((qh << (7 - 2*l)) & 0x100)]));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
sum[j] =
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w),
|
||||
sum[j]))))))));
|
||||
}
|
||||
}
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
temp[j][n] = fma(dscale, sum[j], temp[j][n]);
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 8 threads are used to process each block
|
||||
const uint blocks_per_wg = gl_WorkGroupSize.x/8;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid % 8; // 0...7
|
||||
const uint ix = tid / 8;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
|
||||
calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
88
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp
Normal file
88
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp
Normal file
@@ -0,0 +1,88 @@
|
||||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||
const uint ib32 = itid / 2; // 0..7
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint signscale = pack32(u16vec2(
|
||||
data_a_packed16[ibi].qs[QUANT_K / 8 + 2 * ib32],
|
||||
data_a_packed16[ibi].qs[QUANT_K / 8 + 2 * ib32 + 1]));
|
||||
const float db = d * 0.5 * (0.5 + (signscale >> 28));
|
||||
[[unroll]] for (uint l = 0; l < 2; ++l) {
|
||||
const uint qs0 = data_a[ibi].qs[8 * ib32 + 4 * (itid & 1) + 2 * l];
|
||||
const uint qs1 = data_a[ibi].qs[8 * ib32 + 4 * (itid & 1) + 2 * l + 1];
|
||||
const uint sign = bitfieldExtract(signscale, 7 * int(2 * (itid & 1) + l), 7);
|
||||
const uint sign7 = bitCount(sign);
|
||||
const vec4 grid0 = vec4(unpack8(iq3xxs_grid[qs0]));
|
||||
const vec4 grid1 = vec4(unpack8(iq3xxs_grid[qs1]));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
FLOAT_TYPE sum =
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
|
||||
FLOAT_TYPE(0.0)))))))));
|
||||
temp[j][n] = fma(db, sum, temp[j][n]);
|
||||
}
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint blocks_per_wg = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid % 16; // 0...15
|
||||
const uint ix = tid / 16;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
|
||||
calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
@@ -466,10 +466,13 @@ shared uint16_t iq1s_grid[2048];
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
for (uint i = gl_LocalInvocationIndex.x; i < iq1s_grid_const.length(); i += wgsize.x) {
|
||||
u16vec2 g = unpack16(iq1s_grid_const[i]);
|
||||
iq1s_grid[2*i+0] = g.x;
|
||||
iq1s_grid[2*i+1] = g.y;
|
||||
[[unroll]] for (uint i = 0; i < iq1s_grid_const.length(); i += wgsize.x) {
|
||||
uint idx = i + gl_LocalInvocationIndex.x;
|
||||
if (iq1s_grid_const.length() % wgsize.x == 0 || idx < iq1s_grid_const.length()) {
|
||||
u16vec2 g = unpack16(iq1s_grid_const[idx]);
|
||||
iq1s_grid[2*idx+0] = g.x;
|
||||
iq1s_grid[2*idx+1] = g.y;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
@@ -565,8 +568,10 @@ shared uvec2 iq2xxs_grid[256];
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += wgsize.x) {
|
||||
iq2xxs_grid[i] = iq2xxs_grid_const[i];
|
||||
[[unroll]] for (uint i = 0; i < iq2xxs_grid.length(); i += wgsize.x) {
|
||||
if (iq2xxs_grid_const.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2xxs_grid_const.length()) {
|
||||
iq2xxs_grid[i + gl_LocalInvocationIndex.x] = iq2xxs_grid_const[i + gl_LocalInvocationIndex.x];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
@@ -733,8 +738,10 @@ shared uvec2 iq2xs_grid[512];
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += wgsize.x) {
|
||||
iq2xs_grid[i] = iq2xs_grid_const[i];
|
||||
[[unroll]] for (uint i = 0; i < iq2xs_grid.length(); i += wgsize.x) {
|
||||
if (iq2xs_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2xs_grid_const.length()) {
|
||||
iq2xs_grid[i + gl_LocalInvocationIndex.x] = iq2xs_grid_const[i + gl_LocalInvocationIndex.x];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
@@ -756,6 +763,14 @@ struct block_iq2_s
|
||||
uint8_t scales[QUANT_K_IQ2_S/32];
|
||||
};
|
||||
|
||||
struct block_iq2_s_packed16
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t qs[QUANT_K_IQ2_S/8];
|
||||
uint16_t qh[QUANT_K_IQ2_S/64];
|
||||
uint16_t scales[QUANT_K_IQ2_S/64];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_IQ2_S)
|
||||
|
||||
const uvec2 iq2s_grid_const[1024] = {
|
||||
@@ -1023,8 +1038,10 @@ shared uvec2 iq2s_grid[1024];
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += wgsize.x) {
|
||||
iq2s_grid[i] = iq2s_grid_const[i];
|
||||
[[unroll]] for (uint i = 0; i < iq2s_grid.length(); i += wgsize.x) {
|
||||
if (iq2s_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2s_grid_const.length()) {
|
||||
iq2s_grid[i + gl_LocalInvocationIndex.x] = iq2s_grid_const[i + gl_LocalInvocationIndex.x];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
@@ -1032,6 +1049,7 @@ void init_iq_shmem(uvec3 wgsize)
|
||||
#define QUANT_K QUANT_K_IQ2_S
|
||||
#define QUANT_R QUANT_R_IQ2_S
|
||||
#define A_TYPE block_iq2_s
|
||||
#define A_TYPE_PACKED16 block_iq2_s_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_IQ3_XXS 256
|
||||
@@ -1092,8 +1110,10 @@ shared uint32_t iq3xxs_grid[256];
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += wgsize.x) {
|
||||
iq3xxs_grid[i] = iq3xxs_grid_const[i];
|
||||
[[unroll]] for (uint i = 0; i < iq3xxs_grid.length(); i += wgsize.x) {
|
||||
if (iq3xxs_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq3xxs_grid.length()) {
|
||||
iq3xxs_grid[i + gl_LocalInvocationIndex.x] = iq3xxs_grid_const[i + gl_LocalInvocationIndex.x];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
@@ -1200,8 +1220,10 @@ shared uint32_t iq3s_grid[512];
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += wgsize.x) {
|
||||
iq3s_grid[i] = iq3s_grid_const[i];
|
||||
[[unroll]] for (uint i = 0; i < iq3s_grid.length(); i += wgsize.x) {
|
||||
if (iq3s_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq3s_grid.length()) {
|
||||
iq3s_grid[i + gl_LocalInvocationIndex.x] = iq3s_grid_const[i + gl_LocalInvocationIndex.x];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -402,7 +402,7 @@ void process_shaders() {
|
||||
for (const auto& tname : type_names) {
|
||||
// mul mat vec
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
@@ -565,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
||||
#endif
|
||||
|
||||
}
|
||||
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I8] = {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user