mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 01:24:24 +00:00
Compare commits
15 Commits
b3178
...
sycl-mul-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff0aa3abd1 | ||
|
|
b1ef562bc1 | ||
|
|
17b291a6a5 | ||
|
|
abd894ad96 | ||
|
|
de391e4c80 | ||
|
|
d50f8897a7 | ||
|
|
2075a66a96 | ||
|
|
ba58993152 | ||
|
|
a7854743c5 | ||
|
|
9c77ec1d74 | ||
|
|
a04a953cab | ||
|
|
623494a478 | ||
|
|
37bef89433 | ||
|
|
91c188d6c2 | ||
|
|
84f6de17f6 |
1
.github/labeler.yml
vendored
1
.github/labeler.yml
vendored
@@ -42,7 +42,6 @@ build:
|
||||
- cmake/**
|
||||
- CMakeLists.txt
|
||||
- CMakePresets.json
|
||||
- codecov.yml
|
||||
examples:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/**
|
||||
|
||||
40
.github/workflows/code-coverage.yml
vendored
40
.github/workflows/code-coverage.yml
vendored
@@ -1,40 +0,0 @@
|
||||
name: Code Coverage
|
||||
on: [push, pull_request]
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential gcc-8 lcov
|
||||
|
||||
- name: Build
|
||||
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
||||
|
||||
- name: Run tests
|
||||
run: CC=gcc-8 make test
|
||||
|
||||
- name: Generate coverage report
|
||||
run: |
|
||||
make coverage
|
||||
make lcov-report
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
with:
|
||||
files: lcov-report/coverage.info
|
||||
14
.github/workflows/server.yml
vendored
14
.github/workflows/server.yml
vendored
@@ -87,8 +87,22 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DLLAMA_OPENMP=OFF ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
|
||||
113
.gitignore
vendored
113
.gitignore
vendored
@@ -1,90 +1,123 @@
|
||||
*.o
|
||||
# Extensions
|
||||
|
||||
*.a
|
||||
*.so
|
||||
*.bat
|
||||
*.bin
|
||||
*.dll
|
||||
*.dot
|
||||
*.etag
|
||||
*.exe
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.gcov
|
||||
*.gguf
|
||||
*.gguf.json
|
||||
*.bin
|
||||
*.exe
|
||||
*.dll
|
||||
*.log
|
||||
*.gcov
|
||||
*.gcno
|
||||
*.gcda
|
||||
*.dot
|
||||
*.bat
|
||||
*.tmp
|
||||
*.metallib
|
||||
*.etag
|
||||
*.lastModified
|
||||
.DS_Store
|
||||
.build/
|
||||
*.log
|
||||
*.metallib
|
||||
*.o
|
||||
*.so
|
||||
*.tmp
|
||||
|
||||
# IDE / OS
|
||||
|
||||
.cache/
|
||||
.ccls-cache/
|
||||
.direnv/
|
||||
.DS_Store
|
||||
.envrc
|
||||
.idea/
|
||||
.swiftpm
|
||||
.venv
|
||||
.clang-tidy
|
||||
.vs/
|
||||
.vscode/
|
||||
.idea/
|
||||
nppBackup
|
||||
|
||||
ggml-metal-embed.metal
|
||||
|
||||
lcov-report/
|
||||
# Coverage
|
||||
|
||||
gcovr-report/
|
||||
lcov-report/
|
||||
|
||||
# Build Artifacts
|
||||
|
||||
tags
|
||||
.build/
|
||||
build*
|
||||
!build-info.cmake
|
||||
!build-info.cpp.in
|
||||
!build-info.sh
|
||||
!build.zig
|
||||
cmake-build-*
|
||||
/libllama.so
|
||||
/llama-*
|
||||
android-ndk-*
|
||||
arm_neon.h
|
||||
cmake-build-*
|
||||
CMakeSettings.json
|
||||
compile_commands.json
|
||||
ggml-metal-embed.metal
|
||||
llama-batched-swift
|
||||
out/
|
||||
tmp/
|
||||
|
||||
# CI
|
||||
|
||||
!.github/workflows/*.yml
|
||||
|
||||
# Models
|
||||
|
||||
models/*
|
||||
models-mnt
|
||||
!models/.editorconfig
|
||||
!models/ggml-vocab-*.gguf*
|
||||
|
||||
/Pipfile
|
||||
/libllama.so
|
||||
/llama-*
|
||||
llama-batched-swift
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
CMakeSettings.json
|
||||
|
||||
__pycache__
|
||||
dist
|
||||
# Zig
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
# Logs
|
||||
|
||||
ppl-*.txt
|
||||
qnt-*.txt
|
||||
perf-*.txt
|
||||
|
||||
# Examples
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
examples/server/*.css.hpp
|
||||
examples/server/*.html.hpp
|
||||
examples/server/*.js.hpp
|
||||
examples/server/*.mjs.hpp
|
||||
examples/server/*.css.hpp
|
||||
!build_64.sh
|
||||
!examples/*.bat
|
||||
!examples/*/*.kts
|
||||
!examples/*/*/*.kts
|
||||
!examples/sycl/*.bat
|
||||
!examples/sycl/*.sh
|
||||
|
||||
# Python
|
||||
|
||||
__pycache__
|
||||
.venv
|
||||
/Pipfile
|
||||
dist
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
nppBackup
|
||||
|
||||
# Test binaries
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-backend-ops
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-opt
|
||||
/tests/test-quantize-fns
|
||||
/tests/test-quantize-perf
|
||||
/tests/test-rope
|
||||
/tests/test-sampling
|
||||
/tests/test-tokenizer-0
|
||||
/tests/test-tokenizer-1-spm
|
||||
/tests/test-tokenizer-1-bpe
|
||||
/tests/test-rope
|
||||
/tests/test-backend-ops
|
||||
/tests/test-tokenizer-1-spm
|
||||
|
||||
# Scripts
|
||||
!/scripts/install-oneapi.bat
|
||||
|
||||
@@ -665,6 +665,7 @@ if (LLAMA_SYCL)
|
||||
#todo: AOT
|
||||
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
|
||||
message(STATUS "SYCL found")
|
||||
|
||||
@@ -679,11 +680,9 @@ if (LLAMA_SYCL)
|
||||
endif()
|
||||
|
||||
add_compile_options(-I./) #include DPCT
|
||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
endif()
|
||||
@@ -693,8 +692,10 @@ if (LLAMA_SYCL)
|
||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||
|
||||
if (WIN32)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
else()
|
||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
|
||||
@@ -11,9 +11,21 @@
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "sycl-base",
|
||||
"hidden": true,
|
||||
"generator": "Ninja",
|
||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||
"cacheVariables": {
|
||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||
"CMAKE_CXX_COMPILER": "icx",
|
||||
"LLAMA_SYCL": "ON",
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||
|
||||
{
|
||||
@@ -35,15 +47,18 @@
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||
]
|
||||
}
|
||||
|
||||
@@ -410,15 +410,9 @@ Output (example):
|
||||
|
||||
4. Install build tools
|
||||
|
||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
||||
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||
|
||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
||||
|
||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
||||
|
||||
- Extract `w64devkit` on your pc.
|
||||
|
||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
@@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
# Option 2: Or FP16
|
||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
cmake --build build --config Release -j
|
||||
```
|
||||
@@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
Or, use CMake presets to build:
|
||||
```sh
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
|
||||
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
|
||||
cmake --preset x64-windows-sycl-debug
|
||||
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||
```
|
||||
|
||||
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||
|
||||
*Notes:*
|
||||
|
||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
||||
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||
|
||||
### III. Run the inference
|
||||
|
||||
|
||||
14
codecov.yml
14
codecov.yml
@@ -1,14 +0,0 @@
|
||||
comment: off
|
||||
|
||||
coverage:
|
||||
status:
|
||||
project:
|
||||
default:
|
||||
target: auto
|
||||
threshold: 0
|
||||
base: auto
|
||||
patch:
|
||||
default:
|
||||
target: auto
|
||||
threshold: 0
|
||||
base: auto
|
||||
@@ -6,7 +6,6 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <codecvt>
|
||||
@@ -2657,7 +2656,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
||||
}
|
||||
|
||||
// Set the output file
|
||||
std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
|
||||
|
||||
struct FILE_deleter {
|
||||
void operator()(FILE * f) const {
|
||||
fclose(f);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
||||
if (!outfile) {
|
||||
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
|
||||
@@ -214,7 +214,7 @@ src_func = f"""
|
||||
"""
|
||||
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py = convert_py_pth.read_text()
|
||||
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
lambda m: m.group(1) + src_func + m.group(3),
|
||||
@@ -222,7 +222,7 @@ convert_py = re.sub(
|
||||
flags=re.DOTALL | re.MULTILINE,
|
||||
)
|
||||
|
||||
convert_py_pth.write_text(convert_py)
|
||||
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
|
||||
|
||||
@@ -223,7 +223,11 @@ int main(int argc, char ** argv) {
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
||||
@@ -528,7 +532,12 @@ int main(int argc, char ** argv) {
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
|
||||
@@ -1594,7 +1594,7 @@ struct server_context {
|
||||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
json_value(task.data, "prompt", std::string());
|
||||
prompt = json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
@@ -2038,7 +2038,12 @@ struct server_context {
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
prefix_tokens.push_back(middle_token);
|
||||
}
|
||||
|
||||
prompt_tokens = prefix_tokens;
|
||||
} else {
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
|
||||
@@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
:: build all binary
|
||||
make -j
|
||||
cmake --build . -j
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
cd ..
|
||||
|
||||
@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
|
||||
}
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
|
||||
}
|
||||
return row_rounding;
|
||||
}
|
||||
|
||||
@@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
|
||||
}
|
||||
|
||||
// Round rows to this value for --split-mode row:
|
||||
static int get_mmq_y_host(const int cc, const int mmq_x) {
|
||||
return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
|
||||
static int get_mmq_y_host(const int cc) {
|
||||
return cc >= CC_VOLTA ? 128 : 64;
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
|
||||
@@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cstdint>
|
||||
|
||||
#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
|
||||
#define MMQ_NWARPS 8
|
||||
|
||||
typedef void (*load_tiles_mmq_t)(
|
||||
const char * __restrict__ x, int * __restrict__ x_qs, half2 * __restrict__ x_dm,
|
||||
@@ -15,7 +16,7 @@ typedef void (*load_tiles_mmq_t)(
|
||||
typedef void (*vec_dot_mmq_t)(
|
||||
const int * __restrict__ x_qs, const half2 * __restrict__ x_dm, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0);
|
||||
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1);
|
||||
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);
|
||||
|
||||
struct block_q8_1_mmq {
|
||||
half2 ds[4];
|
||||
@@ -50,21 +51,17 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
|
||||
// get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
static constexpr __device__ int get_mmq_y_device(int mmq_x) {
|
||||
return mmq_x >= 32 ? 128 : 64;
|
||||
}
|
||||
return 128;
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
static constexpr __device__ int get_mmq_y_device(int mmq_x) {
|
||||
return mmq_x >= 32 ? 128 : 64;
|
||||
}
|
||||
return 128;
|
||||
#else
|
||||
static constexpr __device__ int get_mmq_y_device(int /*mmq_x*/) {
|
||||
return 64;
|
||||
}
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
#define TILE_X_SIZES_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
|
||||
#define TILE_X_SIZES_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0}
|
||||
@@ -1734,30 +1731,34 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
}
|
||||
|
||||
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
|
||||
static __device__ __forceinline__ void mmq_write_back_dp4a(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
|
||||
static __device__ __forceinline__ void mmq_write_back_dp4a(
|
||||
const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j >= ne1) {
|
||||
if (j > j_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (need_check && i >= ne0) {
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
|
||||
dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
|
||||
static __device__ __forceinline__ void mmq_write_back_mma(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
|
||||
static __device__ __forceinline__ void mmq_write_back_mma(
|
||||
const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
|
||||
|
||||
typedef mma_int_C_I16J8 mma_C;
|
||||
|
||||
const int i0 = threadIdx.y*mma_C::I;
|
||||
@@ -1769,19 +1770,19 @@ static __device__ __forceinline__ void mmq_write_back_mma(const float * __restri
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += mma_C::J) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
const int j = blockIdx.y*mmq_x + j0 + mma_C::get_j(l);
|
||||
const int j = j0 + mma_C::get_j(l);
|
||||
|
||||
if (j >= ne1) {
|
||||
if (j > j_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int i = blockIdx.x*mmq_y + i0 + mma_C::get_i(l);
|
||||
const int i = i0 + mma_C::get_i(l);
|
||||
|
||||
if (need_check && i >= ne0) {
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[j*ne0 + i] = sum[(j0/mma_C::J)*mma_C::ne + l];
|
||||
dst[j*stride + i] = sum[(j0/mma_C::J)*mma_C::ne + l];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1896,32 +1897,16 @@ static bool mmq_need_sum(const ggml_type type_x) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||
#else
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
|
||||
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
||||
|
||||
// Skip unused template specializations for faster compilation:
|
||||
if (mmq_x > get_mmq_x_max_device()) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
|
||||
static __device__ void mul_mat_q_process_tile(
|
||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0,
|
||||
const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qr = ggml_cuda_type_traits<type>::qr;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int mmq_y = get_mmq_y_device(mmq_x);
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int vdr = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
|
||||
constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
|
||||
|
||||
@@ -1941,20 +1926,18 @@ static __global__ void mul_mat_q(
|
||||
int * tile_x_sc = (int *) (tile_x_dm + txs.dm);
|
||||
int * tile_y = (int *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)]
|
||||
|
||||
const int blocks_per_row_x = ne00 / qk;
|
||||
const int blocks_per_warp = WARP_SIZE / qi;
|
||||
|
||||
const int & ne1 = ne11;
|
||||
|
||||
const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
|
||||
|
||||
const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
constexpr int blocks_per_warp = WARP_SIZE / qi;
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
||||
|
||||
for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
|
||||
const int tile_x_max_i = ne01 - it*mmq_y - 1;
|
||||
const int tile_y_max_j = ne11 - jt*mmq_x - 1;
|
||||
|
||||
load_tiles(x, tile_x_qs, tile_x_dm, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01);
|
||||
const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
|
||||
for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_warp) {
|
||||
|
||||
load_tiles(x, tile_x_qs, tile_x_dm, tile_x_sc, stride01*it*mmq_y + kb0, tile_x_max_i, stride01);
|
||||
|
||||
#pragma unroll
|
||||
for (int kr = 0; kr < qr; ++kr) {
|
||||
@@ -1977,7 +1960,176 @@ static __global__ void mul_mat_q(
|
||||
}
|
||||
}
|
||||
|
||||
write_back(sum, dst, ne0, ne1);
|
||||
if (fixup) {
|
||||
write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
|
||||
} else {
|
||||
write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||
#else
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
||||
|
||||
// Skip unused template specializations for faster compilation:
|
||||
if (mmq_x > get_mmq_x_max_device()) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
{
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
|
||||
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
||||
return;
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
|
||||
const int64_t blocks_per_ne00 = ne00 / qk;
|
||||
constexpr int blocks_per_warp = WARP_SIZE / qi;
|
||||
|
||||
const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int64_t kbc = GGML_PAD((int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp);
|
||||
const int64_t kbc_stop = GGML_PAD((int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp);
|
||||
|
||||
// kb0 == k index when doing the matrix multiplication for an output tile.
|
||||
int kb0_start = kbc % blocks_per_ne00;
|
||||
int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
|
||||
while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
|
||||
const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile.
|
||||
const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile.
|
||||
|
||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
|
||||
it, jt, kb0_start, kb0_stop);
|
||||
|
||||
kbc += blocks_per_ne00;
|
||||
kbc -= kbc % blocks_per_ne00;
|
||||
|
||||
kb0_start = 0;
|
||||
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
|
||||
}
|
||||
|
||||
if (kbc >= kbc_stop) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int jt = kbc / (blocks_per_ne00*nty);
|
||||
const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
|
||||
|
||||
constexpr bool fixup = true; // Last index writes it data to fixup buffer to avoid data races with other blocks.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
|
||||
it, jt, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) {
|
||||
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int blocks_per_warp = WARP_SIZE / qi;
|
||||
const int64_t blocks_per_ne00 = ne00 / qk;
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
||||
|
||||
const int ntx = (ne11 + mmq_x - 1) / mmq_x;
|
||||
const int nty = (ne01 + mmq_y - 1) / mmq_y;
|
||||
|
||||
bool any_fixup = false;
|
||||
|
||||
const int bidx_start = (blockIdx.y*nty + blockIdx.x) * block_num_mmq / (gridDim.y*gridDim.x);
|
||||
const int bidx_stop = (blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq / (gridDim.y*gridDim.x) + 1;
|
||||
|
||||
for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) {
|
||||
const int64_t kbc = GGML_PAD((int64_t) bidx *blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp);
|
||||
const int64_t kbc_stop = GGML_PAD((int64_t)(bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp);
|
||||
|
||||
// Skip fixup tile if the MMQ CUDA block never wrote anything to it:
|
||||
if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int jt = kbc_stop / (blocks_per_ne00*nty);
|
||||
const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
|
||||
|
||||
// Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
|
||||
if (it != blockIdx.x || jt != blockIdx.y) {
|
||||
continue;
|
||||
}
|
||||
|
||||
any_fixup = true;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!any_fixup) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y;
|
||||
|
||||
const int i_max = ne01 - blockIdx.x*mmq_y - 1;
|
||||
const int j_max = ne11 - blockIdx.y*mmq_x - 1;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j > j_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct mmq_args {
|
||||
@@ -1987,124 +2139,151 @@ struct mmq_args {
|
||||
int64_t ne0;
|
||||
};
|
||||
|
||||
constexpr int mmq_get_nwarps(int mmq_x) {
|
||||
return mmq_x >= 32 ? 8 : 4;
|
||||
}
|
||||
|
||||
static int mmq_get_shmem(const ggml_type type, const int mmq_x, const int mmq_y) {
|
||||
const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
|
||||
const int nwarps = mmq_get_nwarps(mmq_x);
|
||||
|
||||
const int shmem_x = txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
|
||||
const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
|
||||
return shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int));
|
||||
return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int));
|
||||
}
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps>
|
||||
static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
|
||||
template <ggml_type type, int mmq_x>
|
||||
static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int mmq_y = get_mmq_y_host(cc, mmq_x);
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const int mmq_y = get_mmq_y_host(cc);
|
||||
|
||||
const int block_num_x = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||
const int block_num_y = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const dim3 block_nums(block_num_x, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
||||
const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1);
|
||||
|
||||
const int shmem = mmq_get_shmem(type, mmq_x, mmq_y);
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shmem_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||
shmem_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, 1);
|
||||
|
||||
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
|
||||
if (!use_stream_k) {
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
|
||||
(args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
|
||||
(args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_nums_mmq(nsm, 1, 1);
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
|
||||
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
const bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
|
||||
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
|
||||
constexpr bool need_check = false;
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
|
||||
(args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
|
||||
(args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
|
||||
} else {
|
||||
const bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
|
||||
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
|
||||
constexpr bool need_check = true;
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
|
||||
(args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
|
||||
(args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
|
||||
}
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||
const int mmq_y = get_mmq_y_host(cc, mmq_x_max);
|
||||
const int mmq_y = get_mmq_y_host(cc);
|
||||
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
|
||||
|
||||
int mmq_x_best = 0;
|
||||
int nwaves_best = INT_MAX;
|
||||
int nparts_best = INT_MAX;
|
||||
|
||||
for (int mmq_x = 8; mmq_x <= mmq_x_max && nwaves_best > 1; mmq_x += 8) {
|
||||
const int block_num_x = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const int nwaves = (block_num_x*block_num_y + nsm - 1) / nsm;
|
||||
for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) {
|
||||
const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const int nwaves_xy_tiling = ntiles_x*block_num_y;
|
||||
|
||||
if (nwaves < nwaves_best && mmq_get_shmem(type, mmq_x, mmq_y) <= smpbo) {
|
||||
const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling;
|
||||
|
||||
if (nparts < nparts_best && mmq_get_shmem(type, mmq_x, mmq_y) <= smpbo) {
|
||||
mmq_x_best = mmq_x;
|
||||
nwaves_best = nwaves;
|
||||
nparts_best = nparts;
|
||||
}
|
||||
}
|
||||
|
||||
switch (mmq_x_best) {
|
||||
case 8:
|
||||
launch_mul_mat_q<type, 8, mmq_get_nwarps( 8)>(args, stream);
|
||||
launch_mul_mat_q<type, 8>(ctx, args, stream);
|
||||
break;
|
||||
case 16:
|
||||
launch_mul_mat_q<type, 16, mmq_get_nwarps( 16)>(args, stream);
|
||||
launch_mul_mat_q<type, 16>(ctx, args, stream);
|
||||
break;
|
||||
case 24:
|
||||
launch_mul_mat_q<type, 24, mmq_get_nwarps( 24)>(args, stream);
|
||||
launch_mul_mat_q<type, 24>(ctx, args, stream);
|
||||
break;
|
||||
case 32:
|
||||
launch_mul_mat_q<type, 32, mmq_get_nwarps( 32)>(args, stream);
|
||||
launch_mul_mat_q<type, 32>(ctx, args, stream);
|
||||
break;
|
||||
case 40:
|
||||
launch_mul_mat_q<type, 40, mmq_get_nwarps( 40)>(args, stream);
|
||||
launch_mul_mat_q<type, 40>(ctx, args, stream);
|
||||
break;
|
||||
case 48:
|
||||
launch_mul_mat_q<type, 48, mmq_get_nwarps( 48)>(args, stream);
|
||||
launch_mul_mat_q<type, 48>(ctx, args, stream);
|
||||
break;
|
||||
case 56:
|
||||
launch_mul_mat_q<type, 56, mmq_get_nwarps( 56)>(args, stream);
|
||||
launch_mul_mat_q<type, 56>(ctx, args, stream);
|
||||
break;
|
||||
case 64:
|
||||
launch_mul_mat_q<type, 64, mmq_get_nwarps( 64)>(args, stream);
|
||||
launch_mul_mat_q<type, 64>(ctx, args, stream);
|
||||
break;
|
||||
case 72:
|
||||
launch_mul_mat_q<type, 72, mmq_get_nwarps( 72)>(args, stream);
|
||||
launch_mul_mat_q<type, 72>(ctx, args, stream);
|
||||
break;
|
||||
case 80:
|
||||
launch_mul_mat_q<type, 80, mmq_get_nwarps( 80)>(args, stream);
|
||||
launch_mul_mat_q<type, 80>(ctx, args, stream);
|
||||
break;
|
||||
case 88:
|
||||
launch_mul_mat_q<type, 88, mmq_get_nwarps( 88)>(args, stream);
|
||||
launch_mul_mat_q<type, 88>(ctx, args, stream);
|
||||
break;
|
||||
case 96:
|
||||
launch_mul_mat_q<type, 96, mmq_get_nwarps( 96)>(args, stream);
|
||||
launch_mul_mat_q<type, 96>(ctx, args, stream);
|
||||
break;
|
||||
case 104:
|
||||
launch_mul_mat_q<type, 104, mmq_get_nwarps(104)>(args, stream);
|
||||
launch_mul_mat_q<type, 104>(ctx, args, stream);
|
||||
break;
|
||||
case 112:
|
||||
launch_mul_mat_q<type, 112, mmq_get_nwarps(112)>(args, stream);
|
||||
launch_mul_mat_q<type, 112>(ctx, args, stream);
|
||||
break;
|
||||
case 120:
|
||||
launch_mul_mat_q<type, 120, mmq_get_nwarps(120)>(args, stream);
|
||||
launch_mul_mat_q<type, 120>(ctx, args, stream);
|
||||
break;
|
||||
case 128:
|
||||
launch_mul_mat_q<type, 128, mmq_get_nwarps(128)>(args, stream);
|
||||
launch_mul_mat_q<type, 128>(ctx, args, stream);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
|
||||
@@ -2114,7 +2293,7 @@ void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
|
||||
}
|
||||
|
||||
#define DECL_MMQ_CASE(type) \
|
||||
template void mul_mat_q_case<type>(const mmq_args & args, cudaStream_t stream) \
|
||||
template void mul_mat_q_case<type>(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \
|
||||
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
|
||||
|
||||
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
||||
}
|
||||
|
||||
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
||||
for (size_t i = 0, n = 3; i < n; ++i) {
|
||||
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
|
||||
7051
ggml-sycl.cpp
7051
ggml-sycl.cpp
File diff suppressed because it is too large
Load Diff
@@ -14,5 +14,10 @@
|
||||
#define GGML_SYCL_BACKEND_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "convert.hpp"
|
||||
#include "dequantize.hpp"
|
||||
#include "dmmv.hpp"
|
||||
#include "mmq.hpp"
|
||||
#include "mmvq.hpp"
|
||||
|
||||
#endif // GGML_SYCL_BACKEND_HPP
|
||||
|
||||
544
ggml-sycl/convert.cpp
Normal file
544
ggml-sycl/convert.cpp
Normal file
@@ -0,0 +1,544 @@
|
||||
#include "convert.hpp"
|
||||
#include "dequantize.hpp"
|
||||
#include "presets.hpp"
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2));
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int ib = i/qk; // block index
|
||||
const int iqs = (i%qk)/qr; // quant index
|
||||
const int iybs = i - i%qk; // y block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
y[iybs + iqs + 0] = v.x();
|
||||
y[iybs + iqs + y_offset] = v.y();
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_sycl(const void *__restrict__ vx,
|
||||
dst_t *__restrict__ y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q2_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q2_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q3_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q3_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_0(vx, y, nb32, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_1(vx, y, nb32, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q5_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q5_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q6_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q6_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq1_s(
|
||||
vx, y, item_ct1, iq1s_grid_gpu
|
||||
);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq1_m(
|
||||
vx, y, item_ct1, iq1s_grid_gpu
|
||||
);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_xxs(
|
||||
vx, y, item_ct1, iq2xxs_grid,
|
||||
ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_xs(
|
||||
vx, y, item_ct1, iq2xs_grid,
|
||||
ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_s(vx, y, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq3_xxs(
|
||||
vx, y, item_ct1, iq3xxs_grid,
|
||||
ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = k / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq3_s(
|
||||
vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
#if QK_K == 64
|
||||
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq4_xs(vx, y, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq4_nl(vx, y, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const src_t * x = (src_t *) vx;
|
||||
|
||||
y[i] = x[i];
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_sycl(const void *__restrict__ vx,
|
||||
dst_t *__restrict__ y, const int k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
convert_unary<src_t>(vx, y, k, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_sycl;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_K_sycl;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_K_sycl;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return dequantize_row_q6_K_sycl;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_sycl;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
return dequantize_row_iq1_m_sycl;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
return dequantize_row_iq2_xxs_sycl;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_sycl;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return dequantize_row_iq2_s_sycl;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_sycl;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_sycl;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return dequantize_row_iq4_xs_sycl;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_sycl;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_sycl<float>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_sycl;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_sycl;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_sycl;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_K_sycl;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_K_sycl;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return dequantize_row_q6_K_sycl;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_sycl;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
return dequantize_row_iq1_m_sycl;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
return dequantize_row_iq2_xxs_sycl;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_sycl;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return dequantize_row_iq2_s_sycl;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_sycl;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_sycl;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return dequantize_row_iq4_xs_sycl;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_sycl;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_sycl<sycl::half>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
27
ggml-sycl/convert.hpp
Normal file
27
ggml-sycl/convert.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_CONVERT_HPP
|
||||
#define GGML_SYCL_CONVERT_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
template <typename T>
|
||||
using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
|
||||
int k, dpct::queue_ptr stream);
|
||||
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
||||
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
|
||||
|
||||
#endif // GGML_SYCL_CONVERT_HPP
|
||||
690
ggml-sycl/dequantize.hpp
Normal file
690
ggml-sycl/dequantize.hpp
Normal file
@@ -0,0 +1,690 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_DEQUANTIZE_HPP
|
||||
#define GGML_SYCL_DEQUANTIZE_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
||||
|
||||
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
v.x() = vui & 0xF;
|
||||
v.y() = vui >> 4;
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
// v = v - {8.0f, 8.0f};
|
||||
// v = v * {d, d};
|
||||
v.s0() = (v.s0() - 8.0f) * d;
|
||||
v.s1() = (v.s1() - 8.0f) * d;
|
||||
|
||||
#else
|
||||
v.x() = (v.x() - 8.0f) * d;
|
||||
v.y() = (v.y() - 8.0f) * d;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const dfloat d = x[ib].dm[0];
|
||||
const dfloat m = x[ib].dm[1];
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
v.x() = vui & 0xF;
|
||||
v.y() = vui >> 4;
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
// v = v * {d, d};
|
||||
// v = v + {m, m};
|
||||
v.s0() = (v.s0() * d) + m;
|
||||
v.s1() = (v.s1() * d) + m;
|
||||
|
||||
#else
|
||||
v.x() = (v.x() * d) + m;
|
||||
v.y() = (v.y() * d) + m;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
||||
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
||||
|
||||
v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
// v = v - {16.0f, 16.0f};
|
||||
// v = v * {d, d};
|
||||
v.s0() = (v.s0() - 16.0f) * d;
|
||||
v.s1() = (v.s1() - 16.0f) * d;
|
||||
|
||||
#else
|
||||
v.x() = (v.x() - 16.0f) * d;
|
||||
v.y() = (v.y() - 16.0f) * d;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const dfloat d = x[ib].dm[0];
|
||||
const dfloat m = x[ib].dm[1];
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
||||
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
||||
|
||||
v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
// v = v * {d, d};
|
||||
// v = v + {m, m};
|
||||
v.s0() = (v.s0() * d) + m;
|
||||
v.s1() = (v.s1() * d) + m;
|
||||
#else
|
||||
v.x() = (v.x() * d) + m;
|
||||
v.y() = (v.y() * d) + m;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
||||
v.x() = x[ib].qs[iqs + 0];
|
||||
v.y() = x[ib].qs[iqs + 1];
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
// v = v * {d, d};
|
||||
v.s0() *= d;
|
||||
v.s1() *= d;
|
||||
#else
|
||||
v.x() *= d;
|
||||
v.y() *= d;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
||||
|
||||
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
||||
const float d = sycl::vec<sycl::half, 1>(x->d)
|
||||
.convert<float, sycl::rounding_mode::automatic>()[0];
|
||||
const float dm = -8*d;
|
||||
|
||||
const uint8_t * q = x->qs + 4*il;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
||||
y[l+16] = d * (q[l] >> 4) + dm;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
||||
|
||||
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
||||
const sycl::float2 d =
|
||||
x->dm.convert<float, sycl::rounding_mode::automatic>();
|
||||
|
||||
const uint8_t * q = x->qs + 4*il;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
y[l + 0] = d.x() * (q[l] & 0xF) + d.y();
|
||||
y[l + 16] = d.x() * (q[l] >> 4) + d.y();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//================================== k-quants
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int n = tid/32;
|
||||
const int l = tid - 32*n;
|
||||
const int is = 8*n + l/16;
|
||||
|
||||
const uint8_t q = x[i].qs[32*n + l];
|
||||
dst_t * y = yy + i*QK_K + 128*n;
|
||||
|
||||
float dall = x[i].dm[0];
|
||||
float dmin = x[i].dm[1];
|
||||
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||
#else
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int il = tid%16; // 0...15
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
|
||||
float dall = x[i].dm[0];
|
||||
float dmin = x[i].dm[1];
|
||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
#if QK_K == 256
|
||||
const int r = item_ct1.get_local_id(2) / 4;
|
||||
const int tid = r/2;
|
||||
const int is0 = r%2;
|
||||
const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
|
||||
const int n = tid / 4;
|
||||
const int j = tid - 4*n;
|
||||
|
||||
uint8_t m = 1 << (4*n + j);
|
||||
int is = 8*n + 2*j + is0;
|
||||
int shift = 2*j;
|
||||
|
||||
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
||||
is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
|
||||
is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
|
||||
(x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
|
||||
float d_all = x[i].d;
|
||||
float dl = d_all * (us - 32);
|
||||
|
||||
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
||||
const uint8_t * q = x[i].qs + 32*n;
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int il = tid%16; // 0...15
|
||||
const int im = il/8; // 0...1
|
||||
const int in = il%8; // 0...7
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
if (is == 0) {
|
||||
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
} else {
|
||||
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63; m = q[j + 4] & 63;
|
||||
} else {
|
||||
d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 32 threads
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int is = 2*il;
|
||||
const int n = 4;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
||||
|
||||
const float dall = x[i].dm[0];
|
||||
const float dmin = x[i].dm[1];
|
||||
|
||||
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
for (int l = 0; l < n; ++l) {
|
||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const uint8_t * q = x[i].qs;
|
||||
dst_t * y = yy + i*QK_K;
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int il = tid/16; // il is in 0...3
|
||||
const int ir = tid%16; // ir is in 0...15
|
||||
const int is = 2*il; // is is in 0...6
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
||||
|
||||
const float dall = x[i].dm[0];
|
||||
const float dmin = x[i].dm[1];
|
||||
|
||||
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
||||
const uint8_t * qh = x[i].qh + 2*ir;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
|
||||
uint8_t hm = 1 << (2*il);
|
||||
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
||||
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const uint8_t q = x[i].qs[tid];
|
||||
const int im = tid/8; // 0...3
|
||||
const int in = tid%8; // 0...7
|
||||
const int is = tid/16; // 0 or 1
|
||||
const uint8_t h = x[i].qh[in] >> im;
|
||||
const float d = x[i].d;
|
||||
dst_t * y = yy + i*QK_K + tid;
|
||||
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
||||
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
#if QK_K == 256
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int ip = tid/32; // ip is 0 or 1
|
||||
const int il = tid - 32*ip; // 0...32
|
||||
const int is = 8*ip + il/16;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t * ql = x[i].ql + 64*ip + il;
|
||||
const uint8_t qh = x[i].qh[32*ip + il];
|
||||
const int8_t * sc = x[i].scales + is;
|
||||
|
||||
y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||
#else
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int ip = tid/16; // 0 or 1
|
||||
const int il = tid - 16*ip; // 0...15
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t ql = x[i].ql[16*ip + il];
|
||||
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
||||
const int8_t * sc = x[i].scales;
|
||||
|
||||
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1,
|
||||
const uint64_t *iq2xxs_grid_ptr,
|
||||
const uint8_t *ksigns_iq2xs_ptr,
|
||||
const uint8_t *kmask_iq2xs_ptr) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * q2 = x[i].qs + 4*ib;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]);
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1,
|
||||
const uint64_t *iq2xs_grid,
|
||||
const uint8_t *ksigns_iq2xs,
|
||||
const uint8_t *kmask_iq2xs) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * q2 = x[i].qs + 4*ib;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
__dpct_inline__ static void
|
||||
dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j)
|
||||
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
assert(false);
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1,
|
||||
const uint32_t *iq3xxs_grid,
|
||||
const uint8_t *ksigns_iq2xs,
|
||||
const uint8_t *kmask_iq2xs) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * q3 = x[i].qs + 8*ib;
|
||||
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
__dpct_inline__ static void
|
||||
dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1,
|
||||
const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * qs = x[i].qs + 8*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
||||
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
||||
const uint8_t signs = x[i].signs[4*ib + il];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
__dpct_inline__ static void
|
||||
dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1,
|
||||
const uint32_t *iq1s_grid_gpu) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
||||
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
||||
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
||||
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
||||
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
||||
grid32[0] &= 0x0f0f0f0f;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
__dpct_inline__ static void
|
||||
dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1,
|
||||
const uint32_t *iq1s_grid_gpu) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
||||
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
||||
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
||||
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
||||
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
|
||||
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
||||
grid32[0] &= 0x0f0f0f0f;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
__dpct_inline__ static void
|
||||
dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = (float)x[ib].d;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename dst_t>
|
||||
__dpct_inline__ static void
|
||||
dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = item_ct1.get_group(2);
|
||||
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
||||
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif // GGML_SYCL_DEQUANTIZE_HPP
|
||||
1022
ggml-sycl/dmmv.cpp
Normal file
1022
ggml-sycl/dmmv.cpp
Normal file
File diff suppressed because it is too large
Load Diff
27
ggml-sycl/dmmv.hpp
Normal file
27
ggml-sycl/dmmv.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_DMMV_HPP
|
||||
#define GGML_SYCL_DMMV_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
|
||||
float *dst_dd_i, const int64_t row_low, const int64_t row_high,
|
||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||
const dpct::queue_ptr &stream);
|
||||
|
||||
#endif // GGML_SYCL_DMMV_HPP
|
||||
@@ -588,266 +588,222 @@ namespace dpct
|
||||
out = prop;
|
||||
}
|
||||
|
||||
/// dpct device extension
|
||||
class device_ext : public sycl::device
|
||||
{
|
||||
typedef std::mutex mutex_type;
|
||||
/// dpct device extension
|
||||
class device_ext : public sycl::device {
|
||||
typedef std::mutex mutex_type;
|
||||
|
||||
public:
|
||||
device_ext() : sycl::device(), _ctx(*this) {}
|
||||
~device_ext()
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
}
|
||||
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
init_queues();
|
||||
}
|
||||
public:
|
||||
device_ext() : sycl::device() {}
|
||||
~device_ext() {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
}
|
||||
device_ext(const sycl::device &base) : sycl::device(base) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
init_queues();
|
||||
}
|
||||
|
||||
int is_native_atomic_supported() { return 0; }
|
||||
int get_major_version() const
|
||||
{
|
||||
return dpct::get_major_version(*this);
|
||||
}
|
||||
int is_native_atomic_supported() { return 0; }
|
||||
int get_major_version() const { return dpct::get_major_version(*this); }
|
||||
|
||||
int get_minor_version() const
|
||||
{
|
||||
return dpct::get_minor_version(*this);
|
||||
}
|
||||
int get_minor_version() const { return dpct::get_minor_version(*this); }
|
||||
|
||||
int get_max_compute_units() const
|
||||
{
|
||||
return get_device_info().get_max_compute_units();
|
||||
}
|
||||
int get_max_compute_units() const {
|
||||
return get_device_info().get_max_compute_units();
|
||||
}
|
||||
|
||||
/// Return the maximum clock frequency of this device in KHz.
|
||||
int get_max_clock_frequency() const
|
||||
{
|
||||
return get_device_info().get_max_clock_frequency();
|
||||
}
|
||||
/// Return the maximum clock frequency of this device in KHz.
|
||||
int get_max_clock_frequency() const {
|
||||
return get_device_info().get_max_clock_frequency();
|
||||
}
|
||||
|
||||
int get_integrated() const { return get_device_info().get_integrated(); }
|
||||
int get_integrated() const { return get_device_info().get_integrated(); }
|
||||
|
||||
int get_max_sub_group_size() const
|
||||
{
|
||||
return get_device_info().get_max_sub_group_size();
|
||||
}
|
||||
int get_max_sub_group_size() const {
|
||||
return get_device_info().get_max_sub_group_size();
|
||||
}
|
||||
|
||||
int get_max_register_size_per_work_group() const
|
||||
{
|
||||
return get_device_info().get_max_register_size_per_work_group();
|
||||
}
|
||||
int get_max_register_size_per_work_group() const {
|
||||
return get_device_info().get_max_register_size_per_work_group();
|
||||
}
|
||||
|
||||
int get_max_work_group_size() const
|
||||
{
|
||||
return get_device_info().get_max_work_group_size();
|
||||
}
|
||||
int get_max_work_group_size() const {
|
||||
return get_device_info().get_max_work_group_size();
|
||||
}
|
||||
|
||||
int get_mem_base_addr_align() const
|
||||
{
|
||||
return get_info<sycl::info::device::mem_base_addr_align>();
|
||||
}
|
||||
int get_mem_base_addr_align() const {
|
||||
return get_info<sycl::info::device::mem_base_addr_align>();
|
||||
}
|
||||
|
||||
size_t get_global_mem_size() const
|
||||
{
|
||||
return get_device_info().get_global_mem_size();
|
||||
}
|
||||
size_t get_global_mem_size() const {
|
||||
return get_device_info().get_global_mem_size();
|
||||
}
|
||||
|
||||
size_t get_max_mem_alloc_size() const
|
||||
{
|
||||
return get_device_info().get_max_mem_alloc_size();
|
||||
}
|
||||
size_t get_max_mem_alloc_size() const {
|
||||
return get_device_info().get_max_mem_alloc_size();
|
||||
}
|
||||
|
||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
||||
void get_memory_info(size_t &free_memory, size_t &total_memory)
|
||||
{
|
||||
total_memory = get_device_info().get_global_mem_size();
|
||||
const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
|
||||
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
||||
"use total memory as free memory";
|
||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||
/// \param [out] free_memory The number of bytes of free memory on the
|
||||
/// SYCL device. \param [out] total_memory The number of bytes of total
|
||||
/// memory on the SYCL device.
|
||||
void get_memory_info(size_t &free_memory, size_t &total_memory) {
|
||||
total_memory = get_device_info().get_global_mem_size();
|
||||
const char *warning_info =
|
||||
"get_memory_info: [warning] ext_intel_free_memory is not "
|
||||
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
||||
"use total memory as free memory";
|
||||
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
||||
if (!has(sycl::aspect::ext_intel_free_memory))
|
||||
{
|
||||
std::cerr << warning_info << std::endl;
|
||||
free_memory = total_memory;
|
||||
}
|
||||
else
|
||||
{
|
||||
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
}
|
||||
if (!has(sycl::aspect::ext_intel_free_memory)) {
|
||||
std::cerr << warning_info << std::endl;
|
||||
free_memory = total_memory;
|
||||
} else {
|
||||
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
}
|
||||
#else
|
||||
std::cerr << warning_info << std::endl;
|
||||
free_memory = total_memory;
|
||||
std::cerr << warning_info << std::endl;
|
||||
free_memory = total_memory;
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma message("Querying the number of bytes of free memory is not supported")
|
||||
#else
|
||||
#warning "Querying the number of bytes of free memory is not supported"
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void get_device_info(device_info &out) const {
|
||||
dpct::get_device_info(out, *this);
|
||||
}
|
||||
|
||||
device_info get_device_info() const {
|
||||
device_info prop;
|
||||
dpct::get_device_info(prop, *this);
|
||||
return prop;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
init_queues();
|
||||
}
|
||||
|
||||
sycl::queue &in_order_queue() { return _q_in_order; }
|
||||
|
||||
sycl::queue &out_of_order_queue() { return _q_out_of_order; }
|
||||
|
||||
sycl::queue &default_queue() { return in_order_queue(); }
|
||||
|
||||
void queues_wait_and_throw() {
|
||||
std::unique_lock<mutex_type> lock(m_mutex);
|
||||
lock.unlock();
|
||||
for (auto &q : _queues) {
|
||||
q.wait_and_throw();
|
||||
}
|
||||
// Guard the destruct of current_queues to make sure the ref count is
|
||||
// safe.
|
||||
lock.lock();
|
||||
}
|
||||
|
||||
void get_device_info(device_info &out) const
|
||||
{
|
||||
dpct::get_device_info(out, *this);
|
||||
}
|
||||
sycl::queue create_queue(bool enable_exception_handler = false) {
|
||||
return create_in_order_queue(enable_exception_handler);
|
||||
}
|
||||
|
||||
device_info get_device_info() const
|
||||
{
|
||||
device_info prop;
|
||||
dpct::get_device_info(prop, *this);
|
||||
return prop;
|
||||
}
|
||||
sycl::queue create_queue(sycl::device device,
|
||||
bool enable_exception_handler = false) {
|
||||
return create_in_order_queue(device, enable_exception_handler);
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
init_queues();
|
||||
}
|
||||
sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
|
||||
sycl::queue &in_order_queue() { return *_q_in_order; }
|
||||
|
||||
sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
|
||||
|
||||
sycl::queue &default_queue()
|
||||
{
|
||||
return in_order_queue();
|
||||
}
|
||||
|
||||
void queues_wait_and_throw()
|
||||
{
|
||||
std::unique_lock<mutex_type> lock(m_mutex);
|
||||
std::vector<std::shared_ptr<sycl::queue>> current_queues(
|
||||
_queues);
|
||||
lock.unlock();
|
||||
for (const auto &q : current_queues)
|
||||
{
|
||||
q->wait_and_throw();
|
||||
}
|
||||
// Guard the destruct of current_queues to make sure the ref count is safe.
|
||||
lock.lock();
|
||||
}
|
||||
|
||||
sycl::queue *create_queue(bool enable_exception_handler = false)
|
||||
{
|
||||
return create_in_order_queue(enable_exception_handler);
|
||||
}
|
||||
|
||||
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
||||
bool enable_exception_handler = false) {
|
||||
return create_in_order_queue(context, device, enable_exception_handler);
|
||||
}
|
||||
|
||||
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
|
||||
sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
|
||||
sycl::queue create_in_order_queue(sycl::device device,
|
||||
bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(context, device, enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(device, enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
|
||||
sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(enable_exception_handler);
|
||||
}
|
||||
sycl::queue create_out_of_order_queue(
|
||||
bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(enable_exception_handler);
|
||||
}
|
||||
|
||||
void destroy_queue(sycl::queue *&queue)
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool
|
||||
{
|
||||
return q.get() == queue;
|
||||
}),
|
||||
_queues.end());
|
||||
queue = nullptr;
|
||||
}
|
||||
void set_saved_queue(sycl::queue *q)
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_saved_queue = q;
|
||||
}
|
||||
sycl::queue *get_saved_queue() const
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return _saved_queue;
|
||||
}
|
||||
sycl::context get_context() const { return _ctx; }
|
||||
void destroy_queue(sycl::queue queue) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_queues.clear();
|
||||
}
|
||||
void set_saved_queue(sycl::queue q) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_saved_queue = q;
|
||||
}
|
||||
sycl::queue get_saved_queue() const {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return _saved_queue;
|
||||
}
|
||||
|
||||
private:
|
||||
void clear_queues()
|
||||
{
|
||||
_queues.clear();
|
||||
_q_in_order = _q_out_of_order = _saved_queue = nullptr;
|
||||
}
|
||||
private:
|
||||
void clear_queues() { _queues.clear(); }
|
||||
|
||||
void init_queues()
|
||||
{
|
||||
_q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
|
||||
_q_out_of_order = create_queue_impl(true);
|
||||
_saved_queue = &default_queue();
|
||||
}
|
||||
void init_queues() {
|
||||
_q_in_order =
|
||||
create_queue_impl(true, sycl::property::queue::in_order());
|
||||
_q_out_of_order = create_queue_impl(true);
|
||||
_saved_queue = default_queue();
|
||||
}
|
||||
|
||||
/// Caller should acquire resource \p m_mutex before calling this function.
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
||||
Properties... properties)
|
||||
{
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler)
|
||||
{
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
_ctx, *this, eh,
|
||||
sycl::property_list(
|
||||
/// Caller should acquire resource \p m_mutex before calling this
|
||||
/// function.
|
||||
template <class... Properties>
|
||||
sycl::queue create_queue_impl(bool enable_exception_handler,
|
||||
Properties... properties) {
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
auto q = sycl::queue(*this, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...)));
|
||||
properties...));
|
||||
_queues.push_back(q);
|
||||
|
||||
return _queues.back().get();
|
||||
}
|
||||
return _queues.back();
|
||||
}
|
||||
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
|
||||
template <class... Properties>
|
||||
sycl::queue create_queue_impl(sycl::device device,
|
||||
bool enable_exception_handler,
|
||||
Properties... properties) {
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
context, device, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...)));
|
||||
|
||||
return _queues.back().get();
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(
|
||||
sycl::queue(device, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...)));
|
||||
|
||||
void get_version(int &major, int &minor) const
|
||||
{
|
||||
detail::get_version(*this, major, minor);
|
||||
}
|
||||
sycl::queue *_q_in_order, *_q_out_of_order;
|
||||
sycl::queue *_saved_queue;
|
||||
sycl::context _ctx;
|
||||
std::vector<std::shared_ptr<sycl::queue>> _queues;
|
||||
mutable mutex_type m_mutex;
|
||||
return _queues.back();
|
||||
}
|
||||
|
||||
void get_version(int &major, int &minor) const {
|
||||
detail::get_version(*this, major, minor);
|
||||
}
|
||||
sycl::queue _q_in_order, _q_out_of_order;
|
||||
sycl::queue _saved_queue;
|
||||
std::vector<sycl::queue> _queues;
|
||||
mutable mutex_type m_mutex;
|
||||
};
|
||||
|
||||
|
||||
/// device manager
|
||||
class dev_mgr
|
||||
{
|
||||
|
||||
3031
ggml-sycl/mmq.cpp
Normal file
3031
ggml-sycl/mmq.cpp
Normal file
File diff suppressed because it is too large
Load Diff
33
ggml-sycl/mmq.hpp
Normal file
33
ggml-sycl/mmq.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_MMQ_HPP
|
||||
#define GGML_SYCL_MMQ_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_mul_mat_q(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor* src0,
|
||||
const ggml_tensor* src1,
|
||||
ggml_tensor* dst,
|
||||
const char* src0_dd_i,
|
||||
const float* src1_ddf_i,
|
||||
const char* src1_ddq_i,
|
||||
float* dst_dd_i,
|
||||
const int64_t row_low,
|
||||
const int64_t row_high,
|
||||
const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size,
|
||||
const dpct::queue_ptr& stream);
|
||||
|
||||
#endif // GGML_SYCL_MMQ_HPP
|
||||
1024
ggml-sycl/mmvq.cpp
Normal file
1024
ggml-sycl/mmvq.cpp
Normal file
File diff suppressed because it is too large
Load Diff
27
ggml-sycl/mmvq.hpp
Normal file
27
ggml-sycl/mmvq.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_MMVQ_HPP
|
||||
#define GGML_SYCL_MMVQ_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
void ggml_sycl_op_mul_mat_vec_q(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
|
||||
float *dst_dd_i, const int64_t row_low, const int64_t row_high,
|
||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||
const dpct::queue_ptr &stream);
|
||||
|
||||
#endif // GGML_SYCL_MMVQ_HPP
|
||||
@@ -18,8 +18,6 @@
|
||||
#define GGML_SYCL_MAX_DEVICES 48
|
||||
#define GGML_SYCL_NAME "SYCL"
|
||||
|
||||
// FIXME: 1024 from cuda
|
||||
#define GROUP_SIZE 1024
|
||||
#define WARP_SIZE 32
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
|
||||
1161
ggml-sycl/vecdotq.hpp
Normal file
1161
ggml-sycl/vecdotq.hpp
Normal file
File diff suppressed because it is too large
Load Diff
221
ggml.c
221
ggml.c
@@ -1753,9 +1753,8 @@ struct ggml_compute_state_shared {
|
||||
int n_threads;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int node_task; // active graph node task phase
|
||||
atomic_int n_barrier;
|
||||
atomic_int n_barrier_passed;
|
||||
|
||||
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
||||
void* abort_callback_data;
|
||||
@@ -18972,47 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
||||
return n_tasks;
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_node_n = * node_n;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
static void ggml_barrier(struct ggml_compute_state * state) {
|
||||
if (state->shared->n_threads == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
#pragma omp barrier
|
||||
}
|
||||
#else
|
||||
static void ggml_barrier(struct ggml_compute_state * state) {
|
||||
if (state->shared->n_threads == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
atomic_int * n_barrier = &state->shared->n_barrier;
|
||||
atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
|
||||
|
||||
int n_threads = state->shared->n_threads;
|
||||
int passed_old = atomic_load(n_barrier_passed);
|
||||
|
||||
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
||||
// last thread
|
||||
atomic_store(n_barrier, 0);
|
||||
atomic_fetch_add(n_barrier_passed, 1);
|
||||
} else {
|
||||
// wait for other threads
|
||||
//while (atomic_load(n_barrier_passed) == passed_old) {
|
||||
//}
|
||||
const int n_spin_before_sleep = 100000;
|
||||
while (true) {
|
||||
for (int i = 0; i < n_spin_before_sleep; i++) {
|
||||
if (atomic_load(n_barrier_passed) != passed_old) {
|
||||
return;
|
||||
}
|
||||
#if defined(__SSE3__)
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
*node_n = atomic_load(&state->shared->node_n);
|
||||
if (*node_n != last_node_n) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if defined(__SSE3__)
|
||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_task_phase = *task_phase;
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
*task_phase = atomic_load(&state->shared->node_task);
|
||||
if (*task_phase != last_task_phase) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if defined(__SSE3__)
|
||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
@@ -19020,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
||||
const struct ggml_cplan * cplan = state->shared->cplan;
|
||||
|
||||
const int n_threads = state->shared->n_threads;
|
||||
const int ith = state->ith;
|
||||
const int n_threads = state->shared->n_threads;
|
||||
|
||||
set_numa_thread_affinity(state->ith);
|
||||
set_numa_thread_affinity(ith);
|
||||
|
||||
int node_n = -1;
|
||||
int task_phase = GGML_TASK_TYPE_FINALIZE;
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_TYPE_INIT,
|
||||
/*.ith =*/ ith,
|
||||
/*.nth =*/ state->shared->n_threads,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
};
|
||||
|
||||
while (true) {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
state->shared->node_n += 1;
|
||||
state->ec = GGML_STATUS_ABORTED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
// all other threads are finished and spinning
|
||||
// do finalize and init here so we don't have synchronize again
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
|
||||
/*.ith =*/ 0,
|
||||
/*.nth =*/ 0,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
};
|
||||
|
||||
if (node_n != -1) {
|
||||
/* FINALIZE */
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
||||
// distribute new work or execute it direct if 1T
|
||||
while (++node_n < cgraph->n_nodes) {
|
||||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
||||
|
||||
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
||||
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
||||
|
||||
params.nth = n_tasks;
|
||||
|
||||
if (n_tasks == 1) {
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
params.type = GGML_TASK_TYPE_INIT;
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
|
||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||
// they do something more efficient than spinning (?)
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.type = GGML_TASK_TYPE_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
task_phase = GGML_TASK_TYPE_INIT;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_n, node_n);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
} else {
|
||||
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
if (node_n >= cgraph->n_nodes) break;
|
||||
|
||||
/* INIT & COMPUTE */
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_TYPE_INIT,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ n_tasks,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
};
|
||||
params.nth = n_tasks;
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
if (ith < n_tasks) {
|
||||
params.type = GGML_TASK_TYPE_INIT;
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
ggml_barrier(state);
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
task_phase = GGML_TASK_TYPE_COMPUTE;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
}
|
||||
else {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
// UPD: adding the do_yield flag seems to resolve the issue universally
|
||||
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
||||
}
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
/* COMPUTE */
|
||||
if (ith < n_tasks) {
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
task_phase = GGML_TASK_TYPE_FINALIZE;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
}
|
||||
else {
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
||||
ggml_barrier(state);
|
||||
|
||||
/* FINALIZE */
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
if (params.ith == 0) {
|
||||
params.type = GGML_TASK_TYPE_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
ggml_barrier(state);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19336,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
|
||||
// update the number of threads from the actual number of threads that we got from OpenMP
|
||||
n_threads = omp_get_num_threads();
|
||||
workers[0].shared->n_threads = n_threads;
|
||||
workers[0].shared->n_active = n_threads;
|
||||
}
|
||||
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
||||
}
|
||||
@@ -19399,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||
/*.perf_node_start_cycles =*/ 0,
|
||||
/*.perf_node_start_time_us =*/ 0,
|
||||
/*.n_threads =*/ n_threads,
|
||||
/*.n_active =*/ n_threads,
|
||||
/*.node_n =*/ -1,
|
||||
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
||||
/*.n_barrier =*/ 0,
|
||||
/*.n_barrier_passed =*/ 0,
|
||||
/*.abort_callback =*/ NULL,
|
||||
/*.abort_callback_data =*/ NULL,
|
||||
/*.current_chunk; =*/ 0,
|
||||
|
||||
6
ggml.h
6
ggml.h
@@ -312,6 +312,12 @@
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
316
llama.cpp
316
llama.cpp
@@ -2310,16 +2310,17 @@ struct llama_vocab {
|
||||
id special_cls_id = -1;
|
||||
id special_mask_id = -1;
|
||||
|
||||
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = -1;
|
||||
id special_suffix_id = -1;
|
||||
id special_middle_id = -1;
|
||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
|
||||
bool add_space_prefix = true;
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = true;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||
@@ -4770,7 +4771,7 @@ static void llm_load_vocab(
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
} // The default value of add_space_prefix is true.
|
||||
} else if (tokenizer_model == "bert") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||
@@ -4783,13 +4784,13 @@ static void llm_load_vocab(
|
||||
vocab.special_pad_id = 0;
|
||||
vocab.special_cls_id = 101;
|
||||
vocab.special_mask_id = 103;
|
||||
vocab.add_space_prefix = false;
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
} else if (tokenizer_model == "gpt2") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
}
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
@@ -4847,6 +4848,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "llama-v3" ||
|
||||
tokenizer_pre == "llama-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||
vocab.tokenizer_ignore_merges = true;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
} else if (
|
||||
tokenizer_pre == "deepseek-llm") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
||||
@@ -4897,6 +4900,14 @@ static void llm_load_vocab(
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_add_eos = false;
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_add_eos = false;
|
||||
} else {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
@@ -5041,10 +5052,10 @@ static void llm_load_vocab(
|
||||
bool temp = true;
|
||||
|
||||
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
||||
vocab.special_add_bos = int(temp);
|
||||
vocab.tokenizer_add_bos = temp;
|
||||
}
|
||||
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
||||
vocab.special_add_eos = int(temp);
|
||||
vocab.tokenizer_add_eos = temp;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5144,7 +5155,7 @@ static void llm_load_vocab(
|
||||
);
|
||||
|
||||
// set attributes by model/tokenizer name
|
||||
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
||||
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
||||
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
||||
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
||||
for (auto id : vocab.cache_special_tokens) {
|
||||
@@ -5269,13 +5280,6 @@ static bool llm_load_tensors(
|
||||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
// disable MoE with SYCL until mul_mat_id is updated
|
||||
if (hparams.n_expert > 0) {
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
model.split_mode = split_mode;
|
||||
model.main_gpu = main_gpu;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
@@ -13158,112 +13162,142 @@ struct llm_bigram_bpe {
|
||||
};
|
||||
|
||||
struct llm_tokenizer_bpe {
|
||||
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
||||
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
||||
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
||||
switch (vocab.type_pre) {
|
||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
regex_exprs = {
|
||||
// same as llama3
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
||||
regex_exprs = {
|
||||
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"[0-9][0-9][0-9]",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
||||
// TODO: MPT pre-tokenization regexes are unknown
|
||||
// the following are close, but not exact. run the following:
|
||||
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
||||
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
||||
regex_exprs = {
|
||||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||
regex_exprs = {
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||
regex_exprs = {
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
||||
regex_exprs = {
|
||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"\\p{N}+",
|
||||
"[0-9][0-9][0-9]",
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
|
||||
output.push_back(token_id);
|
||||
}
|
||||
|
||||
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
|
||||
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
int final_prev_index = -1;
|
||||
bool ignore_merges = false;
|
||||
|
||||
std::vector<std::string> word_collection;
|
||||
switch (vocab.type) {
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
switch (vocab.type_pre) {
|
||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||
ignore_merges = true;
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// same as llama3
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"[\r\n]",
|
||||
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}+",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"[\r\n]",
|
||||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"[0-9][0-9][0-9]",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
||||
// TODO: MPT pre-tokenization regexes are unknown
|
||||
// the following are close, but not exact. run the following:
|
||||
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
||||
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
});
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"\\p{N}+",
|
||||
"[0-9][0-9][0-9]",
|
||||
});
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
||||
|
||||
symbols_final.clear();
|
||||
|
||||
@@ -13274,7 +13308,7 @@ struct llm_tokenizer_bpe {
|
||||
int index = 0;
|
||||
size_t offset = 0;
|
||||
|
||||
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
||||
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
||||
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||
offset = word.size();
|
||||
}
|
||||
@@ -13355,10 +13389,9 @@ struct llm_tokenizer_bpe {
|
||||
for (auto j = str.begin(); j != str.end(); ++j) {
|
||||
std::string byte_str(1, *j);
|
||||
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
||||
if (token_multibyte == vocab.token_to_id.end()) {
|
||||
throw std::runtime_error("ERROR: byte not found in vocab");
|
||||
if (token_multibyte != vocab.token_to_id.end()) {
|
||||
output.push_back(token_multibyte->second);
|
||||
}
|
||||
output.push_back((*token_multibyte).second);
|
||||
}
|
||||
} else {
|
||||
output.push_back((*token).second);
|
||||
@@ -13397,6 +13430,8 @@ private:
|
||||
|
||||
const llama_vocab & vocab;
|
||||
|
||||
std::vector<std::string> regex_exprs;
|
||||
|
||||
std::vector<llm_symbol> symbols;
|
||||
std::vector<llm_symbol> symbols_final;
|
||||
|
||||
@@ -13677,7 +13712,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
|
||||
bool is_prev_special = false;
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
if (add_special && vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
is_prev_special = true;
|
||||
@@ -13687,7 +13722,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
if (vocab.add_space_prefix) {
|
||||
if (vocab.tokenizer_add_space_prefix) {
|
||||
if (!output.size() || is_prev_special) { // prefix with space if first token
|
||||
raw_text = " " + raw_text;
|
||||
}
|
||||
@@ -13705,23 +13740,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
if (add_special && vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
{
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
llm_tokenizer_bpe tokenizer(vocab);
|
||||
|
||||
if (add_special) {
|
||||
tokenizer.append_bos(output);
|
||||
}
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
@@ -13731,23 +13767,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||
#endif
|
||||
llm_tokenizer_bpe tokenizer(vocab);
|
||||
tokenizer.tokenize(raw_text, output);
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
tokenizer.append(fragment.token, output);
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_add_eos != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
if (add_special) {
|
||||
tokenizer.append_eos(output);
|
||||
tokenizer.check_double_bos_eos(output);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
@@ -18320,11 +18348,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
||||
}
|
||||
|
||||
int32_t llama_add_bos_token(const struct llama_model * model) {
|
||||
return model->vocab.special_add_bos;
|
||||
return model->vocab.tokenizer_add_bos;
|
||||
}
|
||||
|
||||
int32_t llama_add_eos_token(const struct llama_model * model) {
|
||||
return model->vocab.special_add_eos;
|
||||
return model->vocab.tokenizer_add_eos;
|
||||
}
|
||||
|
||||
llama_token llama_token_prefix(const struct llama_model * model) {
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
-r ./requirements-convert-legacy-llama.txt
|
||||
torch~=2.1.1
|
||||
torch~=2.2.1
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
-r ./requirements-convert-legacy-llama.txt
|
||||
torch~=2.1.1
|
||||
torch~=2.2.1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
numpy~=1.24.4
|
||||
numpy~=1.26.4
|
||||
sentencepiece~=0.2.0
|
||||
transformers>=4.40.1,<5.0.0
|
||||
gguf>=0.1.0
|
||||
|
||||
@@ -1,83 +1,143 @@
|
||||
import regex
|
||||
import ctypes
|
||||
import array
|
||||
import unicodedata
|
||||
|
||||
|
||||
class CoodepointFlags (ctypes.Structure):
|
||||
_fields_ = [ # see definition in unicode.h
|
||||
("is_undefined", ctypes.c_uint16, 1),
|
||||
("is_number", ctypes.c_uint16, 1), # regex: \p{N}
|
||||
("is_letter", ctypes.c_uint16, 1), # regex: \p{L}
|
||||
("is_separator", ctypes.c_uint16, 1), # regex: \p{Z}
|
||||
("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M}
|
||||
("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P}
|
||||
("is_symbol", ctypes.c_uint16, 1), # regex: \p{S}
|
||||
("is_control", ctypes.c_uint16, 1), # regex: \p{C}
|
||||
]
|
||||
|
||||
|
||||
assert (ctypes.sizeof(CoodepointFlags) == 2)
|
||||
import requests
|
||||
|
||||
|
||||
MAX_CODEPOINTS = 0x110000
|
||||
|
||||
regex_number = regex.compile(r'\p{N}')
|
||||
regex_letter = regex.compile(r'\p{L}')
|
||||
regex_separator = regex.compile(r'\p{Z}')
|
||||
regex_accent_mark = regex.compile(r'\p{M}')
|
||||
regex_punctuation = regex.compile(r'\p{P}')
|
||||
regex_symbol = regex.compile(r'\p{S}')
|
||||
regex_control = regex.compile(r'\p{C}')
|
||||
regex_whitespace = regex.compile(r'\s')
|
||||
UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
|
||||
|
||||
codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
|
||||
|
||||
# see https://www.unicode.org/L2/L1999/UnicodeData.html
|
||||
def unicode_data_iter():
|
||||
res = requests.get(UNICODE_DATA_URL)
|
||||
res.raise_for_status()
|
||||
data = res.content.decode()
|
||||
|
||||
prev = []
|
||||
|
||||
for line in data.splitlines():
|
||||
# ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
|
||||
line = line.split(";")
|
||||
|
||||
cpt = int(line[0], base=16)
|
||||
assert cpt < MAX_CODEPOINTS
|
||||
|
||||
cpt_lower = int(line[-2] or "0", base=16)
|
||||
assert cpt_lower < MAX_CODEPOINTS
|
||||
|
||||
cpt_upper = int(line[-3] or "0", base=16)
|
||||
assert cpt_upper < MAX_CODEPOINTS
|
||||
|
||||
categ = line[2].strip()
|
||||
assert len(categ) == 2
|
||||
|
||||
bidir = line[4].strip()
|
||||
assert len(categ) == 2
|
||||
|
||||
name = line[1]
|
||||
if name.endswith(", First>"):
|
||||
prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
|
||||
continue
|
||||
if name.endswith(", Last>"):
|
||||
assert prev[1:] == (0, 0, categ, bidir)
|
||||
for c in range(prev[0], cpt):
|
||||
yield (c, cpt_lower, cpt_upper, categ, bidir)
|
||||
|
||||
yield (cpt, cpt_lower, cpt_upper, categ, bidir)
|
||||
|
||||
|
||||
# see definition in unicode.h
|
||||
CODEPOINT_FLAG_UNDEFINED = 0x0001 #
|
||||
CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N}
|
||||
CODEPOINT_FLAG_LETTER = 0x0004 # \p{L}
|
||||
CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z}
|
||||
CODEPOINT_FLAG_MARK = 0x0010 # \p{M}
|
||||
CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P}
|
||||
CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S}
|
||||
CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C}
|
||||
|
||||
UNICODE_CATEGORY_TO_FLAG = {
|
||||
"Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined
|
||||
"Cc": CODEPOINT_FLAG_CONTROL, # Control
|
||||
"Cf": CODEPOINT_FLAG_CONTROL, # Format
|
||||
"Co": CODEPOINT_FLAG_CONTROL, # Private Use
|
||||
"Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate
|
||||
"Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter
|
||||
"Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter
|
||||
"Lo": CODEPOINT_FLAG_LETTER, # Other Letter
|
||||
"Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter
|
||||
"Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter
|
||||
"L&": CODEPOINT_FLAG_LETTER, # Cased Letter
|
||||
"Mc": CODEPOINT_FLAG_MARK, # Spacing Mark
|
||||
"Me": CODEPOINT_FLAG_MARK, # Enclosing Mark
|
||||
"Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark
|
||||
"Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number
|
||||
"Nl": CODEPOINT_FLAG_NUMBER, # Letter Number
|
||||
"No": CODEPOINT_FLAG_NUMBER, # Other Number
|
||||
"Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation
|
||||
"Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation
|
||||
"Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation
|
||||
"Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation
|
||||
"Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation
|
||||
"Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation
|
||||
"Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation
|
||||
"Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol
|
||||
"Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol
|
||||
"Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol
|
||||
"So": CODEPOINT_FLAG_SYMBOL, # Other Symbol
|
||||
"Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator
|
||||
"Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator
|
||||
"Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator
|
||||
}
|
||||
|
||||
|
||||
codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
|
||||
table_whitespace = []
|
||||
table_lowercase = []
|
||||
table_uppercase = []
|
||||
table_nfd = []
|
||||
|
||||
for codepoint in range(MAX_CODEPOINTS):
|
||||
for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
|
||||
# convert codepoint to unicode character
|
||||
char = chr(codepoint)
|
||||
char = chr(cpt)
|
||||
|
||||
# regex categories
|
||||
flags = codepoint_flags[codepoint]
|
||||
flags.is_number = bool(regex_number.match(char))
|
||||
flags.is_letter = bool(regex_letter.match(char))
|
||||
flags.is_separator = bool(regex_separator.match(char))
|
||||
flags.is_accent_mark = bool(regex_accent_mark.match(char))
|
||||
flags.is_punctuation = bool(regex_punctuation.match(char))
|
||||
flags.is_symbol = bool(regex_symbol.match(char))
|
||||
flags.is_control = bool(regex_control.match(char))
|
||||
flags.is_undefined = bytes(flags)[0] == 0
|
||||
assert (not flags.is_undefined)
|
||||
|
||||
# whitespaces
|
||||
if bool(regex_whitespace.match(char)):
|
||||
table_whitespace.append(codepoint)
|
||||
# codepoint category flags
|
||||
codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
|
||||
|
||||
# lowercase conversion
|
||||
lower = ord(char.lower()[0])
|
||||
if codepoint != lower:
|
||||
table_lowercase.append((codepoint, lower))
|
||||
if cpt_lower:
|
||||
table_lowercase.append((cpt, cpt_lower))
|
||||
|
||||
# uppercase conversion
|
||||
upper = ord(char.upper()[0])
|
||||
if codepoint != upper:
|
||||
table_uppercase.append((codepoint, upper))
|
||||
if cpt_upper:
|
||||
table_uppercase.append((cpt, cpt_upper))
|
||||
|
||||
# NFD normalization
|
||||
norm = ord(unicodedata.normalize('NFD', char)[0])
|
||||
if codepoint != norm:
|
||||
table_nfd.append((codepoint, norm))
|
||||
if cpt != norm:
|
||||
table_nfd.append((cpt, norm))
|
||||
|
||||
|
||||
# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
table_whitespace.extend(range(0x0009, 0x000D + 1))
|
||||
table_whitespace.extend(range(0x2000, 0x200A + 1))
|
||||
table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
|
||||
|
||||
|
||||
# sort by codepoint
|
||||
table_whitespace.sort()
|
||||
table_lowercase.sort()
|
||||
table_uppercase.sort()
|
||||
table_nfd.sort()
|
||||
|
||||
|
||||
# group ranges with same flags
|
||||
ranges_flags = [(0, codepoint_flags[0])] # start, flags
|
||||
for codepoint, flags in enumerate(codepoint_flags):
|
||||
if bytes(flags) != bytes(ranges_flags[-1][1]):
|
||||
if flags != ranges_flags[-1][1]:
|
||||
ranges_flags.append((codepoint, flags))
|
||||
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
||||
ranges_flags.append((MAX_CODEPOINTS, 0x0000))
|
||||
|
||||
|
||||
# group ranges with same nfd
|
||||
@@ -90,8 +150,8 @@ for codepoint, norm in table_nfd:
|
||||
ranges_nfd[-1] = (start, codepoint, norm)
|
||||
|
||||
|
||||
# Generate 'unicode-data.cpp'
|
||||
|
||||
# Generate 'unicode-data.cpp':
|
||||
# python ./scripts//gen-unicode-data.py > unicode-data.cpp
|
||||
|
||||
def out(line=""):
|
||||
print(line, end='\n') # noqa
|
||||
@@ -110,12 +170,12 @@ out("""\
|
||||
|
||||
out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
||||
for codepoint, flags in ranges_flags:
|
||||
flags = int.from_bytes(bytes(flags), "little")
|
||||
out("{0x%06X, 0x%04X}," % (codepoint, flags))
|
||||
out("};\n")
|
||||
|
||||
out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
||||
out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
|
||||
for codepoint in table_whitespace:
|
||||
out("0x%06X," % codepoint)
|
||||
out("};\n")
|
||||
|
||||
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
||||
|
||||
@@ -43,8 +43,10 @@
|
||||
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wpedantic"
|
||||
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
||||
#endif
|
||||
|
||||
#include "sgemm.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
@@ -11,13 +11,15 @@ import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import random
|
||||
import unicodedata
|
||||
|
||||
from typing import Callable, Iterator
|
||||
|
||||
import cffi
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
logger = logging.getLogger("test-tokenizer-random-bpe")
|
||||
|
||||
logger = logging.getLogger("test-tokenizer-random")
|
||||
|
||||
|
||||
class LibLlama:
|
||||
@@ -155,9 +157,14 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||
'Cửa Việt', # llama-3, ignore_merges = true
|
||||
'<s>a', # Phi-3 fail
|
||||
'<unk><|endoftext|><s>', # Phi-3 fail
|
||||
'a\na', # TODO: Bert fail
|
||||
'a </s> b', # rstrip phi-3
|
||||
'a <mask> b', # lstrip jina-v2
|
||||
'a\na', # bert fail
|
||||
'"`', # falcon
|
||||
' \u2e4e', # falcon
|
||||
'a\xa0\xa0\x00b', # jina-v2-es
|
||||
'one <mask>', # jina-v2-es <mask> lstrip=true
|
||||
'a </s> b', # rstrip phi-3
|
||||
'a <mask> b', # lstrip jina-v2
|
||||
'\xa0aC', # deepseek
|
||||
]
|
||||
|
||||
|
||||
@@ -189,17 +196,23 @@ def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
words = rand.choices(all_tokens, k=500)
|
||||
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
|
||||
if words and words[0] == tokenizer.bos_token: # skip spam warning of double BOS
|
||||
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
|
||||
words.pop(0)
|
||||
if tokenizer.add_bos_token: # drop all starting BOS
|
||||
words.pop(0)
|
||||
if words and words[-1] == tokenizer.eos_token: # skip spam warning of double EOS
|
||||
while len(words) > 1 and words[-2] == tokenizer.eos_token: # leave one trailing EOS
|
||||
words.pop(-1)
|
||||
if tokenizer.add_bos_token: # drop all trailing EOS
|
||||
words.pop(-1)
|
||||
yield "".join(words)
|
||||
|
||||
|
||||
def generator_random_chars(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text with simple characters"""
|
||||
|
||||
NUM_WORDS = 400
|
||||
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||
CHARS = list(sorted(set("""
|
||||
ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
@@ -213,12 +226,50 @@ def generator_random_chars(iterations=100) -> Iterator[str]:
|
||||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
text = []
|
||||
num_words = rand.randint(300, 400)
|
||||
for i in range(num_words):
|
||||
for _ in range(NUM_WORDS):
|
||||
k = rand.randint(1, 7)
|
||||
word = rand.choices(CHARS, k=k)
|
||||
space = rand.choice(WHITESPACES)
|
||||
text.append("".join(word) + space)
|
||||
word.append(rand.choice(WHITESPACES))
|
||||
text.append("".join(word))
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def generator_unicodes() -> Iterator[str]:
|
||||
"""Iterate unicode characters"""
|
||||
|
||||
MAX_CODEPOINTS = 0x30000 # 0x110000
|
||||
|
||||
def _valid(cpt):
|
||||
if cpt >= 0x30000: # unassigned and supplementary
|
||||
return False
|
||||
if 0x00D800 <= cpt <= 0x00F8FF: # Surrogates
|
||||
return False
|
||||
if unicodedata.category(chr(cpt)) == "Cn":
|
||||
return False
|
||||
return True
|
||||
|
||||
characters = [chr(cpt) for cpt in range(1, MAX_CODEPOINTS) if _valid(cpt)]
|
||||
|
||||
yield from characters
|
||||
|
||||
|
||||
def generator_random_unicodes(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text with unicode characters"""
|
||||
|
||||
NUM_WORDS = 200
|
||||
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||
|
||||
characters = list(generator_unicodes())
|
||||
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
text = []
|
||||
for _ in range(NUM_WORDS):
|
||||
k = rand.randint(1, 7)
|
||||
word = rand.choices(characters, k=k)
|
||||
word.append(rand.choice(WHITESPACES))
|
||||
text.append("".join(word))
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
@@ -256,25 +307,7 @@ def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[s
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def generator_random_bytes(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random bytes"""
|
||||
|
||||
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
text = []
|
||||
num_words = rand.randint(300, 400)
|
||||
for i in range(num_words):
|
||||
k = rand.randint(1, 8)
|
||||
word = [chr(r) for r in rand.randbytes(k) if r]
|
||||
word.append(rand.choice(WHITESPACES))
|
||||
text.append("".join(word))
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]):
|
||||
def compare_tokenizers(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]):
|
||||
|
||||
def find_first_mismatch(ids1: list[int], ids2: list[int]):
|
||||
for i, (a, b) in enumerate(zip(ids1, ids2)):
|
||||
@@ -284,20 +317,34 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g
|
||||
return -1
|
||||
return min(len(ids1), len(ids2))
|
||||
|
||||
t0 = time.perf_counter()
|
||||
t_tokenizer1 = 0
|
||||
t_tokenizer2 = 0
|
||||
t_start = time.perf_counter()
|
||||
num_errors = 10
|
||||
|
||||
logger.info("%s: %s" % (generator.__name__, "ini"))
|
||||
for text in generator:
|
||||
# print(repr(text), hex(ord(text[0])), text.encode())
|
||||
t0 = time.perf_counter()
|
||||
ids1 = func_tokenize1(text)
|
||||
t1 = time.perf_counter()
|
||||
ids2 = func_tokenize2(text)
|
||||
t2 = time.perf_counter()
|
||||
t_tokenizer1 += t1 - t0
|
||||
t_tokenizer2 += t2 - t1
|
||||
if ids1 != ids2:
|
||||
i = find_first_mismatch(ids1, ids2)
|
||||
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
|
||||
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
|
||||
logger.info(" TokenIDs: " + str(ids1))
|
||||
logger.info(" Expected: " + str(ids2))
|
||||
raise Exception()
|
||||
t1 = time.perf_counter()
|
||||
logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
|
||||
logger.error(" TokenIDs: " + str(ids1))
|
||||
logger.error(" Expected: " + str(ids2))
|
||||
# raise Exception()
|
||||
num_errors += 1
|
||||
if num_errors > 10:
|
||||
break
|
||||
|
||||
t_total = time.perf_counter() - t_start
|
||||
logger.info("%s: end, tok1: %.3f tok2: %.3f total: %.3f" % (generator.__name__, t_tokenizer1, t_tokenizer2, t_total))
|
||||
|
||||
|
||||
def main(argv: list[str] = None):
|
||||
@@ -307,7 +354,8 @@ def main(argv: list[str] = None):
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
|
||||
logger.info(f"VOCABFILE: '{args.vocab_file}'")
|
||||
|
||||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||
@@ -321,18 +369,22 @@ def main(argv: list[str] = None):
|
||||
ids = func_tokenize2("a")
|
||||
assert 1 <= len(ids) <= 3
|
||||
add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
|
||||
add_eos_token = len(ids) > 1 and tokenizer.eos_token_id == ids[-1]
|
||||
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
|
||||
tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", add_eos_token)
|
||||
|
||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
|
||||
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
|
||||
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_unicodes())
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_unicodes(10_000))
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
|
||||
|
||||
model.free()
|
||||
|
||||
@@ -340,20 +392,40 @@ def main(argv: list[str] = None):
|
||||
if __name__ == "__main__":
|
||||
# main()
|
||||
|
||||
logging.basicConfig(
|
||||
level = logging.DEBUG,
|
||||
format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
|
||||
datefmt = "%Y-%m-%d %H:%M:%S",
|
||||
filename = logger.name + ".log",
|
||||
filemode = "a"
|
||||
)
|
||||
|
||||
path_tokenizers = "./models/tokenizers/"
|
||||
path_vocab_format = "./models/ggml-vocab-%s.gguf"
|
||||
|
||||
# import os
|
||||
# tokenizers = os.listdir(path_tokenizers)
|
||||
tokenizers = [
|
||||
"llama-spm", # SPM
|
||||
"phi-3", # SPM
|
||||
"jina-v2-en", # WPM
|
||||
"bert-bge", # WPM
|
||||
# "llama-spm", # SPM
|
||||
# "phi-3", # SPM
|
||||
# "bert-bge", # WPM
|
||||
# "jina-v2-en", # WPM
|
||||
"gpt-2", # BPE
|
||||
"llama-bpe", # BPE
|
||||
"falcon", # BPE
|
||||
"starcoder", # BPE
|
||||
"jina-v2-es", # BPE
|
||||
"jina-v2-de", # BPE
|
||||
"jina-v2-code", # BPE
|
||||
"smaug-bpe", # BPE
|
||||
"phi-2", # BPE
|
||||
"deepseek-coder", # BPE
|
||||
"deepseek-llm", # BPE
|
||||
]
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
|
||||
logger.info("=" * 50)
|
||||
logger.info(f"TOKENIZER: '{tokenizer}'")
|
||||
vocab_file = path_vocab_format % tokenizer
|
||||
dir_tokenizer = path_tokenizers + "/" + tokenizer
|
||||
main([vocab_file, dir_tokenizer, "--verbose"])
|
||||
|
||||
1652
unicode-data.cpp
1652
unicode-data.cpp
File diff suppressed because it is too large
Load Diff
29
unicode.cpp
29
unicode.cpp
@@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
@@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
@@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
@@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
@@ -679,10 +681,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
continue;
|
||||
}
|
||||
|
||||
const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
|
||||
const auto flags = unicode_cpt_flags(cpts[i]);
|
||||
|
||||
if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
|
||||
text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
|
||||
if (flags.is_whitespace) {
|
||||
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
||||
//text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
|
||||
text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
|
||||
} else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
|
||||
text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
|
||||
} else {
|
||||
text_collapsed[i] = (char) 0xD0; // fallback
|
||||
}
|
||||
@@ -766,9 +772,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
|
||||
} else {
|
||||
// no unicode category used, we can use std::wregex directly
|
||||
const std::wstring wtext = unicode_wstring_from_utf8(text);
|
||||
const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
|
||||
|
||||
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
||||
std::wstring wtext(cpts.begin(), cpts.end());
|
||||
for (size_t i = 0; i < wtext.size(); ++i) {
|
||||
if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
|
||||
wtext[i] = 0x0B;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("text: %s\n", text.c_str());
|
||||
//printf("regex_expr: %s\n", regex_expr.c_str());
|
||||
bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
|
||||
|
||||
Reference in New Issue
Block a user