mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-15 21:44:05 +00:00
Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0374a67e2 | ||
|
|
ddef99522d | ||
|
|
6681688146 | ||
|
|
bac8bed248 | ||
|
|
b81510a7b7 | ||
|
|
ef797db357 | ||
|
|
67d1ef23c6 | ||
|
|
7b50f7c025 | ||
|
|
c79184d2d1 | ||
|
|
499a8f5a78 | ||
|
|
28657a8229 | ||
|
|
bee28421be | ||
|
|
2b72bedec1 | ||
|
|
c8c4495b8d | ||
|
|
7b63a71a6b | ||
|
|
0c2ee38ab7 | ||
|
|
a70c8a0c4b | ||
|
|
9067487c44 | ||
|
|
d4cdd9c1c3 | ||
|
|
55c2646b45 | ||
|
|
e75ba4c043 | ||
|
|
5d46babdc2 | ||
|
|
e17991c466 | ||
|
|
c46944aa25 | ||
|
|
f3ed38d793 | ||
|
|
55a1c5a5fd | ||
|
|
12a81af45f | ||
|
|
8875523eb3 | ||
|
|
ec68e84c32 | ||
|
|
307e79d33d | ||
|
|
d7f5f4e578 | ||
|
|
c8a4e470f6 | ||
|
|
603e43dc91 | ||
|
|
611ba4b264 | ||
|
|
85841e121d | ||
|
|
68b3cd6514 | ||
|
|
de56944147 | ||
|
|
1b2aaf28ac | ||
|
|
343b6e94b6 | ||
|
|
6a746cf9c4 | ||
|
|
eff5e45443 |
@@ -40,7 +40,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -42,7 +42,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
11
.github/labeler.yml
vendored
11
.github/labeler.yml
vendored
@@ -1,10 +1,4 @@
|
||||
# https://github.com/actions/labeler
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-kompute.h
|
||||
- ggml/src/ggml-kompute/**
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -93,3 +87,8 @@ Ascend NPU:
|
||||
- ggml/include/ggml-cann.h
|
||||
- ggml/src/ggml-cann/**
|
||||
- docs/backend/CANN.md
|
||||
OpenCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-opencl.h
|
||||
- ggml/src/ggml-opencl/**
|
||||
|
||||
14
.github/workflows/build.yml
vendored
14
.github/workflows/build.yml
vendored
@@ -84,7 +84,8 @@ jobs:
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
@@ -739,9 +740,6 @@ jobs:
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
# - build: 'kompute-x64'
|
||||
# arch: 'x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -755,12 +753,6 @@ jobs:
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute-x64' }}
|
||||
run: |
|
||||
git submodule update --init ggml/src/ggml-kompute/kompute
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
@@ -776,7 +768,7 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
|
||||
10
.github/workflows/release.yml
vendored
10
.github/workflows/release.yml
vendored
@@ -49,7 +49,8 @@ jobs:
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
@@ -103,7 +104,8 @@ jobs:
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
@@ -160,6 +162,8 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
@@ -211,6 +215,8 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +0,0 @@
|
||||
[submodule "kompute"]
|
||||
path = ggml/src/ggml-kompute/kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
||||
|
||||
@@ -120,7 +120,6 @@ endfunction()
|
||||
|
||||
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
|
||||
@@ -4408,9 +4408,6 @@ class Gemma3NModel(Gemma3Model):
|
||||
]
|
||||
|
||||
def set_vocab(self):
|
||||
with open(self.dir_model / "chat_template.jinja") as f:
|
||||
# quick hack to make sure chat template is added
|
||||
self.gguf_writer.add_chat_template(f.read())
|
||||
super().set_vocab()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
@@ -4781,6 +4778,14 @@ class ARwkv7Model(Rwkv7Model):
|
||||
class MambaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
# Avoid using AutoConfig for hparams
|
||||
hparams = kwargs.pop("hparams", None)
|
||||
if hparams is None:
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
||||
|
||||
def set_vocab(self):
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
# Round vocab size to next multiple of 8
|
||||
@@ -4855,6 +4860,100 @@ class MambaModel(TextModel):
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Mamba2ForCausalLM")
|
||||
class Mamba2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MAMBA2
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
# Avoid using AutoConfig for hparams
|
||||
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
|
||||
hparams = kwargs.pop("hparams", None)
|
||||
if hparams is None:
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
||||
|
||||
def set_vocab(self):
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
# Round vocab size to next multiple of 16
|
||||
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
|
||||
# pad using ceiling division
|
||||
# ref: https://stackoverflow.com/a/17511341/22827863
|
||||
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
||||
self.hparams["vocab_size"] = vocab_size
|
||||
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
self._set_vocab_sentencepiece()
|
||||
elif (self.dir_model / "tokenizer.model.v3").is_file():
|
||||
# mamba-codestral
|
||||
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
|
||||
elif (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
else:
|
||||
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
||||
self._set_vocab_builtin("gpt-neox", vocab_size)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
||||
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
||||
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
|
||||
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
|
||||
n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
||||
|
||||
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
||||
|
||||
# Fail early for models which don't have a block expansion factor of 2
|
||||
# TODO: does this really matter?
|
||||
assert d_inner == 2 * d_model
|
||||
assert d_inner % head_dim == 0
|
||||
|
||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
||||
self.gguf_writer.add_embedding_length(d_model)
|
||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||
self.gguf_writer.add_ssm_state_size(d_state)
|
||||
self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
|
||||
self.gguf_writer.add_ssm_group_count(n_group)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
|
||||
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
|
||||
name = name.removeprefix("model.")
|
||||
|
||||
if name.endswith(".dt_bias"):
|
||||
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
||||
data_torch = data_torch.squeeze()
|
||||
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
|
||||
gguf.MODEL_TENSOR.SSM_A,
|
||||
gguf.MODEL_TENSOR.SSM_D,
|
||||
]):
|
||||
# unsqueeze A to use similar shape semantics as Mamba-1
|
||||
# (D is also unsqueezed, but for more straightforward broadcast internally)
|
||||
data_torch = data_torch.reshape((*data_torch.shape, 1))
|
||||
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
|
||||
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
||||
n_group = self.hparams.get("n_groups", 1)
|
||||
data_torch = data_torch.reshape((n_group, d_inner // n_group))
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
data_torch = -torch.exp(data_torch)
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
|
||||
@ModelBase.register("CohereForCausalLM")
|
||||
class CommandR2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
||||
@@ -6615,12 +6714,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
||||
# maybe we should fallback to text model's arch in that case, since not many models have both
|
||||
text_config = hparams.get("text_config", {})
|
||||
vision_config = hparams.get("vision_config", {})
|
||||
arch = hparams["architectures"][0]
|
||||
arch = None
|
||||
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
|
||||
arch = arches[0]
|
||||
elif "ssm_cfg" in hparams:
|
||||
# For non-hf Mamba and Mamba2 models
|
||||
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
|
||||
|
||||
# if "architectures" is found in the sub-config, use that instead
|
||||
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
||||
arch = text_config["architectures"][0]
|
||||
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
|
||||
arch = vision_config["architectures"][0]
|
||||
if arch is None:
|
||||
raise ValueError("Failed to detect model architecture")
|
||||
return arch
|
||||
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ Additionally, there the following images, similar to the above:
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||
|
||||
|
||||
@@ -136,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
|
||||
@@ -113,15 +113,16 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
|
||||
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
GGML_ABORT("failed to decode, ret = %d\n", ret);
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
|
||||
@@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
||||
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
@@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/ggml-cann.h
|
||||
include/ggml-cpp.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-opt.h
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
@@ -360,6 +358,13 @@ write_basic_package_version_file(
|
||||
VERSION ${GGML_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
target_compile_definitions(ggml-base PRIVATE
|
||||
GGML_VERSION="${GGML_INSTALL_VERSION}"
|
||||
GGML_COMMIT="${GGML_BUILD_COMMIT}"
|
||||
)
|
||||
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
|
||||
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_KOMPUTE_MAX_DEVICES 16
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
const char * name;
|
||||
const char * vendor;
|
||||
int subgroupSize;
|
||||
uint64_t bufferAlignment;
|
||||
uint64_t maxAlloc;
|
||||
};
|
||||
|
||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
||||
bool ggml_vk_has_vulkan(void);
|
||||
bool ggml_vk_has_device(void);
|
||||
struct ggml_vk_device ggml_vk_current_device(void);
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -314,6 +314,13 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Function type used in fatal error callbacks
|
||||
typedef void (*ggml_abort_callback_t)(const char * error_message);
|
||||
|
||||
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
|
||||
// Returns the old callback for chaining
|
||||
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
|
||||
|
||||
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
||||
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
||||
|
||||
@@ -550,6 +557,8 @@ extern "C" {
|
||||
GGML_GLU_OP_REGLU,
|
||||
GGML_GLU_OP_GEGLU,
|
||||
GGML_GLU_OP_SWIGLU,
|
||||
GGML_GLU_OP_GEGLU_ERF,
|
||||
GGML_GLU_OP_GEGLU_QUICK,
|
||||
|
||||
GGML_GLU_OP_COUNT,
|
||||
};
|
||||
@@ -639,6 +648,9 @@ extern "C" {
|
||||
|
||||
// misc
|
||||
|
||||
GGML_API const char * ggml_version(void);
|
||||
GGML_API const char * ggml_commit(void);
|
||||
|
||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||
GGML_API int64_t ggml_time_ms(void);
|
||||
GGML_API int64_t ggml_time_us(void);
|
||||
@@ -1137,6 +1149,22 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// A: n columns, r rows,
|
||||
// B: n columns, r rows,
|
||||
GGML_API struct ggml_tensor * ggml_glu_split(
|
||||
@@ -1160,6 +1188,16 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// normalize along rows
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1503,8 +1541,14 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// a [ne0, ne01, ne02, ne03]
|
||||
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
|
||||
//
|
||||
// broadcast:
|
||||
// ne02 % ne12 == 0
|
||||
// ne03 % ne13 == 0
|
||||
//
|
||||
// fused soft_max(a*scale + mask*(ALiBi slope))
|
||||
// mask is optional
|
||||
// max_bias = 0.0f for no ALiBi
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1967,11 +2011,17 @@ extern "C" {
|
||||
|
||||
#define GGML_KQ_MASK_PAD 64
|
||||
|
||||
// q: [n_embd_k, n_batch, n_head, 1]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
//
|
||||
// broadcast:
|
||||
// n_head % n_head_kv == 0
|
||||
// n_head % ne32 == 0
|
||||
// ne3 % ne33 == 0
|
||||
//
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@@ -2010,7 +2060,8 @@ extern "C" {
|
||||
struct ggml_tensor * dt,
|
||||
struct ggml_tensor * A,
|
||||
struct ggml_tensor * B,
|
||||
struct ggml_tensor * C);
|
||||
struct ggml_tensor * C,
|
||||
struct ggml_tensor * ids);
|
||||
|
||||
// partition into non-overlapping windows with padding if needed
|
||||
// example:
|
||||
|
||||
@@ -365,7 +365,6 @@ ggml_add_backend(BLAS)
|
||||
ggml_add_backend(CANN)
|
||||
ggml_add_backend(CUDA)
|
||||
ggml_add_backend(HIP)
|
||||
ggml_add_backend(Kompute)
|
||||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(MUSA)
|
||||
ggml_add_backend(RPC)
|
||||
|
||||
@@ -61,10 +61,6 @@
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
@@ -189,9 +185,6 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
@@ -575,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
ggml_backend_load_best("kompute", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
|
||||
@@ -65,8 +65,9 @@
|
||||
#include <aclnnop/aclnn_eq_tensor.h>
|
||||
#include <aclnnop/aclnn_gt_scalar.h>
|
||||
#include <aclnnop/aclnn_pow.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v2.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||
nb[i] = nb[i - 1] * ne[i - 1];
|
||||
}
|
||||
|
||||
ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
|
||||
aclTensor* zero =
|
||||
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
|
||||
return zero;
|
||||
GGML_UNUSED(n_bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -2654,6 +2656,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
||||
}
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
src0_row.type = GGML_TYPE_F32;
|
||||
}
|
||||
|
||||
// src0_row [D, M, 1, 1] weight without permute
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
src0_row.nb[0] = ori_src0_nb[0];
|
||||
src0_row.nb[1] = ori_src0_nb[1];
|
||||
src0_row.nb[2] = ori_src0_nb[1];
|
||||
src0_row.nb[3] = ori_src0_nb[1];
|
||||
|
||||
// src1_row [D, 1, 1, 1] -> input
|
||||
src1_row.ne[1] = 1;
|
||||
src1_row.ne[2] = 1;
|
||||
src1_row.ne[3] = 1;
|
||||
src1_row.nb[2] = nb11;
|
||||
src1_row.nb[3] = nb11;
|
||||
|
||||
// dst_row [M, 1, 1, 1] -> out
|
||||
dst_row.ne[1] = 1;
|
||||
dst_row.ne[2] = 1;
|
||||
dst_row.ne[3] = 1;
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
//create weight for one row
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
// expert index
|
||||
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
||||
int64_t i11 = (ne11 == 1 ? 0 : id);
|
||||
int64_t i12 = iid1;
|
||||
|
||||
int64_t i1 = id;
|
||||
int64_t i2 = i12;
|
||||
|
||||
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
||||
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
||||
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
src0_row.data = src0_tmp_ptr;
|
||||
src1_row.data = src1_tmp_ptr;
|
||||
dst_row.data = dst_tmp_ptr;
|
||||
dst_row.src[0] = &src0_row;
|
||||
dst_row.src[1] = &src1_row;
|
||||
|
||||
ggml_cann_mul_mat(ctx, &dst_row);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
|
||||
std::vector<aclTensor*> src0_tensor_vec;
|
||||
std::vector<aclTensor*> src1_tensor_vec;
|
||||
std::vector<aclTensor*> dst_tensor_vec;
|
||||
@@ -2701,9 +2764,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
}
|
||||
|
||||
size_t GROUP_SIZE = 128;
|
||||
// GroupedMatmulV2 required tensor_list.size < 128
|
||||
// GroupedMatmulV3 required tensor_list.size < 128
|
||||
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
||||
// split and call GroupedMatmulV2
|
||||
// split and call GroupedMatmulV3
|
||||
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
||||
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
||||
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
||||
@@ -2713,7 +2776,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
||||
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
||||
|
||||
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
||||
|
||||
@@ -2086,6 +2086,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
@@ -2187,7 +2193,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
@@ -2205,6 +2210,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
@@ -2227,6 +2236,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2172,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
|
||||
@@ -3614,6 +3614,292 @@ static void ggml_compute_forward_swiglu(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_geglu_erf
|
||||
|
||||
static void ggml_compute_forward_geglu_erf_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * src0_p = (float *) (src0_d + i1*src0_o);
|
||||
float * src1_p = (float *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_erf_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_erf_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
||||
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_erf_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
GGML_UNUSED(v);
|
||||
assert(!isnan(v));
|
||||
assert(!isinf(v));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_erf(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf_f16(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_geglu_quick
|
||||
|
||||
static void ggml_compute_forward_geglu_quick_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * src0_p = (float *) (src0_d + i1*src0_o);
|
||||
float * src1_p = (float *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_quick_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_quick_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
||||
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_quick_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
GGML_UNUSED(v);
|
||||
assert(!isnan(v));
|
||||
assert(!isinf(v));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_quick(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_geglu_quick_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_geglu_quick_f16(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_norm
|
||||
|
||||
static void ggml_compute_forward_norm_f32(
|
||||
@@ -5232,14 +5518,17 @@ static void ggml_compute_forward_soft_max_f32(
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
// TODO: handle transposed/permuted matrices
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
//const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
||||
const int64_t nb11 = src1 ? src1->nb[1] : 1;
|
||||
const int64_t nb12 = src1 ? src1->nb[2] : 1;
|
||||
const int64_t nb13 = src1 ? src1->nb[3] : 1;
|
||||
|
||||
const int64_t ne12 = src1 ? src1->ne[2] : 1;
|
||||
const int64_t ne13 = src1 ? src1->ne[3] : 1;
|
||||
|
||||
// TODO: is this supposed to be ceil instead of floor?
|
||||
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
||||
@@ -5249,68 +5538,66 @@ static void ggml_compute_forward_soft_max_f32(
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
||||
float * wp = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||
|
||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
// ALiBi
|
||||
const uint32_t h = (i1/ne01)%ne02; // head
|
||||
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
const int64_t i11 = i01;
|
||||
const int64_t i12 = i02%ne12;
|
||||
const int64_t i13 = i03%ne13;
|
||||
|
||||
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||
// ALiBi
|
||||
const uint32_t h = i02; // head
|
||||
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
||||
|
||||
// broadcast the mask across rows
|
||||
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
||||
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
||||
float * sp = (float *)((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
float * dp = (float *)((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
ggml_vec_cpy_f32 (nc, wp, sp);
|
||||
ggml_vec_scale_f32(nc, wp, scale);
|
||||
if (mp_f32) {
|
||||
if (use_f16) {
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
|
||||
// broadcast the mask across rows
|
||||
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL;
|
||||
float * mp_f32 = src1 ? (float *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL;
|
||||
|
||||
ggml_vec_cpy_f32 (ne00, wp, sp);
|
||||
ggml_vec_scale_f32(ne00, wp, scale);
|
||||
if (mp_f32) {
|
||||
if (use_f16) {
|
||||
for (int i = 0; i < ne00; ++i) {
|
||||
wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < ne00; ++i) {
|
||||
wp[i] += slope*mp_f32[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
wp[i] += slope*mp_f32[i];
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < ne00; ++i) {
|
||||
//printf("p[%d] = %f\n", i, p[i]);
|
||||
assert(!isnan(wp[i]));
|
||||
}
|
||||
#endif
|
||||
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(ne00, &max, wp);
|
||||
|
||||
ggml_float sum = ggml_vec_soft_max_f32(ne00, dp, wp, max);
|
||||
assert(sum > 0.0);
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(ne00, dp, sum);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < ne00; ++i) {
|
||||
assert(!isnan(dp[i]));
|
||||
assert(!isinf(dp[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
//printf("p[%d] = %f\n", i, p[i]);
|
||||
assert(!isnan(wp[i]));
|
||||
}
|
||||
#endif
|
||||
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(nc, &max, wp);
|
||||
|
||||
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
|
||||
assert(sum > 0.0);
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(nc, dp, sum);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
assert(!isnan(dp[i]));
|
||||
assert(!isinf(dp[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7766,7 +8053,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
ggml_type const k_vec_dot_type = ggml_get_type_traits_cpu(k->type)->vec_dot_type;
|
||||
ggml_type const k_vec_dot_type = ggml_get_type_traits_cpu(k->type)->vec_dot_type;
|
||||
ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits_cpu(k_vec_dot_type)->from_float;
|
||||
ggml_vec_dot_t const kq_vec_dot = ggml_get_type_traits_cpu(k->type)->vec_dot;
|
||||
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
||||
@@ -7798,7 +8085,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
memset(VKQ32, 0, DV*sizeof(float));
|
||||
}
|
||||
|
||||
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
||||
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]) : NULL;
|
||||
|
||||
// k indices
|
||||
const int ik3 = iq3 / rk3;
|
||||
@@ -8336,120 +8623,210 @@ void ggml_compute_forward_ssm_conv(
|
||||
static void ggml_compute_forward_ssm_scan_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // s
|
||||
const ggml_tensor * src1 = dst->src[1]; // x
|
||||
const ggml_tensor * src2 = dst->src[2]; // dt
|
||||
const ggml_tensor * src3 = dst->src[3]; // A
|
||||
const ggml_tensor * src4 = dst->src[4]; // B
|
||||
const ggml_tensor * src5 = dst->src[5]; // C
|
||||
const ggml_tensor * src0 = dst->src[0]; // s {d_state, dim, n_head, n_seqs+}
|
||||
const ggml_tensor * src1 = dst->src[1]; // x {dim, n_head, n_seq_tokens, n_seqs}
|
||||
const ggml_tensor * src2 = dst->src[2]; // dt {n_head, n_seq_tokens, n_seqs}
|
||||
const ggml_tensor * src3 = dst->src[3]; // A {d_state, n_head} or {1, n_head}
|
||||
const ggml_tensor * src4 = dst->src[4]; // B {d_state, n_group, n_seq_tokens, n_seqs}
|
||||
const ggml_tensor * src5 = dst->src[5]; // C {d_state, n_group, n_seq_tokens, n_seqs}
|
||||
const ggml_tensor * src6 = dst->src[6]; // ids {n_seqs}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // dim
|
||||
const int64_t nh = src1->ne[1]; // n_head
|
||||
const int64_t ng = src4->ne[1];
|
||||
const int64_t nt = src1->ne[2]; // number of tokens per sequence
|
||||
const int64_t ns = src1->ne[3]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||
// can't use ggml_nbytes because src1 is not necessarily contiguous
|
||||
const int64_t s_off = ggml_nelements(src1) * ggml_element_size(src1);
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + nc*nr*nh*ns == ggml_nelements(dst));
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||
// required for the dot product between s and C
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
||||
// required for per-sequence offsets for states
|
||||
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||
GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
|
||||
GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
|
||||
// allows optimizing the modulo since n_group should be a power of 2
|
||||
GGML_ASSERT((ng & -ng) == ng);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
// heads per thread
|
||||
const int dh = (nh + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
const int ir = ir1 - ir0;
|
||||
// head range for this thread
|
||||
const int ih0 = dh*ith;
|
||||
const int ih1 = MIN(ih0 + dh, nh);
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||
const int32_t * ids = (const int32_t *) src6->data;
|
||||
|
||||
// use the output as the source for the next token-wise iterations
|
||||
if (i2 > 0) { s0 = s; }
|
||||
for (int i3 = 0; i3 < ns; ++i3) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ids[i3]*(src0->nb[3])); // {d_state, dim, nh, ns}
|
||||
float * s = ( float *) (( char *) dst->data + i3*(src0->nb[3]) + s_off); // {d_state, dim, nh, ns}
|
||||
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
|
||||
svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
|
||||
svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
|
||||
for (int i2 = 0; i2 < nt; ++i2) {
|
||||
const float * x = (const float *) ((const char *) src1->data + i2*(src1->nb[2]) + i3*(src1->nb[3])); // {dim, nh, nt, ns}
|
||||
const float * dt = (const float *) ((const char *) src2->data + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {nh, nt, ns}
|
||||
const float * A = (const float *) ((const char *) src3->data); // {d_state, nh} or {1, nh}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[2]) + i3*(src4->nb[3])); // {d_state, ng, nt, ns}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[2]) + i3*(src5->nb[3])); // {d_state, ng, nt, ns}
|
||||
float * y = ( float *) (( char *) dst->data + i2*(nh*nr*sizeof(float)) + i3*(nt*nh*nr*sizeof(float))); // {dim, nh, nt, ns}
|
||||
|
||||
for (int64_t k = 0; k < nc; k += svcntw()) {
|
||||
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
|
||||
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
|
||||
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
|
||||
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
|
||||
if (src3->ne[0] == 1) {
|
||||
// Mamba-2 has a scalar decay factor per head; dA can be outside the state-wise loop
|
||||
|
||||
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
|
||||
t1 = exp_ps_sve(svptrue_b32(), t1);
|
||||
svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
|
||||
// n_head
|
||||
for (int h = ih0; h < ih1; ++h) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
|
||||
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
|
||||
const float dA = expf(dt_soft_plus * A[h]);
|
||||
|
||||
vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
|
||||
r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
|
||||
// dim
|
||||
for (int i1 = 0; i1 < nr; ++i1) {
|
||||
const int ii = i1 + h*nr;
|
||||
const float x_dt = x[ii] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int ggml_f32_epr = svcntw();
|
||||
const int ggml_f32_step = 1 * ggml_f32_epr;
|
||||
|
||||
GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
|
||||
}
|
||||
y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
|
||||
}
|
||||
}
|
||||
}
|
||||
const int np = (nc & ~(ggml_f32_step - 1));
|
||||
|
||||
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
||||
|
||||
GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA);
|
||||
GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt);
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
// TODO: maybe unroll more?
|
||||
for (int j = 0; j < 1; j++) {
|
||||
GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
|
||||
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
|
||||
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
|
||||
|
||||
t0 = GGML_F32_VEC_MUL(t0, adA);
|
||||
t1 = GGML_F32_VEC_MUL(t1, axdt);
|
||||
|
||||
t0 = GGML_F32_VEC_ADD(t0, t1);
|
||||
|
||||
sum = GGML_F32_VEC_FMA(sum, t0, t2);
|
||||
|
||||
GGML_F32_VEC_STORE(s + i + j*ggml_f32_epr + ii*nc, t0);
|
||||
}
|
||||
}
|
||||
|
||||
sumf = GGML_F32xt_REDUCE_ONE(sum);
|
||||
#else
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||
const int np = (nc & ~(GGML_F32_STEP - 1));
|
||||
|
||||
// use the output as the source for the next token-wise iterations
|
||||
if (i2 > 0) { s0 = s; }
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
// d_state
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
int i = i0 + i1*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA);
|
||||
GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt);
|
||||
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
GGML_F32_VEC az[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
|
||||
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
|
||||
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
|
||||
|
||||
ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
|
||||
ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
|
||||
|
||||
ax[j] = GGML_F32_VEC_ADD(ax[j], ay[j]);
|
||||
|
||||
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], az[j]);
|
||||
|
||||
GGML_F32_VEC_STORE(s + i + j*GGML_F32_EPR + ii*nc, ax[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F32_VEC_REDUCE(sumf, sum);
|
||||
#endif
|
||||
#else
|
||||
const int np = 0;
|
||||
#endif
|
||||
// d_state
|
||||
for (int i0 = np; i0 < nc; ++i0) {
|
||||
const int i = i0 + ii*nc;
|
||||
const int ig = i0 + (h & (ng - 1))*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
const float state = (s0[i] * dA) + (B[ig] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[ig];
|
||||
s[i] = state;
|
||||
}
|
||||
y[ii] = sumf;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Mamba-1 has an element-wise decay factor for the states
|
||||
|
||||
// n_head
|
||||
for (int h = ih0; h < ih1; ++h) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
|
||||
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
|
||||
|
||||
// dim
|
||||
for (int i1 = 0; i1 < nr; ++i1) {
|
||||
const int ii = i1 + h*nr;
|
||||
const float x_dt = x[ii] * dt_soft_plus;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
|
||||
svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
|
||||
svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
|
||||
|
||||
// d_state
|
||||
// TODO: what happens when (d_state % svcntw()) != 0?
|
||||
for (int64_t k = 0; k < nc; k += svcntw()) {
|
||||
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
|
||||
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
|
||||
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
|
||||
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
|
||||
|
||||
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
|
||||
t1 = exp_ps_sve(svptrue_b32(), t1);
|
||||
svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
|
||||
|
||||
vs0 = GGML_F32_VEC_FMA(t2, vs0, t1);
|
||||
r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
|
||||
|
||||
GGML_F32_VEC_STORE(&s[ii*nc + k], vs0);
|
||||
}
|
||||
y[ii] = GGML_F32xt_REDUCE_ONE(r1_vector);
|
||||
#else
|
||||
float sumf = 0.0f;
|
||||
// NOTE: can't really use GGML_SIMD here because d_state is usually 16
|
||||
// and also because expf is used within the loop.
|
||||
// d_state
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
const int i = i0 + ii*nc;
|
||||
const int ig = i0 + (h & (ng - 1))*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[ig];
|
||||
s[i] = state;
|
||||
}
|
||||
y[ii] = sumf;
|
||||
#endif
|
||||
}
|
||||
y[i1] = sumf;
|
||||
}
|
||||
}
|
||||
// use the output as the source when it's not the first token-wise iteration
|
||||
s0 = s;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_ssm_scan(
|
||||
@@ -8688,6 +9065,14 @@ void ggml_compute_forward_glu(
|
||||
{
|
||||
ggml_compute_forward_swiglu(params, dst);
|
||||
} break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf(params, dst);
|
||||
} break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
ggml_compute_forward_geglu_quick(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
||||
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
|
||||
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
||||
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
||||
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
|
||||
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
|
||||
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
|
||||
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
|
||||
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
|
||||
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
|
||||
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
|
||||
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
|
||||
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
|
||||
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
||||
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
|
||||
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
|
||||
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
|
||||
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
|
||||
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
|
||||
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
|
||||
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
|
||||
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
||||
}
|
||||
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
||||
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
}
|
||||
@@ -959,6 +959,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = x[i];
|
||||
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_QUICK_FP16
|
||||
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
||||
uint16_t t;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
||||
}
|
||||
}
|
||||
#else
|
||||
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
||||
#ifndef GGML_USE_ACCELERATE
|
||||
ggml_float sum = 0.0;
|
||||
|
||||
@@ -175,6 +175,20 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; \
|
||||
const int id = ggml_cuda_get_device(); \
|
||||
if (!shared_memory_limit_raised[id]) { \
|
||||
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
|
||||
shared_memory_limit_raised[id] = true; \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) do {} while (0)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
#else
|
||||
|
||||
@@ -123,13 +123,7 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_f32<true>), smpbo);
|
||||
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
} else {
|
||||
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
@@ -175,13 +169,7 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), smpbo);
|
||||
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
} else {
|
||||
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
|
||||
@@ -32,7 +32,9 @@ typedef void (* fattn_kernel_t)(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -851,7 +853,8 @@ void launch_fattn(
|
||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0,
|
||||
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
|
||||
@@ -1223,7 +1223,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -1288,7 +1290,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio));
|
||||
const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
|
||||
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
|
||||
float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
|
||||
@@ -1327,7 +1330,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio));
|
||||
const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
|
||||
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
|
||||
float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
|
||||
@@ -1348,8 +1352,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
@@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -64,7 +66,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
@@ -288,8 +290,8 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
@@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -58,8 +60,8 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
@@ -76,7 +78,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
|
||||
@@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -68,7 +70,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
K += nb12*(blockIdx.z / gqa_ratio);
|
||||
V += nb22*(blockIdx.z / gqa_ratio);
|
||||
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
@@ -342,8 +344,8 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
|
||||
@@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -51,8 +53,8 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
@@ -79,7 +81,8 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
Q += nb02* blockIdx.z + nb01*ic0;
|
||||
K += nb12*(blockIdx.z / gqa_ratio);
|
||||
V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
|
||||
|
||||
|
||||
@@ -46,7 +46,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -94,11 +96,11 @@ static __global__ void flash_attn_ext_f16(
|
||||
constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0;
|
||||
const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2);
|
||||
const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
const half2 * mask2 = (const half2 *) maskh;
|
||||
|
||||
const int stride_Q = nb01 / sizeof(float);
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
@@ -440,7 +442,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
|
||||
@@ -2314,6 +2314,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
ggml_cuda_op_swiglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
ggml_cuda_op_geglu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
ggml_cuda_op_geglu_quick(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3116,6 +3122,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
@@ -3321,12 +3329,26 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
case GGML_OP_SSM_SCAN: {
|
||||
if (op->src[3]->ne[0] == 1) {
|
||||
// Mamba2
|
||||
// (kernel only supports d_state == 128 && d_head % 16 == 0)
|
||||
return op->src[0]->ne[0] == 128 && op->src[0]->ne[1] % 16 == 0;
|
||||
} else {
|
||||
// Mamba
|
||||
// (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
|
||||
return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1;
|
||||
}
|
||||
}
|
||||
case GGML_OP_SSM_CONV: {
|
||||
// assumes d_inner % threads == 0
|
||||
return op->src[0]->ne[1] % 128 == 0;
|
||||
}
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX_BACK: {
|
||||
@@ -3375,6 +3397,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
|
||||
// the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3016,14 +3016,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc);
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, MMQ_NWARPS, false>), nbytes_shared);
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, MMQ_NWARPS, true>), nbytes_shared);
|
||||
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "ggml.h"
|
||||
#include "softmax.cuh"
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ float t2f32(T val) {
|
||||
@@ -13,6 +14,29 @@ __device__ float __forceinline__ t2f32<half>(half val) {
|
||||
return __half2float(val);
|
||||
}
|
||||
|
||||
struct soft_max_params {
|
||||
|
||||
int64_t nheads;
|
||||
uint32_t n_head_log2;
|
||||
int64_t ncols;
|
||||
int64_t nrows_x;
|
||||
int64_t nrows_y;
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
int64_t nb11;
|
||||
int64_t nb12;
|
||||
int64_t nb13;
|
||||
|
||||
int64_t ne12;
|
||||
int64_t ne13;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
};
|
||||
|
||||
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
|
||||
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
|
||||
#ifdef __clang__
|
||||
@@ -21,16 +45,24 @@ __device__ float __forceinline__ t2f32<half>(half val) {
|
||||
#endif // __clang__
|
||||
template <bool use_shared, int ncols_template, int block_size_template, typename T>
|
||||
static __global__ void soft_max_f32(
|
||||
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
|
||||
const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
const float * x, const T * mask, float * dst, const soft_max_params p) {
|
||||
const int ncols = ncols_template == 0 ? p.ncols : ncols_template;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int rowx = blockIdx.x;
|
||||
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
|
||||
|
||||
const int64_t i03 = blockIdx.z;
|
||||
const int64_t i02 = blockIdx.y;
|
||||
const int64_t i01 = blockIdx.x;
|
||||
|
||||
//TODO: noncontigous inputs/outputs
|
||||
const int rowx = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
|
||||
|
||||
const int64_t i11 = i01;
|
||||
const int64_t i12 = i02 % p.ne12;
|
||||
const int64_t i13 = i03 % p.ne13;
|
||||
|
||||
x += int64_t(rowx)*ncols;
|
||||
mask += int64_t(rowy)*ncols * (mask != nullptr);
|
||||
mask += (i11*p.nb11 + i12*p.nb12 + i13*p.nb13) / sizeof(T) * (mask != nullptr);
|
||||
dst += int64_t(rowx)*ncols;
|
||||
|
||||
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
||||
@@ -38,7 +70,7 @@ static __global__ void soft_max_f32(
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
|
||||
const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1);
|
||||
|
||||
extern __shared__ float data_soft_max_f32[];
|
||||
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
||||
@@ -55,7 +87,7 @@ static __global__ void soft_max_f32(
|
||||
break;
|
||||
}
|
||||
|
||||
const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
|
||||
const float val = x[col]*p.scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = max(max_val, val);
|
||||
@@ -150,64 +182,58 @@ static __global__ void soft_max_back_f32(
|
||||
}
|
||||
}
|
||||
|
||||
template<int... Ns, typename T>
|
||||
static void launch_soft_max_kernels(const float * x, const T * mask, float * dst,
|
||||
const soft_max_params & p, cudaStream_t stream, dim3 block_dims, dim3 block_nums, size_t nbytes_shared)
|
||||
{
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
auto launch_kernel = [=](auto I) -> bool {
|
||||
constexpr int ncols = decltype(I)::value;
|
||||
constexpr int block = (ncols > 1024 ? 1024 : ncols);
|
||||
|
||||
if (p.ncols == ncols) {
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, ncols, block, T>), smpbo);
|
||||
soft_max_f32<true, ncols, block><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, p);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// unary fold over launch_kernel
|
||||
if ((launch_kernel(std::integral_constant<int, Ns>{}) || ...)) {
|
||||
return;
|
||||
}
|
||||
|
||||
//default case
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, 0, 0, T>), smpbo);
|
||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, dst, p);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
|
||||
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const soft_max_params & params, cudaStream_t stream) {
|
||||
int nth = WARP_SIZE;
|
||||
const int64_t ncols_x = params.ncols;
|
||||
|
||||
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const dim3 block_dims(nth, 1, 1);
|
||||
const dim3 block_nums(nrows_x, 1, 1);
|
||||
const dim3 block_nums(params.ne01, params.ne02, params.ne03);
|
||||
const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
||||
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||
|
||||
const uint32_t n_head = nrows_x/nrows_y;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
soft_max_f32<true, 32, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 64:
|
||||
soft_max_f32<true, 64, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 128:
|
||||
soft_max_f32<true, 128, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 256:
|
||||
soft_max_f32<true, 256, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 512:
|
||||
soft_max_f32<true, 512, 512><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 1024:
|
||||
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 2048:
|
||||
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 4096:
|
||||
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
default:
|
||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
}
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, dst, params, stream, block_dims, block_nums, nbytes_shared);
|
||||
} else {
|
||||
const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
|
||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, params);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,10 +261,11 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src0->ne[1];
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
@@ -247,10 +274,44 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||
|
||||
const int64_t nb11 = src1 ? src1->nb[1] : 1;
|
||||
const int64_t nb12 = src1 ? src1->nb[2] : 1;
|
||||
const int64_t nb13 = src1 ? src1->nb[3] : 1;
|
||||
|
||||
const int64_t ne12 = src1 ? src1->ne[2] : 1;
|
||||
const int64_t ne13 = src1 ? src1->ne[3] : 1;
|
||||
|
||||
const uint32_t n_head = src0->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
|
||||
soft_max_params params = {};
|
||||
params.nheads = src0->ne[2];
|
||||
params.n_head_log2 = n_head_log2;
|
||||
params.ncols = ne00;
|
||||
params.nrows_x = nrows_x;
|
||||
params.nrows_y = nrows_y;
|
||||
params.ne00 = src0->ne[0];
|
||||
params.ne01 = src0->ne[1];
|
||||
params.ne02 = src0->ne[2];
|
||||
params.ne03 = src0->ne[3];
|
||||
params.nb11 = nb11;
|
||||
params.nb12 = nb12;
|
||||
params.nb13 = nb13;
|
||||
params.ne12 = ne12;
|
||||
params.ne13 = ne13;
|
||||
params.scale = scale;
|
||||
params.max_bias = max_bias;
|
||||
params.m0 = m0;
|
||||
params.m1 = m1;
|
||||
|
||||
if (use_f16) {
|
||||
soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||
soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, params, stream);
|
||||
} else {
|
||||
soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||
soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, params, stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,16 +4,15 @@ template <size_t splitD, size_t N>
|
||||
__global__ void __launch_bounds__(splitD, 2)
|
||||
ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
|
||||
const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2,
|
||||
const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * __restrict__ dst, const int64_t L) {
|
||||
GGML_UNUSED(src1_nb0);
|
||||
GGML_UNUSED(src2_nb0);
|
||||
const int32_t * __restrict__ src6, float * __restrict__ dst,
|
||||
const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
|
||||
const int64_t s_off, const int64_t d_inner, const int64_t L) {
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int bidx = blockIdx.x; // split along B (sequences)
|
||||
const int bidy = blockIdx.y; // split along D (d_inner)
|
||||
const int tid = threadIdx.x;
|
||||
const int wid = tid / 32;
|
||||
const int wtid = tid % 32;
|
||||
@@ -24,23 +23,23 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
float * smem_A = smem;
|
||||
float * smem_s0 = smem_A + splitD * stride_sA;
|
||||
|
||||
const float * s0_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
const float * x_block = (const float *) ((const char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * s0_block = (const float *) ((const char *) src0 + src6[bidx] * src0_nb3 + bidy * splitD * src0_nb2);
|
||||
const float * x_block = (const float *) ((const char *) src1 + (bidx * src1_nb3) + bidy * splitD * sizeof(float));
|
||||
const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * A_block = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1);
|
||||
const float * B_block = (const float *) ((const char *) src4 + (bidx * src4_nb2));
|
||||
const float * C_block = (const float *) ((const char *) src5 + (bidx * src5_nb2));
|
||||
float * y_block = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
float * s_block = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
const float * B_block = (const float *) ((const char *) src4 + (bidx * src4_nb3));
|
||||
const float * C_block = (const float *) ((const char *) src5 + (bidx * src5_nb3));
|
||||
float * y_block = (float *) ((char *) dst + (bidx * d_inner * L * sizeof(float)) + bidy * splitD * sizeof(float));
|
||||
float * s_block = (float *) ((char *) dst + s_off + bidx * src0_nb3 + bidy * splitD * src0_nb2);
|
||||
|
||||
const int stride_s0 = src0_nb1 / sizeof(float);
|
||||
const int stride_x = src1_nb1 / sizeof(float);
|
||||
const int stride_s0 = src0_nb2 / sizeof(float);
|
||||
const int stride_x = src1_nb2 / sizeof(float);
|
||||
const int stride_dt = src2_nb1 / sizeof(float);
|
||||
const int stride_A = src3_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb1 / sizeof(float);
|
||||
const int stride_C = src5_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb2 / sizeof(float);
|
||||
const int stride_C = src5_nb2 / sizeof(float);
|
||||
const int stride_s = stride_s0;
|
||||
const int stride_y = stride_x;
|
||||
const int stride_y = d_inner;
|
||||
|
||||
// can N not be 16? for example 32?
|
||||
if (N == 16) {
|
||||
@@ -84,24 +83,156 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
}
|
||||
}
|
||||
|
||||
// assumes as many threads as d_state
|
||||
template <int splitH, int d_state>
|
||||
__global__ void __launch_bounds__(d_state, 1)
|
||||
ssm_scan_f32_group(
|
||||
const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
|
||||
const int32_t * __restrict__ src6, float * __restrict__ dst,
|
||||
const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
|
||||
const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) {
|
||||
|
||||
const int head_idx = (blockIdx.x * splitH) / d_head;
|
||||
const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float);
|
||||
const int seq_idx = blockIdx.y;
|
||||
|
||||
const int group_off = (head_idx & (n_group - 1)) * d_state * sizeof(float);
|
||||
|
||||
const float * s0_block = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
const float * x_block = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + blockIdx.x * splitH * sizeof(float));
|
||||
const float * dt_block = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
|
||||
const float * A_block = (const float *) ((const char *) src3 + head_idx * src3_nb1);
|
||||
const float * B_block = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
|
||||
const float * C_block = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
|
||||
float * y_block = dst + (seq_idx * n_tok * n_head * d_head) + blockIdx.x * splitH;
|
||||
float * s_block = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
|
||||
// strides across n_seq_tokens
|
||||
const int stride_x = src1_nb2 / sizeof(float);
|
||||
const int stride_dt = src2_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb2 / sizeof(float);
|
||||
const int stride_C = src5_nb2 / sizeof(float);
|
||||
const int stride_y = n_head * d_head;
|
||||
|
||||
float state[splitH];
|
||||
// for the parallel accumulation
|
||||
__shared__ float stateC[splitH * d_state];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < splitH; j++) {
|
||||
state[j] = s0_block[j * d_state + threadIdx.x];
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < n_tok; i++) {
|
||||
// TODO: only calculate dA and dt_soft_plus once per head instead of every splitH head elements
|
||||
// TODO: only calculate B and C once per head group
|
||||
// NOTE: dt_soft_plus, dA and x_dt have the same value across threads here.
|
||||
float dt_soft_plus = dt_block[i * stride_dt];
|
||||
if (dt_soft_plus <= 20.0f) {
|
||||
dt_soft_plus = log1pf(expf(dt_soft_plus));
|
||||
}
|
||||
const float dA = expf(dt_soft_plus * A_block[0]);
|
||||
const float B = B_block[i * stride_B + threadIdx.x];
|
||||
const float C = C_block[i * stride_C + threadIdx.x];
|
||||
|
||||
// across d_head
|
||||
#pragma unroll
|
||||
for (int j = 0; j < splitH; j++) {
|
||||
const float x_dt = x_block[i * stride_x + j] * dt_soft_plus;
|
||||
|
||||
state[j] = (state[j] * dA) + (B * x_dt);
|
||||
|
||||
stateC[j * d_state + threadIdx.x] = state[j] * C;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// parallel accumulation for stateC
|
||||
// TODO: simplify
|
||||
{
|
||||
static_assert((d_state & -d_state) == d_state, "the state size has to be a power of 2");
|
||||
static_assert((splitH & -splitH) == splitH, "splitH has to be a power of 2");
|
||||
|
||||
// reduce until w matches the warp size
|
||||
// TODO: does this work even when the physical warp size is 64?
|
||||
#pragma unroll
|
||||
for (int w = d_state; w > WARP_SIZE; w >>= 1) {
|
||||
// (assuming there are d_state threads)
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ((w >> 1) * splitH + d_state - 1) / d_state; j++) {
|
||||
// TODO: check for bank conflicts
|
||||
const int k = (threadIdx.x % (w >> 1)) + (d_state * (threadIdx.x / (w >> 1))) + j * d_state * (d_state / (w >> 1));
|
||||
stateC[k] += stateC[k + (w >> 1)];
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static_assert(splitH >= d_state / WARP_SIZE);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < splitH / (d_state / WARP_SIZE); j++) {
|
||||
float y = stateC[(threadIdx.x % WARP_SIZE) + d_state * (threadIdx.x / WARP_SIZE) + j * d_state * (d_state / WARP_SIZE)];
|
||||
y = warp_reduce_sum(y);
|
||||
|
||||
// store the above accumulations
|
||||
if (threadIdx.x % WARP_SIZE == 0) {
|
||||
const int k = threadIdx.x / WARP_SIZE + j * (d_state / WARP_SIZE);
|
||||
y_block[i * stride_y + k] = y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// write back the state
|
||||
#pragma unroll
|
||||
for (int j = 0; j < splitH; j++) {
|
||||
s_block[j * d_state + threadIdx.x] = state[j];
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_scan_f32_cuda(const float * src0, const float * src1, const float * src2, const float * src3,
|
||||
const float * src4, const float * src5, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * dst, const int64_t N, const int64_t D, const int64_t L, const int64_t B,
|
||||
const float * src4, const float * src5, const int32_t * src6, float * dst,
|
||||
const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3, const int src2_nb1,
|
||||
const int src2_nb2, const int src3_nb1, const int src4_nb2, const int src4_nb3, const int src5_nb2,
|
||||
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
|
||||
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
|
||||
cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// todo: consider D cannot be divided,does this situation exist?
|
||||
GGML_ASSERT(D % threads == 0);
|
||||
const dim3 blocks(B, (D + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (N + 1) * 2) * sizeof(float);
|
||||
if (N == 16) {
|
||||
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0,
|
||||
src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, L);
|
||||
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
|
||||
if (src3_nb1 == sizeof(float)) {
|
||||
// Mamba-2
|
||||
if (d_state == 128) {
|
||||
GGML_ASSERT(d_state % threads == 0);
|
||||
// NOTE: can be any power of two between 4 and 64
|
||||
const int splitH = 16;
|
||||
GGML_ASSERT(head_dim % splitH == 0);
|
||||
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
|
||||
ssm_scan_f32_group<16, 128><<<blocks, threads, 0, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
} else {
|
||||
GGML_ABORT("doesn't support d_state!=128.");
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("doesn't support N!=16.");
|
||||
// Mamba-1
|
||||
GGML_ASSERT(n_head % threads == 0);
|
||||
GGML_ASSERT(head_dim == 1);
|
||||
GGML_ASSERT(n_group == 1);
|
||||
const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
|
||||
if (d_state == 16) {
|
||||
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
} else {
|
||||
GGML_ABORT("doesn't support d_state!=16.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,30 +243,25 @@ void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src3 = dst->src[3]; // A
|
||||
const struct ggml_tensor * src4 = dst->src[4]; // B
|
||||
const struct ggml_tensor * src5 = dst->src[5]; // C
|
||||
|
||||
// const int64_t d_state = src0->ne[0];
|
||||
// const int64_t d_inner = src0->ne[1];
|
||||
// const int64_t l = src1->ne[1];
|
||||
// const int64_t b = src0->ne[2];
|
||||
const struct ggml_tensor * src6 = dst->src[6]; // ids
|
||||
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||
const int64_t nr = src0->ne[1]; // head_dim or 1
|
||||
const int64_t nh = src1->ne[1]; // n_head
|
||||
const int64_t ng = src4->ne[1]; // n_group
|
||||
const int64_t n_t = src1->ne[2]; // number of tokens per sequence
|
||||
const int64_t n_s = src1->ne[3]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||
const int64_t s_off = ggml_nelements(src1) * sizeof(float);
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + nc*nr*nh*n_s == ggml_nelements(dst));
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||
// required for the dot product between s and C
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
// required for per-sequence offsets for states
|
||||
GGML_ASSERT(src0->nb[2] == src0->ne[0] * src0->ne[1] * sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||
GGML_ASSERT(src1->nb[3] == src1->ne[0] * src1->ne[1] * src1->ne[2] * sizeof(float));
|
||||
GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
@@ -143,13 +269,16 @@ void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const float * src3_d = (const float *) src3->data;
|
||||
const float * src4_d = (const float *) src4->data;
|
||||
const float * src5_d = (const float *) src5->data;
|
||||
const int32_t * src6_d = (const int32_t *) src6->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src6->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src0->nb[1], src0->nb[2], src1->nb[0],
|
||||
src1->nb[1], src1->nb[2], src1->nb[3], src2->nb[0], src2->nb[1], src2->nb[2], src3->nb[1],
|
||||
src4->nb[1], src4->nb[2], src5->nb[1], src5->nb[2], dst_d, nc, nr, n_t, n_s, stream);
|
||||
ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src6_d, dst_d,
|
||||
src0->nb[2], src0->nb[3], src1->nb[2], src1->nb[3], src2->nb[1], src2->nb[2],
|
||||
src3->nb[1], src4->nb[2], src4->nb[3], src5->nb[2], src5->nb[3],
|
||||
s_off, nc, nr, nh, ng, n_t, n_s, stream);
|
||||
}
|
||||
|
||||
@@ -285,6 +285,14 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary_gated<op_silu>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary_gated<op_gelu_erf>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary_gated<op_gelu_quick>(ctx, dst);
|
||||
}
|
||||
|
||||
/* silu_back */
|
||||
|
||||
static __device__ __forceinline__ float op_silu_back(float grad, float x) {
|
||||
|
||||
@@ -64,3 +64,7 @@ void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-kompute
|
||||
ggml-kompute.cpp
|
||||
../../include/ggml-kompute.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
|
||||
target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q4_k.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f32.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_norm_f16.comp
|
||||
kompute-shaders/op_rope_norm_f32.comp
|
||||
kompute-shaders/op_rope_neox_f16.comp
|
||||
kompute-shaders/op_rope_neox_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q4_k.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f32.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_norm_f16.h
|
||||
shaderop_rope_norm_f32.h
|
||||
shaderop_rope_neox_f16.h
|
||||
shaderop_rope_neox_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
File diff suppressed because it is too large
Load Diff
Submodule ggml/src/ggml-kompute/kompute deleted from 4565194ed7
@@ -1,112 +0,0 @@
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
|
||||
#extension GL_EXT_control_flow_attributes: enable
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_1 32
|
||||
|
||||
#define GELU_COEF_A 0.044715
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
||||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
|
||||
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
||||
|
||||
#define sizeof_block_q4_0 0x12
|
||||
struct block_q4_0 {
|
||||
float16_t d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.f * xb.d;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_1 0x14
|
||||
struct block_q4_1 {
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb.m;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_k 144
|
||||
struct block_q4_k {
|
||||
float16_t d;
|
||||
float16_t dmin;
|
||||
uint8_t scales[K_SCALE_SIZE];
|
||||
uint8_t qs[QK_K/2];
|
||||
};
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
float16_t d; // super-block scale
|
||||
};
|
||||
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
|
||||
const float16_t d_all = xb.d;
|
||||
|
||||
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
const uint qhIndex = 32*(il/8) + 16*(il&1);
|
||||
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
|
||||
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
|
||||
const float16_t ml = float16_t(d_all * sc * 32.f);
|
||||
const float16_t dl = float16_t(d_all * sc * coef);
|
||||
mat4 reg;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
|
||||
: ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
|
||||
#define QK8_0 32
|
||||
// struct block_q8_0 {
|
||||
// float16_t d; // delta
|
||||
// int8_t qs[QK8_0]; // quants
|
||||
// };
|
||||
#define sizeof_block_q8_0 34
|
||||
@@ -1,58 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
//int offs; // TODO: needed for GGML_OP_ACC, see metal code
|
||||
} pcs;
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
int offs = 0; // TMP (see above)
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
uint row;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint n_past;
|
||||
int ne00;
|
||||
int ne01;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i02 = gl_WorkGroupID.z;
|
||||
const uint i01 = gl_WorkGroupID.y;
|
||||
const uint i00 = gl_WorkGroupID.x;
|
||||
|
||||
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
|
||||
|
||||
if (i00 > pcs.n_past + i01) {
|
||||
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
|
||||
} else {
|
||||
out_[index + pcs.outOff] = in_[index + pcs.inOff];
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
int z = 0;
|
||||
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
|
||||
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
|
||||
const mat4 result = dequantize_block(inIndex, ind%NL);
|
||||
for (uint j = 0; j < 4; ++j) {
|
||||
for (uint k = 0; k < 4; ++k) {
|
||||
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
|
||||
out_[outIndex] = result[j][k];
|
||||
++z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
||||
block_q4_0 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
||||
fres.qs[it] = inA[index+2+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_0 block = get_unaligned_block_q4_0(index);
|
||||
return dequantize_q4_0(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
||||
@@ -1,39 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||
block_q4_1 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
fres.m = u8BufToFloat16(inA, index+2);
|
||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||
fres.qs[it] = inA[index+4+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(index);
|
||||
return dequantize_q4_1(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
||||
@@ -1,44 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 16
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q6_k get_unaligned_block_q6_k(uint index) {
|
||||
block_q6_k fres;
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
|
||||
fres.ql[it] = inA[index + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
|
||||
fres.qh[it] = inA[index + QK_K/2 + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
|
||||
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
|
||||
}
|
||||
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q6_k block = get_unaligned_block_q6_k(index);
|
||||
return dequantize_q6_k(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
||||
@@ -1,69 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb10;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
void main() {
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint rb = gl_WorkGroupID.y*N_F16_F32;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
|
||||
|
||||
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
||||
|
||||
for (uint row = 0; row < N_F16_F32; ++row) {
|
||||
uint r1 = rb + row;
|
||||
if (r1 >= pcs.ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf = 0;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
// device subgroup size
|
||||
layout (local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout(binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
}
|
||||
pcs;
|
||||
|
||||
|
||||
void main() {
|
||||
uvec3 gid = gl_WorkGroupID;
|
||||
|
||||
uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
|
||||
uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
|
||||
|
||||
const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
|
||||
const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||
float sum = 0.0f;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sum += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sum);
|
||||
if (subgroupElect()) {
|
||||
out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_0
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_0 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
||||
@@ -1,35 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_1
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_1 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float m = float(u8BufToFloat16(inA, index+2));
|
||||
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
||||
@@ -1,140 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define N_DST 4
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_k
|
||||
|
||||
layout(local_size_x = 4) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint16_t kmask1 = uint16_t(0x3f3f);
|
||||
const uint16_t kmask2 = uint16_t(0x0f0f);
|
||||
const uint16_t kmask3 = uint16_t(0xc0c0);
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/8; // 0...3
|
||||
const uint it = gl_SubgroupInvocationID%8; // 0...7
|
||||
const uint iq = it/4; // 0 or 1
|
||||
const uint ir = it%4; // 0...3
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = r0 * N_DST;
|
||||
const uint ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
|
||||
|
||||
const uint xblk = offset0 + pcs.inAOff;
|
||||
const uint y = (offset1 / 4) + pcs.inBOff;
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
|
||||
float all_sum = 0.f;
|
||||
|
||||
uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 4) {
|
||||
const uint blk_idx = ib + xblk;
|
||||
|
||||
float sumy[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
|
||||
yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
|
||||
yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
|
||||
yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
|
||||
|
||||
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
||||
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
||||
uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
|
||||
uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
|
||||
uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
|
||||
|
||||
uint16_t sc16[4];
|
||||
sc16[0] = sc_0 & kmask1;
|
||||
sc16[1] = sc_2 & kmask1;
|
||||
sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
|
||||
sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
|
||||
|
||||
float acc1[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
float acc2[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
|
||||
uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
|
||||
acc1[0] += yl[i+0] * (q1 & 0x000F);
|
||||
acc1[1] += yl[i+1] * (q1 & 0x0F00);
|
||||
acc1[2] += yl[i+8] * (q1 & 0x00F0);
|
||||
acc1[3] += yl[i+9] * (q1 & 0xF000);
|
||||
acc2[0] += yh[i+0] * (q2 & 0x000F);
|
||||
acc2[1] += yh[i+1] * (q2 & 0x0F00);
|
||||
acc2[2] += yh[i+8] * (q2 & 0x00F0);
|
||||
acc2[3] += yh[i+9] * (q2 & 0xF000);
|
||||
}
|
||||
|
||||
uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
|
||||
uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
|
||||
uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
|
||||
uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
|
||||
uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
|
||||
uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
|
||||
uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
|
||||
uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
|
||||
|
||||
float dall = float(inA[blk_idx + row_idx].d);
|
||||
float dmin = float(inA[blk_idx + row_idx].dmin);
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
|
||||
(acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
|
||||
(acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
|
||||
dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
|
||||
}
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,106 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y_id = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint8_t kmask1 = uint8_t(0x03);
|
||||
const uint8_t kmask2 = uint8_t(0x0C);
|
||||
const uint8_t kmask3 = uint8_t(0x30);
|
||||
const uint8_t kmask4 = uint8_t(0xC0);
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// bits of invocation ID for gl_SubgroupSize=32:
|
||||
// x x x x x
|
||||
// 4 3 2 1 0
|
||||
// ( tid ) ix
|
||||
// ip ( il )
|
||||
|
||||
const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
|
||||
const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
|
||||
const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
|
||||
const uint ip = tid/8; // first or second half of block (0 or 1)
|
||||
const uint il = tid%8; // each half has 8 parts, one per scale
|
||||
const uint n = 4; // 4 scales at a time (and 4 sums)
|
||||
const uint l0 = n*il; // offset into half-block, 0..28
|
||||
const uint is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
const uint y_offset = 128*ip + l0;
|
||||
const uint q_offset_l = 64*ip + l0;
|
||||
const uint q_offset_h = 32*ip + l0;
|
||||
|
||||
for (uint i = ix; i < nb; i += block_stride) {
|
||||
|
||||
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
|
||||
const uint qlIndex = q_offset_l;
|
||||
const uint q2Index = qlIndex + QK_K/8;
|
||||
const uint qhIndex = q_offset_h;
|
||||
const uint y = yy + i * QK_K + y_offset;
|
||||
|
||||
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
for (uint l = 0; l < n; ++l) {
|
||||
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
|
||||
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
|
||||
const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
|
||||
|
||||
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
|
||||
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
|
||||
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
|
||||
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
|
||||
}
|
||||
|
||||
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
|
||||
}
|
||||
|
||||
const float tot = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
@@ -1,73 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
#define SIZE_OF_D 2
|
||||
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const int nr = N_DST;
|
||||
const int nsg = N_SIMDGROUP;
|
||||
const int nw = N_SIMDWIDTH;
|
||||
|
||||
const int nb = pcs.ne00/QK8_0;
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
|
||||
|
||||
float yl[NB_Q8_0];
|
||||
float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID.x/4;
|
||||
const uint il = gl_SubgroupInvocationID.x%4;
|
||||
|
||||
uint yb = y + ix * QK8_0 + NB_Q8_0*il;
|
||||
|
||||
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
||||
for (uint ib = ix; ib < nb; ib += nw/4) {
|
||||
for (int i = 0; i < NB_Q8_0; ++i) {
|
||||
yl[i] = inB[yb + i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
||||
const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
|
||||
sumq += qs_iq * yl[iq];
|
||||
}
|
||||
const float16_t d = u8BufToFloat16(inA, x + block_offset);
|
||||
sumf[row] += sumq*d;
|
||||
}
|
||||
|
||||
yb += NB_Q8_0 * nw;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect() && first_row + row < pcs.ne01) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
// pointers to src0 rows
|
||||
uint ax[N_ROWS];
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
|
||||
ax[row] = offset0 + pcs.inAOff;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/2;
|
||||
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||
|
||||
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||
|
||||
//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||
// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||
// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 16) {
|
||||
for (int row = 0; row < N_ROWS; row++) {
|
||||
sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
|
||||
}
|
||||
|
||||
yb += BLOCKS_IN_QUANT * 16;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne10;
|
||||
int ne12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
@@ -1,84 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
// MEAN
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float mean = sum[0];
|
||||
|
||||
// recenter
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] - mean;
|
||||
}
|
||||
|
||||
// VARIANCE
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float variance = sum[0];
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] *= scale;
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
|
||||
}
|
||||
}
|
||||
@@ -1,53 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 512) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
||||
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] * scale;
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+pcs.n_dims/2]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+pcs.n_dims/2];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+1]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+1];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
|
||||
}
|
||||
}
|
||||
@@ -1,72 +0,0 @@
|
||||
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
|
||||
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint n_head_log2;
|
||||
int mask;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
|
||||
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
|
||||
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (pcs.max_bias > 0.0f) {
|
||||
int64_t h = i02;
|
||||
|
||||
float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
|
||||
int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, float(exp));
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float localMax = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
|
||||
}
|
||||
float max_ = subgroupMax(localMax);
|
||||
|
||||
// parallel sum
|
||||
float localSum = 0.0f;
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
|
||||
localSum += exp_psrc0;
|
||||
out_[pdst + i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = subgroupAdd(localSum);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
out_[pdst + i00] /= sum;
|
||||
}
|
||||
}
|
||||
@@ -1,71 +0,0 @@
|
||||
#include "common.comp"
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
|
||||
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint inCOff;
|
||||
uint outOff;
|
||||
int n_dims;
|
||||
int mode;
|
||||
int n_ctx_orig;
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
bool has_freq_factors;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const float i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
|
||||
out float cos_theta, out float sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
cos_theta = cos(theta) * mscale;
|
||||
sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
void rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
|
||||
}
|
||||
@@ -71,7 +71,9 @@ else()
|
||||
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
|
||||
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
|
||||
set(XC_FLAGS -fno-fast-math -fno-inline -g)
|
||||
# note: adding -g causes segmentation fault during compile
|
||||
#set(XC_FLAGS -fno-fast-math -fno-inline -g)
|
||||
set(XC_FLAGS -fno-fast-math -fno-inline)
|
||||
else()
|
||||
set(XC_FLAGS -O3)
|
||||
endif()
|
||||
@@ -90,7 +92,7 @@ else()
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
|
||||
xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
|
||||
DEPENDS ggml-metal.metal ${METALLIB_COMMON}
|
||||
|
||||
@@ -229,7 +229,11 @@ typedef struct {
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb23;
|
||||
int32_t ne32;
|
||||
int32_t ne33;
|
||||
uint64_t nb31;
|
||||
uint64_t nb32;
|
||||
uint64_t nb33;
|
||||
int32_t ne1;
|
||||
int32_t ne2;
|
||||
float scale;
|
||||
@@ -461,9 +465,21 @@ typedef struct {
|
||||
} ggml_metal_kargs_sum_rows;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int32_t ne00;
|
||||
int32_t ne01;
|
||||
int32_t ne02;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int32_t ne11;
|
||||
int32_t ne12;
|
||||
int32_t ne13;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
@@ -499,26 +515,25 @@ typedef struct {
|
||||
typedef struct {
|
||||
int64_t d_state;
|
||||
int64_t d_inner;
|
||||
int64_t n_head;
|
||||
int64_t n_group;
|
||||
int64_t n_seq_tokens;
|
||||
int64_t n_seqs;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb10;
|
||||
uint64_t nb03;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb20;
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb30;
|
||||
uint64_t nb31;
|
||||
uint64_t nb40;
|
||||
uint64_t nb41;
|
||||
uint64_t nb42;
|
||||
uint64_t nb50;
|
||||
uint64_t nb43;
|
||||
uint64_t nb51;
|
||||
uint64_t nb52;
|
||||
uint64_t nb53;
|
||||
} ggml_metal_kargs_ssm_scan;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -217,6 +217,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
|
||||
GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
|
||||
GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,
|
||||
GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,
|
||||
GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
|
||||
@@ -529,6 +530,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_REGLU,
|
||||
GGML_METAL_KERNEL_TYPE_GEGLU,
|
||||
GGML_METAL_KERNEL_TYPE_SWIGLU,
|
||||
GGML_METAL_KERNEL_TYPE_GEGLU_ERF,
|
||||
GGML_METAL_KERNEL_TYPE_GEGLU_QUICK,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
GGML_METAL_KERNEL_TYPE_MEAN,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
|
||||
@@ -1196,6 +1199,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP, ssm_scan_f32_group, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction);
|
||||
@@ -1508,6 +1512,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REGLU, reglu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU, geglu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU, swiglu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_ERF, geglu_erf, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_QUICK, geglu_quick, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
|
||||
@@ -1691,6 +1697,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -1725,7 +1733,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
|
||||
return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
|
||||
@@ -2454,6 +2462,12 @@ static bool ggml_metal_encode_node(
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SWIGLU].pipeline;
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU_ERF].pipeline;
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU_QUICK].pipeline;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -2644,10 +2658,7 @@ static bool ggml_metal_encode_node(
|
||||
memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(scale));
|
||||
memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
||||
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src0->ne[1];
|
||||
|
||||
const uint32_t n_head = nrows_x/nrows_y;
|
||||
const uint32_t n_head = src0->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
@@ -2707,6 +2718,18 @@ static bool ggml_metal_encode_node(
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.ne13 =*/ ne13,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.scale =*/ scale,
|
||||
/*.max_bias =*/ max_bias,
|
||||
/*.m0 =*/ m0,
|
||||
@@ -2726,7 +2749,7 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
{
|
||||
@@ -2800,71 +2823,91 @@ static bool ggml_metal_encode_node(
|
||||
struct ggml_tensor * src3 = node->src[3];
|
||||
struct ggml_tensor * src4 = node->src[4];
|
||||
struct ggml_tensor * src5 = node->src[5];
|
||||
struct ggml_tensor * src6 = node->src[6];
|
||||
|
||||
GGML_ASSERT(src3);
|
||||
GGML_ASSERT(src4);
|
||||
GGML_ASSERT(src5);
|
||||
GGML_ASSERT(src6);
|
||||
|
||||
size_t offs_src3 = 0;
|
||||
size_t offs_src4 = 0;
|
||||
size_t offs_src5 = 0;
|
||||
size_t offs_src6 = 0;
|
||||
|
||||
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
|
||||
id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
|
||||
id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
|
||||
id<MTLBuffer> id_src6 = src6 ? ggml_metal_get_buffer(src6, &offs_src6) : nil;
|
||||
|
||||
const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30);
|
||||
const int64_t ne30 = src3->ne[0];
|
||||
const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31);
|
||||
|
||||
const uint64_t nb30 = src3->nb[0];
|
||||
const uint64_t nb30 = src3->nb[0]; GGML_UNUSED(nb30);
|
||||
const uint64_t nb31 = src3->nb[1];
|
||||
|
||||
const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40);
|
||||
const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41);
|
||||
const int64_t ne41 = src4->ne[1];
|
||||
const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42);
|
||||
const int64_t ne43 = src4->ne[3]; GGML_UNUSED(ne43);
|
||||
|
||||
const uint64_t nb40 = src4->nb[0];
|
||||
const uint64_t nb40 = src4->nb[0]; GGML_UNUSED(nb40);
|
||||
const uint64_t nb41 = src4->nb[1];
|
||||
const uint64_t nb42 = src4->nb[2];
|
||||
const uint64_t nb43 = src4->nb[3];
|
||||
|
||||
const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50);
|
||||
const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51);
|
||||
const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52);
|
||||
const int64_t ne53 = src5->ne[3]; GGML_UNUSED(ne53);
|
||||
|
||||
const uint64_t nb50 = src5->nb[0];
|
||||
const uint64_t nb50 = src5->nb[0]; GGML_UNUSED(nb50);
|
||||
const uint64_t nb51 = src5->nb[1];
|
||||
const uint64_t nb52 = src5->nb[2];
|
||||
const uint64_t nb53 = src5->nb[3];
|
||||
|
||||
const int64_t ne60 = src6->ne[0]; GGML_UNUSED(ne60);
|
||||
|
||||
const uint64_t nb60 = src6->nb[0]; GGML_UNUSED(nb60);
|
||||
|
||||
const int64_t d_state = ne00;
|
||||
const int64_t d_inner = ne01;
|
||||
const int64_t n_seq_tokens = ne11;
|
||||
const int64_t n_seqs = ne02;
|
||||
const int64_t n_head = ne02;
|
||||
const int64_t n_group = ne41;
|
||||
const int64_t n_seq_tokens = ne12;
|
||||
const int64_t n_seqs = ne13;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
if (ne30 == 1) {
|
||||
// Mamba-2
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP].pipeline;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
|
||||
}
|
||||
|
||||
ggml_metal_kargs_ssm_scan args = {
|
||||
/*.d_state =*/ d_state,
|
||||
/*.d_inner =*/ d_inner,
|
||||
/*.d_state =*/ d_state,
|
||||
/*.d_inner =*/ d_inner,
|
||||
/*.n_head =*/ n_head,
|
||||
/*.n_group =*/ n_group,
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb20 =*/ nb20,
|
||||
/*.nb21 =*/ nb21,
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb30 =*/ nb30,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb40 =*/ nb40,
|
||||
/*.nb41 =*/ nb41,
|
||||
/*.nb42 =*/ nb42,
|
||||
/*.nb50 =*/ nb50,
|
||||
/*.nb51 =*/ nb51,
|
||||
/*.nb52 =*/ nb52,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb21 =*/ nb21,
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb41 =*/ nb41,
|
||||
/*.nb42 =*/ nb42,
|
||||
/*.nb43 =*/ nb43,
|
||||
/*.nb51 =*/ nb51,
|
||||
/*.nb52 =*/ nb52,
|
||||
/*.nb53 =*/ nb53,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@@ -2874,10 +2917,17 @@ static bool ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||
[encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
|
||||
[encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:6];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:7];
|
||||
[encoder setBuffer:id_src6 offset:offs_src6 atIndex:6];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:7];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:8];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
if (ne30 == 1) {
|
||||
// Mamba-2
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} else {
|
||||
GGML_ASSERT(d_inner == 1);
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
{
|
||||
@@ -4979,7 +5029,11 @@ static bool ggml_metal_encode_node(
|
||||
/*.nb21 =*/ nb21,
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb23 =*/ nb23,
|
||||
/*.ne32 =*/ ne32,
|
||||
/*.ne33 =*/ ne33,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb32 =*/ nb32,
|
||||
/*.nb33 =*/ nb33,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.scale =*/ scale,
|
||||
|
||||
@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
|
||||
}
|
||||
|
||||
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
|
||||
}
|
||||
|
||||
void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re
|
||||
}
|
||||
|
||||
void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
@@ -1258,6 +1261,50 @@ kernel void kernel_swiglu(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_erf(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f+erf_approx<float>(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_quick(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool norm>
|
||||
kernel void kernel_sum_rows(
|
||||
constant ggml_metal_kargs_sum_rows & args,
|
||||
@@ -1320,24 +1367,28 @@ kernel void kernel_soft_max(
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_soft_max & args,
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = (tgpig) / (args.ne02*args.ne01);
|
||||
const int64_t i02 = (tgpig - i03*args.ne02*args.ne01) / args.ne01;
|
||||
const int64_t i01 = (tgpig - i03*args.ne02*args.ne01 - i02*args.ne01);
|
||||
uint3 tptg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
const int32_t i01 = tgpig.x;
|
||||
|
||||
device const float * psrc0 = (device const float *) src0 + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00);
|
||||
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*args.ne00 : nullptr;
|
||||
device float * pdst = (device float *) dst + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00);
|
||||
const int32_t i13 = i03%args.ne13;
|
||||
const int32_t i12 = i02%args.ne12;
|
||||
const int32_t i11 = i01;
|
||||
|
||||
device const float * psrc0 = (device const float *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
|
||||
device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
|
||||
device float * pdst = (device float *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (args.max_bias > 0.0f) {
|
||||
const int64_t h = i02;
|
||||
const int32_t h = i02;
|
||||
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
@@ -1348,13 +1399,13 @@ kernel void kernel_soft_max(
|
||||
// parallel max
|
||||
float lmax = -INFINITY;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
|
||||
lmax = MAX(lmax, psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f));
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
float max_val = simd_max(lmax);
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = -INFINITY;
|
||||
}
|
||||
@@ -1373,7 +1424,7 @@ kernel void kernel_soft_max(
|
||||
|
||||
// parallel sum
|
||||
float lsum = 0.0f;
|
||||
for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
|
||||
const float exp_psrc0 = exp((psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
|
||||
lsum += exp_psrc0;
|
||||
pdst[i00] = exp_psrc0;
|
||||
@@ -1385,7 +1436,7 @@ kernel void kernel_soft_max(
|
||||
|
||||
float sum = simd_sum(lsum);
|
||||
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = 0.0f;
|
||||
}
|
||||
@@ -1404,7 +1455,7 @@ kernel void kernel_soft_max(
|
||||
|
||||
const float inv_sum = 1.0f/sum;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
|
||||
pdst[i00] *= inv_sum;
|
||||
}
|
||||
}
|
||||
@@ -1416,23 +1467,27 @@ kernel void kernel_soft_max_4(
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_soft_max & args,
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = (tgpig) / (args.ne02*args.ne01);
|
||||
const int64_t i02 = (tgpig - i03*args.ne02*args.ne01) / args.ne01;
|
||||
const int64_t i01 = (tgpig - i03*args.ne02*args.ne01 - i02*args.ne01);
|
||||
uint3 tptg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
const int32_t i01 = tgpig.x;
|
||||
|
||||
device const float4 * psrc4 = (device const float4 *) src0 + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00)/4;
|
||||
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*args.ne00/4 : nullptr;
|
||||
device float4 * pdst4 = (device float4 *) dst + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00)/4;
|
||||
const int32_t i13 = i03%args.ne13;
|
||||
const int32_t i12 = i02%args.ne12;
|
||||
const int32_t i11 = i01;
|
||||
|
||||
device const float4 * psrc4 = (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
|
||||
device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
|
||||
device float4 * pdst4 = (device float4 *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
if (args.max_bias > 0.0f) {
|
||||
const int64_t h = i02;
|
||||
const int32_t h = i02;
|
||||
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
@@ -1443,14 +1498,14 @@ kernel void kernel_soft_max_4(
|
||||
// parallel max
|
||||
float4 lmax4 = -INFINITY;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00/4; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
|
||||
}
|
||||
|
||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||
|
||||
float max_val = simd_max(lmax);
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = -INFINITY;
|
||||
}
|
||||
@@ -1469,7 +1524,7 @@ kernel void kernel_soft_max_4(
|
||||
|
||||
// parallel sum
|
||||
float4 lsum4 = 0.0f;
|
||||
for (int i00 = tpitg; i00 < args.ne00/4; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
|
||||
lsum4 += exp_psrc4;
|
||||
pdst4[i00] = exp_psrc4;
|
||||
@@ -1483,7 +1538,7 @@ kernel void kernel_soft_max_4(
|
||||
|
||||
float sum = simd_sum(lsum);
|
||||
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = 0.0f;
|
||||
}
|
||||
@@ -1502,7 +1557,7 @@ kernel void kernel_soft_max_4(
|
||||
|
||||
const float inv_sum = 1.0f/sum;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00/4; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
|
||||
pdst4[i00] *= inv_sum;
|
||||
}
|
||||
}
|
||||
@@ -1588,7 +1643,7 @@ kernel void kernel_ssm_conv_f32(
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-1 part
|
||||
kernel void kernel_ssm_scan_f32(
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
@@ -1596,46 +1651,119 @@ kernel void kernel_ssm_scan_f32(
|
||||
device const void * src3,
|
||||
device const void * src4,
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i3 = tgpig.y;
|
||||
const int64_t i1 = 0;
|
||||
const int64_t ir = tgpig.x; // current head
|
||||
const int64_t i3 = tgpig.y; // current seq
|
||||
|
||||
const uint64_t nb00 = sizeof(float);
|
||||
const uint64_t nb10 = sizeof(float);
|
||||
const uint64_t nb20 = sizeof(float);
|
||||
|
||||
const int64_t nc = args.d_state;
|
||||
// const int64_t nr = args.d_inner;
|
||||
const int64_t nr = args.d_inner;
|
||||
const int64_t nh = args.n_head;
|
||||
const int64_t ng = args.n_group;
|
||||
const int64_t n_t = args.n_seq_tokens;
|
||||
// const int64_t n_s = args.n_seqs;
|
||||
|
||||
const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb01 + i3*args.nb02);
|
||||
device const float * x = (device const float *) ((device const char *) src1 + ir*args.nb10 + i2*args.nb11 + i3*args.nb12);
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*args.nb20 + i2*args.nb21 + i3*args.nb22);
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31);
|
||||
device const float * B = (device const float *) ((device const char *) src4 + i2*args.nb41 + i3*args.nb42);
|
||||
device const float * C = (device const float *) ((device const char *) src5 + i2*args.nb51 + i3*args.nb52);
|
||||
device float * y = (device float *) ((device char *) dst + ir*args.nb10 + i2*args.nb11 + i3*args.nb12); // TODO: do not use src1 strides
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb01 + i3*args.nb02 + args.nb13);
|
||||
device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {d_state, nh}
|
||||
device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
|
||||
|
||||
if (i2 > 0) {
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// i1 == 0
|
||||
float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
float x_dt = x[0] * dt_soft_plus;
|
||||
const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
const float x_dt = x[0] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
int64_t i = i0;
|
||||
float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
const int64_t i = i0 + i1*nc;
|
||||
const float state = (s0[i] * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
}
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
||||
// TODO: optimize (e.g. by parallelizing over d_state)
|
||||
kernel void kernel_ssm_scan_f32_group(
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device const void * src2,
|
||||
device const void * src3,
|
||||
device const void * src4,
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i1 = tgpig.x;
|
||||
const int64_t ir = tgpig.y; // current head
|
||||
const int64_t i3 = tgpig.z; // current seq
|
||||
|
||||
const uint64_t nb00 = sizeof(float);
|
||||
const uint64_t nb10 = sizeof(float);
|
||||
const uint64_t nb20 = sizeof(float);
|
||||
|
||||
const int64_t nc = args.d_state;
|
||||
const int64_t nr = args.d_inner;
|
||||
const int64_t nh = args.n_head;
|
||||
const int64_t ng = args.n_group;
|
||||
const int64_t n_t = args.n_seq_tokens;
|
||||
|
||||
const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
|
||||
device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
|
||||
|
||||
const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
const float x_dt = x[0] * dt_soft_plus;
|
||||
const float dA = exp(dt_soft_plus * A[0]);
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
const int64_t i = i0 + i1*nc;
|
||||
const float state = (s0[i] * dA) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3776,7 +3904,7 @@ kernel void kernel_flash_attn_ext(
|
||||
// load the mask in shared memory
|
||||
#pragma unroll(Q)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31);
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33);
|
||||
|
||||
const float m = pm[ic + tiisg];
|
||||
|
||||
@@ -4262,7 +4390,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
const bool has_mask = mask != q;
|
||||
|
||||
// pointer to the mask
|
||||
device const half * pm = (device const half *) (mask + iq1*args.nb31);
|
||||
device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
@@ -398,12 +398,13 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
||||
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
||||
cl_kernel kernel_relu;
|
||||
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
||||
cl_kernel kernel_clamp;
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16;
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||
cl_kernel kernel_norm;
|
||||
cl_kernel kernel_rms_norm;
|
||||
cl_kernel kernel_group_norm;
|
||||
@@ -736,6 +737,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
@@ -753,12 +756,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
backend_ctx->program_glu =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -2187,7 +2194,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
||||
// dependencies.
|
||||
sync_with_other_backends(backend);
|
||||
|
||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -2222,6 +2229,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
@@ -2256,6 +2269,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
@@ -2271,6 +2285,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
@@ -3199,7 +3215,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
||||
|
||||
// Open file and dump.
|
||||
char fname[512];
|
||||
sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
|
||||
snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
|
||||
FILE * f = fopen(fname, "w");
|
||||
if (!f) {
|
||||
printf("Failed to open %s\n", fname);
|
||||
@@ -3858,6 +3874,44 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
int n = ggml_nelements(dst);
|
||||
|
||||
if (n % 4 == 0) {
|
||||
kernel = backend_ctx->kernel_gelu_erf_4;
|
||||
n /= 4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_gelu_erf;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -4453,7 +4507,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
||||
const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
||||
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
||||
cl_kernel kernel = nullptr;
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
@@ -4484,18 +4539,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne00_src = src0->ne[0];
|
||||
const int ne01_src = src0->ne[1];
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const int ne10_dst = dst->ne[0];
|
||||
const int ne11_dst = dst->ne[1];
|
||||
const int ne12_dst = dst->ne[2];
|
||||
const int ne13_dst = dst->ne[3];
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
||||
float sf0 = (float)ne0 / ne00;
|
||||
float sf1 = (float)ne1 / ne01;
|
||||
float sf2 = (float)ne2 / ne02;
|
||||
float sf3 = (float)ne3 / ne03;
|
||||
|
||||
float pixel_offset = 0.5f;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
@@ -4507,29 +4566,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
|
||||
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||
sf0 = (float)(ne0 - 1) / (ne00 - 1);
|
||||
sf1 = (float)(ne1 - 1) / (ne01 - 1);
|
||||
pixel_offset = 0.0f;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset));
|
||||
}
|
||||
|
||||
|
||||
size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
|
||||
if (dst_total_elements == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -5745,19 +5811,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
|
||||
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_long nb01 = src0->nb[1];
|
||||
const cl_long nb02 = src0->nb[2];
|
||||
const cl_long nb03 = src0->nb[3];
|
||||
|
||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
||||
const int ne13 = src1 ? src1->ne[3] : 0;
|
||||
|
||||
const cl_long nb11 = src1 ? src1->nb[1] : 0;
|
||||
const cl_long nb12 = src1 ? src1->nb[2] : 0;
|
||||
const cl_long nb13 = src1 ? src1->nb[3] : 0;
|
||||
|
||||
const cl_long nb1 = dst->nb[1];
|
||||
const cl_long nb2 = dst->nb[2];
|
||||
const cl_long nb3 = dst->nb[3];
|
||||
|
||||
float scale, max_bias;
|
||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||
|
||||
const int nrows_x = ggml_nrows(src0);
|
||||
const int nrows_y = src0->ne[1];
|
||||
|
||||
const int n_head = nrows_x/nrows_y;
|
||||
const int n_head = src0->ne[2];
|
||||
const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
@@ -5802,13 +5880,22 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
@@ -6215,6 +6302,20 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
kernel = backend_ctx->kernel_swiglu_f16;
|
||||
}
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_geglu_erf;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_geglu_erf_f16;
|
||||
}
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_geglu_quick;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_geglu_quick_f16;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported glu op");
|
||||
}
|
||||
@@ -6329,6 +6430,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_gelu;
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_gelu_erf;
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define GELU_COEF_A 0.044715f
|
||||
#define GELU_QUICK_COEF -1.702f
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||
#define SQRT_2_INV 0.70710678118654752440084436210484f
|
||||
|
||||
kernel void kernel_gelu(
|
||||
global float * src0,
|
||||
@@ -35,6 +36,32 @@ kernel void kernel_gelu_4(
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
float x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
float4 x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_quick(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define GELU_COEF_A 0.044715f
|
||||
#define GELU_QUICK_COEF -1.702f
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||
#define SQRT_2_INV 0.70710678118654752440084436210484f
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu
|
||||
@@ -199,3 +201,137 @@ kernel void kernel_swiglu_f16(
|
||||
dst_row[i0] = silu*x1;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu_erf
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_geglu_erf(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_erf_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const half x0 = src0_row[i0];
|
||||
const half x1 = src1_row[i0];
|
||||
|
||||
const half gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu_quick
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_geglu_quick(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_quick_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const half x0 = src0_row[i0];
|
||||
const half x1 = src1_row[i0];
|
||||
|
||||
const half gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,32 +22,45 @@
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_soft_max_4_f16(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global half * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float scale,
|
||||
float max_bias,
|
||||
float m0,
|
||||
float m1,
|
||||
int n_head_log2
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
src1 = (global half *)((global char *)src1 + offset1);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
global half4 * pmask = (global char *)src1 != (global char *)src0 ? (global half4 *)(src1 + i01*ne00) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
int i13 = i03%ne13;
|
||||
int i12 = i02%ne12;
|
||||
int i11 = i01;
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
@@ -22,32 +22,45 @@
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_soft_max_4(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global float * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float scale,
|
||||
float max_bias,
|
||||
float m0,
|
||||
float m1,
|
||||
int n_head_log2
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
int i13 = i03%ne13;
|
||||
int i12 = i02%ne12;
|
||||
int i11 = i01;
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
@@ -22,32 +22,45 @@
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_soft_max_f16(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global half * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float scale,
|
||||
float max_bias,
|
||||
float m0,
|
||||
float m1,
|
||||
int n_head_log2
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
src1 = (global half *)((global char *)src1 + offset1);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
global half * pmask = (global char *)src1 != (global char *)src0 ? src1 + i01*ne00 : 0;
|
||||
global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
int i13 = i03%ne13;
|
||||
int i12 = i02%ne12;
|
||||
int i11 = i01;
|
||||
|
||||
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global half * pmask = src1 != src0 ? (global half *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
@@ -22,32 +22,45 @@
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_soft_max(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global float * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float scale,
|
||||
float max_bias,
|
||||
float m0,
|
||||
float m1,
|
||||
int n_head_log2
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0;
|
||||
global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
int i13 = i03%ne13;
|
||||
int i12 = i02%ne12;
|
||||
int i11 = i01;
|
||||
|
||||
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global float * pmask = src1 != src0 ? (global float *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
@@ -60,7 +60,8 @@ kernel void kernel_upscale_bilinear(
|
||||
float sf0,
|
||||
float sf1,
|
||||
float sf2,
|
||||
float sf3
|
||||
float sf3,
|
||||
float pixel_offset
|
||||
) {
|
||||
global const char * src_base = (global const char *)p_src0 + off_src0;
|
||||
global float * dst_base = (global float *)((global char *)p_dst + off_dst);
|
||||
@@ -80,8 +81,6 @@ kernel void kernel_upscale_bilinear(
|
||||
int i02_src = (int)(i12_dst / sf2);
|
||||
int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float pixel_offset = 0.5f;
|
||||
|
||||
float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||
long y0_src = (long)floor(y_src_f);
|
||||
long y1_src = y0_src + 1;
|
||||
|
||||
@@ -383,6 +383,24 @@ static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint6
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gated_op_fused_geglu_erf(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
const int64_t j0 = (i / n) * o0 + (i % n);
|
||||
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
|
||||
dst[i] = op_gelu_erf(x[j0]) * g[j1];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gated_op_fused_geglu_quick(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
const int64_t j0 = (i / n) * o0 + (i % n);
|
||||
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
|
||||
dst[i] = op_gelu_quick(x[j0]) * g[j1];
|
||||
}
|
||||
}
|
||||
|
||||
namespace ggml_sycl_detail {
|
||||
static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int n_elements, const int ne10, const int ne11,
|
||||
@@ -978,6 +996,28 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(main_stream,
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(main_stream,
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
@@ -1118,3 +1158,13 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_swiglu(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_geglu_erf(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_geglu_quick(ctx, dst);
|
||||
}
|
||||
|
||||
@@ -80,5 +80,7 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_ELEMENTWISE_HPP
|
||||
|
||||
@@ -83,7 +83,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
|
||||
info.devices[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
info.devices[i].opt_feature.reorder = !device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
|
||||
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
|
||||
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
||||
}
|
||||
|
||||
@@ -3687,6 +3687,12 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
ggml_sycl_swiglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
ggml_sycl_geglu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
ggml_sycl_geglu_quick(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -4232,6 +4238,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
@@ -4285,6 +4293,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
@@ -4395,9 +4409,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
// TODO: support batching
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
|
||||
@@ -224,6 +224,21 @@ enum vk_device_architecture {
|
||||
INTEL_XE2,
|
||||
};
|
||||
|
||||
// HSK x HSV
|
||||
enum FaHeadSizes {
|
||||
FA_HEAD_SIZE_64,
|
||||
FA_HEAD_SIZE_80,
|
||||
FA_HEAD_SIZE_96,
|
||||
FA_HEAD_SIZE_112,
|
||||
FA_HEAD_SIZE_128,
|
||||
FA_HEAD_SIZE_192,
|
||||
FA_HEAD_SIZE_192_128,
|
||||
FA_HEAD_SIZE_256,
|
||||
FA_HEAD_SIZE_576_512,
|
||||
FA_HEAD_SIZE_UNSUPPORTED,
|
||||
FA_HEAD_SIZE_COUNT = FA_HEAD_SIZE_UNSUPPORTED,
|
||||
};
|
||||
|
||||
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
|
||||
vk::PhysicalDeviceProperties props = device.getProperties();
|
||||
|
||||
@@ -431,6 +446,7 @@ struct vk_device_struct {
|
||||
|
||||
// [src/dst 0=fp32,1=fp16]
|
||||
vk_pipeline pipeline_gelu[2];
|
||||
vk_pipeline pipeline_gelu_erf[2];
|
||||
vk_pipeline pipeline_gelu_quick[2];
|
||||
vk_pipeline pipeline_silu[2];
|
||||
vk_pipeline pipeline_relu[2];
|
||||
@@ -440,6 +456,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_geglu[2];
|
||||
vk_pipeline pipeline_reglu[2];
|
||||
vk_pipeline pipeline_swiglu[2];
|
||||
vk_pipeline pipeline_geglu_erf[2];
|
||||
vk_pipeline pipeline_geglu_quick[2];
|
||||
|
||||
vk_pipeline pipeline_leaky_relu_f32;
|
||||
vk_pipeline pipeline_silu_back_f32;
|
||||
@@ -466,26 +484,11 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
||||
|
||||
// [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D64_cm2[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D80_cm2[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D96_cm2[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D112_cm2[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D128_cm2[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D256_cm2[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_cm2[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2];
|
||||
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D64_cm1[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D80_cm1[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D96_cm1[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D112_cm1[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D128_cm1[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D256_cm1[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_cm1[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2];
|
||||
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
|
||||
vk_pipeline pipeline_flash_attn_f32_f16[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2];
|
||||
|
||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||
|
||||
@@ -632,6 +635,8 @@ struct vk_flash_attn_push_constants {
|
||||
uint32_t nev2;
|
||||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
uint32_t nem2;
|
||||
uint32_t nem3;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
@@ -642,14 +647,12 @@ struct vk_flash_attn_push_constants {
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t nb31;
|
||||
|
||||
float scale;
|
||||
float max_bias;
|
||||
float logit_softcap;
|
||||
|
||||
uint32_t mask;
|
||||
uint32_t n_head_log2;
|
||||
uint32_t mask_n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
|
||||
@@ -657,6 +660,7 @@ struct vk_flash_attn_push_constants {
|
||||
uint32_t split_kv;
|
||||
uint32_t k_num;
|
||||
};
|
||||
static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128");
|
||||
|
||||
struct vk_op_push_constants {
|
||||
uint32_t KX;
|
||||
@@ -755,6 +759,14 @@ struct vk_op_rope_push_constants {
|
||||
struct vk_op_soft_max_push_constants {
|
||||
uint32_t KX;
|
||||
uint32_t KY;
|
||||
uint32_t ne00;
|
||||
uint32_t ne01;
|
||||
uint32_t ne02;
|
||||
uint32_t ne12;
|
||||
uint32_t ne13;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
@@ -993,7 +1005,7 @@ struct ggml_backend_vk_context {
|
||||
|
||||
// number of additional consecutive nodes that are being fused with the
|
||||
// node currently being processed
|
||||
uint32_t num_additional_fused_ops {};
|
||||
int num_additional_fused_ops {};
|
||||
};
|
||||
|
||||
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
||||
@@ -1689,6 +1701,35 @@ enum FaCodePath {
|
||||
FA_COOPMAT2,
|
||||
};
|
||||
|
||||
static FaHeadSizes fa_get_head_sizes(uint32_t hsk, uint32_t hsv) {
|
||||
if (hsk != 192 && hsk != 576 && hsk != hsv) {
|
||||
return FA_HEAD_SIZE_UNSUPPORTED;
|
||||
}
|
||||
switch (hsk) {
|
||||
case 64: return FA_HEAD_SIZE_64;
|
||||
case 80: return FA_HEAD_SIZE_80;
|
||||
case 96: return FA_HEAD_SIZE_96;
|
||||
case 112: return FA_HEAD_SIZE_112;
|
||||
case 128: return FA_HEAD_SIZE_128;
|
||||
case 192:
|
||||
if (hsv == 192) {
|
||||
return FA_HEAD_SIZE_192;
|
||||
} else if (hsv == 128) {
|
||||
return FA_HEAD_SIZE_192_128;
|
||||
} else {
|
||||
return FA_HEAD_SIZE_UNSUPPORTED;
|
||||
}
|
||||
case 256: return FA_HEAD_SIZE_256;
|
||||
case 576:
|
||||
if (hsv == 512) {
|
||||
return FA_HEAD_SIZE_576_512;
|
||||
} else {
|
||||
return FA_HEAD_SIZE_UNSUPPORTED;
|
||||
}
|
||||
default: return FA_HEAD_SIZE_UNSUPPORTED;
|
||||
}
|
||||
}
|
||||
|
||||
// number of rows/cols for flash attention shader
|
||||
static constexpr uint32_t flash_attention_num_small_rows = 32;
|
||||
static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
|
||||
@@ -1709,8 +1750,9 @@ static uint32_t get_fa_num_small_rows(FaCodePath path) {
|
||||
}
|
||||
}
|
||||
|
||||
static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
|
||||
static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows) {
|
||||
GGML_UNUSED(clamp);
|
||||
GGML_UNUSED(hsv);
|
||||
|
||||
if (path == FA_SCALAR) {
|
||||
if (small_rows) {
|
||||
@@ -1734,7 +1776,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
|
||||
}
|
||||
|
||||
// small cols to reduce register count
|
||||
if (ggml_is_quantized(type) || D == 256) {
|
||||
if (ggml_is_quantized(type) || hsk >= 256) {
|
||||
return {64, 32};
|
||||
}
|
||||
return {64, 64};
|
||||
@@ -2027,19 +2069,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||
};
|
||||
|
||||
auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
|
||||
return {fa_rows_cols(path, D, clamp, type, small_rows)[0], 1, 1};
|
||||
auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
|
||||
return {fa_rows_cols(path, hsk, hsv, clamp, type, small_rows)[0], 1, 1};
|
||||
};
|
||||
|
||||
auto const &fa_spec_constants = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
|
||||
auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
|
||||
// For large number of rows, 128 invocations seems to work best.
|
||||
// For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
|
||||
// can't use 256 for D==80.
|
||||
// For scalar, use 128 (arbitrary)
|
||||
// The same D_split value is used for both HSK and HSV, so just base it on the union of the LSBs.
|
||||
const uint32_t D = (hsk|hsv);
|
||||
uint32_t wg_size = (path == FA_SCALAR || path == FA_COOPMAT1)
|
||||
? scalar_flash_attention_workgroup_size
|
||||
: ((small_rows && (D % 32) == 0) ? 256 : 128);
|
||||
auto rows_cols = fa_rows_cols(path, D, clamp, type, small_rows);
|
||||
auto rows_cols = fa_rows_cols(path, hsk, hsv, clamp, type, small_rows);
|
||||
|
||||
// D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it.
|
||||
// D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader.
|
||||
@@ -2048,26 +2092,29 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
|
||||
GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
|
||||
return {wg_size, rows_cols[0], rows_cols[1], (D), clamp, D_split};
|
||||
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split};
|
||||
};
|
||||
|
||||
#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, D) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true), fa_rows_cols(FAPATH,D,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true), fa_rows_cols(FAPATH,D,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, HSK, HSV, HEAD_SIZES) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
|
||||
#define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256)
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64, 64, 64) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80, 80, 80) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96, 96, 96) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112, 112, 112) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128, 128, 128) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 192, 192, 192) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 192, 128, 192_128) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256, 256, 256) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 576, 512, 576_512)
|
||||
|
||||
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
|
||||
@@ -2761,6 +2808,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
CREATE_UNARY(gelu)
|
||||
CREATE_UNARY(gelu_erf)
|
||||
CREATE_UNARY(gelu_quick)
|
||||
CREATE_UNARY(silu)
|
||||
CREATE_UNARY(relu)
|
||||
@@ -2775,6 +2823,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_GLU(geglu)
|
||||
CREATE_GLU(reglu)
|
||||
CREATE_GLU(swiglu)
|
||||
CREATE_GLU(geglu_erf)
|
||||
CREATE_GLU(geglu_quick)
|
||||
#undef CREATE_GLU
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -3677,7 +3727,6 @@ static void ggml_vk_instance_init() {
|
||||
|
||||
}
|
||||
|
||||
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
||||
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||
|
||||
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
||||
@@ -5964,28 +6013,74 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
||||
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
||||
} else {
|
||||
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
||||
// Split based on number of ids, to fit in shared memory
|
||||
const uint32_t nei0 = (uint32_t)src2->ne[0];
|
||||
const uint32_t nei1 = (uint32_t)src2->ne[1];
|
||||
|
||||
GGML_ASSERT(nei0 <= 4096);
|
||||
const uint32_t split_size = std::min(nei1, 4096u / nei0);
|
||||
|
||||
ggml_tensor src1_copy = *src1;
|
||||
ggml_tensor src2_copy = *src2;
|
||||
ggml_tensor dst_copy = *dst;
|
||||
|
||||
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
|
||||
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
|
||||
|
||||
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
|
||||
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
|
||||
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
|
||||
|
||||
src1_copy.ne[2] = n_tokens;
|
||||
src2_copy.ne[1] = n_tokens;
|
||||
dst_copy.ne[2] = n_tokens;
|
||||
|
||||
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t D, bool f32acc) {
|
||||
static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const uint32_t hsk, uint32_t hsv) {
|
||||
// Needs to be kept up to date on shader changes
|
||||
GGML_UNUSED(hsv);
|
||||
const uint32_t wg_size = scalar_flash_attention_workgroup_size;
|
||||
const uint32_t Br = scalar_flash_attention_num_large_rows;
|
||||
const uint32_t Bc = scalar_flash_attention_Bc;
|
||||
|
||||
const uint32_t tmpsh = wg_size * sizeof(float);
|
||||
const uint32_t tmpshv4 = wg_size * 4 * sizeof(float);
|
||||
|
||||
const uint32_t masksh = Bc * Br * sizeof(float);
|
||||
|
||||
const uint32_t Qf = Br * (hsk / 4 + 2) * 4 * sizeof(float);
|
||||
|
||||
const uint32_t total_size = tmpsh + tmpshv4 + masksh + Qf;
|
||||
const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", total_size=" << total_size << ", supported=" << supported);
|
||||
|
||||
return supported;
|
||||
}
|
||||
|
||||
static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t hsk, uint32_t hsv, bool f32acc) {
|
||||
// Needs to be kept up to date on shader changes
|
||||
GGML_UNUSED(hsv);
|
||||
const uint32_t wg_size = scalar_flash_attention_workgroup_size;
|
||||
const uint32_t Br = coopmat1_flash_attention_num_large_rows;
|
||||
const uint32_t Bc = scalar_flash_attention_Bc;
|
||||
|
||||
const uint32_t acctype = f32acc ? 4 : 2;
|
||||
const uint32_t f16vec4 = 8;
|
||||
|
||||
const uint32_t tmpsh = wg_size * sizeof(float);
|
||||
const uint32_t tmpshv4 = wg_size * 4 * acctype;
|
||||
|
||||
const uint32_t Qf = Br * (D / 4 + 2) * f16vec4;
|
||||
const uint32_t Qf = Br * (hsk / 4 + 2) * f16vec4;
|
||||
|
||||
const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br;
|
||||
const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br;
|
||||
const uint32_t sfsh = Bc * sfshstride * acctype;
|
||||
|
||||
const uint32_t kshstride = D / 4 + 2;
|
||||
const uint32_t kshstride = hsk / 4 + 2;
|
||||
const uint32_t ksh = Bc * kshstride * f16vec4;
|
||||
|
||||
const uint32_t slope = Br * sizeof(float);
|
||||
@@ -5993,7 +6088,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
|
||||
const uint32_t total_size = tmpsh + tmpshv4 + Qf + sfsh + ksh + slope;
|
||||
const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(D=" << D << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported);
|
||||
VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported);
|
||||
|
||||
return supported;
|
||||
}
|
||||
@@ -6015,13 +6110,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const uint32_t nem1 = mask ? mask->ne[1] : 0;
|
||||
const uint32_t nbm1 = mask ? mask->nb[1] : 0;
|
||||
const uint32_t nem2 = mask ? mask->ne[2] : 0;
|
||||
const uint32_t nem3 = mask ? mask->ne[3] : 0;
|
||||
|
||||
const uint32_t D = neq0;
|
||||
const uint32_t HSK = nek0;
|
||||
const uint32_t HSV = nev0;
|
||||
uint32_t N = neq1;
|
||||
const uint32_t KV = nek1;
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
GGML_ASSERT(ne0 == HSV);
|
||||
GGML_ASSERT(ne2 == N);
|
||||
|
||||
// input tensor rows must be contiguous
|
||||
@@ -6029,12 +6126,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
|
||||
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
|
||||
|
||||
GGML_ASSERT(neq0 == D);
|
||||
GGML_ASSERT(nek0 == D);
|
||||
GGML_ASSERT(nev0 == D);
|
||||
GGML_ASSERT(neq0 == HSK);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
GGML_ASSERT(nev0 == D);
|
||||
|
||||
GGML_ASSERT(nev1 == nek1);
|
||||
|
||||
@@ -6055,7 +6149,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
|
||||
(dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);
|
||||
|
||||
const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, D, dst->op_params[3] == GGML_PREC_F32);
|
||||
const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, HSK, HSV, dst->op_params[3] == GGML_PREC_F32);
|
||||
|
||||
if (!coopmat_shape_supported || !coopmat_shmem_supported) {
|
||||
path = FA_SCALAR;
|
||||
@@ -6085,7 +6179,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
|
||||
if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa &&
|
||||
qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
|
||||
qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) {
|
||||
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
|
||||
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
|
||||
// and change addressing calculations to index Q's dimension 2.
|
||||
@@ -6108,47 +6202,25 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
path = FA_SCALAR;
|
||||
}
|
||||
|
||||
// with large hsk/hsv, scalar path may need to use small_rows to fit in shared memory
|
||||
if (path == FA_SCALAR &&
|
||||
!ggml_vk_flash_attn_scalar_shmem_support(ctx->device, HSK, HSV)) {
|
||||
small_rows = true;
|
||||
}
|
||||
|
||||
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
|
||||
|
||||
FaHeadSizes head_sizes = fa_get_head_sizes(k->ne[0], v->ne[0]);
|
||||
|
||||
switch (path) {
|
||||
case FA_SCALAR:
|
||||
switch (D) {
|
||||
case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
|
||||
case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
|
||||
case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break;
|
||||
case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break;
|
||||
case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break;
|
||||
case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break;
|
||||
default:
|
||||
GGML_ASSERT(!"unsupported D value");
|
||||
return;
|
||||
}
|
||||
pipelines = &ctx->device->pipeline_flash_attn_f32_f16[k->type][head_sizes][f32acc][small_rows][0];
|
||||
break;
|
||||
case FA_COOPMAT1:
|
||||
switch (D) {
|
||||
case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm1[k->type][f32acc][small_rows][0]; break;
|
||||
case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm1[k->type][f32acc][small_rows][0]; break;
|
||||
case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm1[k->type][f32acc][small_rows][0]; break;
|
||||
case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm1[k->type][f32acc][small_rows][0]; break;
|
||||
case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm1[k->type][f32acc][small_rows][0]; break;
|
||||
case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm1[k->type][f32acc][small_rows][0]; break;
|
||||
default:
|
||||
GGML_ASSERT(!"unsupported D value");
|
||||
return;
|
||||
}
|
||||
pipelines = &ctx->device->pipeline_flash_attn_f32_f16_cm1[k->type][head_sizes][f32acc][small_rows][0];
|
||||
break;
|
||||
case FA_COOPMAT2:
|
||||
switch (D) {
|
||||
case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm2[k->type][f32acc][small_rows][0]; break;
|
||||
case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm2[k->type][f32acc][small_rows][0]; break;
|
||||
case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm2[k->type][f32acc][small_rows][0]; break;
|
||||
case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm2[k->type][f32acc][small_rows][0]; break;
|
||||
case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm2[k->type][f32acc][small_rows][0]; break;
|
||||
case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm2[k->type][f32acc][small_rows][0]; break;
|
||||
default:
|
||||
GGML_ASSERT(!"unsupported D value");
|
||||
return;
|
||||
}
|
||||
pipelines = &ctx->device->pipeline_flash_attn_f32_f16_cm2[k->type][head_sizes][f32acc][small_rows][0];
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(0);
|
||||
@@ -6178,7 +6250,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
// Try to use split_k when KV is large enough to be worth the overhead
|
||||
if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
|
||||
// Try to run two workgroups per SM.
|
||||
split_k = ctx->device->shader_core_count * 2 / workgroups_y;
|
||||
split_k = shader_core_count * 2 / (workgroups_y * workgroups_z);
|
||||
if (split_k > 1) {
|
||||
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
|
||||
// of "align", so recompute split_k based on that.
|
||||
@@ -6188,9 +6260,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
}
|
||||
|
||||
// Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
|
||||
// and the per-row m and L values (ne1 rows).
|
||||
const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
|
||||
// Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
|
||||
// and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
|
||||
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
|
||||
if (split_k_size > ctx->device->max_memory_allocation_size) {
|
||||
GGML_ABORT("Requested preallocation size is too large");
|
||||
}
|
||||
@@ -6277,18 +6349,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t mask_n_head_log2 = ((mask != nullptr) << 16) | n_head_log2;
|
||||
|
||||
const vk_flash_attn_push_constants pc = { N, KV,
|
||||
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
|
||||
(uint32_t)neq2, (uint32_t)neq3,
|
||||
(uint32_t)nek2, (uint32_t)nek3,
|
||||
(uint32_t)nev2, (uint32_t)nev3,
|
||||
nem1,
|
||||
nem1, nem2, nem3,
|
||||
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
|
||||
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
|
||||
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
|
||||
nbm1,
|
||||
scale, max_bias, logit_softcap,
|
||||
mask != nullptr, n_head_log2, m0, m1,
|
||||
mask_n_head_log2, m0, m1,
|
||||
gqa_ratio, split_kv, split_k };
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -6309,13 +6382,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
|
||||
const std::array<uint32_t, 4> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
},
|
||||
pc2, { (uint32_t)ne1, 1, 1 });
|
||||
pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 });
|
||||
} else {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
@@ -6481,6 +6554,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_GELU:
|
||||
return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_RELU:
|
||||
@@ -6507,6 +6582,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
return ctx->device->pipeline_geglu_erf[dst->type == GGML_TYPE_F16];
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ctx->device->pipeline_geglu_quick[dst->type == GGML_TYPE_F16];
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -7639,7 +7718,13 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
|
||||
const uint32_t nrows_y = (uint32_t)src0->ne[1];
|
||||
|
||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||
const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u;
|
||||
const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u;
|
||||
const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u;
|
||||
const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u;
|
||||
const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u;
|
||||
|
||||
const uint32_t n_head_kv = src0->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
@@ -7648,6 +7733,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
||||
ncols,
|
||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
||||
ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
scale, max_bias,
|
||||
m0, m1,
|
||||
n_head_log2,
|
||||
@@ -8827,6 +8915,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
@@ -8841,6 +8930,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -9072,6 +9163,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
@@ -9087,6 +9179,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
break;
|
||||
default:
|
||||
@@ -9289,6 +9383,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
@@ -9304,6 +9399,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
buf = tensor->buffer;
|
||||
break;
|
||||
default:
|
||||
@@ -10095,6 +10192,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
@@ -10113,6 +10211,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
@@ -10127,9 +10227,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
||||
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
||||
return false;
|
||||
if (op->op == GGML_OP_MUL_MAT_ID) {
|
||||
if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
||||
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
||||
return false;
|
||||
}
|
||||
// Check against size of shared memory variable
|
||||
if (op->src[2]->ne[0] > 4096) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_F32:
|
||||
@@ -10187,19 +10293,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
bool coopmat2 = device->coopmat2;
|
||||
switch (op->src[0]->ne[0]) {
|
||||
case 64:
|
||||
case 80:
|
||||
case 96:
|
||||
case 112:
|
||||
case 128:
|
||||
case 256:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
// different head sizes of K and V are not supported yet
|
||||
FaHeadSizes head_sizes = fa_get_head_sizes(op->src[1]->ne[0], op->src[2]->ne[0]);
|
||||
if (head_sizes == FA_HEAD_SIZE_UNSUPPORTED) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||
@@ -10279,6 +10374,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
@@ -10369,6 +10470,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -10835,6 +10937,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
|
||||
@@ -11,7 +11,8 @@
|
||||
#include "types.comp"
|
||||
#include "flash_attn_base.comp"
|
||||
|
||||
const uint32_t D_per_thread = D / D_split;
|
||||
const uint32_t HSK_per_thread = HSK / D_split;
|
||||
const uint32_t HSV_per_thread = HSV / D_split;
|
||||
|
||||
const uint32_t cols_per_iter = WorkGroupSize / D_split;
|
||||
const uint32_t cols_per_thread = Bc / cols_per_iter;
|
||||
@@ -29,7 +30,7 @@ layout (binding = 3) readonly buffer M {float16_t data_m[];};
|
||||
// Rows index by Q's dimension 2, and the first N rows are valid.
|
||||
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
|
||||
{
|
||||
uint32_t offset = (iq2 + r) * D + c;
|
||||
uint32_t offset = (iq2 + r) * HSV + c;
|
||||
data_o[o_offset + offset] = D_TYPE(elem);
|
||||
return elem;
|
||||
}
|
||||
@@ -38,7 +39,7 @@ shared FLOAT_TYPE tmpsh[WorkGroupSize];
|
||||
shared vec4 tmpshv4[WorkGroupSize];
|
||||
|
||||
shared float masksh[Bc][Br];
|
||||
shared vec4 Qf[Br][D / 4];
|
||||
shared vec4 Qf[Br][HSK / 4];
|
||||
|
||||
void main() {
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
@@ -53,18 +54,18 @@ void main() {
|
||||
|
||||
uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (D / 4);
|
||||
uint32_t r = (idx + tid) / (D / 4);
|
||||
if (r < Br && d < D / 4 &&
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (HSK / 4);
|
||||
uint32_t r = (idx + tid) / (HSK / 4);
|
||||
if (r < Br && d < HSK / 4 &&
|
||||
i * Br + r < N) {
|
||||
Qf[r][d] = vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d]) * p.scale;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
|
||||
vec4 Of[Br][D_per_thread / 4];
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
vec4 Of[Br][HSV_per_thread / 4];
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
Of[r][d] = vec4(0.0);
|
||||
}
|
||||
@@ -99,6 +100,10 @@ void main() {
|
||||
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
|
||||
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
|
||||
#endif
|
||||
uint32_t m_offset = 0;
|
||||
if (p.nem2 != 1 || p.nem3 != 1) {
|
||||
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
for (uint32_t j = start_j; j < end_j; ++j) {
|
||||
@@ -112,7 +117,7 @@ void main() {
|
||||
|
||||
|
||||
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSK_per_thread / 4; ++d) {
|
||||
#if BLOCK_SIZE > 1
|
||||
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
|
||||
uint ib = coord / BLOCK_SIZE;
|
||||
@@ -144,13 +149,13 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
if (p.mask != 0) {
|
||||
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t c = (idx + tid) % Bc;
|
||||
uint32_t r = (idx + tid) / Bc;
|
||||
if (idx + tid < Bc * Br) {
|
||||
masksh[c][r] = float(data_m[(i * Br + r) * m_stride + (j * Bc + c)]);
|
||||
masksh[c][r] = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
@@ -191,14 +196,14 @@ void main() {
|
||||
Lf[r] = eMf[r]*Lf[r] + rowsumf[r];
|
||||
}
|
||||
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
Of[r][d] = eMf[r] * Of[r][d];
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
#if BLOCK_SIZE > 1
|
||||
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
|
||||
uint ib = coord / BLOCK_SIZE;
|
||||
@@ -255,7 +260,7 @@ void main() {
|
||||
Lf[r] = tmpsh[d_tid];
|
||||
barrier();
|
||||
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
|
||||
Of[r][d] = eMf * Of[r][d];
|
||||
tmpshv4[tid] = Of[r][d];
|
||||
@@ -277,11 +282,11 @@ void main() {
|
||||
// If there is split_k, then the split_k resolve shader does the final
|
||||
// division by L. Store the intermediate O value and per-row m and L values.
|
||||
if (p.k_num > 1) {
|
||||
uint32_t o_offset = D * p.ne1 * split_k_index;
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
||||
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
if (r < N) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
|
||||
perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N);
|
||||
}
|
||||
@@ -289,7 +294,7 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
|
||||
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
if (r < N) {
|
||||
perElemOpStoreCol0(r, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
|
||||
@@ -305,18 +310,18 @@ void main() {
|
||||
Lfrcp[r] = 1.0 / Lf[r];
|
||||
}
|
||||
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
Of[r][d] *= Lfrcp[r];
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1;
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
||||
|
||||
if (p.gqa_ratio > 1) {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
if (r < N) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
|
||||
perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N);
|
||||
}
|
||||
@@ -326,9 +331,9 @@ void main() {
|
||||
} else {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
if (i * Br + r < N) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
|
||||
data_o[o_offset + iq2 * D + (i * Br + r) * p.ne1 * D + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
|
||||
data_o[o_offset + iq2 * HSV + (i * Br + r) * p.ne1 * HSV + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,10 +4,10 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
|
||||
layout (constant_id = 1) const uint32_t Br = 1;
|
||||
layout (constant_id = 2) const uint32_t Bc = 32;
|
||||
layout (constant_id = 3) const uint32_t D = 32;
|
||||
layout (constant_id = 4) const uint32_t Clamp = 0;
|
||||
layout (constant_id = 5) const uint32_t D_split = 16;
|
||||
|
||||
layout (constant_id = 3) const uint32_t HSK = 32;
|
||||
layout (constant_id = 4) const uint32_t HSV = 32;
|
||||
layout (constant_id = 5) const uint32_t Clamp = 0;
|
||||
layout (constant_id = 6) const uint32_t D_split = 16;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t N;
|
||||
@@ -24,6 +24,8 @@ layout (push_constant) uniform parameter {
|
||||
uint32_t nev2;
|
||||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
uint32_t nem2;
|
||||
uint32_t nem3;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
@@ -34,14 +36,12 @@ layout (push_constant) uniform parameter {
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t nb31;
|
||||
|
||||
float scale;
|
||||
float max_bias;
|
||||
float logit_softcap;
|
||||
|
||||
uint32_t mask;
|
||||
uint32_t n_head_log2;
|
||||
uint32_t mask_n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
|
||||
@@ -50,6 +50,9 @@ layout (push_constant) uniform parameter {
|
||||
uint32_t k_num;
|
||||
} p;
|
||||
|
||||
#define MASK_ENABLE_BIT (1<<16)
|
||||
#define N_LOG2_MASK 0xFFFF
|
||||
|
||||
layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
|
||||
|
||||
#if defined(A_TYPE_PACKED16)
|
||||
@@ -100,8 +103,10 @@ ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const i
|
||||
{
|
||||
const uint32_t h = iq2 + (r % p.gqa_ratio);
|
||||
|
||||
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
|
||||
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
|
||||
uint32_t n_head_log2 = p.mask_n_head_log2 & N_LOG2_MASK;
|
||||
|
||||
const ACC_TYPE base = ACC_TYPE(h < n_head_log2 ? p.m0 : p.m1);
|
||||
const int exph = int(h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1);
|
||||
|
||||
return ACC_TYPE(pow(base, ACC_TYPE(exph)));
|
||||
}
|
||||
|
||||
@@ -13,7 +13,9 @@
|
||||
#include "types.comp"
|
||||
#include "flash_attn_base.comp"
|
||||
|
||||
const uint32_t D_per_thread = D / D_split;
|
||||
const uint32_t HSK_per_thread = HSK / D_split;
|
||||
const uint32_t HSV_per_thread = HSV / D_split;
|
||||
|
||||
const uint32_t row_split = 4;
|
||||
const uint32_t rows_per_thread = Br / row_split;
|
||||
const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split / row_split;
|
||||
@@ -32,7 +34,7 @@ layout (binding = 3) readonly buffer M {float16_t data_m[];};
|
||||
// Rows index by Q's dimension 2, and the first N rows are valid.
|
||||
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
|
||||
{
|
||||
uint32_t offset = (iq2 + r) * D + c;
|
||||
uint32_t offset = (iq2 + r) * HSV + c;
|
||||
data_o[o_offset + offset] = D_TYPE(elem);
|
||||
return elem;
|
||||
}
|
||||
@@ -44,14 +46,14 @@ const uint32_t MatBc = 16;
|
||||
shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
|
||||
shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];
|
||||
|
||||
const uint32_t qstride = D / 4 + 2; // in units of f16vec4
|
||||
const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4
|
||||
shared f16vec4 Qf[Br * qstride];
|
||||
|
||||
// Avoid padding for D==256 to make it fit in 48KB shmem.
|
||||
const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br;
|
||||
// Avoid padding for hsk==256 to make it fit in 48KB shmem.
|
||||
const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br;
|
||||
shared ACC_TYPE sfsh[Bc * sfshstride];
|
||||
|
||||
const uint32_t kshstride = D / 4 + 2; // in units of f16vec4
|
||||
const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4
|
||||
shared f16vec4 ksh[Bc * kshstride];
|
||||
|
||||
shared float slope[Br];
|
||||
@@ -74,18 +76,18 @@ void main() {
|
||||
|
||||
uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (D / 4);
|
||||
uint32_t r = (idx + tid) / (D / 4);
|
||||
if (r < Br && d < D / 4 &&
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (HSK / 4);
|
||||
uint32_t r = (idx + tid) / (HSK / 4);
|
||||
if (r < Br && d < HSK / 4 &&
|
||||
i * Br + r < N) {
|
||||
Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
|
||||
ACC_TYPEV4 Of[rows_per_thread][D_per_thread / 4];
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
ACC_TYPEV4 Of[rows_per_thread][HSV_per_thread / 4];
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Of[r][d] = ACC_TYPEV4(0.0);
|
||||
}
|
||||
@@ -123,14 +125,18 @@ void main() {
|
||||
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
|
||||
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
|
||||
#endif
|
||||
uint32_t m_offset = 0;
|
||||
if (p.nem2 != 1 || p.nem3 != 1) {
|
||||
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
for (uint32_t j = start_j; j < end_j; ++j) {
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Bc * D / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (D / 4);
|
||||
uint32_t c = (idx + tid) / (D / 4);
|
||||
if (c < Bc && d < D / 4) {
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Bc * HSK / 4; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t d = (idx + tid) % (HSK / 4);
|
||||
uint32_t c = (idx + tid) / (HSK / 4);
|
||||
if (c < Bc && d < HSK / 4) {
|
||||
#if BLOCK_SIZE > 1
|
||||
uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
|
||||
uint ib = coord / BLOCK_SIZE;
|
||||
@@ -145,14 +151,14 @@ void main() {
|
||||
}
|
||||
barrier();
|
||||
|
||||
// K * Q^T -> S^T: Bc x D * D x Br -> Bc x Br
|
||||
// Bc split across workgroup (four subgroups), loop over D in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
|
||||
// K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br
|
||||
// Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
|
||||
// This is written transposed in order to allow for N being 8 if implementations need it
|
||||
coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
|
||||
coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
|
||||
coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
|
||||
|
||||
for (uint32_t d = 0; d < D / 16; ++d) {
|
||||
for (uint32_t d = 0; d < HSK / 16; ++d) {
|
||||
coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;
|
||||
@@ -176,12 +182,12 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (p.mask != 0) {
|
||||
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t c = (idx + tid) % Bc;
|
||||
uint32_t r = (idx + tid) / Bc;
|
||||
if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
|
||||
sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[(i * Br + r) * m_stride + (j * Bc + c)]));
|
||||
sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]));
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
@@ -202,7 +208,7 @@ void main() {
|
||||
eMf[r] = exp(Moldf - Mf[r]);
|
||||
}
|
||||
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Of[r][d] = float16_t(eMf[r]) * Of[r][d];
|
||||
}
|
||||
@@ -217,7 +223,7 @@ void main() {
|
||||
Pf[r] = exp(sfsh[tile_row(r) + (c * cols_per_iter + col_tid) * sfshstride] - Mf[r]);
|
||||
Lf[r] += Pf[r];
|
||||
}
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
#if BLOCK_SIZE > 1
|
||||
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
|
||||
uint ib = coord / BLOCK_SIZE;
|
||||
@@ -280,7 +286,7 @@ void main() {
|
||||
}
|
||||
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
|
||||
Of[r][d] = float16_t(eMf[r]) * Of[r][d];
|
||||
tmpshv4[tid] = Of[r][d];
|
||||
@@ -300,11 +306,11 @@ void main() {
|
||||
// If there is split_k, then the split_k resolve shader does the final
|
||||
// division by L. Store the intermediate O value and per-row m and L values.
|
||||
if (p.k_num > 1) {
|
||||
uint32_t o_offset = D * p.ne1 * split_k_index;
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
||||
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
if (tile_row(r) < N) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
|
||||
perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N);
|
||||
}
|
||||
@@ -312,7 +318,7 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
|
||||
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
if (tile_row(r) < N) {
|
||||
perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
|
||||
@@ -328,18 +334,18 @@ void main() {
|
||||
Lfrcp[r] = 1.0 / Lf[r];
|
||||
}
|
||||
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Of[r][d] *= float16_t(Lfrcp[r]);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1;
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
||||
|
||||
if (p.gqa_ratio > 1) {
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
if (tile_row(r) < N) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
|
||||
perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N);
|
||||
}
|
||||
@@ -349,9 +355,9 @@ void main() {
|
||||
} else {
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
if (i * Br + tile_row(r) < N) {
|
||||
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
|
||||
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
|
||||
data_o[o_offset + iq2 * D + (i * Br + tile_row(r)) * p.ne1 * D + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
|
||||
data_o[o_offset + iq2 * HSV + (i * Br + tile_row(r)) * p.ne1 * HSV + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,8 +61,8 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
|
||||
// Rows index by Q's dimension 2, and the first N rows are valid.
|
||||
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
|
||||
{
|
||||
if (r < N && c < D) {
|
||||
uint32_t offset = (iq2 + r) * D + c;
|
||||
if (r < N && c < HSV) {
|
||||
uint32_t offset = (iq2 + r) * HSV + c;
|
||||
data_o[o_offset + offset] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
@@ -86,9 +86,9 @@ void main() {
|
||||
tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE);
|
||||
#endif
|
||||
|
||||
tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D);
|
||||
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
|
||||
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
|
||||
tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, HSK);
|
||||
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, HSK);
|
||||
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, HSV);
|
||||
|
||||
// hint to the compiler that strides are aligned for the aligned variant of the shader
|
||||
if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
|
||||
@@ -104,16 +104,16 @@ void main() {
|
||||
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
|
||||
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
|
||||
|
||||
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Q;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;
|
||||
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseAccumulator> Q;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA> Qf16;
|
||||
|
||||
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
|
||||
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D));
|
||||
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK));
|
||||
|
||||
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA>(Q);
|
||||
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA>(Q);
|
||||
Qf16 *= float16_t(p.scale);
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(0);
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
|
||||
|
||||
@@ -130,15 +130,20 @@ void main() {
|
||||
coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2);
|
||||
}
|
||||
|
||||
uint32_t m_offset = 0;
|
||||
if (p.nem2 != 1 || p.nem3 != 1) {
|
||||
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/;
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
for (uint32_t j = start_j; j < end_j; ++j) {
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, D, Bc, gl_MatrixUseB> K_T;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, HSK, Bc, gl_MatrixUseB> K_T;
|
||||
|
||||
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
|
||||
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC);
|
||||
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC);
|
||||
S = coopMatMulAdd(Qf16, K_T, S);
|
||||
|
||||
if (p.logit_softcap != 0.0f) {
|
||||
@@ -148,14 +153,14 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
if (p.mask != 0) {
|
||||
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
||||
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
|
||||
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
|
||||
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
||||
|
||||
coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
|
||||
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
}
|
||||
@@ -203,42 +208,42 @@ void main() {
|
||||
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
|
||||
rowsum = coopMatMulAdd(P_A, One, rowsum);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Bc, D, gl_MatrixUseB> V;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV, gl_MatrixUseB> V;
|
||||
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
|
||||
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC);
|
||||
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC);
|
||||
|
||||
L = eM*L + rowsum;
|
||||
|
||||
// This is the "diagonal" matrix in the paper, but since we do componentwise
|
||||
// multiply rather than matrix multiply it has the diagonal element smeared
|
||||
// across the row
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> eMdiag;
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> eMdiag;
|
||||
|
||||
// resize eM by using smear/reduce
|
||||
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
|
||||
|
||||
// multiply with fp16 accumulation, then add to O.
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(0);
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
|
||||
PV = coopMatMulAdd(P_A, V, PV);
|
||||
|
||||
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(PV);
|
||||
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(PV);
|
||||
}
|
||||
|
||||
// If there is split_k, then the split_k resolve shader does the final
|
||||
// division by L. Store the intermediate O value and per-row m and L values.
|
||||
if (p.k_num > 1) {
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
|
||||
|
||||
uint32_t o_offset = D * p.ne1 * split_k_index;
|
||||
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
||||
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
||||
|
||||
o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
|
||||
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
||||
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
|
||||
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
|
||||
return;
|
||||
}
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Ldiag;
|
||||
|
||||
// resize L by using smear/reduce
|
||||
coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
|
||||
@@ -250,18 +255,18 @@ void main() {
|
||||
|
||||
O = Ldiag*O;
|
||||
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1;
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
||||
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
|
||||
if (p.gqa_ratio > 1) {
|
||||
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
||||
} else {
|
||||
tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, HSV);
|
||||
|
||||
// permute dimensions
|
||||
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
|
||||
|
||||
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, D), tensorViewPermute);
|
||||
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ layout (binding = 1) writeonly buffer D {float data_d[];};
|
||||
layout (push_constant) uniform parameter {
|
||||
uint D;
|
||||
uint N;
|
||||
uint ne3;
|
||||
uint k_num;
|
||||
} p;
|
||||
|
||||
@@ -19,13 +20,14 @@ void main() {
|
||||
// Each workgroup handles a row
|
||||
const uint n = gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint iq3 = gl_WorkGroupID.z;
|
||||
|
||||
uint D = p.D;
|
||||
uint N = p.N;
|
||||
uint k_num = p.k_num;
|
||||
|
||||
uint l_offset = D * N * k_num + n;
|
||||
uint m_offset = D * N * k_num + N + n;
|
||||
uint l_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + n;
|
||||
uint m_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + N + n;
|
||||
uint lm_stride = N * 2;
|
||||
|
||||
// Compute the max m value for the row
|
||||
@@ -49,11 +51,11 @@ void main() {
|
||||
for (uint d = tid; d < D; d += BLOCK_SIZE) {
|
||||
float O = 0.0;
|
||||
[[unroll]] for (uint k = 0; k < k_num; ++k) {
|
||||
uint o_offset = D * N * k + D * n + d;
|
||||
uint o_offset = D * N * (k + iq3 * k_num) + D * n + d;
|
||||
float m = data_a[m_offset + k * lm_stride];
|
||||
O += exp(m - m_max) * data_a[o_offset];
|
||||
}
|
||||
O *= L;
|
||||
data_d[D * n + d] = O;
|
||||
data_d[iq3 * D * N + D * n + d] = O;
|
||||
}
|
||||
}
|
||||
|
||||
27
ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp
Normal file
27
ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp
Normal file
@@ -0,0 +1,27 @@
|
||||
#version 450
|
||||
|
||||
#include "glu_head.comp"
|
||||
|
||||
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
||||
// ref: https://www.johndcook.com/blog/python_erf/
|
||||
const float p_erf = 0.3275911f;
|
||||
const float a1_erf = 0.254829592f;
|
||||
const float a2_erf = -0.284496736f;
|
||||
const float a3_erf = 1.421413741f;
|
||||
const float a4_erf = -1.453152027f;
|
||||
const float a5_erf = 1.061405429f;
|
||||
|
||||
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
float op(float a, float b) {
|
||||
const float a_div_sqr2 = a * SQRT_2_INV;
|
||||
const float sign_x = sign(a_div_sqr2);
|
||||
const float x = abs(a_div_sqr2);
|
||||
const float t = 1.0f / (1.0f + p_erf * x);
|
||||
const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
||||
const float erf_approx = sign_x * y;
|
||||
|
||||
return 0.5f * a * (1.0f + erf_approx) * b;
|
||||
}
|
||||
|
||||
#include "glu_main.comp"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user