mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-16 22:14:07 +00:00
Compare commits
38 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4ec6291a24 | ||
|
|
a12363bbf0 | ||
|
|
a86f52b285 | ||
|
|
b284197df4 | ||
|
|
221c0e0c58 | ||
|
|
07a19e27a2 | ||
|
|
18f3b5ff9e | ||
|
|
7233358d29 | ||
|
|
6c88b3bb25 | ||
|
|
14c28dfc50 | ||
|
|
8c988fa41d | ||
|
|
acd6cb1c41 | ||
|
|
84712b6043 | ||
|
|
d4d1522b20 | ||
|
|
d1aa0cc5d1 | ||
|
|
c8ade30036 | ||
|
|
e28c0b80c2 | ||
|
|
8e6f8bc875 | ||
|
|
adef81781a | ||
|
|
48b86c4fdb | ||
|
|
38d3af1b73 | ||
|
|
6c9ee3b17e | ||
|
|
cd465d823c | ||
|
|
922042601b | ||
|
|
2ba1333b35 | ||
|
|
c2e058f1b4 | ||
|
|
c82d48ec23 | ||
|
|
b4efd77f8a | ||
|
|
2be60cbc27 | ||
|
|
b526ad2668 | ||
|
|
938b785764 | ||
|
|
36c153248f | ||
|
|
a979ca22db | ||
|
|
90083283ec | ||
|
|
d4b91ea7b2 | ||
|
|
83f5872404 | ||
|
|
f0d4d176df | ||
|
|
b17230917c |
@@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
|
||||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true # OnePerLine
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
BreakBeforeBraces: Custom # Attach
|
||||
BraceWrapping:
|
||||
@@ -70,15 +70,18 @@ ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
IncludeBlocks: Regroup
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*\.h>'
|
||||
- Regex: '".*"'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*'
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
- Regex: '^<.*'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
Priority: 4
|
||||
SortPriority: 0
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentAccessModifiers: false
|
||||
|
||||
2
.github/workflows/close-issue.yml
vendored
2
.github/workflows/close-issue.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
||||
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
|
||||
@@ -9,3 +9,4 @@
|
||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/gguf.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
|
||||
@@ -270,7 +270,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
|
||||
## Obtaining and quantizing models
|
||||
@@ -436,7 +435,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||
|
||||
## [`llama-perplexity`](tools/perplexity)
|
||||
|
||||
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
||||
#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
|
||||
|
||||
- <details open>
|
||||
<summary>Measure the perplexity over a text file</summary>
|
||||
@@ -459,8 +458,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||
|
||||
</details>
|
||||
|
||||
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
|
||||
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||
[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||
|
||||
## [`llama-bench`](tools/llama-bench)
|
||||
|
||||
|
||||
@@ -1612,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.antiprompt.emplace_back(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-sp", "--special"},
|
||||
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
||||
@@ -2655,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.i_chunk = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--show-statistics"},
|
||||
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.show_statistics = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--parse-special"},
|
||||
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
||||
|
||||
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
||||
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
|
||||
bool has_suffix = string_ends_with(str, suffix);
|
||||
if (has_suffix) {
|
||||
str = str.substr(0, str.size() - suffix.size());
|
||||
}
|
||||
return has_suffix;
|
||||
}
|
||||
|
||||
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
||||
if (!str.empty() && !stop.empty()) {
|
||||
const char text_last_char = str.back();
|
||||
|
||||
@@ -432,9 +432,10 @@ struct common_params {
|
||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||
int32_t i_chunk = 0; // start processing from this chunk
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
bool show_statistics = false; // show imatrix statistics per tensor
|
||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
@@ -534,6 +535,7 @@ static bool string_starts_with(const std::string & str,
|
||||
|
||||
// While we wait for C++20's std::string::ends_with...
|
||||
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
||||
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
|
||||
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
|
||||
@@ -6486,7 +6486,7 @@ class JaisModel(TextModel):
|
||||
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
||||
|
||||
|
||||
@ModelBase.register("Glm4ForCausalLM")
|
||||
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
||||
class Glm4Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GLM4
|
||||
|
||||
@@ -6508,7 +6508,8 @@ class Glm4Model(TextModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
rope_dim = self.hparams["head_dim"]
|
||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
@@ -6516,6 +6517,13 @@ class Glm4Model(TextModel):
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||
return []
|
||||
elif name.startswith("model.language_model."):
|
||||
name = name.replace("language_model.", "") # for Glm4v
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
class ChatGLMModel(TextModel):
|
||||
|
||||
@@ -305,9 +305,8 @@ On Linux it is possible to use unified memory architecture (UMA) to share main m
|
||||
|
||||
## Vulkan
|
||||
|
||||
**Windows**
|
||||
|
||||
### w64devkit
|
||||
### For Windows Users:
|
||||
**w64devkit**
|
||||
|
||||
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
|
||||
|
||||
@@ -334,7 +333,7 @@ cmake -B build -DGGML_VULKAN=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
### Git Bash MINGW64
|
||||
**Git Bash MINGW64**
|
||||
|
||||
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
|
||||
|
||||
@@ -357,7 +356,8 @@ Now you can load the model in conversation mode using `Vulkan`
|
||||
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
|
||||
```
|
||||
|
||||
### MSYS2
|
||||
**MSYS2**
|
||||
|
||||
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
||||
```sh
|
||||
pacman -S git \
|
||||
@@ -373,9 +373,9 @@ cmake -B build -DGGML_VULKAN=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
**With docker**:
|
||||
### For Docker users:
|
||||
|
||||
You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||
You don't need to install the Vulkan SDK. It will be installed inside the container.
|
||||
|
||||
```sh
|
||||
# Build the image
|
||||
@@ -385,32 +385,29 @@ docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
|
||||
**Without docker**:
|
||||
### For Linux users:
|
||||
|
||||
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
|
||||
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
|
||||
|
||||
For example, on Ubuntu 22.04 (jammy), use the command below:
|
||||
> [!IMPORTANT]
|
||||
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
|
||||
|
||||
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
|
||||
```bash
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
apt update -y
|
||||
apt-get install -y vulkan-sdk
|
||||
# To verify the installation, use the command below:
|
||||
vulkaninfo
|
||||
```
|
||||
|
||||
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||
|
||||
Then, build llama.cpp using the cmake command below:
|
||||
|
||||
Then, assuming you have `cd` into your llama.cpp folder and there are no errors with running `vulkaninfo`, you can proceed to build llama.cpp using the CMake commands below:
|
||||
```bash
|
||||
cmake -B build -DGGML_VULKAN=1
|
||||
cmake --build build --config Release
|
||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
```
|
||||
|
||||
Finally, after finishing your build, you should be able to do something like this:
|
||||
```bash
|
||||
# Test the output binary
|
||||
# "-ngl 99" should offload all of the layers to GPU for most (if not all) models.
|
||||
./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -ngl 99
|
||||
|
||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||
|
||||
@@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||
aclTensor* acl_weight_tensor;
|
||||
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (weightToNZ && is_matmul_weight(weight)) {
|
||||
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
||||
|
||||
// Reverse ne.
|
||||
std::reverse(transpose_ne, transpose_ne + n_dims);
|
||||
|
||||
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
|
||||
|
||||
acl_weight_tensor = aclCreateTensor(
|
||||
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
|
||||
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
|
||||
} else {
|
||||
acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
||||
}
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#ifndef CANN_ACLNN_OPS
|
||||
#define CANN_ACLNN_OPS
|
||||
|
||||
#include <unordered_set>
|
||||
#include <functional>
|
||||
#include <aclnnop/aclnn_abs.h>
|
||||
#include <aclnnop/aclnn_neg.h>
|
||||
@@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
|
||||
*/
|
||||
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Check whether a tensor is a weight tensor for matrix multiplication.
|
||||
*
|
||||
* @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
|
||||
* typically within neural network layers. The function maintains a static set of canonical weight
|
||||
* naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
|
||||
* tensors even with hierarchical naming patterns.
|
||||
*
|
||||
* @param tensor Pointer to the target ggml_tensor object (const-qualified).
|
||||
*/
|
||||
static bool is_matmul_weight(const ggml_tensor* tensor) {
|
||||
std::string name = ggml_get_name(tensor);
|
||||
static const std::unordered_set<std::string> weight_suffixes{
|
||||
"output.weight",
|
||||
"attn_q.weight",
|
||||
"attn_k.weight",
|
||||
"attn_v.weight",
|
||||
"attn_output.weight",
|
||||
"ffn_gate.weight",
|
||||
"ffn_up.weight",
|
||||
"ffn_down.weight"
|
||||
};
|
||||
|
||||
for (const auto& suffix : weight_suffixes) {
|
||||
if (name.find(suffix) != std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a element-wise operation to two input tensors using the CANN
|
||||
* backend.
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include <acl/acl.h>
|
||||
#include <stdarg.h>
|
||||
#include <aclnnop/aclnn_trans_matmul_weight.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
|
||||
aclDataType dataType, aclTensor **tensor)
|
||||
{
|
||||
uint64_t size = 1;
|
||||
for (auto i : shape) {
|
||||
size *= i;
|
||||
}
|
||||
|
||||
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
|
||||
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
|
||||
|
||||
size *= sizeof(int16_t);
|
||||
|
||||
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
|
||||
std::vector<int64_t> strides(shape.size(), 1);
|
||||
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
||||
strides[i] = shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
shape.data(), shape.size(), *deviceAddr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
|
||||
aclrtStream stream;
|
||||
ACL_CHECK(aclrtCreateStream(&stream));
|
||||
|
||||
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
|
||||
void *weightTransposedDeviceAddr = nullptr;
|
||||
aclTensor *weightTransposed = nullptr;
|
||||
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
|
||||
ggml_cann_type_mapping(tensor->type), &weightTransposed);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
void *workspaceAddr = nullptr;
|
||||
|
||||
// TransMatmulWeight
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
|
||||
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
|
||||
if (workspaceSize > 0) {
|
||||
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
workspaceAddrPtrTrans.reset(workspaceAddr);
|
||||
}
|
||||
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
|
||||
|
||||
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
|
||||
|
||||
aclrtMemcpy((char *)tensor->data + offset, size,
|
||||
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
||||
aclrtFree(weightTransposedDeviceAddr);
|
||||
}
|
||||
|
||||
// TODO: need handle tensor which has paddings.
|
||||
/**
|
||||
* @brief Set tensor data in a CANN buffer.
|
||||
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
|
||||
// For acl, synchronous functions use this default stream.
|
||||
// Why aclrtSynchronizeDevice?
|
||||
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
weight_format_to_nz(tensor, data, offset);
|
||||
}
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
||||
|
||||
@@ -494,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.9.0")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
|
||||
@@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
__m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
|
||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
||||
__m128 tmp = max4;
|
||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 ));
|
||||
const float max_scalar = ((v4f32)max4)[0];
|
||||
|
||||
// Quantize these floats
|
||||
|
||||
@@ -22,9 +22,94 @@
|
||||
|
||||
#include "kai_common.h"
|
||||
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "kernels.h"
|
||||
|
||||
#define NELEMS(x) sizeof(x) / sizeof(*x)
|
||||
|
||||
static const size_t INT4_PER_BYTE = 2;
|
||||
static const size_t INT4_BITS = 4;
|
||||
static const int Q4_0_ZERO_POINT = 8;
|
||||
const size_t INT4_PER_UINT16 = 4;
|
||||
|
||||
static void dequantize_row_qsi4c32pscalef16(
|
||||
const void *packed_data,
|
||||
int32_t row_idx,
|
||||
int64_t nc,
|
||||
float *out,
|
||||
size_t nr_pack,
|
||||
size_t packed_row_stride,
|
||||
size_t kr,
|
||||
size_t bl,
|
||||
size_t num_bytes_multiplier
|
||||
) {
|
||||
size_t group_idx = row_idx / nr_pack;
|
||||
size_t row_in_group = row_idx % nr_pack;
|
||||
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
|
||||
size_t num_blocks = nc / bl;
|
||||
const uint8_t *block_ptr = packed_group;
|
||||
|
||||
for (size_t b = 0; b < num_blocks; ++b) {
|
||||
uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier));
|
||||
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
|
||||
|
||||
const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier;
|
||||
size_t num_segments = bl / kr;
|
||||
size_t num_bytes_per_segment = kr / INT4_PER_BYTE;
|
||||
|
||||
for (size_t s = 0; s < num_segments; ++s) {
|
||||
const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment;
|
||||
const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment;
|
||||
for (size_t k = 0; k < num_bytes_per_segment; ++k) {
|
||||
uint8_t byte = qbytes[k] ^ 0x88;
|
||||
int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT;
|
||||
int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT;
|
||||
out[b * bl + s * num_bytes_per_segment + k] = x0 * scale;
|
||||
out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale;
|
||||
}
|
||||
}
|
||||
block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment;
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_row_qsi4c32ps1s0scalef16(
|
||||
const void *packed_data,
|
||||
int32_t row_idx,
|
||||
int64_t k,
|
||||
float *out,
|
||||
size_t nr,
|
||||
size_t packed_row_stride,
|
||||
size_t kr,
|
||||
size_t bl,
|
||||
size_t num_bytes_multiplier
|
||||
) {
|
||||
const size_t num_blocks = k / bl;
|
||||
const size_t bl4 = bl / INT4_PER_UINT16;
|
||||
|
||||
size_t group_idx = row_idx / nr;
|
||||
size_t row_in_group = row_idx % nr;
|
||||
|
||||
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
|
||||
const uint16_t *qdata = (const uint16_t *)packed_group;
|
||||
const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier));
|
||||
|
||||
for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
|
||||
uint16_t scale_f16 = scales[row_in_group + block_idx * nr];
|
||||
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
|
||||
|
||||
for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) {
|
||||
uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group];
|
||||
|
||||
for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) {
|
||||
int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT;
|
||||
out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_UNUSED(kr);
|
||||
}
|
||||
|
||||
static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
#if defined(__ARM_FEATURE_SME)
|
||||
{
|
||||
@@ -63,8 +148,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
/* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
@@ -107,8 +194,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
||||
/* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
||||
/* .packed_stride = */ NULL,
|
||||
/* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
||||
/* .to_float = */ NULL,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
@@ -154,8 +243,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
@@ -200,8 +291,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
@@ -247,8 +340,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
@@ -293,8 +388,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
|
||||
@@ -71,12 +71,15 @@ struct rhs_packing_info {
|
||||
std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
|
||||
std::function<size_t(size_t n, size_t k)>
|
||||
> packed_size;
|
||||
size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl);
|
||||
std::variant<
|
||||
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
|
||||
const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
|
||||
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
|
||||
const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
|
||||
> pack_func;
|
||||
void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out, size_t nr_pack, size_t packed_row_stride,
|
||||
size_t kr, size_t bl, size_t num_bytes_multiplier);
|
||||
};
|
||||
|
||||
struct ggml_kleidiai_kernels {
|
||||
|
||||
@@ -40,6 +40,17 @@ struct ggml_kleidiai_context {
|
||||
ggml_kleidiai_kernels * kernels;
|
||||
} static ctx = { CPU_FEATURE_NONE, NULL };
|
||||
|
||||
static const char* cpu_feature_to_string(cpu_feature f) {
|
||||
switch (f) {
|
||||
case CPU_FEATURE_NONE: return "NONE";
|
||||
case CPU_FEATURE_DOTPROD: return "DOTPROD";
|
||||
case CPU_FEATURE_I8MM: return "I8MM";
|
||||
case CPU_FEATURE_SVE: return "SVE";
|
||||
case CPU_FEATURE_SME: return "SME";
|
||||
default: return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
static void init_kleidiai_context(void) {
|
||||
|
||||
ggml_critical_section_start();
|
||||
@@ -62,6 +73,11 @@ static void init_kleidiai_context(void) {
|
||||
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
||||
}
|
||||
ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
||||
#ifndef NDEBUG
|
||||
if (ctx.kernels) {
|
||||
GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
@@ -102,6 +118,9 @@ static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint1
|
||||
|
||||
class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||
if (op->op != GGML_OP_MUL_MAT) {
|
||||
return false;
|
||||
}
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
|
||||
GGML_ASSERT(kernels);
|
||||
kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
|
||||
@@ -135,6 +154,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
return compute_forward_kv_cache(params, dst);
|
||||
}
|
||||
} else if (dst->op == GGML_OP_GET_ROWS) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
return compute_forward_get_rows(params, dst);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -270,6 +293,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
}
|
||||
|
||||
bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
@@ -342,8 +367,49 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(ctx.kernels);
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
rhs_packing_info * rhs_info = &ctx.kernels->rhs_info;
|
||||
kernel_info * kernel = &ctx.kernels->gemm;
|
||||
|
||||
const int64_t nc = ne00;
|
||||
const int64_t nr = ggml_nelements(src1);
|
||||
|
||||
const size_t block_rows = kernel->get_nr();
|
||||
const size_t kr = kernel->get_kr();
|
||||
|
||||
const size_t num_bytes_multiplier = sizeof(uint16_t);
|
||||
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
const int ir0 = dr * ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int64_t i = ir0; i < ir1; ++i) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
int64_t row_idx = ((const int32_t *)src1->data)[i];
|
||||
GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
|
||||
|
||||
float *out = (float *)((char *)dst->data + i * nb1);
|
||||
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
|
||||
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(ctx.kernels);
|
||||
const size_t n = tensor->ne[1];
|
||||
const size_t k = tensor->ne[0];
|
||||
@@ -351,17 +417,12 @@ public:
|
||||
size_t kr = ctx.kernels->gemm.get_kr();
|
||||
size_t sr = ctx.kernels->gemm.get_sr();
|
||||
|
||||
#ifndef NDEBUG
|
||||
const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
|
||||
GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
|
||||
#endif
|
||||
struct kai_rhs_pack_qs4cxs1s0_param params;
|
||||
params.lhs_zero_point = 1;
|
||||
params.rhs_zero_point = 8;
|
||||
variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, ¶ms);
|
||||
|
||||
return 0;
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
};
|
||||
@@ -375,8 +436,8 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
|
||||
static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
@@ -418,18 +479,35 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(ctx.kernels);
|
||||
|
||||
const size_t n = tensor->ne[1];
|
||||
const size_t k = tensor->ne[0];
|
||||
const size_t nr = ctx.kernels->gemm.get_nr();
|
||||
const size_t kr = ctx.kernels->gemm.get_kr();
|
||||
|
||||
return variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::kleidiai {
|
||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT &&
|
||||
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
|
||||
op->src[0]->type == GGML_TYPE_Q4_0 &&
|
||||
op->src[0]->buffer &&
|
||||
(ggml_n_dims(op->src[0]) == 2) &&
|
||||
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
|
||||
if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type == GGML_TYPE_F32 &&
|
||||
if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) &&
|
||||
ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
|
||||
return true;
|
||||
}
|
||||
@@ -438,7 +516,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
}
|
||||
|
||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
@@ -469,7 +547,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ nullptr,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
|
||||
@@ -102,12 +102,12 @@ if (CUDAToolkit_FOUND)
|
||||
if (GGML_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
|
||||
else ()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static)
|
||||
endif()
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
|
||||
@@ -6,24 +6,33 @@
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
|
||||
|
||||
if (i >= k) {
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ib = i/qk; // block index
|
||||
const int64_t iqs = (i%qk)/qr; // quant index
|
||||
const int64_t iybs = i - i%qk; // y block start index
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
|
||||
|
||||
const int64_t ib = ibx0 + i00/qk; // block index
|
||||
const int64_t iqs = (i00%qk)/qr; // quant index
|
||||
const int64_t iybs = i00 - i00%qk; // y block start index
|
||||
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
y[iybs + iqs + 0] = v.x;
|
||||
y[iybs + iqs + y_offset] = v.y;
|
||||
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
||||
y[iy0 + 0] = float(v.x);
|
||||
y[iy0 + y_offset] = float(v.y);
|
||||
}
|
||||
|
||||
template <bool need_check>
|
||||
@@ -457,9 +466,17 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
static void dequantize_block_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
@@ -624,14 +641,14 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
|
||||
return dequantize_block_q8_0_f16_cuda;
|
||||
}
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -676,11 +693,11 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -722,6 +739,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
@@ -733,6 +760,16 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float, nv_bfloat16>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, nv_bfloat16>;
|
||||
default:
|
||||
@@ -744,6 +781,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, float>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16, float>;
|
||||
default:
|
||||
|
||||
@@ -2,24 +2,13 @@
|
||||
|
||||
#include "ggml-common.h"
|
||||
|
||||
static __device__ __forceinline__ void convert_f32_f32(const float * src, float * dst) {
|
||||
*dst = *src;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void convert_f32_f16(const float * src, half * dst) {
|
||||
*dst = __float2half(*src);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void convert_f32_bf16(const float * src, nv_bfloat16 * dst) {
|
||||
*dst = *src;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void convert_f16_f16(const half * src, half * dst) {
|
||||
*dst = *src;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void convert_f16_f32(const half * src, float * dst) {
|
||||
*dst = *src;
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
|
||||
if constexpr (std::is_same_v<src_t, dst_t>) {
|
||||
*dst = *src;
|
||||
} else {
|
||||
*dst = float(*src);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||
@@ -230,22 +219,7 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
||||
convert_f32_f32((const float *)cxi, (float *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
||||
convert_f32_f16((const float *)cxi, (half *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
|
||||
convert_f32_bf16((const float *)cxi, (nv_bfloat16 *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||
convert_f16_f16((const half *)cxi, (half *)cdsti);
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
convert_f16_f32((const half *)cxi, (float *)cdsti);
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
|
||||
}
|
||||
|
||||
@@ -8,10 +8,10 @@
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
@@ -139,43 +139,14 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f32_cuda(
|
||||
template<typename src_t, typename dst_t>
|
||||
static void ggml_cpy_flt_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_bf16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
@@ -307,16 +278,6 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
@@ -372,11 +333,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
@@ -403,9 +364,17 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
@@ -430,11 +399,11 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
return nullptr;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||
return (void*) cpy_flt<cpy_1_flt<float, float>>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
|
||||
return (void*) cpy_flt<cpy_1_flt<float, nv_bfloat16>>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
return (void*) cpy_flt<cpy_1_flt<float, half>>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
@@ -458,9 +427,17 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>;
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
return (void*) cpy_flt<cpy_1_flt<half, half>>;
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||
return (void*) cpy_flt<cpy_1_flt<half, nv_bfloat16>>;
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||
return (void*) cpy_flt<cpy_1_flt<half, float>>;
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, half>>;
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, nv_bfloat16>>;
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, float>>;
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
@@ -23,33 +23,13 @@ typedef void (* fattn_kernel_t)(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3);
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33);
|
||||
|
||||
typedef half (*vec_dot_KQ_f16_t)(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||
@@ -745,33 +725,58 @@ void launch_fattn(
|
||||
size_t nb23 = V ? V->nb[3] : nb13;
|
||||
|
||||
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
K_data = (char *) K_f16.ptr;
|
||||
|
||||
const size_t bs = ggml_blck_size(K->type);
|
||||
const size_t ts = ggml_type_size(K->type);
|
||||
|
||||
nb11 = nb11*bs*sizeof(half)/ts;
|
||||
nb12 = nb12*bs*sizeof(half)/ts;
|
||||
nb13 = nb13*bs*sizeof(half)/ts;
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
if (ggml_is_contiguously_allocated(K)) {
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
|
||||
nb11 = nb11*bs*sizeof(half)/ts;
|
||||
nb12 = nb12*bs*sizeof(half)/ts;
|
||||
nb13 = nb13*bs*sizeof(half)/ts;
|
||||
} else {
|
||||
GGML_ASSERT(K->nb[0] == ts);
|
||||
to_fp16_nc_cuda_t to_fp16 = ggml_get_to_fp16_nc_cuda(K->type);
|
||||
const int64_t s01 = nb11 / ts;
|
||||
const int64_t s02 = nb12 / ts;
|
||||
const int64_t s03 = nb13 / ts;
|
||||
to_fp16(K_data, K_f16.ptr, K->ne[0], K->ne[1], K->ne[2], K->ne[3], s01, s02, s03, main_stream);
|
||||
|
||||
nb11 = K->ne[0] * sizeof(half);
|
||||
nb12 = K->ne[1] * nb11;
|
||||
nb13 = K->ne[2] * nb12;
|
||||
}
|
||||
K_data = (char *) K_f16.ptr;
|
||||
}
|
||||
|
||||
if (V && need_f16_V && V->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
V_data = (char *) V_f16.ptr;
|
||||
|
||||
const size_t bs = ggml_blck_size(V->type);
|
||||
const size_t ts = ggml_type_size(V->type);
|
||||
|
||||
nb21 = nb21*bs*sizeof(half)/ts;
|
||||
nb22 = nb22*bs*sizeof(half)/ts;
|
||||
nb23 = nb23*bs*sizeof(half)/ts;
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
if (ggml_is_contiguously_allocated(V)) {
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
V_data = (char *) V_f16.ptr;
|
||||
|
||||
nb21 = nb21*bs*sizeof(half)/ts;
|
||||
nb22 = nb22*bs*sizeof(half)/ts;
|
||||
nb23 = nb23*bs*sizeof(half)/ts;
|
||||
} else {
|
||||
GGML_ASSERT(V->nb[0] == ts);
|
||||
to_fp16_nc_cuda_t to_fp16 = ggml_get_to_fp16_nc_cuda(V->type);
|
||||
const int64_t s01 = nb21 / ts;
|
||||
const int64_t s02 = nb22 / ts;
|
||||
const int64_t s03 = nb23 / ts;
|
||||
to_fp16(V_data, V_f16.ptr, V->ne[0], V->ne[1], V->ne[2], V->ne[3], s01, s02, s03, main_stream);
|
||||
|
||||
nb21 = V->ne[0] * sizeof(half);
|
||||
nb22 = V->ne[1] * nb21;
|
||||
nb23 = V->ne[2] * nb22;
|
||||
}
|
||||
V_data = (char *) V_f16.ptr;
|
||||
}
|
||||
|
||||
int parallel_blocks = 1;
|
||||
@@ -867,14 +872,11 @@ void launch_fattn(
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
||||
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
nb11, nb12, nb13,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
||||
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
|
||||
@@ -408,7 +408,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const int stride_K,
|
||||
const int stride_V,
|
||||
const int stride_mask,
|
||||
const int jt,
|
||||
half2 * const __restrict__ tile_Q,
|
||||
half2 * const __restrict__ tile_K,
|
||||
half2 * const __restrict__ tile_V,
|
||||
@@ -455,7 +454,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
cp_async_wait_all();
|
||||
__syncthreads();
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
|
||||
(V_h2 + int64_t(k_VKQ_0)*stride_V, tile_V, nbatch_V2, stride_V);
|
||||
} else {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
if (ncols2 > 1 || mask_h2) {
|
||||
@@ -471,7 +470,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
if (nstages <= 1) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(K_h2 + k_VKQ_0*stride_K + k0_start, tile_K, k0_diff, stride_K);
|
||||
(K_h2 + int64_t(k_VKQ_0)*stride_K + k0_start, tile_K, k0_diff, stride_K);
|
||||
if (use_cp_async) {
|
||||
cp_async_wait_all();
|
||||
}
|
||||
@@ -715,7 +714,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
(mask_h2 + (k_VKQ_0 + c::nbatch_fa)/2, tile_mask, stride_mask);
|
||||
}
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
(K_h2 + int64_t(k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -732,7 +731,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
if (nstages <= 1 && i0_start < reusable_cutoff) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
|
||||
(V_h2 + int64_t(k_VKQ_0)*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
|
||||
if (use_cp_async) {
|
||||
cp_async_wait_all();
|
||||
}
|
||||
@@ -771,8 +770,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
|
||||
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
||||
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
|
||||
@@ -920,7 +918,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
(mask_h2 + kb0_start*c::nbatch_fa/2, tile_mask, stride_mask);
|
||||
}
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
(K_h2 + int64_t(kb0_start)*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
}
|
||||
|
||||
// Iterate over ne11 == previous tokens:
|
||||
@@ -928,13 +926,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
constexpr bool last_iter = false;
|
||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
|
||||
}
|
||||
{ // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
|
||||
constexpr bool last_iter = true;
|
||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
|
||||
}
|
||||
|
||||
// With multi-stage loading there is no __syncthreads at the end of the iter,
|
||||
@@ -1214,33 +1212,13 @@ static __global__ void flash_attn_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -1359,8 +1337,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -21,33 +21,13 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -127,7 +107,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -221,7 +201,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
KV_tmp[k][i] = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,8 +280,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -21,33 +21,13 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -66,8 +46,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -135,7 +114,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
|
||||
const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
const half2 tmp = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
|
||||
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
|
||||
}
|
||||
@@ -231,8 +210,9 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(tmp);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,7 +292,6 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -18,33 +18,13 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -191,13 +171,16 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -244,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
half sum = vec_dot_KQ(K + i_KQ*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum((float)sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
@@ -300,14 +283,18 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
}
|
||||
|
||||
half2 V_k;
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k_VKQ_0 + k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k_VKQ_0 + k0 + 1)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -351,8 +338,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -18,33 +18,13 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -59,8 +39,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -198,13 +177,16 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
|
||||
float VKQ[ncols] = {0.0f};
|
||||
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -246,7 +228,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
|
||||
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
@@ -297,13 +279,17 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
break;
|
||||
}
|
||||
|
||||
const float V_ki = dequantize_1_v(V + (k_VKQ_0 + k)*nb21, tid);
|
||||
const float V_ki = dequantize_1_v(V + k*nb21, tid);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_ki*KQ[j*D + k];
|
||||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -348,7 +334,6 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -37,33 +37,13 @@ static __global__ void flash_attn_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
@@ -197,7 +177,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
|
||||
frag_a_K K_a;
|
||||
wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
|
||||
wmma::load_matrix_sync(K_a, K_h + int64_t(k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
|
||||
@@ -344,7 +324,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
|
||||
|
||||
frag_a_V v_a;
|
||||
wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
|
||||
wmma::load_matrix_sync(v_a, V_h + int64_t(k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
|
||||
@@ -451,7 +431,6 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
}
|
||||
|
||||
@@ -280,22 +280,12 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
if (fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
|
||||
if (!fast_fp16_available(cc)) {
|
||||
if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <float.h>
|
||||
#include <initializer_list>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
@@ -2765,6 +2766,39 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
|
||||
const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
|
||||
const ggml_tensor *mul = cgraph->nodes[node_idx+1];
|
||||
|
||||
GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
|
||||
|
||||
//rms norm only supports F32
|
||||
if (mul->src[0]->type != GGML_TYPE_F32 ||
|
||||
mul->src[1]->type != GGML_TYPE_F32 ||
|
||||
mul->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//if rms norm is the B operand, then we don't handle broadcast
|
||||
if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//rms_norm kernel assumes contigous rows
|
||||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
@@ -2774,6 +2808,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
||||
if (!use_cuda_graph || cuda_graph_update_required) {
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2781,6 +2816,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion && ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
@@ -3242,13 +3283,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||
if ((src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_F16) &&
|
||||
(src1_type == GGML_TYPE_F32 || src1_type == GGML_TYPE_BF16 || src1_type == GGML_TYPE_F16)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
||||
@@ -3284,12 +3321,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||
return true;
|
||||
}
|
||||
@@ -3370,7 +3401,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return op->src[0]->ne[1] % 128 == 0;
|
||||
}
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
return true;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
|
||||
@@ -10,7 +10,7 @@ static __global__ void im2col_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
||||
const int64_t ksize = OW * KH;
|
||||
const int64_t kx = i / ksize;
|
||||
const int64_t kd = kx * ksize;
|
||||
const int64_t ky = (i - kd) / OW;
|
||||
|
||||
@@ -104,10 +104,12 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
}
|
||||
}
|
||||
|
||||
template <int block_size>
|
||||
template <int block_size, bool do_multiply = false>
|
||||
static __global__ void rms_norm_f32(
|
||||
const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps) {
|
||||
const int64_t stride_sample, const float eps, const float * mul = nullptr, const int64_t mul_stride_row = 0,
|
||||
const int64_t mul_stride_channel = 0, const int64_t mul_stride_sample = 0, const int mul_ncols = 0,
|
||||
const int mul_nrows = 0, const int mul_nchannels = 0, const int mul_nsamples = 0) {
|
||||
const int nrows = gridDim.x;
|
||||
const int nchannels = gridDim.y;
|
||||
|
||||
@@ -119,6 +121,13 @@ static __global__ void rms_norm_f32(
|
||||
x += sample*stride_sample + channel*stride_channel + row*stride_row;
|
||||
dst += ((sample*nchannels + channel)*nrows + row)*ncols;
|
||||
|
||||
if constexpr (do_multiply) {
|
||||
const int mul_row = row % mul_nrows;
|
||||
const int mul_channel = channel % mul_nchannels;
|
||||
const int mul_sample = sample % mul_nsamples;
|
||||
mul += mul_sample*mul_stride_sample + mul_channel*mul_stride_channel + mul_row*mul_stride_row;
|
||||
}
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
@@ -145,7 +154,12 @@ static __global__ void rms_norm_f32(
|
||||
const float scale = rsqrtf(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col] = scale * x[col];
|
||||
if constexpr (do_multiply) {
|
||||
const int mul_col = col % mul_ncols;
|
||||
dst[col] = scale * x[col] * mul[mul_col];
|
||||
} else {
|
||||
dst[col] = scale * x[col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -310,10 +324,30 @@ static void rms_norm_f32_cuda(
|
||||
const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
rms_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
rms_norm_f32<WARP_SIZE, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_mul_f32_cuda(
|
||||
const float * x, const float * mul, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
|
||||
const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
|
||||
const int mul_ncols, const int mul_nrows, const int mul_nchannels, const int mul_nsamples,
|
||||
const float eps, cudaStream_t stream) {
|
||||
const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
if (mul == nullptr) {
|
||||
rms_norm_f32_cuda(x, dst, ncols, nrows, nchannels, nsamples, stride_row, stride_channel, stride_sample, eps, stream);
|
||||
return;
|
||||
}
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
rms_norm_f32<WARP_SIZE, true><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -407,6 +441,59 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
rms_norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * mul_tensor) {
|
||||
const ggml_tensor * rms_norm_src = (ggml_tensor *) dst->src[0];
|
||||
float eps = 0.0f;
|
||||
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) rms_norm_src->data;
|
||||
const float * mul_d = nullptr;
|
||||
const ggml_tensor * mul_src = nullptr;
|
||||
|
||||
if (mul_tensor->src[0] == dst) {
|
||||
mul_d = (float *) mul_tensor->src[1]->data;
|
||||
mul_src = mul_tensor->src[1];
|
||||
} else if(mul_tensor->src[1] == dst) {
|
||||
mul_d = (float *) mul_tensor->src[0]->data;
|
||||
mul_src = mul_tensor->src[0];
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
float * dst_d = (float *) mul_tensor->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(rms_norm_src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
const int64_t ne00 = rms_norm_src->ne[0];
|
||||
const int64_t ne01 = rms_norm_src->ne[1];
|
||||
const int64_t ne02 = rms_norm_src->ne[2];
|
||||
const int64_t ne03 = rms_norm_src->ne[3];
|
||||
|
||||
const size_t ts0 = ggml_type_size(rms_norm_src->type);
|
||||
GGML_ASSERT(rms_norm_src->nb[0] == ts0);
|
||||
const int64_t s01 = rms_norm_src->nb[1] / ts0;
|
||||
const int64_t s02 = rms_norm_src->nb[2] / ts0;
|
||||
const int64_t s03 = rms_norm_src->nb[3] / ts0;
|
||||
|
||||
const size_t ts_mul = ggml_type_size(mul_src->type);
|
||||
GGML_ASSERT(mul_src->nb[0] == ts_mul);
|
||||
const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
|
||||
const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
|
||||
const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
|
||||
|
||||
const int mul_ncols = mul_src->ne[0];
|
||||
const int mul_nrows = mul_src->ne[1];
|
||||
const int mul_nchannels = mul_src->ne[2];
|
||||
const int mul_nsamples = mul_src->ne[3];
|
||||
|
||||
rms_norm_mul_f32_cuda(src0_d, mul_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, mul_s01, mul_s02, mul_s03, mul_ncols, mul_nrows, mul_nchannels, mul_nsamples, eps, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * grad = dst->src[0]; // gradients
|
||||
const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
|
||||
|
||||
@@ -6,6 +6,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
|
||||
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * mul_tensor);
|
||||
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -4,24 +4,8 @@
|
||||
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
||||
GGML_UNUSED(src_f);
|
||||
GGML_UNUSED(dst_f);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
|
||||
convert_f32_f16(src_f, dst_h);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
|
||||
convert_f32_bf16(src_f, dst_b);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
|
||||
convert_f32_f32(src_f, dst_f);
|
||||
__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
||||
convert_flt(src_f, dst_f);
|
||||
}
|
||||
|
||||
// Generic quantized set_rows kernel template
|
||||
|
||||
@@ -105,6 +105,8 @@ set(GGML_OPENCL_KERNELS
|
||||
pad
|
||||
repeat
|
||||
mul_mat_f16_f32
|
||||
conv2d
|
||||
conv2d_f16_f32
|
||||
)
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
|
||||
@@ -390,6 +390,9 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_tanh;
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_conv_2d_f16;
|
||||
cl_program program_conv_2d_f32;
|
||||
cl_program program_conv_2d_f16_f32;
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
@@ -441,6 +444,9 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_conv_2d_f16;
|
||||
cl_kernel kernel_conv_2d_f32;
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
@@ -1478,6 +1484,47 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// conv2d
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "conv2d.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f16_f32 {
|
||||
#include "conv2d_f16_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("conv2d.cl");
|
||||
const std::string kernel_src_f16_f32 = read_file("conv2d_f16_f32.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_conv_2d_f16 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), (std::string(compile_opts) + " -DUSE_FP16=1").c_str());
|
||||
CL_CHECK((backend_ctx->kernel_conv_2d_f16 = clCreateKernel(backend_ctx->program_conv_2d_f16, "kernel_conv_2d", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
backend_ctx->program_conv_2d_f32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_conv_2d_f32 = clCreateKernel(backend_ctx->program_conv_2d_f32, "kernel_conv_2d", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: conv2d kernel source not found or empty. This op will not be available.\n");
|
||||
backend_ctx->program_conv_2d_f16 = nullptr;
|
||||
backend_ctx->kernel_conv_2d_f16 = nullptr;
|
||||
backend_ctx->program_conv_2d_f32 = nullptr;
|
||||
backend_ctx->kernel_conv_2d_f32 = nullptr;
|
||||
}
|
||||
if (!kernel_src_f16_f32.empty()) {
|
||||
backend_ctx->program_conv_2d_f16_f32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16_f32.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_conv_2d_f16_f32 = clCreateKernel(backend_ctx->program_conv_2d_f16_f32, "kernel_conv_2d", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: conv2d_f16_f32 kernel source not found or empty. This op will not be available.\n");
|
||||
backend_ctx->program_conv_2d_f16_f32 = nullptr;
|
||||
backend_ctx->kernel_conv_2d_f16_f32 = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -2361,6 +2408,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_2D:
|
||||
return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
|
||||
(op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
|
||||
case GGML_OP_CONCAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -4998,6 +5049,82 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const cl_uint Cout = ne03; const cl_uint Cin = ne02; const cl_uint N = ne13;
|
||||
const cl_uint KW = ne00; const cl_uint KH = ne01; const cl_uint W = ne10; const cl_uint H = ne11; const cl_uint OW = ne0; const cl_uint OH = ne1;
|
||||
|
||||
const cl_uint s0 = dst->op_params[0]; const cl_uint s1 = dst->op_params[1];
|
||||
const cl_uint p0 = dst->op_params[2]; const cl_uint p1 = dst->op_params[3];
|
||||
const cl_uint d0 = dst->op_params[4]; const cl_uint d1 = dst->op_params[5];
|
||||
|
||||
const cl_uint cl_nb01 = nb01/ggml_type_size(src0->type); const cl_uint cl_nb02 = nb02/ggml_type_size(src0->type); const cl_uint cl_nb03 = nb03/ggml_type_size(src0->type);
|
||||
const cl_uint cl_nb11 = nb11/ggml_type_size(src1->type); const cl_uint cl_nb12 = nb12/ggml_type_size(src1->type); const cl_uint cl_nb13 = nb13/ggml_type_size(src1->type);
|
||||
const cl_uint cl_nb1 = nb1/ggml_type_size(dst->type); const cl_uint cl_nb2 = nb2/ggml_type_size(dst->type); const cl_uint cl_nb3 = nb3/ggml_type_size(dst->type);
|
||||
|
||||
const int64_t NPQ = (int64_t)N * OW * OH;
|
||||
|
||||
const uint32_t BS_K = 64;
|
||||
const uint32_t BS_NPQ = 64;
|
||||
const uint32_t BS_CRS = 16;
|
||||
const uint32_t VEC_SIZE = 4;
|
||||
|
||||
const uint32_t TS_K = 4;
|
||||
const uint32_t TS_NPQ = 8;
|
||||
|
||||
const uint32_t WG_K = BS_K / TS_K;
|
||||
const uint32_t WG_NPQ = BS_NPQ / TS_NPQ;
|
||||
|
||||
auto splitWork = [](uint32_t work_size, uint32_t block_size) { return (block_size + work_size - 1) / block_size; };
|
||||
const uint32_t NB_K = splitWork(Cout, BS_K);
|
||||
const uint32_t NB_NPQ = splitWork(NPQ, BS_NPQ);
|
||||
|
||||
cl_kernel kernel;
|
||||
size_t shmem_size;
|
||||
|
||||
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_conv_2d_f16;
|
||||
shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_half4));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_conv_2d_f32;
|
||||
shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_float) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4));
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_conv_2d_f16_f32;
|
||||
shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4));
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported data type combination for conv2d");
|
||||
}
|
||||
|
||||
cl_uint idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra1->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, shmem_size, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cout)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cin)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &N));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KH)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &W)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OH));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p1));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d1));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb01)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb02)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb11)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb12)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb2)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb3));
|
||||
|
||||
size_t global_work_size[] = { (size_t)NB_K * WG_K, (size_t)NB_NPQ * WG_NPQ, 1 };
|
||||
size_t local_work_size[] = { (size_t)WG_K, (size_t)WG_NPQ, 1 };
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -6752,6 +6879,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
ggml_cl_upscale(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_CONV_2D:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_conv_2d;
|
||||
break;
|
||||
case GGML_OP_CONCAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
185
ggml/src/ggml-opencl/kernels/conv2d.cl
Normal file
185
ggml/src/ggml-opencl/kernels/conv2d.cl
Normal file
@@ -0,0 +1,185 @@
|
||||
#ifdef USE_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define T_FLOAT half
|
||||
#define T_FLOAT4 half4
|
||||
#define VSTORE_T_FLOAT4(data, offset, p) vstore_half4_rte(data, offset, p)
|
||||
#else
|
||||
#define T_FLOAT float
|
||||
#define T_FLOAT4 float4
|
||||
#define VSTORE_T_FLOAT4(data, offset, p) vstore4(data, offset, p)
|
||||
#endif
|
||||
|
||||
#if defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
#define REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
#define T_ACCUM float4
|
||||
#define VEC_SIZE 4
|
||||
|
||||
#define BS_K 64
|
||||
#define BS_NPQ 64
|
||||
#define BS_CRS 16
|
||||
|
||||
#define TS_K 4
|
||||
#define TS_NPQ 8
|
||||
|
||||
#define WG_K (BS_K / TS_K)
|
||||
#define WG_NPQ (BS_NPQ / TS_NPQ)
|
||||
|
||||
#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE)
|
||||
#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE)
|
||||
|
||||
static inline uint splitWork(uint work_size, uint block_size){
|
||||
return (work_size + block_size - 1) / block_size;
|
||||
}
|
||||
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
kernel void kernel_conv_2d(
|
||||
global void* p_knl,
|
||||
ulong off_knl,
|
||||
global void* p_src,
|
||||
ulong off_src,
|
||||
global void* p_dst,
|
||||
ulong off_dst,
|
||||
local void* shared,
|
||||
uint Cout, uint Cin, uint N,
|
||||
uint KW, uint KH, uint W, uint H, uint OW, uint OH,
|
||||
uint s0, uint s1, uint p0, uint p1, uint d0, uint d1,
|
||||
uint nb01, uint nb02, uint nb03,
|
||||
uint nb11, uint nb12, uint nb13,
|
||||
uint nb1, uint nb2, uint nb3
|
||||
) {
|
||||
global T_FLOAT* knl_data = (global T_FLOAT*) ((global char*)p_knl + off_knl);
|
||||
global T_FLOAT* src_data = (global T_FLOAT*) ((global char*)p_src + off_src);
|
||||
global T_FLOAT* dst_data = (global T_FLOAT*) ((global char*)p_dst + off_dst);
|
||||
|
||||
const uint K = Cout;
|
||||
const uint CRS = Cin*KH*KW;
|
||||
const uint NPQ = N*OH*OW;
|
||||
|
||||
const uint lid_k = get_local_id(0);
|
||||
const uint lid_npq = get_local_id(1);
|
||||
const uint tid = lid_npq * WG_K + lid_k;
|
||||
|
||||
const uint B_idx_K = get_group_id(0);
|
||||
const uint B_idx_NPQ = get_group_id(1);
|
||||
|
||||
const uint offset_k = B_idx_K * BS_K;
|
||||
const uint offset_npq = B_idx_NPQ * BS_NPQ;
|
||||
|
||||
local T_FLOAT* Ash = (local T_FLOAT*)shared;
|
||||
local T_FLOAT4* Bsh = (local T_FLOAT4*) &Ash[BS_K * BS_CRS];
|
||||
|
||||
T_ACCUM regC[TS_K][TS_NPQ_VEC];
|
||||
for (int i = 0; i < TS_K; ++i) {
|
||||
for (int j = 0; j < TS_NPQ_VEC; ++j) {
|
||||
regC[i][j] = (T_ACCUM)(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
const uint NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) {
|
||||
const uint offset_crs = B_idx_CRS * BS_CRS;
|
||||
|
||||
for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) {
|
||||
const uint k_l = i / BS_CRS;
|
||||
const uint crs_l = i % BS_CRS;
|
||||
const uint k_g = offset_k + k_l;
|
||||
const uint crs_g = offset_crs + crs_l;
|
||||
|
||||
if (k_g < K && crs_g < CRS) {
|
||||
const uint Cin_idx = crs_g / (KW*KH);
|
||||
const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW;
|
||||
const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW;
|
||||
const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03;
|
||||
Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx];
|
||||
} else {
|
||||
Ash[k_l * BS_CRS + crs_l] = (T_FLOAT)0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) {
|
||||
const uint crs_l = i / BS_NPQ_VEC;
|
||||
const uint npq_l_vec = i % BS_NPQ_VEC;
|
||||
const uint crs_g = offset_crs + crs_l;
|
||||
|
||||
T_FLOAT4 val = (T_FLOAT4)(0.0f);
|
||||
if (crs_g < CRS) {
|
||||
const uint Cin_idx = crs_g / (KW * KH);
|
||||
const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW;
|
||||
const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW;
|
||||
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||
const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v;
|
||||
if (npq_g < NPQ) {
|
||||
const uint N_idx = npq_g / (OH * OW);
|
||||
const uint pq_idx = npq_g % (OH * OW);
|
||||
const uint OH_idx = pq_idx / OW;
|
||||
const uint OW_idx = pq_idx % OW;
|
||||
const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1);
|
||||
const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0);
|
||||
|
||||
if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) {
|
||||
const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13;
|
||||
((T_FLOAT*)&val)[v] = src_data[src_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#pragma unroll
|
||||
for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) {
|
||||
T_FLOAT regA[TS_K];
|
||||
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||
regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l];
|
||||
}
|
||||
|
||||
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||
T_FLOAT4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg];
|
||||
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||
regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), convert_float4(regB), regC[k_l_reg][npq_l_vec_reg]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||
const uint k_g = offset_k + lid_k * TS_K + k_l_reg;
|
||||
if (k_g >= K) continue;
|
||||
|
||||
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||
const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE;
|
||||
|
||||
const uint N_idx = npq_g_base / (OH * OW);
|
||||
const uint pq_idx = npq_g_base % (OH * OW);
|
||||
const uint OH_idx = pq_idx / OW;
|
||||
const uint OW_idx = pq_idx % OW;
|
||||
|
||||
if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) {
|
||||
const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3;
|
||||
VSTORE_T_FLOAT4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]);
|
||||
} else {
|
||||
T_ACCUM res = regC[k_l_reg][npq_l_vec_reg];
|
||||
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||
const uint npq_g = npq_g_base + v;
|
||||
if (npq_g < NPQ) {
|
||||
const uint N_idx_s = npq_g / (OH*OW);
|
||||
const uint pq_idx_s = npq_g % (OH*OW);
|
||||
const uint OH_idx_s = pq_idx_s / OW;
|
||||
const uint OW_idx_s = pq_idx_s % OW;
|
||||
const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3;
|
||||
dst_data[dst_idx_s] = (T_FLOAT)(((float*)&res)[v]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
176
ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl
Normal file
176
ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl
Normal file
@@ -0,0 +1,176 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#if defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
#define REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
#define T_ACCUM float4
|
||||
#define VEC_SIZE 4
|
||||
|
||||
#define BS_K 64
|
||||
#define BS_NPQ 64
|
||||
#define BS_CRS 16
|
||||
|
||||
#define TS_K 4
|
||||
#define TS_NPQ 8
|
||||
|
||||
#define WG_K (BS_K / TS_K)
|
||||
#define WG_NPQ (BS_NPQ / TS_NPQ)
|
||||
|
||||
#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE)
|
||||
#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE)
|
||||
|
||||
static inline uint splitWork(uint work_size, uint block_size){
|
||||
return (work_size + block_size - 1) / block_size;
|
||||
}
|
||||
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
kernel void kernel_conv_2d(
|
||||
global void* p_knl,
|
||||
ulong off_knl,
|
||||
global void* p_src,
|
||||
ulong off_src,
|
||||
global void* p_dst,
|
||||
ulong off_dst,
|
||||
local void* shared,
|
||||
uint Cout, uint Cin, uint N,
|
||||
uint KW, uint KH, uint W, uint H, uint OW, uint OH,
|
||||
uint s0, uint s1, uint p0, uint p1, uint d0, uint d1,
|
||||
uint nb01, uint nb02, uint nb03,
|
||||
uint nb11, uint nb12, uint nb13,
|
||||
uint nb1, uint nb2, uint nb3
|
||||
) {
|
||||
global half* knl_data = (global half*) ((global char*)p_knl + off_knl);
|
||||
global float* src_data = (global float*) ((global char*)p_src + off_src);
|
||||
global float* dst_data = (global float*) ((global char*)p_dst + off_dst);
|
||||
|
||||
const uint K = Cout;
|
||||
const uint CRS = Cin*KH*KW;
|
||||
const uint NPQ = N*OH*OW;
|
||||
|
||||
const uint lid_k = get_local_id(0);
|
||||
const uint lid_npq = get_local_id(1);
|
||||
const uint tid = lid_npq * WG_K + lid_k;
|
||||
|
||||
const uint B_idx_K = get_group_id(0);
|
||||
const uint B_idx_NPQ = get_group_id(1);
|
||||
|
||||
const uint offset_k = B_idx_K * BS_K;
|
||||
const uint offset_npq = B_idx_NPQ * BS_NPQ;
|
||||
|
||||
local half* Ash = (local half*)shared;
|
||||
local float4* Bsh = (local float4*) &Ash[BS_K * BS_CRS];
|
||||
|
||||
T_ACCUM regC[TS_K][TS_NPQ_VEC];
|
||||
for (int i = 0; i < TS_K; ++i) {
|
||||
for (int j = 0; j < TS_NPQ_VEC; ++j) {
|
||||
regC[i][j] = (T_ACCUM)(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
const uint NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) {
|
||||
const uint offset_crs = B_idx_CRS * BS_CRS;
|
||||
|
||||
for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) {
|
||||
const uint k_l = i / BS_CRS;
|
||||
const uint crs_l = i % BS_CRS;
|
||||
const uint k_g = offset_k + k_l;
|
||||
const uint crs_g = offset_crs + crs_l;
|
||||
|
||||
if (k_g < K && crs_g < CRS) {
|
||||
const uint Cin_idx = crs_g / (KW*KH);
|
||||
const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW;
|
||||
const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW;
|
||||
const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03;
|
||||
Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx];
|
||||
} else {
|
||||
Ash[k_l * BS_CRS + crs_l] = (half)0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) {
|
||||
const uint crs_l = i / BS_NPQ_VEC;
|
||||
const uint npq_l_vec = i % BS_NPQ_VEC;
|
||||
const uint crs_g = offset_crs + crs_l;
|
||||
|
||||
float4 val = (float4)(0.0f);
|
||||
if (crs_g < CRS) {
|
||||
const uint Cin_idx = crs_g / (KW * KH);
|
||||
const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW;
|
||||
const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW;
|
||||
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||
const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v;
|
||||
if (npq_g < NPQ) {
|
||||
const uint N_idx = npq_g / (OH * OW);
|
||||
const uint pq_idx = npq_g % (OH * OW);
|
||||
const uint OH_idx = pq_idx / OW;
|
||||
const uint OW_idx = pq_idx % OW;
|
||||
const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1);
|
||||
const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0);
|
||||
|
||||
if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) {
|
||||
const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13;
|
||||
((float*)&val)[v] = src_data[src_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#pragma unroll
|
||||
for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) {
|
||||
half regA[TS_K];
|
||||
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||
regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l];
|
||||
}
|
||||
|
||||
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||
float4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg];
|
||||
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||
regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), regB, regC[k_l_reg][npq_l_vec_reg]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||
const uint k_g = offset_k + lid_k * TS_K + k_l_reg;
|
||||
if (k_g >= K) continue;
|
||||
|
||||
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||
const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE;
|
||||
|
||||
const uint N_idx = npq_g_base / (OH * OW);
|
||||
const uint pq_idx = npq_g_base % (OH * OW);
|
||||
const uint OH_idx = pq_idx / OW;
|
||||
const uint OW_idx = pq_idx % OW;
|
||||
|
||||
if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) {
|
||||
const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3;
|
||||
vstore4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]);
|
||||
} else {
|
||||
T_ACCUM res = regC[k_l_reg][npq_l_vec_reg];
|
||||
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||
const uint npq_g = npq_g_base + v;
|
||||
if (npq_g < NPQ) {
|
||||
const uint N_idx_s = npq_g / (OH*OW);
|
||||
const uint pq_idx_s = npq_g % (OH*OW);
|
||||
const uint OH_idx_s = pq_idx_s / OW;
|
||||
const uint OW_idx_s = pq_idx_s % OW;
|
||||
const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3;
|
||||
dst_data[dst_idx_s] = ((float*)&res)[v];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -31,7 +31,7 @@ kernel void kernel_im2col_f16(
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long ksize = OW * KH;
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
|
||||
@@ -31,7 +31,7 @@ kernel void kernel_im2col_f32(
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long ksize = OW * KH;
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
|
||||
@@ -3531,7 +3531,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
||||
|
||||
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
||||
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
||||
|
||||
@@ -26,7 +26,7 @@ static void im2col_kernel(const float * x, T * dst, int64_t batch_offset, int64_
|
||||
|
||||
// make each work-item deal with more elements since sycl global range can not exceed max int
|
||||
for (int64_t i = global_id; i < pelements; i += (work_group_size * item_ct1.get_group_range(2))) {
|
||||
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
||||
const int64_t ksize = OW * KH;
|
||||
const int64_t kx = i / ksize;
|
||||
const int64_t kd = kx * ksize;
|
||||
const int64_t ky = (i - kd) / OW;
|
||||
|
||||
@@ -328,6 +328,7 @@ struct vk_device_struct {
|
||||
uint64_t max_memory_allocation_size;
|
||||
uint64_t suballocation_block_size;
|
||||
bool fp16;
|
||||
bool bf16;
|
||||
bool pipeline_robustness;
|
||||
vk::Device device;
|
||||
uint32_t vendor_id;
|
||||
@@ -482,6 +483,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_conv2d_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
||||
|
||||
@@ -875,6 +877,38 @@ struct vk_op_rwkv_wkv7_push_constants {
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
struct vk_op_conv2d_push_constants {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
uint32_t N;
|
||||
|
||||
uint32_t KW;
|
||||
uint32_t KH;
|
||||
uint32_t W;
|
||||
uint32_t H;
|
||||
uint32_t OW;
|
||||
uint32_t OH;
|
||||
|
||||
uint32_t s0;
|
||||
uint32_t s1;
|
||||
uint32_t p0;
|
||||
uint32_t p1;
|
||||
uint32_t d0;
|
||||
uint32_t d1;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
|
||||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
};
|
||||
|
||||
struct vk_op_conv2d_dw_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t batches;
|
||||
@@ -974,18 +1008,45 @@ private:
|
||||
#endif // GGML_VULKAN_MEMORY_DEBUG
|
||||
|
||||
class vk_perf_logger {
|
||||
public:
|
||||
public:
|
||||
void print_timings() {
|
||||
if (timings.empty()) {
|
||||
return;
|
||||
}
|
||||
uint64_t total_all_op_times = 0;
|
||||
std::cerr << "----------------\nVulkan Timings:" << std::endl;
|
||||
for (const auto& t : timings) {
|
||||
uint64_t total = 0;
|
||||
for (const auto& time : t.second) {
|
||||
total += time;
|
||||
for (const auto & t : timings) {
|
||||
uint64_t total_op_times = 0;
|
||||
for (const auto & time : t.second) {
|
||||
total_op_times += time;
|
||||
}
|
||||
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
|
||||
std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0)
|
||||
<< " us";
|
||||
|
||||
// If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
|
||||
auto it = flops.find(t.first);
|
||||
if (it != flops.end() && (it->second).size() == t.second.size()) {
|
||||
uint64_t total_op_flops = 0;
|
||||
for (const auto & elem : it->second) {
|
||||
total_op_flops += elem;
|
||||
}
|
||||
std::cerr << " ("
|
||||
<< (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) /
|
||||
(double(total_op_times) / (1000.0 * 1000.0 * 1000.0))
|
||||
<< " GFLOPS/s)";
|
||||
}
|
||||
|
||||
total_all_op_times += total_op_times;
|
||||
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
if (timings.size() > 0) {
|
||||
std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl;
|
||||
}
|
||||
|
||||
timings.clear();
|
||||
flops.clear();
|
||||
}
|
||||
|
||||
void log_timing(const ggml_tensor * node, uint64_t time) {
|
||||
@@ -994,22 +1055,45 @@ public:
|
||||
return;
|
||||
}
|
||||
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
||||
const uint64_t m = node->src[0]->ne[1];
|
||||
const uint64_t n = node->src[1]->ne[1];
|
||||
const uint64_t k = node->src[1]->ne[0];
|
||||
std::string name = ggml_op_name(node->op);
|
||||
const uint64_t m = node->src[0]->ne[1];
|
||||
const uint64_t n = node->src[1]->ne[1];
|
||||
const uint64_t k = node->src[1]->ne[0];
|
||||
std::string name = ggml_op_name(node->op);
|
||||
if (n == 1) {
|
||||
name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
|
||||
} else {
|
||||
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
|
||||
}
|
||||
timings[name].push_back(time);
|
||||
flops[name].push_back(m * n * (k + (k - 1)));
|
||||
return;
|
||||
}
|
||||
if (node->op == GGML_OP_CONV_2D) {
|
||||
std::string name = ggml_op_name(node->op);
|
||||
ggml_tensor * knl = node->src[0];
|
||||
uint64_t OW = node->ne[0];
|
||||
uint64_t OH = node->ne[1];
|
||||
uint64_t N = node->ne[3];
|
||||
uint64_t Cout = node->ne[2];
|
||||
uint64_t KW = knl->ne[0];
|
||||
uint64_t KH = knl->ne[1];
|
||||
uint64_t Cin = knl->ne[2];
|
||||
// KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
|
||||
uint64_t size_M = Cout;
|
||||
uint64_t size_K = Cin * KW * KH;
|
||||
uint64_t size_N = N * OW * OH;
|
||||
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
|
||||
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
|
||||
", N=N*OW*OH=" + std::to_string(size_N);
|
||||
flops[name].push_back(n_flops);
|
||||
timings[name].push_back(time);
|
||||
return;
|
||||
}
|
||||
timings[ggml_op_name(node->op)].push_back(time);
|
||||
}
|
||||
private:
|
||||
private:
|
||||
std::map<std::string, std::vector<uint64_t>> timings;
|
||||
std::map<std::string, std::vector<uint64_t>> flops;
|
||||
};
|
||||
|
||||
struct ggml_backend_vk_context {
|
||||
@@ -2112,6 +2196,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
}
|
||||
compile_count++;
|
||||
}
|
||||
|
||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
|
||||
parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||
};
|
||||
@@ -2961,6 +3046,42 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
// conv2d
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
uint32_t conv2d_BS_K = 128;
|
||||
uint32_t conv2d_BS_CRS = 16;
|
||||
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
||||
if (device->subgroup_shuffle &&
|
||||
device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
|
||||
use_collectives = 1;
|
||||
conv2d_BS_CRS = std::min(
|
||||
device->subgroup_size,
|
||||
conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
|
||||
}
|
||||
uint32_t conv2d_BS_NPQ = 128;
|
||||
uint32_t conv2d_TS_K = 8;
|
||||
uint32_t conv2d_shmem_req =
|
||||
(conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
||||
conv2d_BS_CRS = 8;
|
||||
if (use_collectives) {
|
||||
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
|
||||
}
|
||||
}
|
||||
|
||||
if (use_collectives) {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
|
||||
false);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -3273,6 +3394,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
#if defined(VK_KHR_shader_bfloat16)
|
||||
device->bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type;
|
||||
#else
|
||||
device->bf16 = false;
|
||||
#endif
|
||||
|
||||
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
||||
|
||||
if (device->subgroup_size_control) {
|
||||
@@ -3615,6 +3742,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
bool coopmat_support = false;
|
||||
bool coopmat2_support = false;
|
||||
bool integer_dot_product = false;
|
||||
bool bfloat16_support = false;
|
||||
|
||||
for (auto properties : ext_props) {
|
||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||
@@ -3635,6 +3763,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
|
||||
integer_dot_product = true;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_BFLOAT16")) {
|
||||
bfloat16_support = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -3701,10 +3834,25 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features;
|
||||
}
|
||||
|
||||
#if defined(VK_KHR_shader_bfloat16)
|
||||
VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {};
|
||||
bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR;
|
||||
if (bfloat16_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features;
|
||||
last_struct = (VkBaseOutStructure *)&bfloat16_features;
|
||||
}
|
||||
#endif
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
|
||||
|
||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
#if defined(VK_KHR_shader_bfloat16)
|
||||
bool bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type;
|
||||
#else
|
||||
bool bf16 = false;
|
||||
#endif
|
||||
|
||||
uint32_t default_subgroup_size = get_subgroup_size("", device_architecture);
|
||||
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
|
||||
const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
@@ -3722,8 +3870,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, bf16, subgroup_size,
|
||||
props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str());
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
@@ -6809,6 +6957,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
return ctx->device->pipeline_conv2d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_is_contiguous(src1)) {
|
||||
@@ -7131,6 +7285,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
const uint32_t OW = dst->ne[0];
|
||||
elements = { N * OC * OH * OW, 1, 1};
|
||||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
// src0 - kernel: [KW, KH, Cin, Cout]
|
||||
// src1 - input: [W, H, Cin, N]
|
||||
// dst - result: [OW, OH, Cout, N]
|
||||
|
||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
|
||||
int64_t W = src1->ne[0];
|
||||
int64_t H = src1->ne[1];
|
||||
int64_t KW = src0->ne[0];
|
||||
int64_t KH = src0->ne[1];
|
||||
int64_t Cout = src0->ne[3];
|
||||
int64_t N = src1->ne[3];
|
||||
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
|
||||
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
|
||||
int64_t NPQ = N * OW * OH;
|
||||
|
||||
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
|
||||
elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
|
||||
}
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_DIV:
|
||||
@@ -7997,6 +8176,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
vk_op_conv2d_push_constants p{};
|
||||
p.Cout = static_cast<uint32_t>(ne03);
|
||||
p.Cin = static_cast<uint32_t>(ne02);
|
||||
p.N = static_cast<uint32_t>(ne13);
|
||||
|
||||
p.KW = static_cast<uint32_t>(ne00);
|
||||
p.KH = static_cast<uint32_t>(ne01);
|
||||
p.W = static_cast<uint32_t>(ne10);
|
||||
p.H = static_cast<uint32_t>(ne11);
|
||||
p.OW = static_cast<uint32_t>(ne0);
|
||||
p.OH = static_cast<uint32_t>(ne1);
|
||||
|
||||
p.s0 = static_cast<uint32_t>(dst->op_params[0]);
|
||||
p.s1 = static_cast<uint32_t>(dst->op_params[1]);
|
||||
p.p0 = static_cast<uint32_t>(dst->op_params[2]);
|
||||
p.p1 = static_cast<uint32_t>(dst->op_params[3]);
|
||||
p.d0 = static_cast<uint32_t>(dst->op_params[4]);
|
||||
p.d1 = static_cast<uint32_t>(dst->op_params[5]);
|
||||
|
||||
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
||||
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
||||
p.nb03 = static_cast<uint32_t>(nb03 / nb00);
|
||||
|
||||
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
||||
p.nb12 = static_cast<uint32_t>(nb12 / nb10);
|
||||
p.nb13 = static_cast<uint32_t>(nb13 / nb10);
|
||||
|
||||
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
||||
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
|
||||
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
|
||||
|
||||
GGML_ASSERT(ne03 == ne2);
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
vk_op_conv2d_dw_push_constants p{};
|
||||
p.ne = ggml_nelements(dst);
|
||||
@@ -9059,6 +9287,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
@@ -9126,6 +9355,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
{
|
||||
@@ -9332,6 +9562,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_2D:
|
||||
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
@@ -9462,6 +9696,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
@@ -10013,7 +10248,7 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st
|
||||
}
|
||||
// if rms_norm is the B operand, then we don't handle broadcast
|
||||
if (rms_norm == mul->src[1] &&
|
||||
mul->src[0]->ne[1] != rms_norm->ne[1]) {
|
||||
!ggml_are_same_shape(mul->src[0], rms_norm)) {
|
||||
return false;
|
||||
}
|
||||
// rms_norm shader assumes contiguous rows
|
||||
@@ -10043,6 +10278,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
|
||||
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
||||
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||
} else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D) {
|
||||
// Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode.
|
||||
auto CRS_size =
|
||||
cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[0]->ne[2];
|
||||
auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3];
|
||||
total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type);
|
||||
}
|
||||
i += ctx->num_additional_fused_ops;
|
||||
ctx->num_additional_fused_ops = 0;
|
||||
@@ -10619,6 +10860,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return true;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
// Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE;
|
||||
// Channel-contiguous format is not supported yet.
|
||||
return (op->src[0]->type == GGML_TYPE_F32 &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
op->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]) &&
|
||||
ggml_is_contiguous(op)) && !is_Apple;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -11177,6 +11432,14 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
const int32_t p1 = tensor->op_params[6];
|
||||
|
||||
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
|
||||
} else if (tensor->op == GGML_OP_CONV_2D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
const int32_t p1 = tensor->op_params[3];
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
|
||||
|
||||
265
ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
Normal file
265
ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
Normal file
@@ -0,0 +1,265 @@
|
||||
#version 450
|
||||
|
||||
#ifdef USE_COLLECTIVES
|
||||
# extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
#endif
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
// Make spec constant
|
||||
#define SHMEM_PAD 0
|
||||
|
||||
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
|
||||
layout(binding = 0) readonly buffer A {
|
||||
A_TYPE knl_data[];
|
||||
}; // src0 - kernel: [KW, KH, Cin, Cout]
|
||||
|
||||
layout(binding = 1) readonly buffer B {
|
||||
B_TYPE src_data[];
|
||||
}; // src1 - input: [W, H, Cin, N] -- channel_first format
|
||||
|
||||
layout(binding = 2) writeonly buffer D {
|
||||
D_TYPE dst_data[];
|
||||
}; // dst - result: [OW, OH, Cout, N]
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
// I/O channels, batch size
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
uint32_t N;
|
||||
|
||||
// Tensor spatial sizes: kernel, input, output
|
||||
uint32_t KW;
|
||||
uint32_t KH;
|
||||
uint32_t W;
|
||||
uint32_t H;
|
||||
uint32_t OW;
|
||||
uint32_t OH;
|
||||
|
||||
// Parameters: stride, padding, dilation - 0=y, 1=x
|
||||
uint32_t s0;
|
||||
uint32_t s1;
|
||||
uint32_t p0;
|
||||
uint32_t p1;
|
||||
uint32_t d0;
|
||||
uint32_t d1;
|
||||
|
||||
// Strides in elements
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
|
||||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
}
|
||||
|
||||
p;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
// Blocktile sizes
|
||||
layout(constant_id = 1) const uint BS_K = 128;
|
||||
layout(constant_id = 2) const uint BS_CRS = 16;
|
||||
layout(constant_id = 3) const uint BS_NPQ = 128;
|
||||
// Thread-tile sizes
|
||||
layout(constant_id = 4) const uint TS_K = 8;
|
||||
layout(constant_id = 5) const uint use_collectives = 1;
|
||||
|
||||
uint32_t tid = gl_LocalInvocationID.x;
|
||||
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
||||
|
||||
uint splitWork(uint work_size, uint block_size) {
|
||||
return (block_size + work_size - 1) / block_size;
|
||||
}
|
||||
|
||||
uint32_t K = p.Cout;
|
||||
uint32_t CRS = p.Cin * p.KH * p.KW;
|
||||
uint32_t NPQ = p.N * p.OH * p.OW;
|
||||
|
||||
uint32_t n_elems_out = K * NPQ;
|
||||
|
||||
// Number of blocktiles per input
|
||||
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
|
||||
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
|
||||
|
||||
const uint32_t Ash_numel = BS_K * BS_CRS;
|
||||
const uint32_t Bsh_numel = BS_CRS * BS_NPQ;
|
||||
|
||||
const uint32_t Ash_len = BS_K * Ash_stride;
|
||||
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
||||
|
||||
shared float Ash[Ash_len]; // K x CRS
|
||||
shared float Bsh[Bsh_len]; // CRS x NPQ
|
||||
|
||||
// Threadtile sizes
|
||||
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||
|
||||
// Number of threadtiles per blocktile
|
||||
const uint32_t NT_K = BS_K / TS_K;
|
||||
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
|
||||
|
||||
float regA[TS_K];
|
||||
float regB[TS_NPQ];
|
||||
float regC[TS_K][TS_NPQ];
|
||||
|
||||
/*
|
||||
Compute
|
||||
KxCRS @ CRSxNPQ = K x NPQ
|
||||
K=Cout
|
||||
C=Cin
|
||||
R,S=KH,KW
|
||||
P,Q=OH,OW
|
||||
*/
|
||||
|
||||
uint32_t B_idx_K = gl_WorkGroupID.x;
|
||||
uint32_t B_idx_NPQ = gl_WorkGroupID.y;
|
||||
|
||||
uint32_t T_y = tid / NT_NPQ;
|
||||
uint32_t T_x = tid % NT_NPQ;
|
||||
|
||||
uint32_t Ar = tid / BS_CRS;
|
||||
uint32_t Ac = tid % BS_CRS;
|
||||
const uint32_t ArpWg = WG_SIZE / BS_CRS;
|
||||
|
||||
uint32_t Br = tid / BS_NPQ;
|
||||
uint32_t Bc = tid % BS_NPQ;
|
||||
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
|
||||
|
||||
void main() {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = 0.0;
|
||||
}
|
||||
}
|
||||
/* Advance block in CRS dim */
|
||||
for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
|
||||
uint32_t CRS_idx_a;
|
||||
uint32_t Cin_idx_a;
|
||||
uint32_t KH_idx_a;
|
||||
uint32_t KW_idx_a;
|
||||
|
||||
#ifdef USE_COLLECTIVES
|
||||
uint32_t cached_CRS_idx;
|
||||
uint32_t cached_Cin_idx;
|
||||
uint32_t cached_KH_idx;
|
||||
uint32_t cached_KW_idx;
|
||||
if (use_collectives == 1) {
|
||||
cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
|
||||
cached_Cin_idx = cached_CRS_idx / (p.KW * p.KH);
|
||||
uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
|
||||
cached_KH_idx = cached_CRS_remainder / p.KW;
|
||||
cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW;
|
||||
|
||||
CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
|
||||
Cin_idx_a = subgroupShuffle(cached_Cin_idx, Ac);
|
||||
KH_idx_a = subgroupShuffle(cached_KH_idx, Ac);
|
||||
KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
|
||||
} else {
|
||||
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
||||
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
|
||||
uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
||||
KH_idx_a = CRS_remainder / p.KW;
|
||||
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
||||
}
|
||||
#else
|
||||
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
||||
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
|
||||
CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
||||
KH_idx_a = CRS_remainder / p.KW;
|
||||
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
||||
#endif
|
||||
|
||||
/* Load kernel to A_block: (BS_K x BS_CRS)*/
|
||||
for (uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg) {
|
||||
uint32_t B_ly = r_offset + Ar;
|
||||
uint32_t B_lx = Ac;
|
||||
uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
|
||||
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
|
||||
float val = knl_data[knl_idx];
|
||||
if (K_idx >= K || CRS_idx_a >= CRS) {
|
||||
val = 0.0;
|
||||
}
|
||||
Ash[B_ly * Ash_stride + B_lx] = val;
|
||||
}
|
||||
/* Load input to B_block: (BS_CRS x BS_NPQ) */
|
||||
for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
|
||||
uint32_t B_ly = r_offset + Br; /* Row index of B block */
|
||||
uint32_t B_lx = Bc;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
|
||||
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
|
||||
uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
|
||||
uint32_t OH_idx = NPQ_remainder / p.OW;
|
||||
uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW;
|
||||
|
||||
uint32_t CRS_idx_b;
|
||||
uint32_t Cin_idx_b;
|
||||
uint32_t KH_idx_b;
|
||||
uint32_t KW_idx_b;
|
||||
#ifdef USE_COLLECTIVES
|
||||
if (use_collectives == 1) {
|
||||
CRS_idx_b = subgroupShuffle(cached_CRS_idx, r_offset + Br);
|
||||
Cin_idx_b = subgroupShuffle(cached_Cin_idx, r_offset + Br);
|
||||
KH_idx_b = subgroupShuffle(cached_KH_idx, r_offset + Br);
|
||||
KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
|
||||
} else {
|
||||
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
||||
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
|
||||
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
||||
KH_idx_b = CRS_remainder / p.KW;
|
||||
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
||||
}
|
||||
#else
|
||||
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
||||
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
|
||||
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
||||
KH_idx_b = CRS_remainder / p.KW;
|
||||
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
||||
#endif
|
||||
|
||||
uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
|
||||
uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
|
||||
uint32_t src_idx =
|
||||
min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
|
||||
float val = src_data[src_idx];
|
||||
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
|
||||
val = 0.0;
|
||||
}
|
||||
Bsh[B_ly * Bsh_stride + B_lx] = val;
|
||||
}
|
||||
barrier();
|
||||
for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
|
||||
}
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
|
||||
}
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
/* Save C* */
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
|
||||
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
|
||||
uint32_t OH_idx = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
dst_data[dst_idx] = regC[T_ly][T_lx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -40,12 +40,10 @@ void main() {
|
||||
const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
|
||||
const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
|
||||
const int oh_s1 = int(oh) * p.s1;
|
||||
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||
const uint ksize = p.OW * p.KH;
|
||||
|
||||
const uint base_linear_idx = gidx * NUM_ITER;
|
||||
|
||||
const uint max_ky = ksize / p.OW;
|
||||
|
||||
uint current_kx = base_linear_idx / ksize;
|
||||
const uint rem = base_linear_idx - (current_kx * ksize);
|
||||
uint current_ky = rem / p.OW;
|
||||
@@ -76,7 +74,7 @@ void main() {
|
||||
|
||||
if (++current_ix == p.OW) {
|
||||
current_ix = 0;
|
||||
if (++current_ky == max_ky) {
|
||||
if (++current_ky == p.KH) {
|
||||
current_ky = 0;
|
||||
current_kx++;
|
||||
}
|
||||
|
||||
@@ -50,8 +50,14 @@ void main() {
|
||||
const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
|
||||
|
||||
if (do_multiply) {
|
||||
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
|
||||
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col]));
|
||||
if (ncols > p.ne10) {
|
||||
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
|
||||
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)]));
|
||||
}
|
||||
} else {
|
||||
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
|
||||
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
|
||||
|
||||
@@ -655,6 +655,8 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
|
||||
|
||||
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
||||
|
||||
@@ -765,8 +767,8 @@ void write_output_files() {
|
||||
len += "};\n";
|
||||
}
|
||||
}
|
||||
fprintf(src, data.c_str());
|
||||
fprintf(src, len.c_str());
|
||||
fputs(data.c_str(), src);
|
||||
fputs(len.c_str(), src);
|
||||
}
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
|
||||
@@ -233,6 +233,11 @@ class Keys:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
class IMatrix:
|
||||
CHUNK_COUNT = "imatrix.chunk_count"
|
||||
CHUNK_SIZE = "imatrix.chunk_size"
|
||||
DATASETS = "imatrix.datasets"
|
||||
|
||||
class Clip:
|
||||
PROJECTOR_TYPE = "clip.projector_type"
|
||||
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||
@@ -282,6 +287,7 @@ class Keys:
|
||||
class GGUFType:
|
||||
MODEL = "model"
|
||||
ADAPTER = "adapter"
|
||||
IMATRIX = "imatrix"
|
||||
MMPROJ = "mmproj" # dummy, unused for now
|
||||
|
||||
|
||||
|
||||
@@ -144,6 +144,10 @@ class Metadata:
|
||||
# Quick hack to fix the Norway problem
|
||||
# https://hitchdev.com/strictyaml/why/implicit-typing-removed/
|
||||
yaml_content = yaml_content.replace("- no\n", "- \"no\"\n")
|
||||
# yaml should use 2 spaces insted of tab
|
||||
# this issue has came up with the Qwen/Qwen3-235B-A22B-Instruct-2507 model card
|
||||
# (I've also sent a pr tp fix the modelcard too)
|
||||
yaml_content = yaml_content.replace("\t", " ")
|
||||
|
||||
if yaml_content:
|
||||
data = yaml.safe_load(yaml_content)
|
||||
|
||||
@@ -1 +1 @@
|
||||
d62df60a07ba3deeb85e5cfc9b1ee07645ff35e2
|
||||
3323219cd3cc050e5c7133cd4fc1e50d1f590faf
|
||||
|
||||
@@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Write key type
|
||||
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
||||
@@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
|
||||
if (!s_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Write value type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
@@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t mem_size = size;
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Write value type
|
||||
@@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Read type of key
|
||||
int32_t r_type_i_ref;
|
||||
@@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
if (!s_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Read type of value
|
||||
int32_t s_type_i_ref;
|
||||
io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
|
||||
if (s_type_i != s_type_i_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
|
||||
return false;
|
||||
@@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
} else {
|
||||
// For each layer, read the values for each cell (transposed)
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Read type of value
|
||||
|
||||
@@ -1544,7 +1544,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12: type = LLM_TYPE_190M; break;
|
||||
case 12:
|
||||
switch (hparams.n_embd) {
|
||||
case 768: type = LLM_TYPE_190M; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
} break;
|
||||
case 24:
|
||||
switch (hparams.n_embd) {
|
||||
case 1024: type = LLM_TYPE_450M; break;
|
||||
@@ -1557,7 +1561,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 3584: type = LLM_TYPE_7B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
} break;
|
||||
case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World
|
||||
case 32:
|
||||
switch (hparams.n_embd) {
|
||||
case 2560: type = LLM_TYPE_2_9B; break;
|
||||
case 4096: type = LLM_TYPE_7B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
} break;
|
||||
case 61:
|
||||
switch (hparams.n_embd) {
|
||||
case 4096: type = LLM_TYPE_14B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
} break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
||||
@@ -2641,6 +2641,7 @@ struct test_rms_norm_mul_add : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float eps;
|
||||
const bool broadcast;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
@@ -2650,18 +2651,21 @@ struct test_rms_norm_mul_add : public test_case {
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, eps);
|
||||
return VARS_TO_STR4(type, ne, eps, broadcast);
|
||||
}
|
||||
|
||||
test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
||||
float eps = 1e-6f)
|
||||
: type(type), ne(ne), eps(eps) {}
|
||||
float eps = 1e-6f, bool broadcast = false)
|
||||
: type(type), ne(ne), eps(eps), broadcast(broadcast) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
std::array<int64_t, 4> broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4};
|
||||
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data());
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * c = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
|
||||
ggml_set_param(a);
|
||||
ggml_set_name(a, "a");
|
||||
ggml_set_param(b);
|
||||
@@ -3699,6 +3703,93 @@ struct test_im2col : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// CONV_2D
|
||||
struct test_conv_2d : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
const int stride0;
|
||||
const int stride1;
|
||||
const int padding0;
|
||||
const int padding1;
|
||||
const int dilation0;
|
||||
const int dilation1;
|
||||
// Whether the inputs are contiguous in the channel dim or the width dim
|
||||
const bool cwhn;
|
||||
|
||||
// If true, the direct CONV_2D will be used in the graph, otherwise it
|
||||
// uses ggml_conv_2d:
|
||||
// * if the program is called with -o CONV_2D_DIRECT_IMPL, the
|
||||
// CONV_2D graph will be built, while
|
||||
// * if the program is called with -o CONV_2D_INDIRECT_IMPL, the
|
||||
// IM2COL -> MUL_MM graph will be built.
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(ne_input, ne_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
|
||||
}
|
||||
|
||||
uint64_t op_flops(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
// Just counting matmul costs:
|
||||
// KxCRS @ CRSxNPQ = KxNPQ --> KxNPQx(CRS+CRS-1) flops
|
||||
|
||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
|
||||
int64_t W = ne_input[0];
|
||||
int64_t H = ne_input[1];
|
||||
int64_t KW = ne_kernel[0];
|
||||
int64_t KH = ne_kernel[1];
|
||||
int64_t Cin = ne_kernel[2];
|
||||
int64_t Cout = ne_kernel[3];
|
||||
int64_t N = ne_input[3];
|
||||
int64_t OH = calc_conv_output_size(H, KH, stride0, padding0, dilation0);
|
||||
int64_t OW = calc_conv_output_size(W, KW, stride0, padding0, dilation0);
|
||||
|
||||
int64_t K = Cout;
|
||||
int64_t CRS = Cin * KH * KW;
|
||||
int64_t NPQ = N * OH * OW;
|
||||
|
||||
return K * NPQ * (2 * CRS - 1);
|
||||
}
|
||||
|
||||
test_conv_2d(std::array<int64_t, 4> ne_input = { 64, 64, 16, 1 },
|
||||
std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, int stride0 = 1, int stride1 = 1, int padding0 = 0,
|
||||
int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
|
||||
ne_input(ne_input),
|
||||
ne_kernel(ne_kernel),
|
||||
stride0(stride0),
|
||||
stride1(stride1),
|
||||
padding0(padding0),
|
||||
padding1(padding1),
|
||||
dilation0(dilation0),
|
||||
dilation1(dilation1),
|
||||
cwhn(cwhn) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
||||
ggml_set_name(input, "input");
|
||||
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
||||
ggml_set_name(kernel, "kernel");
|
||||
|
||||
if (cwhn) {
|
||||
// change memory layout to channel-most-contiguous (CWHN),
|
||||
// then permute it back so NE matches the original input
|
||||
input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
|
||||
input = ggml_permute(ctx, input, 2, 0, 1, 3);
|
||||
kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
|
||||
kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
|
||||
}
|
||||
|
||||
ggml_tensor * out =
|
||||
ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONV_2D_DW
|
||||
struct test_conv_2d_dw : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
@@ -4275,26 +4366,32 @@ struct test_flash_attn_ext : public test_case {
|
||||
const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV));
|
||||
const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV));
|
||||
|
||||
auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
|
||||
auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, bool is_view) -> ggml_tensor * {
|
||||
int64_t ne[4] = {ne0, ne1, ne2, ne3};
|
||||
int64_t ne_perm[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
ne_perm[permute[i]] = ne[i];
|
||||
}
|
||||
ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
|
||||
ggml_tensor * t;
|
||||
if (is_view) {
|
||||
ggml_tensor * t0 = ggml_new_tensor_4d(ctx, type, ne_perm[0], 2*ne_perm[1], ne_perm[2], ne_perm[3]);
|
||||
t = ggml_view_4d(ctx, t0, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3], t0->nb[1], t0->nb[2], t0->nb[3], 0);
|
||||
} else {
|
||||
t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
|
||||
}
|
||||
if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
|
||||
t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
|
||||
}
|
||||
return t;
|
||||
};
|
||||
|
||||
ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr23[0], nr23[1]);
|
||||
ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr23[0], nr23[1], false);
|
||||
ggml_set_name(q, "q");
|
||||
|
||||
ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, nr23[1]);
|
||||
ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, nr23[1], true); // the K tensor is usually a view of the K cache
|
||||
ggml_set_name(k, "k");
|
||||
|
||||
ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1]);
|
||||
ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1], true); // the V tensor is usually a view of the V cache
|
||||
ggml_set_name(v, "v");
|
||||
|
||||
ggml_tensor * m = nullptr;
|
||||
@@ -5006,6 +5103,81 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true));
|
||||
|
||||
// Conv_2D test cases
|
||||
#ifdef DETAILED_TESTS
|
||||
// Probably we do not have enough time to execute these in the pipeline.
|
||||
uint32_t iwh_idx = 0;
|
||||
uint32_t kwh_idx = 1;
|
||||
uint32_t Cout_idx = 2;
|
||||
uint32_t Cin_idx = 3;
|
||||
uint32_t B_idx = 4;
|
||||
|
||||
std::vector<std::array<int, 5>> cases = {
|
||||
//{IWH, KWH, Cout, Cin, B}
|
||||
// K=CRS=NPQ=4096 conv_2d matmul performance
|
||||
{19, 4, 4096, 256, 16},
|
||||
// K=128, CRS=128, NPQ=4096
|
||||
{ 19, 4, 128, 8, 16},
|
||||
// K=130, CRS=128, NPQ=4096
|
||||
{ 19, 4, 130, 8, 16},
|
||||
// Edge case: K x CRS is small
|
||||
{ 19, 2, 4, 4, 16},
|
||||
// A ConvNet's first layer
|
||||
{ 224, 3, 8, 3, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel
|
||||
{ 224, 2, 8, 1, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
|
||||
{ 224, 2, 8, 1, 8 },
|
||||
// A middle layer of a ConvNet
|
||||
{ 58, 3, 64, 32, 1 },
|
||||
// A middle layer of a ConvNet, several images in the batch
|
||||
{ 58, 3, 64, 32, 8 },
|
||||
// A deep layer of a ConvNet, several images in the batch
|
||||
{ 16, 3, 256, 128, 8 }
|
||||
};
|
||||
|
||||
for (auto act_case : cases) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
|
||||
}
|
||||
#endif
|
||||
|
||||
// CONV_2D:
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
|
||||
//uint32_t s0 = 3;
|
||||
uint32_t s1 = 5;
|
||||
uint32_t p0 = 5;
|
||||
//uint32_t p1 = 2;
|
||||
uint32_t d0 = 2;
|
||||
uint32_t d1 = 4;
|
||||
|
||||
for (uint32_t s0 : { 1, 3 }) {
|
||||
for (uint32_t p1 : { 2, 5 }) {
|
||||
for (uint32_t Cin : { 1, 25 }) {
|
||||
for (uint32_t Cout : { 1, 12 }) {
|
||||
for (uint32_t KH : { 1, 2, 3, 11 }) {
|
||||
for (uint32_t KW : { 1, 2, 3, 11 }) {
|
||||
for (uint32_t H : { 1, 133 }) {
|
||||
for (uint32_t W : { 1, 141 }) {
|
||||
if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 &&
|
||||
calc_conv_output_size(H, KH, s1, p1, d1) > 0) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ W, H, Cin, 2 }, { KW, KH, Cin, Cout }, s0, s1, p0, p1, d0, d1, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sycl backend will limit task global_range < MAX_INT
|
||||
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
||||
@@ -5192,6 +5364,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
|
||||
@@ -5610,6 +5783,43 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
// Conv2d: K=CRS=NPQ=4096 matmul performance
|
||||
uint32_t iwh_idx = 0;
|
||||
uint32_t kwh_idx = 1;
|
||||
uint32_t Cout_idx = 2;
|
||||
uint32_t Cin_idx = 3;
|
||||
uint32_t B_idx = 4;
|
||||
std::vector<std::array<int, 5>> cases = {
|
||||
//{IWH, KWH, Cout, Cin, B}
|
||||
// K=CRS=NPQ=4096 conv2d matmul performance
|
||||
{19, 4, 4096, 256, 16},
|
||||
// K=128, CRS=128, NPQ=4096
|
||||
{ 19, 4, 128, 8, 16},
|
||||
// K=130, CRS=128, NPQ=4096
|
||||
{ 19, 4, 130, 8, 16},
|
||||
// Edge case: K x CRS is small
|
||||
{ 19, 2, 4, 4, 16},
|
||||
// A ConvNet's first layer
|
||||
{ 224, 3, 8, 3, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel
|
||||
{ 224, 2, 8, 1, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
|
||||
{ 224, 2, 8, 1, 8 },
|
||||
// A middle layer of a ConvNet
|
||||
{ 58, 3, 64, 32, 1 },
|
||||
// A middle layer of a ConvNet, several images in the batch
|
||||
{ 58, 3, 64, 32, 8 },
|
||||
// A deep layer of a ConvNet, several images in the batch
|
||||
{ 16, 3, 512, 128, 8 },
|
||||
};
|
||||
|
||||
for (auto act_case : cases) {
|
||||
// Direct CONV_2D
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
|
||||
@@ -1,33 +1,92 @@
|
||||
# llama.cpp/tools/imatrix
|
||||
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
|
||||
More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861
|
||||
More information is available in <https://github.com/ggml-org/llama.cpp/pull/4861>.
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
./llama-imatrix \
|
||||
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
||||
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
||||
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
||||
-m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \
|
||||
[--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
|
||||
[--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
|
||||
[--show-statistics] [...]
|
||||
```
|
||||
|
||||
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
|
||||
Here `-m | --model` with a model name and `-f | --file` with a file containing calibration data (such as e.g. `wiki.train.raw`) are mandatory.
|
||||
The parameters in square brackets are optional and have the following meaning:
|
||||
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
|
||||
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||
|
||||
* `-h | --help` shows usage information and exits.
|
||||
* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||
* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
|
||||
* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
|
||||
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||
* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||
* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
|
||||
* `--parse-special` enables parsing of special tokens (e.g., `<|im_start|>` in some models). Useful for models with custom tokenizers.
|
||||
* `--chunk | --from-chunk` to skip the first `n` chunks of tokens from the input data. Useful for resuming or skipping initial low-quality data.
|
||||
* `--chunks` maximum number of chunks to process. Default is -1 for all available chunks.
|
||||
* `--no-ppl` disables the calculation of perplexity for the processed chunks. Useful if you want to speed up the processing and do not care about perplexity.
|
||||
* `--show-statistics` displays imatrix file's statistics.
|
||||
|
||||
For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||
For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu-layers` argument.
|
||||
|
||||
## Example
|
||||
Recent versions of `llama-imatrix` store data in GGUF format by default. For the legacy format, use an extension other than `.gguf` when saving the output file. More information is available in <https://github.com/ggml-org/llama.cpp/pull/9400>.
|
||||
|
||||
## Examples
|
||||
|
||||
```bash
|
||||
# generate importance matrix (imatrix.dat)
|
||||
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||
# generate importance matrix using default filename (imatrix.gguf), offloading 99 layers to GPU
|
||||
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -ngl 99
|
||||
|
||||
# use the imatrix to perform a Q4_K_M quantization
|
||||
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||
./llama-quantize --imatrix imatrix.gguf ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||
```
|
||||
|
||||
```bash
|
||||
# generate and save the imatrix using legacy format
|
||||
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99
|
||||
```
|
||||
|
||||
```bash
|
||||
# covert legacy (binary) imatrix format to new (GGUF) format
|
||||
./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
|
||||
```
|
||||
|
||||
```bash
|
||||
# combine existing imatrices
|
||||
./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
|
||||
```
|
||||
|
||||
```bash
|
||||
# skip first 5 chunks, save intermediates every 20 chunks and snapshots every 50, parsing special tokens
|
||||
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --chunk 5 --output-frequency 20 --save-frequency 50 --parse-special
|
||||
```
|
||||
|
||||
```bash
|
||||
# analyse imatrix file and display summary statistics instead of running inference
|
||||
./llama-imatrix --in-file imatrix.gguf --show-statistics
|
||||
```
|
||||
|
||||
`--show-statistics` will display the following statistics:
|
||||
|
||||
#### Per tensor
|
||||
|
||||
* Σ(Act²): sum of all squared activations (the importance scores)
|
||||
* Min & Max: minimum and maximum squared activations values
|
||||
* μ & σ: Squared activations' mean and standard deviation
|
||||
* % Active: proportion of elements whose average squared activation exceeds a small threshold (1e-5). Helpful to determine how alive/dormant the tensor is during inference
|
||||
* N: number of squared activations
|
||||
* Entropy: entropy of the squared activation distribution, in bits (standard Shannon entropy measurement) $S = -\sum_{i=1}^N p_i \log_2 p_i$
|
||||
* E (norm): Normalized entropy. $E(norm)=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. These two metrics can be used to determine how well a prompt "exercises" the model's capabilities
|
||||
* ZD Score: z-score distribution as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415)
|
||||
* CosSim: cosine similarity with respect to the previous layer's tensor. Useful to determine how similar the squared activations of the current layer are to the previous layer's squared activations.
|
||||
|
||||
#### Per layer
|
||||
|
||||
Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated.
|
||||
|
||||
#### Important note on the computed Statistics
|
||||
|
||||
When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**.
|
||||
Whilst the results are still useful, they're less realiable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -785,14 +785,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// check for reverse prompt using special tokens
|
||||
llama_token last_token = common_sampler_last(smpl);
|
||||
for (auto token : antiprompt_token) {
|
||||
if (token == last_token) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
// avoid calling common_sampler_last() if last_output is empty
|
||||
if (!last_output.empty()) {
|
||||
llama_token last_token = common_sampler_last(smpl);
|
||||
for (auto token : antiprompt_token) {
|
||||
if (token == last_token) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
break;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -367,8 +367,8 @@ struct clip_ctx {
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
ggml_backend_t backend;
|
||||
ggml_backend_t backend_cpu;
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
|
||||
int max_nodes = 8192;
|
||||
@@ -384,9 +384,18 @@ struct clip_ctx {
|
||||
if (!backend_cpu) {
|
||||
throw std::runtime_error("failed to initialize CPU backend");
|
||||
}
|
||||
backend = ctx_params.use_gpu
|
||||
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
|
||||
: nullptr;
|
||||
if (ctx_params.use_gpu) {
|
||||
auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
|
||||
if (backend_name != nullptr) {
|
||||
backend = ggml_backend_init_by_name(backend_name, nullptr);
|
||||
if (!backend) {
|
||||
LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
|
||||
}
|
||||
}
|
||||
if (!backend) {
|
||||
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
if (backend) {
|
||||
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <cctype>
|
||||
@@ -68,6 +70,11 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
|
||||
|
||||
// TODO: share with imatrix.cpp
|
||||
static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets";
|
||||
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
|
||||
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
|
||||
|
||||
static bool striequals(const char * a, const char * b) {
|
||||
while (*a && *b) {
|
||||
if (std::tolower(*a) != std::tolower(*b)) {
|
||||
@@ -84,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
for (auto ch : ftype_str_in) {
|
||||
ftype_str.push_back(std::toupper(ch));
|
||||
}
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
for (const auto & it : QUANT_OPTIONS) {
|
||||
if (striequals(it.name.c_str(), ftype_str.c_str())) {
|
||||
ftype = it.ftype;
|
||||
ftype_str_out = it.name;
|
||||
@@ -93,7 +100,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
}
|
||||
try {
|
||||
int ftype_int = std::stoi(ftype_str);
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
for (const auto & it : QUANT_OPTIONS) {
|
||||
if (it.ftype == ftype_int) {
|
||||
ftype = it.ftype;
|
||||
ftype_str_out = it.name;
|
||||
@@ -129,7 +136,7 @@ static void usage(const char * executable) {
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
for (const auto & it : QUANT_OPTIONS) {
|
||||
if (it.name != "COPY") {
|
||||
printf(" %2d or ", it.ftype);
|
||||
} else {
|
||||
@@ -140,7 +147,7 @@ static void usage(const char * executable) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
|
||||
if (!in) {
|
||||
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
|
||||
@@ -180,7 +187,9 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
|
||||
exit(1);
|
||||
}
|
||||
if (ncall > 0) {
|
||||
for (auto& v : e) v /= ncall;
|
||||
for (auto & v : e) {
|
||||
v /= ncall;
|
||||
}
|
||||
}
|
||||
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
@@ -188,7 +197,7 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
|
||||
}
|
||||
}
|
||||
|
||||
// latest imatrix version contains the dataset filename at the end of the file
|
||||
// latest legacy imatrix version contains the dataset filename at the end of the file
|
||||
int m_last_call = 0;
|
||||
if (in.peek() != EOF) {
|
||||
in.read((char *)&m_last_call, sizeof(m_last_call));
|
||||
@@ -196,15 +205,130 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
|
||||
in.read((char *)&dataset_len, sizeof(dataset_len));
|
||||
std::vector<char> dataset_as_vec(dataset_len);
|
||||
in.read(dataset_as_vec.data(), dataset_len);
|
||||
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
|
||||
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
|
||||
imatrix_datasets.resize(1);
|
||||
imatrix_datasets[0].assign(dataset_as_vec.begin(), dataset_as_vec.end());
|
||||
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_datasets[0].c_str());
|
||||
}
|
||||
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
|
||||
return m_last_call;
|
||||
}
|
||||
|
||||
static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
|
||||
struct ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ false, // the data is needed
|
||||
/* .ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
|
||||
return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
|
||||
}
|
||||
const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
|
||||
if (n_entries < 1) {
|
||||
fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const int dataset_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
|
||||
const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
|
||||
const int chunk_size_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
|
||||
if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
|
||||
fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
|
||||
|
||||
const std::string sums_suffix{ ".in_sum2" };
|
||||
const std::string counts_suffix{ ".counts" };
|
||||
|
||||
// Using an ordered map to get a deterministic iteration order.
|
||||
std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string name = cur->name;
|
||||
|
||||
if (name.empty()) { continue; }
|
||||
|
||||
if (string_remove_suffix(name, sums_suffix)) {
|
||||
// in_sum2
|
||||
sums_counts_for[std::move(name)].first = cur;
|
||||
} else if (string_remove_suffix(name, counts_suffix)) {
|
||||
// counts
|
||||
sums_counts_for[std::move(name)].second = cur;
|
||||
} else {
|
||||
// ignore other tensors
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & sc : sums_counts_for) {
|
||||
const std::string & name = sc.first;
|
||||
const struct ggml_tensor * sums = sc.second.first;
|
||||
const struct ggml_tensor * counts = sc.second.second;
|
||||
|
||||
if (!sums || !counts) {
|
||||
fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const int64_t ne0 = sums->ne[0];
|
||||
const int64_t ne1 = sums->ne[1];
|
||||
|
||||
auto & e = imatrix_data[name];
|
||||
e.resize(ggml_nelements(sums));
|
||||
float max_count = 0.0f;
|
||||
for (int64_t j = 0; j < ne1; ++j) {
|
||||
const float count = ((const float *) counts->data)[j];
|
||||
if (count > 0.0f) {
|
||||
for (int64_t i = 0; i < ne0; ++i) {
|
||||
e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
|
||||
}
|
||||
} else {
|
||||
// Partial imatrix data, this tensor never got any input during calibration
|
||||
for (int64_t i = 0; i < ne0; ++i) {
|
||||
e[j*ne0 + i] = 1;
|
||||
}
|
||||
}
|
||||
if (count > max_count) {
|
||||
max_count = count;
|
||||
}
|
||||
}
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
|
||||
|
||||
int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx);
|
||||
imatrix_datasets.reserve(n_datasets);
|
||||
for (int64_t i = 0; i < n_datasets; ++i) {
|
||||
imatrix_datasets.push_back(gguf_get_val_str(ctx_gguf, dataset_idx));
|
||||
}
|
||||
printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str());
|
||||
for (size_t i = 1; i < imatrix_datasets.size(); ++i) {
|
||||
printf(", '%s'", imatrix_datasets[i].c_str());
|
||||
}
|
||||
printf("]\n");
|
||||
|
||||
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
|
||||
return m_last_chunk;
|
||||
}
|
||||
|
||||
static int prepare_imatrix(const std::string & imatrix_file,
|
||||
std::string & imatrix_dataset,
|
||||
std::vector<std::string> & imatrix_dataset,
|
||||
const std::vector<std::string> & included_weights,
|
||||
const std::vector<std::string> & excluded_weights,
|
||||
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
@@ -216,18 +340,21 @@ static int prepare_imatrix(const std::string & imatrix_file,
|
||||
return m_last_call;
|
||||
}
|
||||
if (!excluded_weights.empty()) {
|
||||
for (auto& name : excluded_weights) {
|
||||
for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
|
||||
for (const auto & name : excluded_weights) {
|
||||
for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
|
||||
auto pos = it->first.find(name);
|
||||
if (pos != std::string::npos) it = imatrix_data.erase(it);
|
||||
else ++it;
|
||||
if (pos != std::string::npos) {
|
||||
it = imatrix_data.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!included_weights.empty()) {
|
||||
std::unordered_map<std::string, std::vector<float>> tmp;
|
||||
for (auto& name : included_weights) {
|
||||
for (auto& e : imatrix_data) {
|
||||
for (const auto & name : included_weights) {
|
||||
for (auto & e : imatrix_data) {
|
||||
auto pos = e.first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
tmp.emplace(std::move(e));
|
||||
@@ -396,9 +523,9 @@ int main(int argc, char ** argv) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
std::string imatrix_dataset;
|
||||
std::vector<std::string> imatrix_datasets;
|
||||
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
|
||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
|
||||
if (!imatrix_data.empty()) {
|
||||
params.imatrix = &imatrix_data;
|
||||
{
|
||||
@@ -409,11 +536,12 @@ int main(int argc, char ** argv) {
|
||||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
if (!imatrix_dataset.empty()) {
|
||||
if (!imatrix_datasets.empty()) {
|
||||
llama_model_kv_override kvo;
|
||||
// TODO: list multiple datasets when there are more than one
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
||||
strncpy(kvo.val_str, imatrix_datasets[0].c_str(), 127);
|
||||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
@@ -575,6 +575,8 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
|
||||
`parse_special`: (Optional) Boolean indicating if special tokens should be tokenized. When `false` special tokens are treated as plaintext. Default: `true`
|
||||
|
||||
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
|
||||
|
||||
**Response:**
|
||||
|
||||
@@ -253,6 +253,7 @@ struct server_task {
|
||||
defaults.sampling = params_base.sampling;
|
||||
defaults.speculative = params_base.speculative;
|
||||
defaults.n_keep = params_base.n_keep;
|
||||
defaults.antiprompt = params_base.antiprompt;
|
||||
|
||||
// enabling this will output extra debug information in the HTTP responses from the server
|
||||
params.verbose = params_base.verbosity > 9;
|
||||
@@ -490,6 +491,10 @@ struct server_task {
|
||||
}
|
||||
}
|
||||
}
|
||||
// set reverse prompt from cli args if not set in the request
|
||||
if (params.antiprompt.empty()) {
|
||||
params.antiprompt = defaults.antiprompt;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
@@ -4516,9 +4521,10 @@ int main(int argc, char ** argv) {
|
||||
json tokens_response = json::array();
|
||||
if (body.count("content") != 0) {
|
||||
const bool add_special = json_value(body, "add_special", false);
|
||||
const bool parse_special = json_value(body, "parse_special", true);
|
||||
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||
|
||||
llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
|
||||
llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
|
||||
|
||||
if (with_pieces) {
|
||||
for (const auto& token : tokens) {
|
||||
|
||||
Reference in New Issue
Block a user