mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-08 18:14:07 +00:00
Compare commits
33 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5eae934883 | ||
|
|
05c0380f2a | ||
|
|
8c3fdf44ec | ||
|
|
f6da8cb86a | ||
|
|
8a2234ea0c | ||
|
|
3de008208b | ||
|
|
69db8a52e6 | ||
|
|
c466abe158 | ||
|
|
0a2a3841e8 | ||
|
|
9961d244f2 | ||
|
|
25f1045f07 | ||
|
|
97669e4073 | ||
|
|
2f853687b3 | ||
|
|
ef2af57ddf | ||
|
|
5d804a4938 | ||
|
|
d4d8dbe383 | ||
|
|
35a42edac8 | ||
|
|
fec7911f8f | ||
|
|
078ce23ea7 | ||
|
|
a0c2b207c5 | ||
|
|
4b20d8b7e3 | ||
|
|
02c1813517 | ||
|
|
77dee9de97 | ||
|
|
4795c91c32 | ||
|
|
b66df9d9c9 | ||
|
|
b9382c3877 | ||
|
|
3dc7397a27 | ||
|
|
e92d53b29e | ||
|
|
0d161f021a | ||
|
|
4efd5a8316 | ||
|
|
274966226f | ||
|
|
9777032dcc | ||
|
|
7d3c9f2b21 |
@@ -22,7 +22,7 @@ AllowShortIfStatementsOnASingleLine: Never
|
||||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BinPackArguments: false
|
||||
BinPackArguments: true
|
||||
BinPackParameters: false # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
BreakBeforeBraces: Custom # Attach
|
||||
|
||||
@@ -137,6 +137,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
|
||||
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
|
||||
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
||||
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
|
||||
|
||||
#### Multimodal
|
||||
|
||||
|
||||
20
ci/run.sh
20
ci/run.sh
@@ -386,10 +386,10 @@ function gg_run_open_llama_7b_v2 {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -520,8 +520,8 @@ function gg_run_pythia_1_4b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -651,10 +651,10 @@ function gg_run_pythia_2_8b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
||||
@@ -1548,11 +1548,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-fa", "--flash-attn"}, "FA",
|
||||
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "on" || value == "enabled") {
|
||||
if (value == "on" || value == "enabled" || value == "1") {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
||||
} else if (value == "off" || value == "disabled") {
|
||||
} else if (value == "off" || value == "disabled" || value == "0") {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
} else if (value == "auto") {
|
||||
} else if (value == "auto" || value == "-1") {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
} else {
|
||||
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
||||
@@ -2962,13 +2962,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.endpoint_metrics = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
||||
add_opt(common_arg(
|
||||
{"--slots"},
|
||||
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.endpoint_slots = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
||||
add_opt(common_arg(
|
||||
{"--props"},
|
||||
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
||||
@@ -2976,6 +2969,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.endpoint_props = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
||||
add_opt(common_arg(
|
||||
{"--slots"},
|
||||
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.endpoint_slots = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
||||
add_opt(common_arg(
|
||||
{"--no-slots"},
|
||||
"disables slots monitoring endpoint",
|
||||
|
||||
@@ -444,7 +444,7 @@ struct common_params {
|
||||
|
||||
// "advanced" endpoints are disabled by default for better security
|
||||
bool webui = true;
|
||||
bool endpoint_slots = false;
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
|
||||
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
||||
return &gsmpl->cur_p;
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
||||
auto * res = &gsmpl->cur_p;
|
||||
|
||||
if (do_sort && !res->sorted) {
|
||||
// remember the selected token before sorting
|
||||
const llama_token id = res->data[res->selected].id;
|
||||
|
||||
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.p > b.p;
|
||||
});
|
||||
|
||||
// restore the selected token after sorting
|
||||
for (size_t i = 0; i < res->size; ++i) {
|
||||
if (res->data[i].id == id) {
|
||||
res->selected = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
res->sorted = true;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
||||
|
||||
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
// helpers
|
||||
|
||||
// access the internal list of current candidate tokens
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
||||
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
|
||||
// the .sorted flag of the result indicates whether the returned candidates are sorted
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
|
||||
|
||||
// get the last accepted token
|
||||
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
||||
|
||||
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
|
||||
@@ -302,10 +302,6 @@ class ModelBase:
|
||||
# data = data_torch.squeeze().numpy()
|
||||
data = data_torch.numpy()
|
||||
|
||||
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
||||
if len(data.shape) == 0:
|
||||
data = data_torch.numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@@ -314,3 +314,7 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
|
||||
|
||||
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
|
||||
|
||||
### GGML_CANN_DISABLE_ACL_GRAPH
|
||||
|
||||
When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
|
||||
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
|
||||
|
||||
6
examples/model-conversion/scripts/utils/curl-embedding-server.sh
Executable file
6
examples/model-conversion/scripts/utils/curl-embedding-server.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
curl --request POST \
|
||||
--url http://localhost:8080/embedding \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{"input": "Hello world today"}' \
|
||||
--silent
|
||||
@@ -244,7 +244,7 @@ int main(int argc, char ** argv) {
|
||||
// stochastic verification
|
||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
||||
|
||||
auto & dist_tgt = *common_sampler_get_candidates(smpl);
|
||||
auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
|
||||
|
||||
float p_tgt = 0.0f;
|
||||
float p_dft = 0.0f;
|
||||
@@ -493,7 +493,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
|
||||
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
|
||||
|
||||
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
|
||||
@@ -129,7 +129,9 @@ endif()
|
||||
option(GGML_LASX "ggml: enable lasx" ON)
|
||||
option(GGML_LSX "ggml: enable lsx" ON)
|
||||
option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
||||
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
||||
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
||||
|
||||
@@ -307,6 +307,9 @@ extern "C" {
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
// Split graph without allocating it
|
||||
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
|
||||
// Allocate and compute graph on the backend scheduler
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
|
||||
@@ -651,7 +651,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 30
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_COPIES
|
||||
@@ -902,7 +902,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
|
||||
}
|
||||
|
||||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
// reset splits
|
||||
sched->n_splits = 0;
|
||||
sched->n_graph_inputs = 0;
|
||||
@@ -1687,6 +1687,8 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
@@ -70,6 +70,8 @@
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <aclnnop/aclnn_index_copy.h>
|
||||
#include <aclnnop/aclnn_index_select.h>
|
||||
#include <aclnnop/aclnn_clamp.h>
|
||||
#include <aclnnop/aclnn_threshold.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -964,8 +966,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
}
|
||||
aclTensor* acl_gamma = get_f32_cache_acl_tensor(
|
||||
ctx,
|
||||
&ctx.f32_one_cache,
|
||||
ctx.f32_one_cache_element,
|
||||
&ctx.rms_norm_one_tensor_cache.cache,
|
||||
ctx.rms_norm_one_tensor_cache.size,
|
||||
src->ne,
|
||||
acl_gamma_nb,
|
||||
1, // dims
|
||||
@@ -980,8 +982,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
}
|
||||
aclTensor* acl_rstd = get_f32_cache_acl_tensor(
|
||||
ctx,
|
||||
&ctx.f32_zero_cache,
|
||||
ctx.f32_zero_cache_element,
|
||||
&ctx.rms_norm_zero_tensor_cache.cache,
|
||||
ctx.rms_norm_zero_tensor_cache.size,
|
||||
src->ne,
|
||||
acl_rstd_nb,
|
||||
GGML_MAX_DIMS,
|
||||
@@ -1423,21 +1425,25 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
||||
* @param start Starting exponent offset.
|
||||
* @param stop Stopping exponent offset (exclusive).
|
||||
* @param step Step size for the exponent increment.
|
||||
* @param dtype Data type for slope tensor.
|
||||
*/
|
||||
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
|
||||
float m, int64_t size, float start, float stop, float step){
|
||||
int64_t ne[] = {size};
|
||||
size_t nb[] = {sizeof(uint16_t)};
|
||||
float m, int64_t size, float start, float stop, float step, ggml_type dtype){
|
||||
aclDataType acl_type = ggml_cann_type_mapping(dtype);
|
||||
size_t type_size = ggml_type_size(dtype);
|
||||
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(uint16_t));
|
||||
int64_t ne[] = {size};
|
||||
size_t nb[] = {type_size};
|
||||
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
|
||||
void* arange_buffer = arange_allocator.get();
|
||||
|
||||
aclTensor* arange_tensor = ggml_cann_create_tensor(
|
||||
arange_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
|
||||
arange_buffer, acl_type, type_size, ne, nb, 1);
|
||||
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
|
||||
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
|
||||
slope_buffer, acl_type, type_size, ne, nb, 1);
|
||||
|
||||
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
|
||||
|
||||
@@ -1468,10 +1474,11 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
|
||||
* @param n_head Total number of attention heads.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
|
||||
* @param max_bias Maximum bias value for slope computation.
|
||||
* @param dtype Data type for slope tensor.
|
||||
*
|
||||
*/
|
||||
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
||||
void* slope_buffer, float max_bias) {
|
||||
void* slope_buffer, float max_bias, ggml_type dtype) {
|
||||
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
@@ -1488,7 +1495,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
||||
float step = 1;
|
||||
float count = n_head_log2;
|
||||
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
|
||||
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
|
||||
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
|
||||
if (n_head_log2 < n_head) {
|
||||
// arange2
|
||||
start = 2 * (n_head_log2 - n_head_log2) + 1;
|
||||
@@ -1497,7 +1504,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(
|
||||
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
|
||||
m1, count, start, end + 1, step);
|
||||
m1, count, start, end + 1, step, dtype);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1534,7 +1541,7 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
||||
ggml_cann_pool_alloc bias_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
|
||||
bias_buffer = bias_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
// broadcast for mask, slop and dst;
|
||||
@@ -1760,10 +1767,10 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
case GGML_TYPE_F16: {
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(float_t);
|
||||
src_trans_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
@@ -1807,14 +1814,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
dequant_ne = weight_ne;
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
dequant_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
||||
ggml_cann_pool_alloc dequant_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
||||
@@ -1823,11 +1830,11 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
|
||||
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
||||
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
||||
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
|
||||
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float),
|
||||
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
||||
|
||||
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
dequant_nb[0] = sizeof(float);
|
||||
dequant_ne = src0->ne;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
@@ -2248,46 +2255,35 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
|
||||
* 5. Compute sin(θ), cos(θ) and optionally scale by attn_factor.
|
||||
* 6. Expand sin/cos values by repeat or repeat_interleave depending
|
||||
* on whether @param is_neox is enabled.
|
||||
* 7. Store the computed values into persistent buffers
|
||||
* (ctx.rope_sin_ptr / ctx.rope_cos_ptr).
|
||||
*
|
||||
* @param ctx The CANN backend context, holding memory pool,
|
||||
* stream, and persistent buffers for rope init/cache.
|
||||
* @param dst The destination ggml_tensor whose computation
|
||||
* depends on the cached RoPE values (usually Qcur/Kcur).
|
||||
* @param theta_scale Scalar exponent base for computing theta scale values.
|
||||
* @param freq_scale Frequency scaling factor, applied to theta scale.
|
||||
* @param attn_factor Attention scaling factor, applied to sin/cos.
|
||||
* @param is_neox Whether to use Neox-style repeat strategy
|
||||
* (dim expansion vs repeat_interleave).
|
||||
* @param ctx The CANN backend context, holding memory pool,
|
||||
* stream, and persistent buffers for rope init/cache.
|
||||
* @param dst The destination ggml_tensor whose computation
|
||||
* depends on the RoPE values (usually Qcur/Kcur).
|
||||
* @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values.
|
||||
* @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values.
|
||||
* @param theta_scale Scalar exponent base for computing theta scale values.
|
||||
* @param freq_scale Frequency scaling factor, applied to theta scale.
|
||||
* @param attn_factor Attention scaling factor, applied to sin/cos.
|
||||
* @param is_neox Whether to use Neox-style repeat strategy
|
||||
* (dim expansion vs repeat_interleave).
|
||||
*/
|
||||
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
void* sin_tensor_buffer, void* cos_tensor_buffer,
|
||||
float* corr_dims, float ext_factor,
|
||||
float theta_scale, float freq_scale,
|
||||
float attn_factor, bool is_neox) {
|
||||
// int sin/cos cache, cache has different repeat method depond on
|
||||
// @param.is_neox
|
||||
bool is_q = (std::strncmp(dst->name, "Qcur-", 5) == 0);
|
||||
bool is_k = (std::strncmp(dst->name, "Kcur-", 5) == 0);
|
||||
|
||||
// used for accuracy testing
|
||||
bool is_attention = is_q || is_k;
|
||||
|
||||
// just compute in first layer in attention
|
||||
bool is_fisrt_layer = (std::strncmp(dst->name, "Qcur-0", GGML_MAX_NAME) == 0);
|
||||
if(is_attention && !is_fisrt_layer) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src1 = dst->src[1]; // position
|
||||
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
int64_t theta_scale_length = ne00 / 2;
|
||||
int64_t theta_scale_length = src0->ne[0] / 2;
|
||||
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
|
||||
size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
|
||||
theta_scale_length * sizeof(float_t)};
|
||||
size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
|
||||
theta_scale_length * sizeof(float)};
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
int64_t position_length = src1->ne[0];
|
||||
@@ -2297,65 +2293,115 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
|
||||
int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
|
||||
size_t theta_nb[GGML_MAX_DIMS];
|
||||
theta_nb[0] = sizeof(float_t);
|
||||
theta_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
|
||||
}
|
||||
|
||||
// init theta scale, just one time
|
||||
if(ctx.rope_init_ptr == nullptr || !is_attention) {
|
||||
// theta_scale arange, [0,1,...,ne00/2 - 1]
|
||||
if(ctx.rope_init_ptr != nullptr){
|
||||
ACL_CHECK(aclrtFree(ctx.rope_init_ptr));
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&ctx.rope_init_ptr, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
// theta_scale arange, [0,1,...,ne00/2 - 1]
|
||||
aclTensor* acl_theta_scale_tensor = nullptr;
|
||||
// cache theta scale
|
||||
if (ctx.rope_cache.theta_scale_length != theta_scale_length ||
|
||||
// theta_scale and freq_scale should not change during the current token inference process,
|
||||
// so we can directly use == here instead of comparing the absolute difference.
|
||||
ctx.rope_cache.theta_scale != theta_scale ||
|
||||
ctx.rope_cache.freq_scale != freq_scale) {
|
||||
|
||||
aclTensor* acl_theta_scale_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_init_ptr, ACL_FLOAT, sizeof(float_t),
|
||||
ctx.rope_cache.theta_scale_length = theta_scale_length;
|
||||
ctx.rope_cache.theta_scale = theta_scale;
|
||||
ctx.rope_cache.freq_scale = freq_scale;
|
||||
|
||||
if (ctx.rope_cache.theta_scale_cache != nullptr) {
|
||||
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
|
||||
acl_theta_scale_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
|
||||
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
||||
|
||||
float start = 0;
|
||||
float step = 1;
|
||||
float stop = ne00 / 2;
|
||||
float n_elements = ne00 / 2;
|
||||
float stop = theta_scale_length;
|
||||
float n_elements = theta_scale_length;
|
||||
aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
|
||||
|
||||
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
|
||||
aclTensor* acl_yarn_ramp_tensor = nullptr;
|
||||
if (ext_factor != 0) {
|
||||
// -rope_yarn_ramp
|
||||
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
||||
// return MIN(1, MAX(0, y)) - 1;
|
||||
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
|
||||
void* yarn_ramp_buffer = yarn_ramp_allocator.get();
|
||||
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t),
|
||||
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
||||
float zero_value = 0, one_value = 1;
|
||||
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
|
||||
aclScalar* low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
|
||||
aclScalar* zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT);
|
||||
aclScalar* one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT);
|
||||
aclScalar* denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT);
|
||||
aclScalar* ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor, low, one, acl_yarn_ramp_tensor);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor, denom_safe);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor, zero, zero);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor, one);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor, one, one);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, ext_factor_sc);
|
||||
|
||||
// theta_interp = freq_scale * theta_extrap;
|
||||
// theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
// theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
// theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
|
||||
// theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
|
||||
//
|
||||
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
|
||||
// cache freq_scale + (freq_scale - 1) * ramp_mix
|
||||
float freq_scale_1 = freq_scale - 1;
|
||||
aclScalar* freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT);
|
||||
aclScalar* freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, freq_scale_1_sc);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor, freq_scale_sc, one);
|
||||
|
||||
ggml_cann_release_resources(ctx, low, zero, one, denom_safe, ext_factor_sc, freq_scale_sc, freq_scale_1_sc);
|
||||
}
|
||||
|
||||
// power
|
||||
aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
|
||||
acl_theta_scale_tensor);
|
||||
|
||||
// freq_scale
|
||||
if (freq_scale != 1) {
|
||||
if (ext_factor != 0) {
|
||||
aclnn_mul(ctx, acl_theta_scale_tensor, acl_yarn_ramp_tensor);
|
||||
} else if (freq_scale != 1) {
|
||||
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
|
||||
}
|
||||
|
||||
// freq_factors
|
||||
if (src2) {
|
||||
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
|
||||
src2->data, ggml_cann_type_mapping(src2->type),
|
||||
ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
||||
aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_freq_factors_tensor);
|
||||
}
|
||||
// release
|
||||
ggml_cann_release_resources(ctx, acl_theta_scale_tensor,acl_theta_scale);
|
||||
}
|
||||
|
||||
// init sin_repeat && cos_repeat, one token just init in 0 layer
|
||||
if(position_length > ctx.max_prompt_length) {
|
||||
ctx.max_prompt_length = position_length;
|
||||
int64_t repeat_theta_length = theta_scale_length * ctx.max_prompt_length * 2;
|
||||
if(ctx.rope_sin_ptr != nullptr) {
|
||||
ACL_CHECK(aclrtFree(ctx.rope_sin_ptr));
|
||||
ACL_CHECK(aclrtFree(ctx.rope_cos_ptr));
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&ctx.rope_sin_ptr, repeat_theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
ACL_CHECK(aclrtMalloc(&ctx.rope_cos_ptr, repeat_theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
}
|
||||
|
||||
aclTensor* acl_theta_scale_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_init_ptr, ACL_FLOAT, sizeof(float_t),
|
||||
ggml_cann_release_resources(ctx, acl_yarn_ramp_tensor, acl_theta_scale);
|
||||
} else {
|
||||
// use cache
|
||||
acl_theta_scale_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
|
||||
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
||||
}
|
||||
|
||||
ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
|
||||
// freq_factors
|
||||
if (src2) {
|
||||
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
|
||||
void* freq_fac_res_ptr = freq_fac_res_allocator.get();
|
||||
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
|
||||
src2->data, ggml_cann_type_mapping(src2->type),
|
||||
ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_freq_fac_res_tensor = ggml_cann_create_tensor(
|
||||
freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
|
||||
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
||||
aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
|
||||
std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
|
||||
}
|
||||
|
||||
// position
|
||||
aclTensor* acl_position_tensor = ggml_cann_create_tensor(
|
||||
@@ -2365,49 +2411,53 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
// power * position
|
||||
int64_t theta_length = theta_scale_length * position_length;
|
||||
ggml_cann_pool_alloc theta_allocator(ctx.pool(),
|
||||
theta_length * sizeof(float_t));
|
||||
theta_length * sizeof(float));
|
||||
void* theta_buffer = theta_allocator.get();
|
||||
|
||||
aclTensor* acl_theta_tensor =
|
||||
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
|
||||
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float),
|
||||
theta_ne, theta_nb, GGML_MAX_DIMS);
|
||||
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
|
||||
acl_theta_tensor);
|
||||
|
||||
// sin/cos
|
||||
ggml_cann_pool_alloc sin_allocator(ctx.pool(),
|
||||
theta_length * sizeof(float_t));
|
||||
theta_length * sizeof(float));
|
||||
void* sin_buffer = sin_allocator.get();
|
||||
aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
|
||||
sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
|
||||
sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
|
||||
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
|
||||
|
||||
ggml_cann_pool_alloc cos_allocator(ctx.pool(),
|
||||
theta_length * sizeof(float_t));
|
||||
theta_length * sizeof(float));
|
||||
void* cos_buffer = cos_allocator.get();
|
||||
aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
|
||||
cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
|
||||
cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
|
||||
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
|
||||
|
||||
if (ext_factor != 0) {
|
||||
attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||
}
|
||||
|
||||
// attn_factor
|
||||
if (attn_factor != 1) {
|
||||
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
|
||||
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
|
||||
}
|
||||
|
||||
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
|
||||
int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
|
||||
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
||||
sin_reshape_nb[0] = sizeof(float_t);
|
||||
sin_reshape_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_sin_repeat_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_sin_ptr, ACL_FLOAT, sizeof(float_t),
|
||||
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
|
||||
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_cos_repeat_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_cos_ptr, ACL_FLOAT, sizeof(float_t),
|
||||
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
|
||||
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
||||
|
||||
// repeat
|
||||
@@ -2449,6 +2499,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: use ascendc
|
||||
// Only test with LLAMA model.
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
|
||||
// param
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
@@ -2470,8 +2521,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: n_dims <= ne0
|
||||
GGML_ASSERT(n_dims == ne0);
|
||||
GGML_ASSERT(n_dims % 2 == 0);
|
||||
// TODO: ext_factor != 0
|
||||
GGML_ASSERT(ext_factor == 0);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
@@ -2481,20 +2530,28 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
|
||||
// sin/cos tensor length.
|
||||
int64_t repeat_theta_length = src0->ne[0] * src1->ne[0];
|
||||
ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
|
||||
ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
|
||||
void *sin_tensor_buffer = sin_tensor_allocator.get();
|
||||
void *cos_tensor_buffer = cos_tensor_allocator.get();
|
||||
|
||||
// init ctx.rope_cos/rope_sin cache
|
||||
aclnn_cache_init(ctx, dst, theta_scale, freq_scale, attn_factor, is_neox);
|
||||
aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor,
|
||||
theta_scale, freq_scale, attn_factor, is_neox);
|
||||
|
||||
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
|
||||
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
||||
sin_reshape_nb[0] = sizeof(float_t);
|
||||
sin_reshape_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_sin_reshape_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_sin_ptr, ACL_FLOAT, sizeof(float_t),
|
||||
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
|
||||
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_cos_reshape_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_cos_ptr, ACL_FLOAT, sizeof(float_t),
|
||||
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
|
||||
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
@@ -2509,7 +2566,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
void* minus_one_scale_buffer = nullptr;
|
||||
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||
ggml_cann_pool_alloc minus_one_scale_allocator(
|
||||
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
||||
ctx.pool(), sizeof(float) * src0->ne[0]);
|
||||
if (!is_neox) {
|
||||
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
||||
input_roll_buffer = roll_allocator.get();
|
||||
@@ -2539,13 +2596,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||
minus_one_nb[0] = sizeof(float_t);
|
||||
minus_one_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||
}
|
||||
acl_minus_one_tensor = aclnn_values(
|
||||
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||
ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
|
||||
int64_t dim = 3;
|
||||
int64_t* index = new int64_t[src0->ne[0]];
|
||||
for (int i = 0; i < src0->ne[0]; i++) {
|
||||
@@ -2573,22 +2630,22 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||
minus_one_nb[0] = sizeof(float_t);
|
||||
minus_one_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||
}
|
||||
acl_minus_one_tensor = aclnn_values(
|
||||
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||
ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
|
||||
// -1 * first half
|
||||
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
||||
size_t first_half_nb[GGML_MAX_DIMS];
|
||||
first_half_nb[0] = sizeof(float_t);
|
||||
first_half_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
|
||||
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
|
||||
minus_one_scale_buffer, ACL_FLOAT, sizeof(float), first_half_ne,
|
||||
first_half_nb, GGML_MAX_DIMS);
|
||||
bool inplace = true;
|
||||
float scale = -1;
|
||||
@@ -2628,28 +2685,28 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: ne0 != n_dims in mode2
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
size_t input_fp32_nb[GGML_MAX_DIMS];
|
||||
input_fp32_nb[0] = sizeof(float_t);
|
||||
input_fp32_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc fp32_allocator1(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
||||
void* input_fp32_buffer1 = fp32_allocator1.get();
|
||||
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
|
||||
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
ggml_cann_pool_alloc fp32_allocator2(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
||||
void* input_fp32_buffer2 = fp32_allocator2.get();
|
||||
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
|
||||
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
|
||||
ggml_cann_pool_alloc fp32_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
||||
output_fp32_buffer = fp32_allocator.get();
|
||||
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
|
||||
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
|
||||
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
||||
@@ -2746,8 +2803,6 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
|
||||
aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
|
||||
int64_t dilationVal[] = {1};
|
||||
aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
|
||||
bool transposed = true;
|
||||
int64_t groups = 1;
|
||||
int8_t cubeMathType = 0;
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
@@ -2755,7 +2810,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
|
||||
#endif
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
|
||||
padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
|
||||
padding, dilation, true, padding, 1, acl_dst, cubeMathType);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
|
||||
}
|
||||
@@ -2864,174 +2919,49 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
*/
|
||||
static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
//dst [M, K, N, 1]
|
||||
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
||||
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
||||
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] -> [D, M, K, 1]
|
||||
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1]
|
||||
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
GGML_ASSERT(src1->ne[3] == 1);
|
||||
GGML_ASSERT(dst->ne[3] == 1);
|
||||
|
||||
// copy index from npu to cpu
|
||||
int64_t n_as = ne02; // A
|
||||
int64_t n_ids = ids->ne[0]; // K
|
||||
int64_t batch = src1->ne[2];
|
||||
GGML_ASSERT(batch == ids->ne[1]);
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
||||
ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0));
|
||||
void* export_ptr = export_allocator.get();
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
aclTensor *select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
|
||||
aclTensor *export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
|
||||
|
||||
char * src0_original = (char *) src0->data;
|
||||
char * src1_original = (char *) src1->data;
|
||||
char * dst_original = (char *) dst->data;
|
||||
size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
|
||||
|
||||
// src0 is F16, src1 is F32, dst is F32
|
||||
ggml_cann_pool_alloc src0_cast_allocator;
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
|
||||
void* src0_cast_buf = src0_cast_allocator.get();
|
||||
|
||||
size_t cast_nb[GGML_MAX_DIMS];
|
||||
cast_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
|
||||
int64_t select_export_ne[] = {src0->ne[0], src0->ne[1], ids->ne[0]};
|
||||
size_t select_export_nb[3];
|
||||
select_export_nb[0] = src0->nb[0];
|
||||
for (int k = 1;k < 3; k++) {
|
||||
select_export_nb[k] = select_export_nb[k-1] * select_export_ne[k-1];
|
||||
}
|
||||
|
||||
aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
|
||||
ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
|
||||
ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
|
||||
aclTensor *select_export = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_export_ne, select_export_nb, 3);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight, 0, select_index, select_export);
|
||||
|
||||
src0_original = (char *) src0_cast_buf;
|
||||
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
||||
int64_t select_transpose_ne[] = {select_export_ne[1], select_export_ne[0], select_export_ne[2]};
|
||||
size_t select_transpose_nb[] = {select_export_nb[1], select_export_nb[0], select_export_nb[2]};
|
||||
aclTensor *select_export_transpose = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_transpose_ne, select_transpose_nb, 3);
|
||||
|
||||
int64_t active_tensor_ne[] = {src1->ne[0], 1, src1->ne[1]};
|
||||
size_t active_tensor_nb[] = {src1->nb[0], src1->nb[1], src1->nb[1]};
|
||||
aclTensor *active_tensor = ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
|
||||
|
||||
int64_t dst_ne[] = {dst->ne[0], 1, dst->ne[1]};
|
||||
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[1]};
|
||||
aclTensor *acl_dst = ggml_cann_create_tensor(dst, dst_ne,dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor, select_export_transpose, acl_dst, 2);
|
||||
|
||||
ggml_cann_release_resources(ctx, select_index, export_weight, select_export, active_tensor, acl_dst, select_export_transpose);
|
||||
}
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
src0_row.type = GGML_TYPE_F32;
|
||||
}
|
||||
|
||||
// src0_row [D, M, 1, 1] weight without permute
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
src0_row.nb[0] = ori_src0_nb[0];
|
||||
src0_row.nb[1] = ori_src0_nb[1];
|
||||
src0_row.nb[2] = ori_src0_nb[1];
|
||||
src0_row.nb[3] = ori_src0_nb[1];
|
||||
|
||||
// src1_row [D, 1, 1, 1] -> input
|
||||
src1_row.ne[1] = 1;
|
||||
src1_row.ne[2] = 1;
|
||||
src1_row.ne[3] = 1;
|
||||
src1_row.nb[2] = nb11;
|
||||
src1_row.nb[3] = nb11;
|
||||
|
||||
// dst_row [M, 1, 1, 1] -> out
|
||||
dst_row.ne[1] = 1;
|
||||
dst_row.ne[2] = 1;
|
||||
dst_row.ne[3] = 1;
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
//create weight for one row
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
// expert index
|
||||
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
||||
int64_t i11 = (ne11 == 1 ? 0 : id);
|
||||
int64_t i12 = iid1;
|
||||
|
||||
int64_t i1 = id;
|
||||
int64_t i2 = i12;
|
||||
|
||||
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
||||
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
||||
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
src0_row.data = src0_tmp_ptr;
|
||||
src1_row.data = src1_tmp_ptr;
|
||||
dst_row.data = dst_tmp_ptr;
|
||||
dst_row.src[0] = &src0_row;
|
||||
dst_row.src[1] = &src1_row;
|
||||
|
||||
ggml_cann_mul_mat(ctx, &dst_row);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
|
||||
std::vector<aclTensor*> src0_tensor_vec;
|
||||
std::vector<aclTensor*> src1_tensor_vec;
|
||||
std::vector<aclTensor*> dst_tensor_vec;
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
// src0_row [M, D] -> weight && permute
|
||||
int64_t src0_ne[2] = {ne01, ne00};
|
||||
size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
|
||||
// src1_row [D, 1] -> input
|
||||
int64_t src1_ne[2] = {ne10, 1};
|
||||
size_t src1_nb[2] = {nb10, nb11};
|
||||
// dst_row [M, 1] -> out
|
||||
int64_t dst_ne[2] = {ne0, 1};
|
||||
size_t dst_nb[2] = {nb0, nb1};
|
||||
|
||||
// expert index
|
||||
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
||||
int64_t i11 = (ne11 == 1 ? 0 : id);
|
||||
int64_t i12 = iid1;
|
||||
|
||||
int64_t i1 = id;
|
||||
int64_t i2 = i12;
|
||||
|
||||
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
||||
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
||||
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
|
||||
ACL_FLOAT, sizeof(float),
|
||||
src0_ne, src0_nb, 2);
|
||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
|
||||
ACL_FLOAT, sizeof(float),
|
||||
src1_ne, src1_nb, 2);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
|
||||
ACL_FLOAT, sizeof(float),
|
||||
dst_ne, dst_nb, 2);
|
||||
|
||||
src0_tensor_vec.push_back(acl_src0);
|
||||
src1_tensor_vec.push_back(acl_src1);
|
||||
dst_tensor_vec.push_back(acl_dst);
|
||||
}
|
||||
}
|
||||
|
||||
size_t GROUP_SIZE = 128;
|
||||
// GroupedMatmulV3 required tensor_list.size < 128
|
||||
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
||||
// split and call GroupedMatmulV3
|
||||
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
||||
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
||||
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
||||
std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
|
||||
|
||||
aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
|
||||
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
||||
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
||||
|
||||
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -3342,7 +3272,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
const int64_t n_heads = src0->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
|
||||
void* slope_buffer = slope_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
|
||||
|
||||
int64_t slope_ne[] = {1, 1, n_heads, 1};
|
||||
size_t slope_nb[GGML_MAX_DIMS];
|
||||
|
||||
@@ -360,6 +360,30 @@ struct ggml_cann_graph {
|
||||
};
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
struct ggml_cann_rope_cache {
|
||||
~ggml_cann_rope_cache() {
|
||||
if(theta_scale_cache != nullptr) {
|
||||
ACL_CHECK(aclrtFree(theta_scale_cache));
|
||||
}
|
||||
}
|
||||
|
||||
void* theta_scale_cache = nullptr;
|
||||
int64_t theta_scale_length = 0;
|
||||
float theta_scale = 0.0f;
|
||||
float freq_scale = 0.0f;
|
||||
};
|
||||
|
||||
struct ggml_cann_tensor_cache {
|
||||
~ggml_cann_tensor_cache() {
|
||||
if(cache != nullptr) {
|
||||
ACL_CHECK(aclrtFree(cache));
|
||||
}
|
||||
}
|
||||
|
||||
void* cache = nullptr;
|
||||
int64_t size = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Context for managing CANN backend operations.
|
||||
*/
|
||||
@@ -371,19 +395,15 @@ struct ggml_backend_cann_context {
|
||||
#ifdef USE_ACL_GRAPH
|
||||
/// Cached CANN ACL graph used for executing the current ggml computation graph.
|
||||
std::unique_ptr<ggml_cann_graph> cann_graph;
|
||||
bool acl_graph_mode = true;
|
||||
#endif
|
||||
cann_task_queue task_queue;
|
||||
bool async_mode;
|
||||
// Rope Cache
|
||||
void* rope_init_ptr = nullptr;
|
||||
void* rope_sin_ptr = nullptr;
|
||||
void* rope_cos_ptr = nullptr;
|
||||
int64_t max_prompt_length = 0;
|
||||
ggml_cann_rope_cache rope_cache;
|
||||
// Constant Pool
|
||||
void* f32_zero_cache = nullptr;
|
||||
void* f32_one_cache = nullptr;
|
||||
int64_t f32_zero_cache_element = 0;
|
||||
int64_t f32_one_cache_element = 0;
|
||||
ggml_cann_tensor_cache rms_norm_one_tensor_cache;
|
||||
ggml_cann_tensor_cache rms_norm_zero_tensor_cache;
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
@@ -399,6 +419,13 @@ struct ggml_backend_cann_context {
|
||||
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
#ifdef USE_ACL_GRAPH
|
||||
acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
|
||||
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
|
||||
__func__, device,
|
||||
acl_graph_mode ? "GRAPH" : "EAGER",
|
||||
acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -415,21 +442,6 @@ struct ggml_backend_cann_context {
|
||||
ACL_CHECK(aclrtDestroyStream(streams[i]));
|
||||
}
|
||||
}
|
||||
if(rope_init_ptr != nullptr) {
|
||||
ACL_CHECK(aclrtFree(rope_init_ptr));
|
||||
}
|
||||
if(rope_sin_ptr != nullptr) {
|
||||
ACL_CHECK(aclrtFree(rope_sin_ptr));
|
||||
}
|
||||
if(rope_cos_ptr != nullptr) {
|
||||
ACL_CHECK(aclrtFree(rope_cos_ptr));
|
||||
}
|
||||
if(f32_zero_cache != nullptr) {
|
||||
ACL_CHECK(aclrtFree(f32_zero_cache));
|
||||
}
|
||||
if(f32_one_cache != nullptr) {
|
||||
ACL_CHECK(aclrtFree(f32_one_cache));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -2247,10 +2247,15 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
release_nz_workspace();
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
bool cann_graph_update_required = false;
|
||||
|
||||
if (!cann_ctx->acl_graph_mode) {
|
||||
use_cann_graph = false;
|
||||
}
|
||||
|
||||
if (use_cann_graph) {
|
||||
if (cann_ctx->cann_graph == nullptr) {
|
||||
cann_ctx->cann_graph.reset(new ggml_cann_graph());
|
||||
@@ -2400,16 +2405,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
// TODO: with ops-test v == 1
|
||||
float ext_factor = 0.0f;
|
||||
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
|
||||
// TODO: n_dims <= ne0
|
||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: ext_factor != 0
|
||||
if (ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
@@ -2418,10 +2417,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
if(!ggml_is_contiguous(op->src[0])){
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_UPSCALE: {
|
||||
@@ -2483,12 +2483,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
|
||||
return (op->src[0]->ne[0] - 1) <= 255;
|
||||
case GGML_OP_SCALE:
|
||||
float bias;
|
||||
memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
|
||||
@@ -2522,13 +2524,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] % 16 != 0) {
|
||||
// TODO: padding to support
|
||||
return false;
|
||||
|
||||
@@ -433,15 +433,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
ggml-cpu/arch/riscv/quants.c
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_RVV)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
|
||||
elseif (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||
set(MARCH_STR "rv64gc")
|
||||
if (GGML_RV_ZFH)
|
||||
string(APPEND MARCH_STR "_zfh")
|
||||
endif()
|
||||
if (GGML_XTHEADVECTOR)
|
||||
string(APPEND MARCH_STR "_xtheadvector")
|
||||
elseif (GGML_RVV)
|
||||
string(APPEND MARCH_STR "_v")
|
||||
if (GGML_RV_ZVFH)
|
||||
string(APPEND MARCH_STR "_zvfh")
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_RV_ZICBOP)
|
||||
string(APPEND MARCH_STR "_zicbop")
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
||||
|
||||
@@ -1270,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int tmp, tmp2, sumi;
|
||||
float ftmp, ft2;
|
||||
const uint8_t * restrict q40;
|
||||
const uint8_t * restrict q41;
|
||||
const uint8_t * restrict q42;
|
||||
const uint8_t * restrict q43;
|
||||
const int8_t * restrict q80;
|
||||
const int8_t * restrict q81;
|
||||
const int8_t * restrict q82;
|
||||
const int8_t * restrict q83;
|
||||
int s0, s1, s2, s3;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"vsetivli zero, 12, e8, m1\n\t"
|
||||
"vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"li %[s1], 8\n\t"
|
||||
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
||||
"vle32.v v1, (%[s6b])\n\t"
|
||||
"vslide1down.vx v1, v1, zero\n\t"
|
||||
"vmv.v.x v16, zero\n\t"
|
||||
"vslidedown.vi v2, v1, 2\n\t"
|
||||
"vmv1r.v v3, v2\n\t"
|
||||
"vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
||||
"vsetivli zero, 2, e32, m1\n\t"
|
||||
"vsetivli zero, 2, e32, m1, ta, ma\n\t"
|
||||
"vmv.v.i v4, 4\n\t"
|
||||
"vand.vx v8, v1, %[kmask1]\n\t"
|
||||
"vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
||||
"vsrl.vi v6, v1, 6\n\t"
|
||||
"vsrl.vv v7, v2, v5\n\t"
|
||||
"vsse32.v v8, (%[utmp]), %[s1]\n\t"
|
||||
"vand.vx v0, v6, %[kmask3]\n\t"
|
||||
"vand.vx v2, v7, %[kmask2]\n\t"
|
||||
"vsll.vi v6, v0, 4\n\t"
|
||||
"li %[t2], 8\n\t"
|
||||
"addi %[t1], %[utmp], 4\n\t"
|
||||
"addi %[s0], %[utmp], 4\n\t"
|
||||
"vor.vv v1, v6, v2\n\t"
|
||||
"vsse32.v v8, (%[utmp]), %[t2]\n\t"
|
||||
"vsse32.v v1, (%[t1]), %[t2]\n\t"
|
||||
"vsetivli zero, 8, e16, m1\n\t"
|
||||
"vsse32.v v1, (%[s0]), %[s1]\n\t"
|
||||
"vsetivli zero, 8, e16, m1, ta, ma\n\t"
|
||||
"vle32.v v2, (%[bsums])\n\t"
|
||||
"vnsrl.wi v0, v2, 0\n\t"
|
||||
"vnsrl.wi v1, v2, 16\n\t"
|
||||
@@ -1300,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vle8.v v3, (%[mins])\n\t"
|
||||
"vzext.vf2 v4, v3\n\t"
|
||||
"vwmul.vv v6, v4, v2\n\t"
|
||||
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
||||
"vredsum.vs v0, v6, v16\n\t"
|
||||
"vredsum.vs v0, v7, v0\n\t"
|
||||
"vfcvt.f.x.v v0, v0\n\t"
|
||||
"vfmv.f.s %[ftmp], v0\n\t"
|
||||
"vsetivli zero, 16, e8, m1, ta, ma\n\t"
|
||||
"vle8.v v0, (%[xs])\n\t"
|
||||
"fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
|
||||
"addi %[q40], %[xs], 64\n\t"
|
||||
"addi %[q41], %[xs], 16\n\t"
|
||||
"addi %[q42], %[xs], 32\n\t"
|
||||
"addi %[q43], %[xs], 48\n\t"
|
||||
"addi %[q80], %[ys], 64\n\t"
|
||||
"vle8.v v1, (%[q41])\n\t"
|
||||
"vle8.v v2, (%[q42])\n\t"
|
||||
"addi %[q81], %[ys], 16\n\t"
|
||||
"addi %[q41], %[q41], 64\n\t"
|
||||
"addi %[q82], %[ys], 32\n\t"
|
||||
"vle8.v v3, (%[q43])\n\t"
|
||||
"vle8.v v8, (%[ys])\n\t"
|
||||
"addi %[q42], %[q42], 64\n\t"
|
||||
"addi %[q83], %[ys], 48\n\t"
|
||||
"addi %[q43], %[q43], 64\n\t"
|
||||
"vsrl.vi v4, v0, 4\n\t"
|
||||
"vle8.v v9, (%[q81])\n\t"
|
||||
"vle8.v v10, (%[q82])\n\t"
|
||||
"vand.vi v0, v0, 0xF\n\t"
|
||||
"addi %[q81], %[q81], 64\n\t"
|
||||
"vsrl.vi v5, v1, 4\n\t"
|
||||
"addi %[q82], %[q82], 64\n\t"
|
||||
"vle8.v v11, (%[q83])\n\t"
|
||||
"vle8.v v12, (%[q80])\n\t"
|
||||
"vand.vi v1, v1, 0xF\n\t"
|
||||
"addi %[q83], %[q83], 64\n\t"
|
||||
"vsrl.vi v6, v2, 4\n\t"
|
||||
"addi %[q80], %[q80], 64\n\t"
|
||||
"vle8.v v13, (%[q81])\n\t"
|
||||
"vle8.v v14, (%[q82])\n\t"
|
||||
"vand.vi v2, v2, 0xF\n\t"
|
||||
"addi %[q81], %[q81], 64\n\t"
|
||||
"vsrl.vi v7, v3, 4\n\t"
|
||||
"addi %[q82], %[q82], 64\n\t"
|
||||
"vwmul.vv v16, v0, v8\n\t"
|
||||
"vle8.v v15, (%[q83])\n\t"
|
||||
"vle8.v v0, (%[q40])\n\t"
|
||||
"vand.vi v3, v3, 0xF\n\t"
|
||||
"addi %[q83], %[q83], 64\n\t"
|
||||
"vwmul.vv v24, v2, v12\n\t"
|
||||
"vwmul.vv v20, v4, v10\n\t"
|
||||
"vwmul.vv v28, v6, v14\n\t"
|
||||
"vwmacc.vv v16, v1, v9\n\t"
|
||||
"vle8.v v1, (%[q41])\n\t"
|
||||
"vle8.v v2, (%[q42])\n\t"
|
||||
"vwmacc.vv v24, v3, v13\n\t"
|
||||
"vwmacc.vv v20, v5, v11\n\t"
|
||||
"vwmacc.vv v28, v7, v15\n\t"
|
||||
"addi %[q40], %[q80], 64\n\t"
|
||||
"addi %[q41], %[q81], 64\n\t"
|
||||
"vle8.v v3, (%[q43])\n\t"
|
||||
"vle8.v v8, (%[q80])\n\t"
|
||||
"addi %[q42], %[q82], 64\n\t"
|
||||
"addi %[q43], %[q83], 64\n\t"
|
||||
"vsrl.vi v4, v0, 4\n\t"
|
||||
"vle8.v v9, (%[q81])\n\t"
|
||||
"vle8.v v10, (%[q82])\n\t"
|
||||
"vand.vi v0, v0, 0xF\n\t"
|
||||
"vsrl.vi v5, v1, 4\n\t"
|
||||
"vsrl.vi v7, v3, 4\n\t"
|
||||
"vand.vi v3, v3, 0xF\n\t"
|
||||
"vle8.v v11, (%[q83])\n\t"
|
||||
"vle8.v v12, (%[q40])\n\t"
|
||||
"vand.vi v1, v1, 0xF\n\t"
|
||||
"vsrl.vi v6, v2, 4\n\t"
|
||||
"vand.vi v2, v2, 0xF\n\t"
|
||||
"vwmul.vv v18, v0, v8\n\t"
|
||||
"vle8.v v13, (%[q41])\n\t"
|
||||
"vle8.v v14, (%[q42])\n\t"
|
||||
"vwmul.vv v26, v2, v12\n\t"
|
||||
"vwmul.vv v22, v4, v10\n\t"
|
||||
"vwmul.vv v30, v6, v14\n\t"
|
||||
"vwmacc.vv v18, v1, v9\n\t"
|
||||
"vle8.v v15, (%[q43])\n\t"
|
||||
"vwmacc.vv v26, v3, v13\n\t"
|
||||
"vwmacc.vv v22, v5, v11\n\t"
|
||||
"vwmacc.vv v30, v7, v15\n\t"
|
||||
"vmv.v.x v0, zero\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vredsum.vs v0, v6, v0\n\t"
|
||||
"vmv.x.s %[sumi], v0"
|
||||
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
|
||||
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
||||
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
|
||||
"vsetivli zero, 16, e16, m2, ta, ma\n\t"
|
||||
"vwredsum.vs v4, v16, v0\n\t"
|
||||
"lbu %[s0], 0(%[scale])\n\t"
|
||||
"vwredsum.vs v5, v20, v0\n\t"
|
||||
"lbu %[s1], 1(%[scale])\n\t"
|
||||
"vwredsum.vs v6, v24, v0\n\t"
|
||||
"lbu %[s2], 2(%[scale])\n\t"
|
||||
"vwredsum.vs v7, v28, v0\n\t"
|
||||
"lbu %[s3], 3(%[scale])\n\t"
|
||||
"vwredsum.vs v8, v18, v0\n\t"
|
||||
"lbu %[q40], 4(%[scale])\n\t"
|
||||
"vwredsum.vs v9, v22, v0\n\t"
|
||||
"lbu %[q41], 5(%[scale])\n\t"
|
||||
"vwredsum.vs v10, v26, v0\n\t"
|
||||
"lbu %[q42], 6(%[scale])\n\t"
|
||||
"vwredsum.vs v11, v30, v0\n\t"
|
||||
"lbu %[q43], 7(%[scale])\n\t"
|
||||
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
||||
"vmul.vx v0, v4, %[s0]\n\t"
|
||||
"vmul.vx v1, v8, %[q40]\n\t"
|
||||
"vmacc.vx v0, %[s1], v5\n\t"
|
||||
"vmacc.vx v1, %[q41], v9\n\t"
|
||||
"vmacc.vx v0, %[s2], v6\n\t"
|
||||
"vmacc.vx v1, %[q42], v10\n\t"
|
||||
"vmacc.vx v0, %[s3], v7\n\t"
|
||||
"vmacc.vx v1, %[q43], v11\n\t"
|
||||
"vfcvt.f.x.v v0, v0\n\t"
|
||||
"vfcvt.f.x.v v1, v1\n\t"
|
||||
"vfmv.f.s %[ft2], v0\n\t"
|
||||
"vfmv.f.s %[ftmp], v1\n\t"
|
||||
"fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
|
||||
"fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
|
||||
: [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
|
||||
, [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
|
||||
, [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
|
||||
, [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
|
||||
: [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
|
||||
, [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
||||
, [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
|
||||
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
@@ -1314,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf -= dmin * sumi;
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
sumi = 0;
|
||||
const uint8_t * scale = scales;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
int vl128 = 128, vl64 = 64, vl32 = 32;
|
||||
__asm__ __volatile__(
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"vle8.v v8, (%[q8])\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"vle8.v v0, (%[q4])\n\t"
|
||||
"vsrl.vi v4, v0, 4\n\t"
|
||||
"vand.vi v0, v0, 0xF\n\t"
|
||||
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"vwmul.vv v28, v6, v14\n\t"
|
||||
"vwmul.vv v20, v4, v10\n\t"
|
||||
"vwmul.vv v24, v2, v12\n\t"
|
||||
"vwmul.vv v16, v0, v8\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vle8.v v2, (%[scale])\n\t"
|
||||
"vmv.v.x v0, zero\n\t"
|
||||
"vzext.vf4 v1, v2\n\t"
|
||||
"vsetvli zero, %[vl32], e16, m4\n\t"
|
||||
"vwredsum.vs v6, v24, v0\n\t"
|
||||
"vwredsum.vs v7, v28, v0\n\t"
|
||||
"vwredsum.vs v4, v16, v0\n\t"
|
||||
"vwredsum.vs v5, v20, v0\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vslideup.vi v6, v7, 1\n\t"
|
||||
"vslideup.vi v4, v5, 1\n\t"
|
||||
"vslideup.vi v4, v6, 2\n\t"
|
||||
"vmul.vv v8, v4, v1\n\t"
|
||||
"vredsum.vs v0, v8, v0\n\t"
|
||||
"vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
||||
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
||||
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
q4 += 64; q8 += 128; scale += 4;
|
||||
}
|
||||
|
||||
sumf += d * sumi;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -1693,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
case 128:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
__builtin_prefetch(&x[i + 1].d, 0, 1);
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
@@ -1701,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
const int8_t * restrict scale = x[i].scales;
|
||||
|
||||
int sum_t = 0;
|
||||
int t0;
|
||||
int q6h;
|
||||
float ftmp;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"addi %[q6h], %[q6], 32\n\t"
|
||||
"ld t0, 0(%[scale])\n\t"
|
||||
"addi %[scale], %[scale], 8\n\t"
|
||||
"slli t6, t0, 1 * 8\n\t"
|
||||
"lb zero, 0(%[q6])\n\t"
|
||||
"slli t5, t0, 2 * 8\n\t"
|
||||
"slli t4, t0, 3 * 8\n\t"
|
||||
"lb zero, 0(%[q6h])\n\t"
|
||||
"slli t3, t0, 4 * 8\n\t"
|
||||
"slli t2, t0, 5 * 8\n\t"
|
||||
"lb zero, 0(%[qh])\n\t"
|
||||
"lb zero, 31(%[q6h])\n\t"
|
||||
"slli t1, t0, 6 * 8\n\t"
|
||||
"srai a7, t0, 56\n\t"
|
||||
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"vle8.v v8, (%[q6])\n\t"
|
||||
"srai t6, t6, 56\n\t"
|
||||
"srai t5, t5, 56\n\t"
|
||||
"srai t4, t4, 56\n\t"
|
||||
"srai t3, t3, 56\n\t"
|
||||
"vle8.v v10, (%[q6h])\n\t"
|
||||
"addi %[q6], %[q6], 64\n\t"
|
||||
"slli t0, t0, 7 * 8\n\t"
|
||||
"srai t2, t2, 56\n\t"
|
||||
"srai t1, t1, 56\n\t"
|
||||
"srai t0, t0, 56\n\t"
|
||||
"vle8.v v4, (%[qh])\n\t"
|
||||
"vsrl.vi v12, v8, 4\n\t"
|
||||
"vsrl.vi v14, v10, 4\n\t"
|
||||
"lb zero, 0(%[q8])\n\t"
|
||||
"vand.vi v8, v8, 0xF\n\t"
|
||||
"vand.vi v10, v10, 0xF\n\t"
|
||||
"lb zero, 32(%[q8])\n\t"
|
||||
"vsll.vi v0, v4, 4\n\t"
|
||||
"vsll.vi v2, v4, 2\n\t"
|
||||
"lb zero, 64(%[q8])\n\t"
|
||||
"vsrl.vi v6, v4, 2\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"vle8.v v8, (%[q6])\n\t"
|
||||
"vsrl.vi v12, v8, 4\n\t"
|
||||
"vand.vi v8, v8, 0xF\n\t"
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"vand.vx v0, v0, %[mask]\n\t"
|
||||
"lb zero, 96(%[q8])\n\t"
|
||||
"vand.vx v2, v2, %[mask]\n\t"
|
||||
"vand.vx v4, v4, %[mask]\n\t"
|
||||
"vand.vx v6, v6, %[mask]\n\t"
|
||||
"vor.vv v8, v8, v0\n\t"
|
||||
"lb zero, 127(%[q8])\n\t"
|
||||
"vor.vv v10, v10, v2\n\t"
|
||||
"vor.vv v12, v12, v4\n\t"
|
||||
"vor.vv v14, v14, v6\n\t"
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"vle8.v v0, (%[q8])\n\t"
|
||||
"vsub.vx v8, v8, %[vl32]\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
@@ -1734,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vwredsum.vs v13, v28, v0\n\t"
|
||||
"vwredsum.vs v14, v30, v0\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vslideup.vi v10, v9, 1\n\t"
|
||||
"vslideup.vi v8, v7, 1\n\t"
|
||||
"vslideup.vi v11, v12, 1\n\t"
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v2, (%[scale])\n\t"
|
||||
"vsext.vf4 v4, v2\n\t"
|
||||
"vmul.vv v2, v4, v10\n\t"
|
||||
"vredsum.vs v0, v2, v0\n\t"
|
||||
"vmv.x.s %[t0], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[t0]"
|
||||
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
|
||||
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
|
||||
"vmul.vx v0, v10, t0\n\t"
|
||||
"vmul.vx v1, v9, t1\n\t"
|
||||
"vmacc.vx v0, t2, v8\n\t"
|
||||
"vmacc.vx v1, t3, v7\n\t"
|
||||
"vmacc.vx v0, t4, v11\n\t"
|
||||
"vmacc.vx v1, t5, v12\n\t"
|
||||
"vmacc.vx v0, t6, v13\n\t"
|
||||
"vmacc.vx v1, a7, v14\n\t"
|
||||
"vadd.vv v0, v0, v1\n\t"
|
||||
"vfcvt.f.x.v v0, v0\n\t"
|
||||
"vfmv.f.s %[ftmp], v0\n\t"
|
||||
"fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
|
||||
: [q6] "+&r" (q6), [q6h] "=&r" (q6h)
|
||||
, [scale] "+&r" (scale)
|
||||
, [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
|
||||
: [qh] "r" (qh), [q8] "r" (q8)
|
||||
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
, [mask] "r" (0x30)
|
||||
, [mask] "r" (0x30), [d] "f" (d)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
, "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
|
||||
, "a6", "a5", "a4", "a3"
|
||||
);
|
||||
q6 += 64; qh += 32; q8 += 128; scale += 8;
|
||||
qh += 32; q8 += 128;
|
||||
}
|
||||
|
||||
sumf += d * sum_t;
|
||||
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -3221,6 +3221,13 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
||||
}
|
||||
#elif defined(__riscv_zvfh)
|
||||
for (int vl; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(n - i);
|
||||
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
||||
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
|
||||
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
|
||||
@@ -215,6 +215,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
||||
|
||||
// F16 SVE
|
||||
#define DEFAULT_PG32 svptrue_b32()
|
||||
#define DEFAULT_PG16 svptrue_b16()
|
||||
|
||||
#define GGML_F32Cxt svfloat16_t
|
||||
#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
|
||||
#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
|
||||
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
|
||||
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
||||
|
||||
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
||||
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
||||
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
||||
#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
|
||||
|
||||
#define GGML_F16x_VEC GGML_F32Cxt
|
||||
#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
|
||||
#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
|
||||
#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
|
||||
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
|
||||
#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
|
||||
#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
|
||||
#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
|
||||
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
|
||||
|
||||
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
||||
#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
|
||||
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
||||
{ \
|
||||
sum1 = svadd_f16_x(pg16, sum1, sum2); \
|
||||
sum3 = svadd_f16_x(pg16, sum3, sum4); \
|
||||
sum1 = svadd_f16_x(pg16, sum1, sum3); \
|
||||
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
||||
(res) = (ggml_float) sum_f16; \
|
||||
}
|
||||
#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
|
||||
// F16 NEON
|
||||
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
|
||||
@@ -85,15 +85,21 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
// reduce sum1,sum2 to sum1
|
||||
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
for (int i = 0, avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e32m8(n - i);
|
||||
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
||||
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
||||
vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
|
||||
vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
|
||||
int vl = __riscv_vsetvlmax_e32m8();
|
||||
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
vfloat32m8_t vsum;
|
||||
vfloat32m8_t ax;
|
||||
vfloat32m8_t ay;
|
||||
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
|
||||
for (int i = 0; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m8(n - i);
|
||||
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
|
||||
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
|
||||
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
|
||||
}
|
||||
sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
|
||||
vl = __riscv_vsetvlmax_e32m8();
|
||||
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
|
||||
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
@@ -207,38 +213,125 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
||||
|
||||
ggml_float sumf = 0.0;
|
||||
|
||||
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8; //get vector length
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F16_VEC_REDUCE(sumf, sum);
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
// if you hit this, you are likely running outside the FP range
|
||||
assert(!isnan(sumf) && !isinf(sumf));
|
||||
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
int vl = __riscv_vsetvlmax_e32m2();
|
||||
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
vfloat32m2_t vsum;
|
||||
vfloat16m1_t ax;
|
||||
vfloat16m1_t ay;
|
||||
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
|
||||
for (int i = 0; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e16m1(n - i);
|
||||
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
|
||||
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
|
||||
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
|
||||
}
|
||||
vl = __riscv_vsetvlmax_e32m1();
|
||||
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
|
||||
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
|
||||
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#endif // __riscv_zvfh
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
|
||||
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F16_VEC_REDUCE(sumf, sum);
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
// if you hit this, you are likely running outside the FP range
|
||||
assert(!isnan(sumf) && !isinf(sumf));
|
||||
#endif
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#endif
|
||||
#endif // GGML_SIMD
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
@@ -257,6 +350,12 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
||||
for (; i + 3 < n; i += 4) {
|
||||
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
const int vlen = svcntw();
|
||||
for (; i < n; i += vlen) {
|
||||
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
||||
svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
||||
@@ -281,10 +380,24 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
||||
for (; i + 3 < n; i += 4) {
|
||||
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
const int vlen = svcntw();
|
||||
for (; i < n; i += vlen) {
|
||||
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
||||
svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
for (int vl; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(n - i);
|
||||
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
||||
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
|
||||
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
|
||||
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = ggml_silu_f32(x[i]) * g[i];
|
||||
@@ -328,6 +441,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
|
||||
#endif
|
||||
sum += (ggml_float)_mm_cvtss_f32(val);
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
const int vlen = svcntw();
|
||||
for (; i < n; i += vlen) {
|
||||
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
||||
svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
|
||||
svdup_n_f32_x(pg, max)));
|
||||
svst1_f32(pg, y + i, val);
|
||||
sum += (ggml_float)svaddv_f32(pg, val);
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
||||
|
||||
@@ -119,45 +119,149 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
}
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
|
||||
const int np = (n & ~(ggml_f16_step - 1));
|
||||
|
||||
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
||||
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
||||
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
||||
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
||||
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
||||
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
||||
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
||||
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
||||
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
||||
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
||||
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
||||
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
||||
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
||||
}
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
||||
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
||||
}
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
||||
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
||||
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
||||
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
||||
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
||||
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
||||
|
||||
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
// reduce sum0..sum3 to sum0
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
@@ -293,35 +397,112 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
|
||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16;
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
||||
}
|
||||
}
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
hy = svmad_f16_x(pg, hx, vx, hy);
|
||||
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
||||
}
|
||||
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -517,33 +698,59 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
|
||||
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16;
|
||||
const int ggml_f16_step = 2 * ggml_f16_epr;
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||
const int np = (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t ay1, ay2;
|
||||
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
|
||||
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
|
||||
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
|
||||
}
|
||||
}
|
||||
// leftovers
|
||||
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
||||
if (np < n) {
|
||||
svbool_t pg = svwhilelt_b16(np, n);
|
||||
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
||||
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
||||
svst1_f16(pg, (__fp16 *)(y + np), out);
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#endif
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -795,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
|
||||
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
|
||||
const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
|
||||
const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
|
||||
const svfloat32_t n = svsub_f32_x(pg, z, r);
|
||||
const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
|
||||
const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
|
||||
const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
|
||||
const svbool_t c = svacgt_n_f32(pg, n, 126);
|
||||
const svfloat32_t u = svmul_f32_x(pg, b, b);
|
||||
const svfloat32_t j = svmla_f32_x(pg,
|
||||
svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
|
||||
svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
|
||||
svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
|
||||
const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
|
||||
const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
|
||||
const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
|
||||
return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
|
||||
svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
|
||||
}
|
||||
|
||||
// computes silu x/(1+exp(-x)) in single precision vector
|
||||
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
|
||||
const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
|
||||
const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
|
||||
const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
|
||||
const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
|
||||
const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
|
||||
return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
|
||||
}
|
||||
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
|
||||
// adapted from arm limited optimized routine
|
||||
// the maximum error is 1.45358 plus 0.5 ulps
|
||||
@@ -1030,6 +1269,14 @@ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
|
||||
vl);
|
||||
}
|
||||
|
||||
// computes silu x/(1+exp(-x)) in single precision vector
|
||||
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
|
||||
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
|
||||
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
|
||||
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
|
||||
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
|
||||
}
|
||||
|
||||
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
|
||||
|
||||
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "conv2d.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
struct conv_params {
|
||||
const int64_t IW, IH;
|
||||
@@ -94,8 +95,8 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
|
||||
const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
|
||||
|
||||
const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
|
||||
const float kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
|
||||
acc += (input_val * kernel_val);
|
||||
const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
|
||||
acc += (input_val * ggml_cuda_cast<float>(kernel_val));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -523,13 +523,6 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H40,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
|
||||
@@ -1562,13 +1555,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H40, flash_attn_ext_vec_f16_h40, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H40, flash_attn_ext_vec_bf16_h40, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H40, flash_attn_ext_vec_q4_0_h40, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H40, flash_attn_ext_vec_q4_1_h40, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H40, flash_attn_ext_vec_q5_0_h40, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H40, flash_attn_ext_vec_q5_1_h40, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H40, flash_attn_ext_vec_q8_0_h40, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64, flash_attn_ext_vec_f16_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64, flash_attn_ext_vec_bf16_h64, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64, flash_attn_ext_vec_q4_0_h64, has_simdgroup_reduction);
|
||||
@@ -1909,9 +1895,15 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_ARANGE:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[0]->ne[0] == 32) {
|
||||
// head size == 32 (e.g. bert-bge-small)
|
||||
// TODO: not sure if it is worth adding kernels for this size
|
||||
// for new head sizes, add checks here
|
||||
if (op->src[0]->ne[0] != 40 &&
|
||||
op->src[0]->ne[0] != 64 &&
|
||||
op->src[0]->ne[0] != 80 &&
|
||||
op->src[0]->ne[0] != 96 &&
|
||||
op->src[0]->ne[0] != 112 &&
|
||||
op->src[0]->ne[0] != 128 &&
|
||||
op->src[0]->ne[0] != 192 &&
|
||||
op->src[0]->ne[0] != 256) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
@@ -5138,10 +5130,8 @@ static int ggml_metal_encode_node(
|
||||
|
||||
bool use_vec_kernel = false;
|
||||
|
||||
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
|
||||
// for now avoiding mainly to keep the number of templates/kernels a bit lower
|
||||
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
|
||||
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
|
||||
// use non-vec kernel if the batch size is large or if the vec-kernel is not supported for this head size
|
||||
if (ne01 >= 20 || (ne00 == 40 || ne00 == 80 || ne00 == 112)) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
@@ -5329,24 +5319,6 @@ static int ggml_metal_encode_node(
|
||||
use_vec_kernel = true;
|
||||
|
||||
switch (ne00) {
|
||||
case 40:
|
||||
{
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H40].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H40].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H40].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H40].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H40].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H40].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H40].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
|
||||
GGML_LOG_ERROR("add template specialization for this type\n");
|
||||
GGML_ABORT("add template specialization for this type");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case 64:
|
||||
{
|
||||
switch (src1->type) {
|
||||
|
||||
@@ -4803,6 +4803,9 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
ushort3 ntg[[threads_per_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
static_assert(DK % 32 == 0, "DK must be divisible by 32");
|
||||
static_assert(DV % 32 == 0, "DV must be divisible by 32");
|
||||
|
||||
const short nsg = ntg.y; // number of simdgroups
|
||||
const short iwg = tgpig[2]%nwg;
|
||||
|
||||
@@ -5160,16 +5163,6 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 40, 40, 8>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 40, 40, 8>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 40, 40, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 40, 40, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 40, 40, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 40, 40, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h40")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 40, 40, 8>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 64, 64, 8>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 64, 64, 8>;
|
||||
|
||||
@@ -2776,10 +2776,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
if (op->src[4]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ggml_tensor * q = op->src[0];
|
||||
const ggml_tensor * k = op->src[1];
|
||||
const ggml_tensor * v = op->src[2];
|
||||
@@ -5765,6 +5761,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
||||
static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
|
||||
const ggml_tensor * v = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
GGML_ASSERT(q->extra);
|
||||
GGML_ASSERT(k->extra);
|
||||
GGML_ASSERT(v->extra);
|
||||
@@ -5772,6 +5769,9 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
||||
if (mask) {
|
||||
GGML_ASSERT(mask->extra);
|
||||
}
|
||||
if (sinks) {
|
||||
GGML_ASSERT(sinks->extra);
|
||||
}
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
@@ -5813,6 +5813,7 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
||||
ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra;
|
||||
ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL;
|
||||
ggml_tensor_extra_cl * extra_sinks = sinks ? (ggml_tensor_extra_cl *)sinks->extra : NULL;
|
||||
|
||||
cl_ulong offset_q = extra_q->offset + q->view_offs;
|
||||
cl_ulong offset_k = extra_k->offset + k->view_offs;
|
||||
@@ -5820,6 +5821,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
||||
cl_ulong offset_o = extra_o->offset + dst->view_offs;
|
||||
cl_mem mask_buffer = extra_mask ? extra_mask->data_device : NULL;
|
||||
cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0;
|
||||
cl_mem sinks_buffer = extra_sinks ? extra_sinks->data_device : NULL;
|
||||
cl_ulong offset_sinks = extra_sinks ? extra_sinks->offset + sinks->view_offs : 0;
|
||||
|
||||
const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3];
|
||||
const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3];
|
||||
@@ -5874,6 +5877,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
||||
CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int), &mask_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int), &mask_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 38, sizeof(cl_mem), &sinks_buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 39, sizeof(cl_ulong), &offset_sinks));
|
||||
|
||||
if (n_q == 1) {
|
||||
const size_t wg_size = 64;
|
||||
|
||||
@@ -49,7 +49,9 @@ __kernel void flash_attn_f16(
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int block_q_idx = get_group_id(0);
|
||||
@@ -171,6 +173,20 @@ __kernel void flash_attn_f16(
|
||||
}
|
||||
|
||||
if (my_query_row < n_q) {
|
||||
if (sinks_void != NULL) {
|
||||
const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
const ACC_TYPE m_sink = sinks_ptr[head_idx];
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
|
||||
const ACC_TYPE scale_o = exp(m_i - m_final);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] *= scale_o;
|
||||
}
|
||||
|
||||
l_i = l_i * exp(m_i - m_final) + exp(m_sink - m_final);
|
||||
}
|
||||
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
|
||||
global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_i > 0.0f) {
|
||||
@@ -214,7 +230,9 @@ __kernel void flash_attn_f16_q1(
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int head_batch_idx = get_global_id(1);
|
||||
@@ -247,7 +265,12 @@ __kernel void flash_attn_f16_q1(
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
|
||||
ACC_TYPE m_i = -INFINITY;
|
||||
const global ACC_TYPE* sinks_ptr = NULL;
|
||||
if (sinks_void != NULL) {
|
||||
sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
|
||||
for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
|
||||
@@ -320,7 +343,11 @@ __kernel void flash_attn_f16_q1(
|
||||
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1;
|
||||
global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
const ACC_TYPE l_final = local_l[0];
|
||||
ACC_TYPE l_final = local_l[0];
|
||||
|
||||
if (sinks_ptr != NULL) {
|
||||
l_final += exp(sinks_ptr[head_idx] - m_final);
|
||||
}
|
||||
|
||||
if (l_final > 0.0f) {
|
||||
const ACC_TYPE l_inv = 1.0f / l_final;
|
||||
|
||||
@@ -49,7 +49,9 @@ __kernel void flash_attn_f32(
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int block_q_idx = get_group_id(0);
|
||||
@@ -171,6 +173,20 @@ __kernel void flash_attn_f32(
|
||||
}
|
||||
|
||||
if (my_query_row < n_q) {
|
||||
if (sinks_void != NULL) {
|
||||
const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
const ACC_TYPE m_sink = sinks_ptr[head_idx];
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
|
||||
const ACC_TYPE scale_o = exp(m_i - m_final);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] *= scale_o;
|
||||
}
|
||||
|
||||
l_i = l_i * exp(m_i - m_final) + exp(m_sink - m_final);
|
||||
}
|
||||
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
|
||||
global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_i > 0.0f) {
|
||||
@@ -214,7 +230,9 @@ __kernel void flash_attn_f32_q1(
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int head_batch_idx = get_global_id(1);
|
||||
@@ -247,7 +265,12 @@ __kernel void flash_attn_f32_q1(
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
|
||||
ACC_TYPE m_i = -INFINITY;
|
||||
const global ACC_TYPE* sinks_ptr = NULL;
|
||||
if (sinks_void != NULL) {
|
||||
sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
|
||||
for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
|
||||
@@ -320,7 +343,11 @@ __kernel void flash_attn_f32_q1(
|
||||
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1;
|
||||
global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
const ACC_TYPE l_final = local_l[0];
|
||||
ACC_TYPE l_final = local_l[0];
|
||||
|
||||
if (sinks_ptr != NULL) {
|
||||
l_final += exp(sinks_ptr[head_idx] - m_final);
|
||||
}
|
||||
|
||||
if (l_final > 0.0f) {
|
||||
const ACC_TYPE l_inv = 1.0f / l_final;
|
||||
|
||||
@@ -52,7 +52,9 @@ __kernel void flash_attn_f32_f16(
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int block_q_idx = get_group_id(0);
|
||||
@@ -174,6 +176,20 @@ __kernel void flash_attn_f32_f16(
|
||||
}
|
||||
|
||||
if (my_query_row < n_q) {
|
||||
if (sinks_void != NULL) {
|
||||
const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
const ACC_TYPE m_sink = sinks_ptr[head_idx];
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
|
||||
const ACC_TYPE scale_o = exp(m_i - m_final);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] *= scale_o;
|
||||
}
|
||||
|
||||
l_i = l_i * exp(m_i - m_final) + exp(m_sink - m_final);
|
||||
}
|
||||
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
|
||||
global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_i > 0.0f) {
|
||||
@@ -217,7 +233,9 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int head_batch_idx = get_global_id(1);
|
||||
@@ -250,7 +268,12 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
|
||||
ACC_TYPE m_i = -INFINITY;
|
||||
const global ACC_TYPE* sinks_ptr = NULL;
|
||||
if (sinks_void != NULL) {
|
||||
sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
|
||||
for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
|
||||
@@ -323,7 +346,11 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1;
|
||||
global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
const ACC_TYPE l_final = local_l[0];
|
||||
ACC_TYPE l_final = local_l[0];
|
||||
|
||||
if (sinks_ptr != NULL) {
|
||||
l_final += exp(sinks_ptr[head_idx] - m_final);
|
||||
}
|
||||
|
||||
if (l_final > 0.0f) {
|
||||
const ACC_TYPE l_inv = 1.0f / l_final;
|
||||
|
||||
@@ -360,6 +360,13 @@ struct vk_fa_pipeline_state {
|
||||
}
|
||||
};
|
||||
|
||||
enum shader_reduction_mode {
|
||||
SHADER_REDUCTION_MODE_SHMEM,
|
||||
SHADER_REDUCTION_MODE_HYBRID,
|
||||
SHADER_REDUCTION_MODE_SUBGROUP,
|
||||
SHADER_REDUCTION_MODE_COUNT,
|
||||
};
|
||||
|
||||
static constexpr uint32_t num_argsort_pipelines = 11;
|
||||
static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
|
||||
|
||||
@@ -386,15 +393,18 @@ struct vk_device_struct {
|
||||
bool uma;
|
||||
bool prefer_host_memory;
|
||||
bool float_controls_rte_fp16;
|
||||
bool subgroup_add;
|
||||
bool subgroup_arithmetic;
|
||||
bool subgroup_shuffle;
|
||||
bool subgroup_ballot;
|
||||
bool subgroup_clustered;
|
||||
bool multi_add;
|
||||
|
||||
bool add_rms_fusion;
|
||||
uint32_t partials_binding_alignment;
|
||||
|
||||
bool integer_dot_product;
|
||||
// 0: default, 1: force mmvq, -1: disable mmvq
|
||||
int32_t mmvq_mode;
|
||||
|
||||
bool subgroup_size_control;
|
||||
uint32_t subgroup_min_size;
|
||||
@@ -452,12 +462,15 @@ struct vk_device_struct {
|
||||
|
||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||
vk_pipeline pipeline_quantize_q8_1;
|
||||
vk_pipeline pipeline_quantize_q8_1_x4;
|
||||
|
||||
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
||||
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
||||
|
||||
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio];
|
||||
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
||||
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
||||
@@ -1357,6 +1370,7 @@ struct vk_instance_t {
|
||||
PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {};
|
||||
|
||||
std::vector<size_t> device_indices;
|
||||
std::vector<bool> device_supports_membudget;
|
||||
vk_device devices[GGML_VK_MAX_DEVICES];
|
||||
};
|
||||
|
||||
@@ -2764,11 +2778,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, REQSUBGROUPSIZE > 0, false, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, REQSUBGROUPSIZE > 0, false, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, REQSUBGROUPSIZE > 0, false, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
@@ -2911,60 +2925,91 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
rm_stdq = 2;
|
||||
uint32_t rm_iq = 2 * rm_kq;
|
||||
|
||||
for (uint32_t w = 0; w < DMMV_WG_SIZE_COUNT; ++w) {
|
||||
uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_16 : (subgroup_size_16 * 4);
|
||||
uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? device->subgroup_size : (device->subgroup_size * 4);
|
||||
const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN;
|
||||
// Ensure a subgroup size >= 16 is available
|
||||
const bool use_subgroups16 = use_subgroups &&
|
||||
(!device->subgroup_size_control && device->subgroup_size >= 16 ||
|
||||
device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16);
|
||||
|
||||
const bool s = device->subgroup_add && device->architecture != vk_device_architecture::AMD_GCN;
|
||||
const uint32_t subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16) ? 16 : device->subgroup_size;
|
||||
const uint32_t subgroup_size16 = std::max(subgroup_size, 16u);
|
||||
|
||||
const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0;
|
||||
const uint32_t force_subgroup_size16 = use_subgroups16 ? subgroup_size16 : 0;
|
||||
|
||||
for (uint32_t w = 0; w < DMMV_WG_SIZE_COUNT; ++w) {
|
||||
const uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size : (subgroup_size * 4);
|
||||
const uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size16 : (subgroup_size16 * 4);
|
||||
|
||||
const shader_reduction_mode reduc = (use_subgroups && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP :
|
||||
(use_subgroups && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID :
|
||||
SHADER_REDUCTION_MODE_SHMEM;
|
||||
|
||||
const shader_reduction_mode reduc16 = (use_subgroups16 && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP :
|
||||
(use_subgroups16 && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID :
|
||||
SHADER_REDUCTION_MODE_SHMEM;
|
||||
|
||||
for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[s], arr_dmmv_f32_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[s], arr_dmmv_f16_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[s], arr_dmmv_bf16_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[s], arr_dmmv_q4_0_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[s], arr_dmmv_q4_1_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[s], arr_dmmv_q5_0_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[s], arr_dmmv_q5_1_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[s], arr_dmmv_q8_0_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[s], arr_dmmv_q2_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[s], arr_dmmv_q3_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[s], arr_dmmv_q4_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[s], arr_dmmv_q5_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[s], arr_dmmv_q6_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[s], arr_dmmv_iq1_s_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[s], arr_dmmv_iq1_m_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[s], arr_dmmv_iq2_xxs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[s], arr_dmmv_iq2_xs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[s], arr_dmmv_iq2_s_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[s], arr_dmmv_iq3_xxs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[s], arr_dmmv_iq3_s_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[s], arr_dmmv_iq4_xs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[s], arr_dmmv_iq4_nl_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[s], arr_dmmv_mxfp4_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[reduc], arr_dmmv_q5_1_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[reduc], arr_dmmv_q8_0_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[reduc16], arr_dmmv_q2_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[reduc16], arr_dmmv_q3_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[reduc16], arr_dmmv_q4_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[reduc16], arr_dmmv_q5_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[reduc16], arr_dmmv_q6_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[reduc16], arr_dmmv_iq1_s_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[reduc16], arr_dmmv_iq1_m_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_iq2_xxs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[reduc16], arr_dmmv_iq2_xs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[reduc16], arr_dmmv_iq2_s_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_iq3_xxs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[reduc16], arr_dmmv_iq3_s_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[reduc16], arr_dmmv_iq4_xs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[s], arr_dmmv_f32_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[s], arr_dmmv_f16_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[s], arr_dmmv_bf16_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[s], arr_dmmv_q4_0_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[s], arr_dmmv_q4_1_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[s], arr_dmmv_q5_0_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[s], arr_dmmv_q5_1_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[s], arr_dmmv_q8_0_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[s], arr_dmmv_q2_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[s], arr_dmmv_q3_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[s], arr_dmmv_q4_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[s], arr_dmmv_q5_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[s], arr_dmmv_q6_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[s], arr_dmmv_iq1_s_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[s], arr_dmmv_iq1_m_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[s], arr_dmmv_iq2_xxs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[s], arr_dmmv_iq2_xs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[s], arr_dmmv_iq2_s_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[s], arr_dmmv_iq3_xxs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[s], arr_dmmv_iq3_s_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[s], arr_dmmv_iq4_xs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[s], arr_dmmv_iq4_nl_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[s], arr_dmmv_mxfp4_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[reduc], arr_dmmv_q5_1_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[reduc], arr_dmmv_q8_0_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[reduc16], arr_dmmv_q2_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[reduc16], arr_dmmv_q3_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[reduc16], arr_dmmv_q4_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[reduc16], arr_dmmv_q5_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[reduc16], arr_dmmv_q6_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[reduc16], arr_dmmv_iq1_s_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[reduc16], arr_dmmv_iq1_m_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[reduc16], arr_dmmv_iq2_xxs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[reduc16], arr_dmmv_iq2_xs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[reduc16], arr_dmmv_iq2_s_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[reduc16], arr_dmmv_iq3_xxs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[reduc16], arr_dmmv_iq3_s_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[reduc16], arr_dmmv_iq4_xs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (device->integer_dot_product) {
|
||||
const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
|
||||
const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_q8_1_f32", arr_dmmv_q5_1_q8_1_f32_len[reduc], arr_dmmv_q5_1_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32", arr_dmmv_q8_0_q8_1_f32_len[reduc], arr_dmmv_q8_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int);
|
||||
}
|
||||
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3056,10 +3101,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
if (device->subgroup_clustered && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_subgroup_len, quantize_q8_1_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||
if (device->subgroup_add && device->subgroup_require_full_support) {
|
||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
|
||||
@@ -3578,11 +3630,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
}
|
||||
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
|
||||
|
||||
device->subgroup_add = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
||||
|
||||
device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
||||
device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle);
|
||||
device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered);
|
||||
|
||||
device->subgroup_ballot = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBallot);
|
||||
@@ -4038,11 +4091,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr;
|
||||
|
||||
device->add_rms_fusion = !device->disable_fusion &&
|
||||
device->subgroup_add &&
|
||||
device->subgroup_arithmetic &&
|
||||
device->vendor_id != VK_VENDOR_ID_INTEL;
|
||||
device->partials_binding_alignment =
|
||||
std::max(4u, (uint32_t)device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
|
||||
device->mmvq_mode = 0;
|
||||
if (getenv("GGML_VK_DISABLE_MMVQ")) {
|
||||
device->mmvq_mode = -1;
|
||||
} else if (getenv("GGML_VK_FORCE_MMVQ")) {
|
||||
device->mmvq_mode = 1;
|
||||
}
|
||||
|
||||
return device;
|
||||
}
|
||||
|
||||
@@ -4281,15 +4341,16 @@ static void ggml_vk_instance_init() {
|
||||
vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT");
|
||||
vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT");
|
||||
vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT");
|
||||
|
||||
}
|
||||
|
||||
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||
|
||||
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
|
||||
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
||||
char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
|
||||
if (devices_env != nullptr) {
|
||||
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
||||
size_t num_available_devices = devices.size();
|
||||
|
||||
std::string devices(devices_env);
|
||||
std::replace(devices.begin(), devices.end(), ',', ' ');
|
||||
@@ -4304,8 +4365,6 @@ static void ggml_vk_instance_init() {
|
||||
vk_instance.device_indices.push_back(tmp);
|
||||
}
|
||||
} else {
|
||||
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
|
||||
// If no vulkan devices are found, return early
|
||||
if (devices.empty()) {
|
||||
GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
|
||||
@@ -4410,6 +4469,19 @@ static void ggml_vk_instance_init() {
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
|
||||
|
||||
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
||||
vk::PhysicalDevice vkdev = devices[vk_instance.device_indices[i]];
|
||||
std::vector<vk::ExtensionProperties> extensionprops = vkdev.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool membudget_supported = false;
|
||||
for (const auto & ext : extensionprops) {
|
||||
if (strcmp(VK_EXT_MEMORY_BUDGET_EXTENSION_NAME, ext.extensionName) == 0) {
|
||||
membudget_supported = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vk_instance.device_supports_membudget.push_back(membudget_supported);
|
||||
|
||||
ggml_vk_print_gpu_info(i);
|
||||
}
|
||||
}
|
||||
@@ -4556,9 +4628,22 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||
|
||||
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols, uint32_t m, uint32_t k) {
|
||||
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
||||
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16 || b_type == GGML_TYPE_Q8_1);
|
||||
GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols);
|
||||
|
||||
if (b_type == GGML_TYPE_Q8_1) {
|
||||
switch (a_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
switch (a_type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
@@ -4590,7 +4675,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||
|
||||
// heuristic to choose workgroup size
|
||||
uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
|
||||
if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {
|
||||
if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {
|
||||
// Prefer larger workgroups when M is small, to spread the work out more
|
||||
// and keep more SMs busy.
|
||||
// q6_k seems to prefer small workgroup size even for "medium" values of M.
|
||||
@@ -4605,6 +4690,13 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||
}
|
||||
}
|
||||
|
||||
if (b_type == GGML_TYPE_Q8_1) {
|
||||
if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) {
|
||||
dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
|
||||
}
|
||||
return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1];
|
||||
}
|
||||
|
||||
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1];
|
||||
}
|
||||
|
||||
@@ -4674,7 +4766,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
||||
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec_id()");
|
||||
GGML_ASSERT(b_type == GGML_TYPE_F32);
|
||||
|
||||
switch (a_type) {
|
||||
@@ -5587,20 +5679,20 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
|
||||
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type, bool use_x4_blocks) {
|
||||
switch(type) {
|
||||
case GGML_TYPE_Q8_1:
|
||||
return ctx->device->pipeline_quantize_q8_1;
|
||||
return use_x4_blocks ? ctx->device->pipeline_quantize_q8_1_x4 : ctx->device->pipeline_quantize_q8_1;
|
||||
default:
|
||||
std::cerr << "Missing quantize pipeline for type: " << ggml_type_name(type) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, vk_subbuffer&& in, vk_subbuffer&& out, uint32_t ne) {
|
||||
static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, vk_subbuffer&& in, vk_subbuffer&& out, uint32_t ne, bool use_x4_blocks = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_quantize_q8_1(" << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ", " << ne << ")");
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
||||
vk_pipeline pipeline = use_x4_blocks ? ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true) : ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, false);
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
@@ -5720,12 +5812,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||
|
||||
if (quantize_y) {
|
||||
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
||||
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
|
||||
}
|
||||
|
||||
if (dryrun) {
|
||||
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
||||
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
if (quantize_y) {
|
||||
y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
|
||||
}
|
||||
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
|
||||
if (
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
@@ -5791,7 +5886,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
GGML_ASSERT(d_Y->size >= y_sz * ne12 * ne13);
|
||||
} else if (quantize_y) {
|
||||
d_Y = ctx->prealloc_y;
|
||||
GGML_ASSERT(d_Y->size >= y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1));
|
||||
GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz * ne12 * ne13, 144) * 144);
|
||||
} else {
|
||||
d_Y = d_Qy;
|
||||
y_buf_offset = qy_buf_offset;
|
||||
@@ -5828,7 +5923,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
@@ -5845,10 +5940,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
||||
}
|
||||
|
||||
uint32_t y_sz_total = y_sz * ne12 * ne13;
|
||||
if (quantize_y) {
|
||||
y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
|
||||
}
|
||||
|
||||
// compute
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, pipeline,
|
||||
{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
|
||||
{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total },
|
||||
{ d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
|
||||
ne01, ne11, ne10,
|
||||
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
@@ -5863,6 +5963,51 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
}
|
||||
}
|
||||
|
||||
// Device tuning
|
||||
static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_t n, uint32_t k, ggml_type src0_type) {
|
||||
if (device->mmvq_mode == 1) {
|
||||
return true;
|
||||
} else if (device->mmvq_mode == -1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// MMVQ is generally good for batches
|
||||
if (n > 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (device->vendor_id) {
|
||||
case VK_VENDOR_ID_NVIDIA:
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
return device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
case VK_VENDOR_ID_AMD:
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
return device->architecture == vk_device_architecture::AMD_GCN;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
switch (src0_type) {
|
||||
// From tests on A770 Linux, may need more tuning
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(k);
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||
@@ -5917,22 +6062,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
||||
|
||||
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
||||
|
||||
const bool qx_needs_dequant = x_non_contig;
|
||||
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
|
||||
|
||||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
const uint64_t x_ne = ne01 * ne00;
|
||||
const uint64_t y_ne = ne11 * ne10;
|
||||
const uint64_t d_ne = ne11 * ne01;
|
||||
|
||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
|
||||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne11, ne10, src0->type);
|
||||
|
||||
vk_pipeline to_fp16_vk_0 = nullptr;
|
||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
@@ -5944,14 +6074,47 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11, ne20, ne00);
|
||||
|
||||
// Check for mmq first
|
||||
vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, GGML_TYPE_Q8_1, ne11, ne20, ne00) : nullptr;
|
||||
vk_pipeline to_q8_1 = nullptr;
|
||||
|
||||
if (dmmv == nullptr) {
|
||||
// Fall back to f16 dequant mul mat
|
||||
dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11, ne20, ne00);
|
||||
quantize_y = false;
|
||||
}
|
||||
|
||||
if (quantize_y) {
|
||||
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
|
||||
}
|
||||
|
||||
const bool qx_needs_dequant = x_non_contig;
|
||||
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
|
||||
|
||||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||
GGML_ASSERT(dmmv != nullptr);
|
||||
|
||||
const uint64_t x_ne = ne01 * ne00;
|
||||
const uint64_t y_ne = ne11 * ne10;
|
||||
const uint64_t d_ne = ne11 * ne01;
|
||||
|
||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
|
||||
const uint64_t y_sz = quantize_y ? (y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
if (dryrun) {
|
||||
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
||||
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
if (quantize_y) {
|
||||
y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
|
||||
}
|
||||
if (
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
||||
@@ -5960,7 +6123,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
||||
ctx->prealloc_size_x = x_sz_upd;
|
||||
}
|
||||
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
|
||||
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
|
||||
ctx->prealloc_size_y = y_sz_upd;
|
||||
}
|
||||
|
||||
@@ -5971,6 +6134,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
if (qy_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||
}
|
||||
if (quantize_y) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
|
||||
}
|
||||
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
||||
return;
|
||||
}
|
||||
@@ -6001,6 +6167,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
}
|
||||
if (qy_needs_dequant) {
|
||||
d_Y = ctx->prealloc_y;
|
||||
} else if (quantize_y) {
|
||||
d_Y = ctx->prealloc_y;
|
||||
GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz * ne12 * ne13, 144) * 144);
|
||||
} else {
|
||||
d_Y = d_Qy;
|
||||
y_buf_offset = qy_buf_offset;
|
||||
@@ -6011,9 +6180,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
if (ctx->prealloc_x_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (x_non_contig) {
|
||||
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
||||
}
|
||||
@@ -6029,6 +6196,17 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
}
|
||||
if (quantize_y) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
}
|
||||
|
||||
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
|
||||
uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01;
|
||||
@@ -6053,6 +6231,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
groups_x = CEIL_DIV(groups_x, groups_z);
|
||||
}
|
||||
|
||||
// TODO: Clean up this whole sz * ne_2 * ne_3 thing, it hasn't been necessary for a long time
|
||||
uint32_t y_sz_total = y_sz * ne12 * ne13;
|
||||
if (quantize_y) {
|
||||
y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
|
||||
}
|
||||
|
||||
// compute
|
||||
const vk_mat_vec_push_constants pc = {
|
||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||
@@ -6060,13 +6244,13 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||
};
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
||||
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
||||
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz_total }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
||||
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
||||
|
||||
if (x_non_contig) {
|
||||
ctx->prealloc_x_need_sync = true;
|
||||
}
|
||||
if (y_non_contig) {
|
||||
if (y_non_contig || quantize_y) {
|
||||
ctx->prealloc_y_need_sync = true;
|
||||
}
|
||||
}
|
||||
@@ -11483,15 +11667,29 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
|
||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
|
||||
|
||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
|
||||
vk::PhysicalDeviceMemoryProperties2 memprops = {};
|
||||
bool membudget_supported = vk_instance.device_supports_membudget[device];
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
}
|
||||
vkdev.getMemoryProperties2(&memprops);
|
||||
|
||||
for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
|
||||
const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
|
||||
|
||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
*free = heap.size;
|
||||
|
||||
if (membudget_supported && i < budgetprops.heapUsage.size()) {
|
||||
*free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
|
||||
} else {
|
||||
*free = heap.size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#if USE_SUBGROUP_ADD
|
||||
|
||||
#if USE_SUBGROUP_ADD || USE_SUBGROUP_ADD_NO_SHMEM
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#endif
|
||||
@@ -12,10 +13,19 @@
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#ifndef MMQ
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
#else
|
||||
layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
#ifdef B_TYPE_VEC2
|
||||
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
|
||||
#endif
|
||||
#ifdef B_TYPE_VEC4
|
||||
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
#ifdef MUL_MAT_ID
|
||||
@@ -92,6 +102,23 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||
layout (constant_id = 2) const uint NUM_COLS = 1;
|
||||
|
||||
#ifdef USE_SUBGROUP_ADD_NO_SHMEM
|
||||
void reduce_result(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
temp[j][n] = subgroupAdd(temp[j][n]);
|
||||
}
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
|
||||
|
||||
void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||
@@ -152,3 +179,4 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
140
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
Normal file
140
ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
Normal file
@@ -0,0 +1,140 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
#extension GL_EXT_integer_dot_product : require
|
||||
|
||||
#define MMQ
|
||||
#define B_TYPE block_q8_1_x4
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
#define K_PER_ITER 8
|
||||
|
||||
#include "mul_mmq_funcs.comp"
|
||||
|
||||
uint a_offset, b_offset, d_offset;
|
||||
|
||||
int32_t cache_b_qs[2];
|
||||
vec2 cache_b_ds;
|
||||
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint col = i*BLOCK_SIZE + tid*K_PER_ITER;
|
||||
|
||||
// Preload data_b block
|
||||
const uint b_block_idx = (j*p.batch_stride_b + col) / QUANT_K_Q8_1 + b_offset;
|
||||
const uint b_qs_idx = tid % 4;
|
||||
const uint b_block_idx_outer = b_block_idx / 4;
|
||||
const uint b_block_idx_inner = b_block_idx % 4;
|
||||
cache_b_ds = vec2(data_b[b_block_idx_outer].ds[b_block_idx_inner]);
|
||||
|
||||
#if QUANT_R == 2
|
||||
cache_b_qs[0] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx];
|
||||
cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx + 4];
|
||||
#else
|
||||
cache_b_qs[0] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 2];
|
||||
cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 2 + 1];
|
||||
#endif
|
||||
|
||||
uint ibi = first_row*p.ncols;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint a_block_idx = (ibi + col)/QUANT_K + a_offset;
|
||||
ibi += p.ncols;
|
||||
|
||||
int32_t q_sum = 0;
|
||||
#if QUANT_R == 2
|
||||
const i32vec2 data_a_qs = repack(a_block_idx, b_qs_idx);
|
||||
q_sum += dotPacked4x8EXT(data_a_qs.x,
|
||||
cache_b_qs[0]);
|
||||
q_sum += dotPacked4x8EXT(data_a_qs.y,
|
||||
cache_b_qs[1]);
|
||||
#else
|
||||
int32_t data_a_qs = repack(a_block_idx, b_qs_idx * 2);
|
||||
q_sum += dotPacked4x8EXT(data_a_qs,
|
||||
cache_b_qs[0]);
|
||||
data_a_qs = repack(a_block_idx, b_qs_idx * 2 + 1);
|
||||
q_sum += dotPacked4x8EXT(data_a_qs,
|
||||
cache_b_qs[1]);
|
||||
#endif
|
||||
|
||||
#if QUANT_AUXF == 1
|
||||
temp[j][n] += mul_q8_1(q_sum, get_d(a_block_idx), cache_b_ds, 4);
|
||||
#else
|
||||
temp[j][n] += mul_q8_1(q_sum, get_dm(a_block_idx), cache_b_ds, 4);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
a_offset /= QUANT_K;
|
||||
b_offset /= QUANT_K_Q8_1;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
temp[j][n] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||
if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
|
||||
num_iters++;
|
||||
}
|
||||
int unroll_count = 4;
|
||||
uint unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
uint i = 0;
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
unroll_count = 2;
|
||||
unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
#if K_PER_ITER == 2
|
||||
if ((p.ncols & 1) != 0 &&
|
||||
unrolled_iters == num_iters &&
|
||||
unrolled_iters > 0) {
|
||||
unrolled_iters -= unroll_count;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
while (i < num_iters) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
@@ -494,6 +494,9 @@ void main() {
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
}
|
||||
}
|
||||
#if defined(ACC_TYPE_MAX)
|
||||
[[unroll]] for (uint i = 0; i < sum.length(); ++i) { sum[i] = clamp(sum[i], -ACC_TYPE_MAX, ACC_TYPE_MAX); }
|
||||
#endif
|
||||
|
||||
// Convert from ACC_TYPE to D_TYPE
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BNover4, gl_MatrixUseAccumulator> mat_d;
|
||||
@@ -535,6 +538,9 @@ void main() {
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
}
|
||||
}
|
||||
#if defined(ACC_TYPE_MAX)
|
||||
[[unroll]] for (uint i = 0; i < sum.length(); ++i) { sum[i] = clamp(sum[i], -ACC_TYPE_MAX, ACC_TYPE_MAX); }
|
||||
#endif
|
||||
|
||||
// Convert from ACC_TYPE to D_TYPE
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BNover2, gl_MatrixUseAccumulator> mat_d;
|
||||
|
||||
@@ -28,7 +28,7 @@ layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
|
||||
#if defined(A_TYPE_PACKED32)
|
||||
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||
#endif
|
||||
layout (binding = 1) readonly buffer B {block_q8_1_packed32 data_b[];};
|
||||
layout (binding = 1) readonly buffer B {block_q8_1_x4_packed128 data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
@@ -98,7 +98,7 @@ shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
|
||||
#endif
|
||||
|
||||
#define LOAD_VEC_A (4 * QUANT_R)
|
||||
#define LOAD_VEC_B 4
|
||||
#define LOAD_VEC_B 16
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
shared u16vec2 row_ids[4096];
|
||||
@@ -270,15 +270,22 @@ void main() {
|
||||
const uint iqs = idx & 0x7;
|
||||
#else
|
||||
const uint ib = pos_b_ib + (loadc_b + l) * p.stride_b / BK;
|
||||
const uint ib_outer = ib / 4;
|
||||
const uint ib_inner = ib % 4;
|
||||
|
||||
const uint iqs = loadr_b;
|
||||
#endif
|
||||
|
||||
const uint buf_ib = loadc_b + l;
|
||||
|
||||
if (iqs == 0) {
|
||||
buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib].ds);
|
||||
buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib_outer].ds[ib_inner]);
|
||||
}
|
||||
buf_b_qs[buf_ib * SHMEM_STRIDE + iqs] = data_b[ib].qs[iqs];
|
||||
const ivec4 values = data_b[ib_outer].qs[ib_inner * 2 + iqs];
|
||||
buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 ] = values.x;
|
||||
buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 1] = values.y;
|
||||
buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 2] = values.z;
|
||||
buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 3] = values.w;
|
||||
}
|
||||
|
||||
barrier();
|
||||
@@ -349,7 +356,7 @@ void main() {
|
||||
cache_b_qs[cc * (BK / 4) + idx_k]);
|
||||
}
|
||||
|
||||
sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc]);
|
||||
sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@ i32vec2 repack(uint ib, uint iqs) {
|
||||
(vui >> 4) & 0x0F0F0F0F);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0f * dsb.y));
|
||||
ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - (8 / sum_divisor) * dsb.y));
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -29,8 +29,8 @@ i32vec2 repack(uint ib, uint iqs) {
|
||||
(vui >> 4) & 0x0F0F0F0F);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, vec2 dma, vec2 dsb) {
|
||||
return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y);
|
||||
ACC_TYPE mul_q8_1(const int32_t q_sum, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
|
||||
return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y / sum_divisor);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -50,8 +50,8 @@ i32vec2 repack(uint ib, uint iqs) {
|
||||
return i32vec2(v0, v1);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0f * dsb.y));
|
||||
ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
|
||||
return ACC_TYPE(da * (float(q_sum) * dsb.x - (16 / sum_divisor) * dsb.y));
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -69,8 +69,8 @@ i32vec2 repack(uint ib, uint iqs) {
|
||||
return i32vec2(v0, v1);
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, vec2 dma, vec2 dsb) {
|
||||
return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y);
|
||||
ACC_TYPE mul_q8_1(const int32_t q_sum, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
|
||||
return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y / sum_divisor);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -81,7 +81,7 @@ int32_t repack(uint ib, uint iqs) {
|
||||
data_a[ib].qs[iqs * 2 + 1]));
|
||||
}
|
||||
|
||||
ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) {
|
||||
ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
|
||||
return ACC_TYPE(float(q_sum) * da * dsb.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3,6 +3,15 @@
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#ifdef USE_SUBGROUPS
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#extension GL_KHR_shader_subgroup_clustered : require
|
||||
|
||||
#define INVOCATION_ID gl_SubgroupInvocationID.x
|
||||
#else
|
||||
#define INVOCATION_ID gl_LocalInvocationID.x
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
@@ -14,13 +23,19 @@ layout(constant_id = 0) const uint GROUP_SIZE = 32;
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {vec4 data_a[];};
|
||||
#ifndef QBLOCK_X4
|
||||
layout (binding = 1) writeonly buffer D {block_q8_1_packed32 data_b[];};
|
||||
#else
|
||||
layout (binding = 1) writeonly buffer D {block_q8_1_x4 data_b[];};
|
||||
#endif
|
||||
|
||||
#ifndef USE_SUBGROUPS
|
||||
shared float shmem[GROUP_SIZE];
|
||||
#endif
|
||||
|
||||
void quantize() {
|
||||
const uint wgid = gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint tid = INVOCATION_ID;
|
||||
|
||||
// Each thread handles a vec4, so 8 threads handle a block
|
||||
const uint blocks_per_group = GROUP_SIZE / 8;
|
||||
@@ -30,9 +45,19 @@ void quantize() {
|
||||
const uint ib = wgid * blocks_per_group + block_in_wg;
|
||||
const uint iqs = tid % 8;
|
||||
|
||||
#ifndef QBLOCK_X4
|
||||
if (ib >= gl_NumWorkGroups.x * blocks_per_group) {
|
||||
return;
|
||||
}
|
||||
#else
|
||||
const uint ibx4_outer = ib / 4;
|
||||
const uint ibx4_inner = ib % 4;
|
||||
|
||||
const uint required_x4_blocks = (p.ne + 127) / 128;
|
||||
if (ibx4_outer >= required_x4_blocks) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint a_idx = ib * 8 + iqs;
|
||||
|
||||
@@ -40,7 +65,9 @@ void quantize() {
|
||||
const vec4 abs_vals = abs(vals);
|
||||
|
||||
// Find absolute max for each block
|
||||
shmem[tid] = max(max(abs_vals.x, abs_vals.y), max(abs_vals.z, abs_vals.w));
|
||||
const float thread_max = max(max(abs_vals.x, abs_vals.y), max(abs_vals.z, abs_vals.w));
|
||||
#ifndef USE_SUBGROUPS
|
||||
shmem[tid] = thread_max;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = 4; s > 0; s >>= 1) {
|
||||
if (iqs < s) {
|
||||
@@ -50,14 +77,28 @@ void quantize() {
|
||||
}
|
||||
|
||||
const float amax = shmem[block_in_wg * 8];
|
||||
#else
|
||||
const float amax = subgroupClusteredMax(thread_max, 8);
|
||||
#endif
|
||||
|
||||
const float d = amax / 127.0;
|
||||
const float d_inv = d != 0.0 ? 1.0 / d : 0.0;
|
||||
vals = round(vals * d_inv);
|
||||
|
||||
#ifndef QBLOCK_X4
|
||||
data_b[ib].qs[iqs] = pack32(i8vec4(round(vals)));
|
||||
#else
|
||||
data_b[ibx4_outer].qs[ibx4_inner * 8 + iqs] = pack32(i8vec4(round(vals)));
|
||||
#endif
|
||||
|
||||
#ifndef USE_SUBGROUPS
|
||||
barrier();
|
||||
#endif
|
||||
|
||||
// Calculate the sum for each block
|
||||
shmem[tid] = vals.x + vals.y + vals.z + vals.w;
|
||||
const float thread_sum = vals.x + vals.y + vals.z + vals.w;
|
||||
#ifndef USE_SUBGROUPS
|
||||
shmem[tid] = thread_sum;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = 4; s > 0; s >>= 1) {
|
||||
if (iqs < s) {
|
||||
@@ -65,10 +106,19 @@ void quantize() {
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
#else
|
||||
const float sum = subgroupClusteredAdd(thread_sum, 8);
|
||||
#endif
|
||||
if (iqs == 0) {
|
||||
#ifndef USE_SUBGROUPS
|
||||
const float sum = shmem[tid];
|
||||
#endif
|
||||
|
||||
#ifndef QBLOCK_X4
|
||||
data_b[ib].ds = f16vec2(vec2(d, sum * d));
|
||||
#else
|
||||
data_b[ibx4_outer].ds[ibx4_inner] = f16vec2(vec2(d, sum * d));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -207,6 +207,18 @@ struct block_q8_1_packed32
|
||||
int32_t qs[8];
|
||||
};
|
||||
|
||||
// 4 blocks in one to allow 16-byte/128-bit alignment and loads
|
||||
struct block_q8_1_x4
|
||||
{
|
||||
f16vec2 ds[4];
|
||||
int32_t qs[32];
|
||||
};
|
||||
struct block_q8_1_x4_packed128
|
||||
{
|
||||
f16vec2 ds[4];
|
||||
ivec4 qs[8];
|
||||
};
|
||||
|
||||
// K-quants
|
||||
#define QUANT_K_Q2_K 256
|
||||
|
||||
|
||||
@@ -206,6 +206,22 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
|
||||
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||
}
|
||||
|
||||
bool is_quantized_type(const std::string& type_name) {
|
||||
return type_name != "f32" && type_name != "f16" && type_name != "bf16";
|
||||
}
|
||||
|
||||
bool is_legacy_quant(const std::string& type_name) {
|
||||
return type_name == "q4_0" || type_name == "q4_1" || type_name == "q5_0" || type_name == "q5_1" || type_name == "q8_0";
|
||||
}
|
||||
|
||||
bool is_k_quant(const std::string& type_name) {
|
||||
return string_ends_with(type_name, "_k");
|
||||
}
|
||||
|
||||
bool is_iq_quant(const std::string& type_name) {
|
||||
return string_starts_with(type_name, "iq");
|
||||
}
|
||||
|
||||
static const char path_separator = '/';
|
||||
|
||||
std::string join_paths(const std::string& path1, const std::string& path2) {
|
||||
@@ -402,7 +418,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "q8_0")) {
|
||||
if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && is_legacy_quant(tname)) {
|
||||
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
#endif
|
||||
@@ -495,8 +511,20 @@ void process_shaders() {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
// mul mat vec with integer dot product
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (is_legacy_quant(tname)) {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Dequant shaders
|
||||
if (tname != "f16" && tname != "bf16") {
|
||||
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
||||
@@ -579,7 +607,12 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {});
|
||||
|
||||
string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});
|
||||
string_to_spv("quantize_q8_1_subgroup", "quantize_q8_1.comp", {{"USE_SUBGROUPS", "1"}});
|
||||
|
||||
string_to_spv("quantize_q8_1_x4", "quantize_q8_1.comp", {{"QBLOCK_X4", "1"}});
|
||||
string_to_spv("quantize_q8_1_x4_subgroup", "quantize_q8_1.comp", {{"QBLOCK_X4", "1"}, {"USE_SUBGROUPS", "1"}});
|
||||
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
@@ -821,12 +854,21 @@ void write_output_files() {
|
||||
fputs(len.c_str(), src);
|
||||
}
|
||||
|
||||
for (const std::string& btype : {"f16", "f32"}) {
|
||||
std::vector<std::string> btypes = {"f16", "f32"};
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
btypes.push_back("q8_1");
|
||||
#endif
|
||||
|
||||
for (const std::string& btype : btypes) {
|
||||
for (const auto& tname : type_names) {
|
||||
fprintf(hdr, "extern unsigned char *arr_dmmv_%s_%s_f32_data[2];\n", tname.c_str(), btype.c_str());
|
||||
fprintf(hdr, "extern uint64_t arr_dmmv_%s_%s_f32_len[2];\n", tname.c_str(), btype.c_str());
|
||||
std::string data = "unsigned char *arr_dmmv_" + tname + "_" + btype + "_f32_data[2] = {mul_mat_vec_" + tname + "_" + btype + "_f32_data, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_data};\n";
|
||||
std::string len = "uint64_t arr_dmmv_" + tname + "_" + btype + "_f32_len[2] = {mul_mat_vec_" + tname + "_" + btype + "_f32_len, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_len};\n";
|
||||
if (btype == "q8_1" && !is_legacy_quant(tname)) {
|
||||
continue;
|
||||
}
|
||||
fprintf(hdr, "extern unsigned char *arr_dmmv_%s_%s_f32_data[3];\n", tname.c_str(), btype.c_str());
|
||||
fprintf(hdr, "extern uint64_t arr_dmmv_%s_%s_f32_len[3];\n", tname.c_str(), btype.c_str());
|
||||
std::string data = "unsigned char *arr_dmmv_" + tname + "_" + btype + "_f32_data[3] = {mul_mat_vec_" + tname + "_" + btype + "_f32_data, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_data, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_no_shmem_data};\n";
|
||||
std::string len = "uint64_t arr_dmmv_" + tname + "_" + btype + "_f32_len[3] = {mul_mat_vec_" + tname + "_" + btype + "_f32_len, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_len, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_no_shmem_len};\n";
|
||||
fputs(data.c_str(), src);
|
||||
fputs(len.c_str(), src);
|
||||
}
|
||||
|
||||
@@ -611,6 +611,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_RESHAPE:
|
||||
return false;
|
||||
case GGML_OP_CPY:
|
||||
{
|
||||
@@ -1062,6 +1064,8 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_RESHAPE:
|
||||
return true;
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
|
||||
@@ -273,7 +273,7 @@ struct gguf_reader {
|
||||
}
|
||||
|
||||
bool read(std::string & dst) const {
|
||||
uint64_t size = -1;
|
||||
uint64_t size = 0;
|
||||
if (!read(size)) {
|
||||
return false;
|
||||
}
|
||||
@@ -523,7 +523,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
// tensor shape
|
||||
{
|
||||
uint32_t n_dims = -1;
|
||||
uint32_t n_dims = 0;
|
||||
ok = ok && gr.read(n_dims);
|
||||
if (n_dims > GGML_MAX_DIMS) {
|
||||
GGML_LOG_ERROR("%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
|
||||
|
||||
@@ -206,7 +206,7 @@ extern "C" {
|
||||
llama_token_data * data;
|
||||
size_t size;
|
||||
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
||||
bool sorted;
|
||||
bool sorted; // note: do not assume the data is sorted - always check this flag
|
||||
} llama_token_data_array;
|
||||
|
||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
@@ -1156,11 +1156,6 @@ extern "C" {
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
||||
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
||||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
/// Setting k <= 0 makes this a noop
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||
|
||||
@@ -270,19 +270,7 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
// resolve automatic Flash Attention use and reserve worst-case graph
|
||||
if (!hparams.vocab_only) {
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
int n_splits_pp = -1;
|
||||
int n_nodes_pp = -1;
|
||||
|
||||
int n_splits_tg = -1;
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
||||
@@ -294,6 +282,60 @@ llama_context::llama_context(
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
// resolve automatic Flash Attention use
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
||||
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to split graph for Flash Attention check");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||
bool fa_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_fa != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
||||
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
||||
fa_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (fa_device_mismatch) {
|
||||
cparams.flash_attn = false;
|
||||
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||
if (ggml_is_quantized(params.type_v)) {
|
||||
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
||||
}
|
||||
} else {
|
||||
cparams.flash_attn = true;
|
||||
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
int n_splits_pp = -1;
|
||||
int n_nodes_pp = -1;
|
||||
|
||||
int n_splits_tg = -1;
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
@@ -301,48 +343,6 @@ llama_context::llama_context(
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||
bool fa_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_fa != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
||||
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
||||
fa_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (fa_device_mismatch) {
|
||||
cparams.flash_attn = false;
|
||||
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||
if (ggml_is_quantized(params.type_v)) {
|
||||
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
||||
}
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
} else {
|
||||
cparams.flash_attn = true;
|
||||
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
@@ -1366,7 +1366,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
@@ -1401,7 +1401,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
this->n_outputs = save_n_outputs;
|
||||
|
||||
// initialize scheduler with the specified graph
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
if (split_only) {
|
||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -196,7 +196,7 @@ public:
|
||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
|
||||
// reserve a graph with a dummy ubatch of the specified size
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
||||
|
||||
private:
|
||||
llm_graph_params graph_params(
|
||||
|
||||
@@ -128,6 +128,89 @@ struct ring_buffer {
|
||||
std::vector<T> data;
|
||||
};
|
||||
|
||||
// writes result in res, does not mutate cur
|
||||
static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
|
||||
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx;
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
|
||||
bucket_idx.reserve(cur.size);
|
||||
|
||||
for (int i = 0; i < (int)cur.size; ++i) {
|
||||
const float val = cur.data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||
bucket_idx.push_back(ib);
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= npartial) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
res.resize(nhave);
|
||||
auto * ptr = res.data();
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)cur.size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = res.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets - 1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
|
||||
}
|
||||
|
||||
// reduces the size of cur_p to npartial, keeping only the top npartial elements
|
||||
static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
|
||||
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
|
||||
if (npartial <= 128) {
|
||||
std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
|
||||
|
||||
cur_p->size = npartial;
|
||||
cur_p->sorted = true;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> tmp;
|
||||
|
||||
llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
|
||||
|
||||
std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
|
||||
|
||||
cur_p->size = npartial;
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
|
||||
// iterator for the probabilities
|
||||
#ifdef __GNUC__
|
||||
@@ -200,18 +283,21 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
||||
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
|
||||
GGML_ASSERT(cur_p->size > 0);
|
||||
|
||||
// Sort the logits in descending order
|
||||
if (!cur_p->sorted) {
|
||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
cur_p->sorted = true;
|
||||
// Sort the logits in descending order if requested
|
||||
if (do_sort && !cur_p->sorted) {
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
|
||||
}
|
||||
|
||||
float max_l = cur_p->data[0].logit;
|
||||
if (!cur_p->sorted) {
|
||||
for (size_t i = 1; i < cur_p->size; ++i) {
|
||||
max_l = std::max(max_l, cur_p->data[i].logit);
|
||||
}
|
||||
}
|
||||
|
||||
float cum_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
@@ -226,7 +312,6 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
||||
}
|
||||
|
||||
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
|
||||
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
|
||||
// if (k >= (int32_t)cur_p->size) {
|
||||
// return;
|
||||
// }
|
||||
@@ -239,64 +324,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
|
||||
// Sort scores in descending order
|
||||
if (!cur_p->sorted) {
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
if (k <= 128) {
|
||||
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
|
||||
} else {
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx(cur_p->size);
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
const float val = cur_p->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= k) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::vector<llama_token_data> tmp_tokens(nhave);
|
||||
auto * ptr = tmp_tokens.data();
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets - 1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||
|
||||
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||
|
||||
}
|
||||
cur_p->sorted = true;
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, k);
|
||||
}
|
||||
|
||||
cur_p->size = k;
|
||||
@@ -576,7 +604,8 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
|
||||
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
// sorting is not necessary here
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
|
||||
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
||||
}
|
||||
@@ -626,32 +655,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||
);
|
||||
}
|
||||
|
||||
// softmax
|
||||
|
||||
static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "softmax";
|
||||
}
|
||||
|
||||
static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_softmax_i = {
|
||||
/* .name = */ llama_sampler_softmax_name,
|
||||
/* .accept = */ nullptr,
|
||||
/* .apply = */ llama_sampler_softmax_apply,
|
||||
/* .reset = */ nullptr,
|
||||
/* .clone = */ nullptr,
|
||||
/* .free = */ nullptr,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_softmax() {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_softmax_i,
|
||||
/* .ctx = */ nullptr
|
||||
);
|
||||
}
|
||||
|
||||
// top-k
|
||||
|
||||
struct llama_sampler_top_k {
|
||||
@@ -663,7 +666,7 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
|
||||
}
|
||||
|
||||
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_top_k *) smpl->ctx;
|
||||
llama_sampler_top_k_impl(cur_p, ctx->k);
|
||||
}
|
||||
|
||||
@@ -699,6 +702,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
||||
struct llama_sampler_top_p {
|
||||
const float p;
|
||||
const size_t min_keep;
|
||||
|
||||
std::vector<llama_token_data> buf_sort;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -706,20 +711,35 @@ static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl
|
||||
}
|
||||
|
||||
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_top_p *) smpl->ctx;
|
||||
|
||||
if (ctx->p >= 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
|
||||
size_t k = cur_p->size;
|
||||
auto * pdata = cur_p->data;
|
||||
|
||||
auto & buf_sort = ctx->buf_sort;
|
||||
|
||||
// if not sorted, try adaptive top-k sorting
|
||||
if (!cur_p->sorted && cur_p->size > 1024) {
|
||||
k = std::min<size_t>(256, cur_p->size);
|
||||
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
|
||||
pdata = buf_sort.data();
|
||||
} else if (!cur_p->sorted) {
|
||||
// small candidates -> sort inplace
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, k);
|
||||
}
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = cur_p->size;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cum_sum += cur_p->data[i].p;
|
||||
cum_sum += pdata[i].p;
|
||||
|
||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||
@@ -727,9 +747,21 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// we exceeded the current top-k heuristic -> increase k and continue
|
||||
if (!cur_p->sorted && i == k - 1) {
|
||||
k = cur_p->size;
|
||||
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
|
||||
pdata = buf_sort.data();
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the top-p tokens
|
||||
if (!cur_p->sorted) {
|
||||
std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
cur_p->size = last_idx;
|
||||
}
|
||||
|
||||
@@ -757,6 +789,7 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
||||
/* .ctx = */ new llama_sampler_top_p {
|
||||
/* .p = */ p,
|
||||
/* .min_keep = */ min_keep,
|
||||
/* .buf_sort = */ {},
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -773,7 +806,7 @@ static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl
|
||||
}
|
||||
|
||||
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_min_p *) smpl->ctx;
|
||||
|
||||
if (ctx->p <= 0.0f || !cur_p->size) {
|
||||
return;
|
||||
@@ -799,7 +832,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
|
||||
// if we have enough values the operation was a success
|
||||
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
|
||||
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||
std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
|
||||
cur_p->size = filtered_tokens.size();
|
||||
min_p_applied = true;
|
||||
}
|
||||
@@ -809,10 +842,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
if (!min_p_applied) {
|
||||
// Sort the logits in descending order
|
||||
if (!cur_p->sorted) {
|
||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
cur_p->sorted = true;
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
|
||||
}
|
||||
|
||||
const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
|
||||
@@ -869,7 +899,7 @@ static const char * llama_sampler_typical_name(const struct llama_sampler * /*sm
|
||||
}
|
||||
|
||||
static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_typical *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_typical *) smpl->ctx;
|
||||
|
||||
// Reference implementation:
|
||||
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
||||
@@ -878,7 +908,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
|
||||
}
|
||||
|
||||
// Compute the softmax of logits and calculate entropy
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
@@ -1012,7 +1042,7 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
|
||||
}
|
||||
|
||||
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
|
||||
if (ctx->delta > 0) {
|
||||
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
|
||||
const float max_temp = ctx->temp + ctx->delta;
|
||||
@@ -1027,7 +1057,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / cur_p->size);
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
@@ -1121,7 +1151,7 @@ struct llama_sampler_xtc {
|
||||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -1139,17 +1169,20 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
|
||||
|
||||
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
||||
float chance = distribution(ctx->rng);
|
||||
if (chance > ctx->probability) return;
|
||||
if (chance > ctx->probability) {
|
||||
return;
|
||||
}
|
||||
|
||||
// in case it's not sorted/recalculated yet
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
int pos_last = 0;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].p >= ctx->threshold) {
|
||||
pos_last = i;
|
||||
} else break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
|
||||
@@ -1221,7 +1254,7 @@ struct llama_sampler_mirostat {
|
||||
|
||||
float mu;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -1231,7 +1264,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
|
||||
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
// Estimate s_hat using the most probable m tokens
|
||||
float s_hat = 0.0;
|
||||
@@ -1250,7 +1283,8 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
|
||||
float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
|
||||
|
||||
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||
|
||||
@@ -1336,7 +1370,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
|
||||
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
// Truncate the words with surprise values greater than mu
|
||||
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
||||
@@ -1348,7 +1382,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
|
||||
}
|
||||
|
||||
// Normalize the probabilities of the remaining words
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||
|
||||
@@ -1540,7 +1574,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
||||
}
|
||||
trigger_pattern += ")[\\s\\S]*";
|
||||
auto trigger_pattern_c = trigger_pattern.c_str();
|
||||
const auto * trigger_pattern_c = trigger_pattern.c_str();
|
||||
trigger_patterns = &trigger_pattern_c;
|
||||
num_trigger_patterns = 1;
|
||||
}
|
||||
@@ -1748,7 +1782,7 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||
}
|
||||
|
||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
|
||||
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||
return;
|
||||
@@ -1780,13 +1814,14 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
|
||||
}
|
||||
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
||||
|
||||
//apply mask
|
||||
// apply mask
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit < max - (ctx->n * std)) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
}
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
|
||||
@@ -1991,7 +2026,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
|
||||
|
||||
{
|
||||
const int last = last_n_repeat - 1;
|
||||
int rt = 0, lt = 0;
|
||||
|
||||
int rt = 0;
|
||||
int lt = 0;
|
||||
|
||||
for (int k = 1; k < last_n_repeat; ++k) {
|
||||
if (k > rt) {
|
||||
@@ -2135,8 +2172,8 @@ static struct llama_sampler_i llama_sampler_dry_i = {
|
||||
/* .free = */ llama_sampler_dry_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
|
||||
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
||||
const int MAX_CHAR_LEN = 40;
|
||||
const int MAX_SEQ_LEN = 20;
|
||||
@@ -2169,7 +2206,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_dry_i,
|
||||
/* .ctx = */ new llama_sampler_dry {
|
||||
/* .total_context_size = */ context_size,
|
||||
/* .total_context_size = */ n_ctx_train,
|
||||
/* .dry_multiplier = */ dry_multiplier,
|
||||
/* .dry_base = */ dry_base,
|
||||
/* .dry_allowed_length = */ dry_allowed_length,
|
||||
@@ -2308,7 +2345,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
|
||||
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_infill *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
#if defined(GGML_DEBUG_SAMPLER_INFILL)
|
||||
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
|
||||
|
||||
@@ -197,10 +197,10 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
||||
sampler_tester tester(n_vocab);
|
||||
|
||||
llama_token min_token_id = 0;
|
||||
const llama_token max_token_id = n_vocab-1;
|
||||
const llama_token max_token_id = n_vocab - 1;
|
||||
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
switch (s) {
|
||||
case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
|
||||
case 'y': GGML_ABORT("typical test not implemented");
|
||||
case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
|
||||
@@ -243,10 +243,10 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
||||
}
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
||||
GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
|
||||
} else if (s == 'm') {
|
||||
int expected_size = ceilf((1.0f-min_p) * n_vocab);
|
||||
int expected_size = ceilf((1.0f - min_p) * n_vocab);
|
||||
expected_size = std::max(expected_size, 1);
|
||||
expected_size = std::min(expected_size, size);
|
||||
|
||||
@@ -256,14 +256,14 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
||||
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
||||
GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
|
||||
printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%5d top_p=%f min_p=%f\n",
|
||||
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
|
||||
}
|
||||
|
||||
@@ -308,28 +308,28 @@ static void test_perf() {
|
||||
int main(void) {
|
||||
ggml_time_init();
|
||||
|
||||
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
|
||||
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
|
||||
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
|
||||
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f);
|
||||
|
||||
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
|
||||
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
|
||||
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f, 0.0f, 1.0f);
|
||||
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f, 0.0f, 1.0f);
|
||||
|
||||
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
|
||||
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
|
||||
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
|
||||
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
|
||||
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0);
|
||||
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
|
||||
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.26f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.49f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.51f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.00f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.24f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f}, 0.26f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f}, 0.49f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f}, 0.51f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f}, 0.74f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.05f);
|
||||
@@ -345,23 +345,23 @@ int main(void) {
|
||||
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
||||
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
||||
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0, 0.25f, 0.25f, 0.25f, 0.25f}, 50.0f, 0.0f, 0.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0, 0, 0, 0.5f, 0.5f}, 50.0f, 0.0f, 0.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0, 0, 0, 0.5f, 0.5f}, 50.0f, 0.0f, 0.0f);
|
||||
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.000011f, 0.249997f, 0.249997f, 0.249997f, 0.249997f}, 1.0f, 5.0f, 5.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.000023f, 0.000023f, 0.000023f, 0.499966f, 0.499966f}, 1.0f, 5.0f, 5.0f);
|
||||
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.000000f, 0.000023f, 0.000023f, 0.499977f, 0.499977f}, 1.0f, 5.0f, 5.0f);
|
||||
|
||||
|
||||
test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
|
||||
test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {});
|
||||
test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.109232f, 0.296923f}, 1.0f, 1.1f, 2, 5, {});
|
||||
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
|
||||
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {});
|
||||
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.032727f, 0.241818f, 0.241818f}, 2.0f, 1.1f, 2, 5, {});
|
||||
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
|
||||
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f, 0.0f, 0.0f}, 1.00f);
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 3.00f);
|
||||
|
||||
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
|
||||
@@ -372,7 +372,7 @@ int main(void) {
|
||||
test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
|
||||
|
||||
test_sampler_queue(10000, "k", 100, 1.0000f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.0003f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
|
||||
|
||||
@@ -37,7 +37,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0)<br/> |
|
||||
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||
@@ -49,6 +49,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
||||
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_SPLIT) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
@@ -63,6 +65,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
@@ -73,12 +76,15 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||
| `--list-devices` | print list of available devices and exit |
|
||||
| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
|
||||
| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
|
||||
| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
||||
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||
| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
|
||||
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
@@ -96,9 +102,12 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--log-file FNAME` | Log to file |
|
||||
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
|
||||
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
|
||||
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
|
||||
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
|
||||
|
||||
|
||||
**Sampling params**
|
||||
@@ -113,6 +122,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||
@@ -141,7 +151,10 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--no-context-shift` | disables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `--swa-checkpoints N` | max number of SWA checkpoints per slot to create (default: 3)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_SWA_CHECKPOINTS) |
|
||||
| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
| `--no-warmup` | skip warming up the model with an empty run |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
@@ -152,10 +165,14 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
|
||||
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
|
||||
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
|
||||
| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
||||
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||
@@ -163,23 +180,25 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||
| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
@@ -187,8 +206,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
|
||||
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for speculative decoding model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
|
||||
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for speculative decoding model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
|
||||
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
|
||||
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
|
||||
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
|
||||
| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) |
|
||||
@@ -199,6 +217,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-30b-default` | use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet) |
|
||||
|
||||
|
||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||
@@ -865,25 +884,23 @@ Same as the `/v1/embeddings` endpoint.
|
||||
|
||||
### GET `/slots`: Returns the current slots processing state
|
||||
|
||||
> [!WARNING]
|
||||
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
|
||||
|
||||
This endpoint is disabled by default and can be enabled with `--slots`
|
||||
This endpoint is enabled by default and can be disabled with `--no-slots`. It can be used to query various per-slot metrics, such as speed, processed tokens, sampling parameters, etc.
|
||||
|
||||
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
|
||||
|
||||
**Response format**
|
||||
|
||||
Example:
|
||||
<details>
|
||||
<summary>Example with 2 slots</summary>
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"id": 0,
|
||||
"id_task": -1,
|
||||
"n_ctx": 1024,
|
||||
"id_task": 135,
|
||||
"n_ctx": 65536,
|
||||
"speculative": false,
|
||||
"is_processing": false,
|
||||
"is_processing": true,
|
||||
"params": {
|
||||
"n_predict": -1,
|
||||
"seed": 4294967295,
|
||||
@@ -893,6 +910,7 @@ Example:
|
||||
"top_k": 40,
|
||||
"top_p": 0.949999988079071,
|
||||
"min_p": 0.05000000074505806,
|
||||
"top_n_sigma": -1.0,
|
||||
"xtc_probability": 0.0,
|
||||
"xtc_threshold": 0.10000000149011612,
|
||||
"typical_p": 1.0,
|
||||
@@ -903,17 +921,10 @@ Example:
|
||||
"dry_multiplier": 0.0,
|
||||
"dry_base": 1.75,
|
||||
"dry_allowed_length": 2,
|
||||
"dry_penalty_last_n": -1,
|
||||
"dry_sequence_breakers": [
|
||||
"\n",
|
||||
":",
|
||||
"\"",
|
||||
"*"
|
||||
],
|
||||
"dry_penalty_last_n": 131072,
|
||||
"mirostat": 0,
|
||||
"mirostat_tau": 5.0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"stop": [],
|
||||
"max_tokens": -1,
|
||||
"n_keep": 0,
|
||||
"n_discard": 0,
|
||||
@@ -921,8 +932,12 @@ Example:
|
||||
"stream": true,
|
||||
"n_probs": 0,
|
||||
"min_keep": 0,
|
||||
"grammar": "",
|
||||
"chat_format": "GPT-OSS",
|
||||
"reasoning_format": "none",
|
||||
"reasoning_in_content": false,
|
||||
"thinking_forced_open": false,
|
||||
"samplers": [
|
||||
"penalties",
|
||||
"dry",
|
||||
"top_k",
|
||||
"typ_p",
|
||||
@@ -932,22 +947,89 @@ Example:
|
||||
"temperature"
|
||||
],
|
||||
"speculative.n_max": 16,
|
||||
"speculative.n_min": 5,
|
||||
"speculative.p_min": 0.8999999761581421,
|
||||
"timings_per_token": false
|
||||
"speculative.n_min": 0,
|
||||
"speculative.p_min": 0.75,
|
||||
"timings_per_token": false,
|
||||
"post_sampling_probs": false,
|
||||
"lora": []
|
||||
},
|
||||
"prompt": "",
|
||||
"next_token": {
|
||||
"has_next_token": true,
|
||||
"has_new_line": false,
|
||||
"n_remain": -1,
|
||||
"n_decoded": 0,
|
||||
"stopping_word": ""
|
||||
"n_decoded": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"id_task": 0,
|
||||
"n_ctx": 65536,
|
||||
"speculative": false,
|
||||
"is_processing": true,
|
||||
"params": {
|
||||
"n_predict": -1,
|
||||
"seed": 4294967295,
|
||||
"temperature": 0.800000011920929,
|
||||
"dynatemp_range": 0.0,
|
||||
"dynatemp_exponent": 1.0,
|
||||
"top_k": 40,
|
||||
"top_p": 0.949999988079071,
|
||||
"min_p": 0.05000000074505806,
|
||||
"top_n_sigma": -1.0,
|
||||
"xtc_probability": 0.0,
|
||||
"xtc_threshold": 0.10000000149011612,
|
||||
"typical_p": 1.0,
|
||||
"repeat_last_n": 64,
|
||||
"repeat_penalty": 1.0,
|
||||
"presence_penalty": 0.0,
|
||||
"frequency_penalty": 0.0,
|
||||
"dry_multiplier": 0.0,
|
||||
"dry_base": 1.75,
|
||||
"dry_allowed_length": 2,
|
||||
"dry_penalty_last_n": 131072,
|
||||
"mirostat": 0,
|
||||
"mirostat_tau": 5.0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"max_tokens": -1,
|
||||
"n_keep": 0,
|
||||
"n_discard": 0,
|
||||
"ignore_eos": false,
|
||||
"stream": true,
|
||||
"n_probs": 0,
|
||||
"min_keep": 0,
|
||||
"chat_format": "GPT-OSS",
|
||||
"reasoning_format": "none",
|
||||
"reasoning_in_content": false,
|
||||
"thinking_forced_open": false,
|
||||
"samplers": [
|
||||
"penalties",
|
||||
"dry",
|
||||
"top_k",
|
||||
"typ_p",
|
||||
"top_p",
|
||||
"min_p",
|
||||
"xtc",
|
||||
"temperature"
|
||||
],
|
||||
"speculative.n_max": 16,
|
||||
"speculative.n_min": 0,
|
||||
"speculative.p_min": 0.75,
|
||||
"timings_per_token": false,
|
||||
"post_sampling_probs": false,
|
||||
"lora": []
|
||||
},
|
||||
"next_token": {
|
||||
"has_next_token": true,
|
||||
"has_new_line": true,
|
||||
"n_remain": -1,
|
||||
"n_decoded": 136
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### GET `/metrics`: Prometheus compatible metrics exporter
|
||||
|
||||
This endpoint is only accessible if `--metrics` is set.
|
||||
|
||||
@@ -141,7 +141,7 @@ struct slot_params {
|
||||
// Embeddings
|
||||
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||
|
||||
json to_json() const {
|
||||
json to_json(bool only_metrics = false) const {
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(sampling.samplers.size());
|
||||
for (const auto & sampler : sampling.samplers) {
|
||||
@@ -153,9 +153,55 @@ struct slot_params {
|
||||
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
||||
}
|
||||
|
||||
if (only_metrics) {
|
||||
return json {
|
||||
{"n_predict", n_predict}, // Server configured n_predict
|
||||
{"seed", sampling.seed},
|
||||
{"temperature", sampling.temp},
|
||||
{"dynatemp_range", sampling.dynatemp_range},
|
||||
{"dynatemp_exponent", sampling.dynatemp_exponent},
|
||||
{"top_k", sampling.top_k},
|
||||
{"top_p", sampling.top_p},
|
||||
{"min_p", sampling.min_p},
|
||||
{"top_n_sigma", sampling.top_n_sigma},
|
||||
{"xtc_probability", sampling.xtc_probability},
|
||||
{"xtc_threshold", sampling.xtc_threshold},
|
||||
{"typical_p", sampling.typ_p},
|
||||
{"repeat_last_n", sampling.penalty_last_n},
|
||||
{"repeat_penalty", sampling.penalty_repeat},
|
||||
{"presence_penalty", sampling.penalty_present},
|
||||
{"frequency_penalty", sampling.penalty_freq},
|
||||
{"dry_multiplier", sampling.dry_multiplier},
|
||||
{"dry_base", sampling.dry_base},
|
||||
{"dry_allowed_length", sampling.dry_allowed_length},
|
||||
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
|
||||
{"mirostat", sampling.mirostat},
|
||||
{"mirostat_tau", sampling.mirostat_tau},
|
||||
{"mirostat_eta", sampling.mirostat_eta},
|
||||
{"max_tokens", n_predict}, // User configured n_predict
|
||||
{"n_keep", n_keep},
|
||||
{"n_discard", n_discard},
|
||||
{"ignore_eos", sampling.ignore_eos},
|
||||
{"stream", stream},
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
|
||||
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
|
||||
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
|
||||
{"samplers", samplers},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
{"speculative.p_min", speculative.p_min},
|
||||
{"timings_per_token", timings_per_token},
|
||||
{"post_sampling_probs", post_sampling_probs},
|
||||
{"lora", lora},
|
||||
};
|
||||
}
|
||||
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : sampling.grammar_triggers) {
|
||||
server_grammar_trigger ct(std::move(trigger));
|
||||
server_grammar_trigger ct(trigger);
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
|
||||
@@ -1572,7 +1618,26 @@ struct server_slot {
|
||||
}
|
||||
}
|
||||
|
||||
json to_json() const {
|
||||
json to_json(bool only_metrics = false) const {
|
||||
if (only_metrics) {
|
||||
return json {
|
||||
{"id", id},
|
||||
{"id_task", id_task},
|
||||
{"n_ctx", n_ctx},
|
||||
{"speculative", can_speculate()},
|
||||
{"is_processing", is_processing()},
|
||||
{"params", params.to_json(true)},
|
||||
{"next_token",
|
||||
{
|
||||
{"has_next_token", has_next_token},
|
||||
{"has_new_line", has_new_line},
|
||||
{"n_remain", n_remaining},
|
||||
{"n_decoded", n_decoded},
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return json {
|
||||
{"id", id},
|
||||
{"id_task", id_task},
|
||||
@@ -2485,11 +2550,12 @@ struct server_context {
|
||||
return slot.has_next_token; // continue
|
||||
}
|
||||
|
||||
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
|
||||
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
|
||||
size_t n_probs = slot.params.sampling.n_probs;
|
||||
size_t n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
if (post_sampling) {
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl, true);
|
||||
const size_t max_probs = cur_p->size;
|
||||
|
||||
// set probability for sampled token
|
||||
@@ -2874,7 +2940,7 @@ struct server_context {
|
||||
int n_processing_slots = 0;
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
json slot_data = slot.to_json();
|
||||
json slot_data = slot.to_json(true);
|
||||
|
||||
if (slot.is_processing()) {
|
||||
n_processing_slots++;
|
||||
@@ -4271,16 +4337,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
const auto handle_props = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
// this endpoint is publicly available, please only return what is safe to be exposed
|
||||
json data = {
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model.path },
|
||||
{ "modalities", json{
|
||||
{ "modalities", json {
|
||||
{"vision", ctx_server.oai_parser_opt.allow_image},
|
||||
{"audio", ctx_server.oai_parser_opt.allow_audio},
|
||||
} },
|
||||
{ "endpoint_slots", params.endpoint_slots },
|
||||
{ "endpoint_props", params.endpoint_props },
|
||||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
{ "webui", params.webui },
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||
|
||||
@@ -148,6 +148,8 @@ class ServerProcess:
|
||||
server_args.append("--metrics")
|
||||
if self.server_slots:
|
||||
server_args.append("--slots")
|
||||
else:
|
||||
server_args.append("--no-slots")
|
||||
if self.pooling:
|
||||
server_args.extend(["--pooling", self.pooling])
|
||||
if self.model_alias:
|
||||
|
||||
@@ -895,7 +895,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
|
||||
codes.push_back(new_token_id);
|
||||
|
||||
const auto * cands = common_sampler_get_candidates(smpl[i]);
|
||||
const auto * cands = common_sampler_get_candidates(smpl[i], false);
|
||||
|
||||
// is it an end of generation? -> mark the stream as finished
|
||||
if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {
|
||||
|
||||
Reference in New Issue
Block a user