mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 01:24:24 +00:00
Compare commits
17 Commits
gg/hparams
...
b4900
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
891c63956d | ||
|
|
2f21123c1d | ||
|
|
374101fd74 | ||
|
|
b3c9a65673 | ||
|
|
8ba95dca20 | ||
|
|
dc079cfdff | ||
|
|
7b61bcc87c | ||
|
|
f4c3dd5daa | ||
|
|
3d35d87b41 | ||
|
|
b19bd064c0 | ||
|
|
92a391327e | ||
|
|
9f2250ba72 | ||
|
|
774973b8f3 | ||
|
|
8fcb563613 | ||
|
|
add2a3aa5a | ||
|
|
c522ce4143 | ||
|
|
081bee8c64 |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -1379,7 +1379,7 @@ jobs:
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
@@ -29,6 +29,8 @@ else()
|
||||
set(LLAMA_STANDALONE OFF)
|
||||
endif()
|
||||
|
||||
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
||||
@@ -145,7 +147,13 @@ endif()
|
||||
# 3rd-party
|
||||
#
|
||||
|
||||
if (NOT TARGET ggml)
|
||||
if (LLAMA_USE_SYSTEM_GGML)
|
||||
message(STATUS "Using system-provided libggml, skipping ggml build")
|
||||
find_package(ggml REQUIRED)
|
||||
add_library(ggml ALIAS ggml::ggml)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
include("ggml/cmake/common.cmake")
|
||||
|
||||
function(llama_add_compile_flags)
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
|
||||
@@ -853,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-sysf", "--system-prompt-file"}, "FNAME",
|
||||
"a file containing the system prompt (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
||||
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
||||
params.system_prompt.pop_back();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--in-file"}, "FNAME",
|
||||
"an input file (repeat to specify multiple files)",
|
||||
@@ -1875,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
|
||||
@@ -1033,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
llama_set_warmup(lctx, true);
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_vocab_bos(vocab);
|
||||
llama_token eos = llama_vocab_eos(vocab);
|
||||
@@ -1063,6 +1065,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
llama_kv_self_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
llama_set_warmup(lctx, false);
|
||||
}
|
||||
|
||||
iparams.model.reset(model);
|
||||
|
||||
@@ -79,6 +79,7 @@ class Opt {
|
||||
ctx_params = llama_context_default_params();
|
||||
model_params = llama_model_default_params();
|
||||
context_size_default = ctx_params.n_batch;
|
||||
n_threads_default = ctx_params.n_threads;
|
||||
ngl_default = model_params.n_gpu_layers;
|
||||
common_params_sampling sampling;
|
||||
temperature_default = sampling.temp;
|
||||
@@ -104,6 +105,7 @@ class Opt {
|
||||
|
||||
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||
ctx_params.n_ctx = ctx_params.n_batch;
|
||||
ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
|
||||
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||
|
||||
@@ -116,12 +118,12 @@ class Opt {
|
||||
std::string chat_template_file;
|
||||
std::string user;
|
||||
bool use_jinja = false;
|
||||
int context_size = -1, ngl = -1;
|
||||
int context_size = -1, ngl = -1, n_threads = -1;
|
||||
float temperature = -1;
|
||||
bool verbose = false;
|
||||
|
||||
private:
|
||||
int context_size_default = -1, ngl_default = -1;
|
||||
int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
|
||||
float temperature_default = -1;
|
||||
bool help = false;
|
||||
|
||||
@@ -159,53 +161,94 @@ class Opt {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
|
||||
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
||||
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
|
||||
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
||||
return 1;
|
||||
}
|
||||
use_jinja = true;
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse_options(const char ** argv, int & i, bool & options_parsing) {
|
||||
if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||
verbose = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||
help = true;
|
||||
return 0;
|
||||
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
||||
options_parsing = false;
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
|
||||
if (positional_args_i == 0) {
|
||||
if (!argv[i][0] || argv[i][0] == '-') {
|
||||
return 1;
|
||||
}
|
||||
|
||||
++positional_args_i;
|
||||
model_ = argv[i];
|
||||
} else if (positional_args_i == 1) {
|
||||
++positional_args_i;
|
||||
user = argv[i];
|
||||
} else {
|
||||
user += " " + std::string(argv[i]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int parse(int argc, const char ** argv) {
|
||||
bool options_parsing = true;
|
||||
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
||||
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
||||
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||
verbose = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
|
||||
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
||||
return 1;
|
||||
}
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||
help = true;
|
||||
return 0;
|
||||
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
||||
options_parsing = false;
|
||||
} else if (positional_args_i == 0) {
|
||||
if (!argv[i][0] || argv[i][0] == '-') {
|
||||
return 1;
|
||||
}
|
||||
int ret = parse_options_with_value(argc, argv, i, options_parsing);
|
||||
if (ret == 0) {
|
||||
continue;
|
||||
} else if (ret == 1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
++positional_args_i;
|
||||
model_ = argv[i];
|
||||
} else if (positional_args_i == 1) {
|
||||
++positional_args_i;
|
||||
user = argv[i];
|
||||
} else {
|
||||
user += " " + std::string(argv[i]);
|
||||
ret = parse_options(argv, i, options_parsing);
|
||||
if (ret == 0) {
|
||||
continue;
|
||||
} else if (ret == 1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (parse_positional_args(argv, i, positional_args_i)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (model_.empty()){
|
||||
if (model_.empty()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -232,6 +275,8 @@ class Opt {
|
||||
" Number of GPU layers (default: %d)\n"
|
||||
" --temp <value>\n"
|
||||
" Temperature (default: %.1f)\n"
|
||||
" -t, --threads <value>\n"
|
||||
" Number of threads to use during generation (default: %d)\n"
|
||||
" -v, --verbose, --log-verbose\n"
|
||||
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
||||
" -h, --help\n"
|
||||
@@ -260,7 +305,7 @@ class Opt {
|
||||
" llama-run file://some-file3.gguf\n"
|
||||
" llama-run --ngl 999 some-file4.gguf\n"
|
||||
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
||||
context_size_default, ngl_default, temperature_default);
|
||||
context_size_default, ngl_default, temperature_default, n_threads_default);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse(
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
if (!chat_params.grammar.empty()) {
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
}
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
|
||||
@@ -87,11 +87,11 @@ struct wav_header {
|
||||
uint32_t data_size;
|
||||
};
|
||||
|
||||
static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
|
||||
static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
|
||||
std::ofstream file(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
|
||||
return;
|
||||
LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
wav_header header;
|
||||
@@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector<float> & dat
|
||||
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
|
||||
}
|
||||
|
||||
file.close();
|
||||
return file.good();
|
||||
}
|
||||
|
||||
static void fill_hann_window(int length, bool periodic, float * output) {
|
||||
@@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.out_file = "output.wav";
|
||||
params.prompt = "";
|
||||
|
||||
params.n_predict = 4096;
|
||||
@@ -1060,8 +1061,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
}
|
||||
#endif
|
||||
|
||||
const std::string fname = "output.wav";
|
||||
|
||||
const int n_sr = 24000; // sampling rate
|
||||
|
||||
// zero out first 0.25 seconds
|
||||
@@ -1072,11 +1071,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
|
||||
LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
||||
|
||||
save_wav16(fname, audio, n_sr);
|
||||
int retval = 0;
|
||||
|
||||
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
|
||||
if (save_wav16(params.out_file, audio, n_sr)) {
|
||||
LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
|
||||
} else {
|
||||
retval = ENOENT;
|
||||
}
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
return retval;
|
||||
}
|
||||
|
||||
26
ggml/cmake/common.cmake
Normal file
26
ggml/cmake/common.cmake
Normal file
@@ -0,0 +1,26 @@
|
||||
function(ggml_get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
@@ -1,4 +1,5 @@
|
||||
include(CheckCXXCompilerFlag)
|
||||
include("../cmake/common.cmake")
|
||||
|
||||
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||
|
||||
@@ -24,33 +25,6 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(ggml_get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
|
||||
@@ -2790,10 +2790,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||
output_ne_offset);
|
||||
int64_t antiquantGroupSize = 0;
|
||||
if (src0->ne[0] > QK8_0) {
|
||||
antiquantGroupSize = QK8_0;
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
@@ -2833,7 +2837,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
||||
@@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
||||
if (op->src[0]->ne[0] <= QK8_0) {
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
||||
@@ -474,6 +474,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
@@ -495,9 +496,9 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i_src0;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
@@ -515,6 +516,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
@@ -534,9 +536,9 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i_src0;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
@@ -566,9 +568,11 @@ struct bin_bcast_sycl {
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne0[] = {ne00, ne01, ne02, ne03};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb0[] = {nb00, nb01, nb02, nb03};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
@@ -583,32 +587,41 @@ struct bin_bcast_sycl {
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne0[0];
|
||||
int64_t ne1 = cne0[1];
|
||||
int64_t ne2 = cne0[2];
|
||||
int64_t ne3 = cne0[3];
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
|
||||
size_t nb0 = cnb0[0];
|
||||
size_t nb1 = cnb0[1];
|
||||
size_t nb2 = cnb0[2];
|
||||
size_t nb3 = cnb0[3];
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
@@ -625,6 +638,28 @@ struct bin_bcast_sycl {
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_UNUSED(s00);
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
@@ -661,8 +696,8 @@ struct bin_bcast_sycl {
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
|
||||
s13, item_ct1);
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
@@ -680,7 +715,7 @@ struct bin_bcast_sycl {
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s11, s12, s13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -333,10 +333,11 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
@@ -486,6 +487,22 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
if (buffer == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
|
||||
if (ctx != nullptr) {
|
||||
for (ggml_tensor_extra_gpu * extra : ctx->tensor_extras) {
|
||||
release_extra_gpu(extra);
|
||||
}
|
||||
ctx->tensor_extras.clear(); // reset the tensor_extras vector
|
||||
}
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
||||
@@ -495,7 +512,7 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
||||
/* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
|
||||
/* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
|
||||
/* .clear = */ ggml_backend_sycl_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
/* .reset = */ ggml_backend_sycl_buffer_reset,
|
||||
};
|
||||
|
||||
// sycl buffer type
|
||||
@@ -576,7 +593,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
||||
|
||||
auto dev_count = ggml_backend_sycl_get_device_count();
|
||||
|
||||
@@ -3113,8 +3129,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
const int64_t i2 = i12;
|
||||
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
src1_row.data = src1_original + + i11*nb11 + i12*nb12;
|
||||
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
||||
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
||||
}
|
||||
@@ -3761,7 +3777,6 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
int ggml_backend_sycl_get_device_count() {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
||||
return ggml_sycl_info().device_count;
|
||||
}
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
|
||||
#include "ggml-vulkan-shaders.hpp"
|
||||
|
||||
#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1))
|
||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||
|
||||
#define VK_VENDOR_ID_AMD 0x1002
|
||||
@@ -368,6 +369,7 @@ struct vk_mat_mat_push_constants {
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t k_split;
|
||||
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
||||
uint32_t padded_N;
|
||||
};
|
||||
struct vk_mat_vec_push_constants {
|
||||
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
@@ -380,6 +382,7 @@ struct vk_mat_mat_id_push_constants {
|
||||
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
|
||||
uint32_t padded_N;
|
||||
};
|
||||
struct vk_mat_vec_id_push_constants {
|
||||
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
@@ -1476,26 +1479,26 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
// spec constants and tile sizes for quant matmul (non-Qi_K)
|
||||
l_warptile_mmq = { 256, 128, 256, 64 };
|
||||
m_warptile_mmq = { 256, 128, 128, 64 };
|
||||
s_warptile_mmq = { 256, 128, 128, 64 };
|
||||
s_warptile_mmq = { 256, 32, 64, 128 };
|
||||
l_mmq_wg_denoms = { 128, 256, 1 };
|
||||
m_mmq_wg_denoms = { 128, 128, 1 };
|
||||
s_mmq_wg_denoms = { 128, 128, 1 };
|
||||
s_mmq_wg_denoms = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (Qi_K)
|
||||
l_warptile_mmq_k = { 256, 128, 512, 16 };
|
||||
m_warptile_mmq_k = { 256, 128, 256, 16 };
|
||||
s_warptile_mmq_k = { 256, 32, 128, 64 };
|
||||
l_mmq_wg_denoms_k = { 128, 512, 1 };
|
||||
m_mmq_wg_denoms_k = { 128, 256, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 128, 1 };
|
||||
l_warptile_mmq_k = { 256, 64, 128, 64 };
|
||||
m_warptile_mmq_k = { 256, 32, 64, 64 };
|
||||
s_warptile_mmq_k = { 256, 32, 32, 128 };
|
||||
l_mmq_wg_denoms_k = { 64, 128, 1 };
|
||||
m_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 32, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul_id
|
||||
l_warptile_mmqid = { 256, 128, 128, 16 };
|
||||
l_warptile_mmqid = { 256, 128, 64, 16 };
|
||||
m_warptile_mmqid = { 256, 128, 64, 16 };
|
||||
s_warptile_mmqid = { 256, 64, 64, 16 };
|
||||
l_mmqid_wg_denoms = { 128, 128, 1 };
|
||||
s_warptile_mmqid = { 256, 128, 64, 16 };
|
||||
l_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
m_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
s_mmqid_wg_denoms = { 64, 64, 1 };
|
||||
s_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
|
||||
l_align = 128;
|
||||
m_align = 64;
|
||||
@@ -3850,10 +3853,14 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
if ((ctx->device->mul_mat_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s[src0_type]) {
|
||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||
uint32_t crossover_medium = mmp->s->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_s[src0_type]) {
|
||||
return aligned ? mmp->a_m : mmp->m;
|
||||
}
|
||||
return aligned ? mmp->a_s : mmp->s;
|
||||
@@ -3878,18 +3885,19 @@ static void ggml_vk_matmul(
|
||||
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
|
||||
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
||||
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
||||
uint32_t padded_n) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
if (split_k == 1) {
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(batch_stride_d == m * n);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 };
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -3898,13 +3906,17 @@ static void ggml_vk_matmul(
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
if ((ctx->device->mul_mat_id_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_id_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
if ((ctx->device->mul_mat_id_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s[src0_type]) {
|
||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||
uint32_t crossover_medium = mmp->s->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_id_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_id_s[src0_type]) {
|
||||
return aligned ? mmp->a_m : mmp->m;
|
||||
}
|
||||
return aligned ? mmp->a_s : mmp->s;
|
||||
@@ -3929,14 +3941,15 @@ static void ggml_vk_matmul_id(
|
||||
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
|
||||
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
||||
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11,
|
||||
uint32_t padded_n) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
||||
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
||||
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
||||
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
||||
nei0, nei1, nbi1, ne11 };
|
||||
nei0, nei1, nbi1, ne11, padded_n };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
|
||||
}
|
||||
|
||||
@@ -4098,15 +4111,17 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
|
||||
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
|
||||
|
||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = padded_n * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
@@ -4229,7 +4244,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
{ d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
|
||||
ne01, ne11, ne10,
|
||||
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
|
||||
); // NOLINT
|
||||
}
|
||||
|
||||
@@ -4680,15 +4695,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
const uint64_t x_ne = ne01 * ne00;
|
||||
const uint64_t y_ne = ne11 * ne10;
|
||||
const uint64_t d_ne = ne21 * ne20;
|
||||
|
||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
|
||||
const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
|
||||
|
||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||
const uint64_t x_ne = ne01 * ne00;
|
||||
const uint64_t y_ne = padded_n * ne10;
|
||||
const uint64_t d_ne = ne21 * ne20;
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
||||
@@ -4807,7 +4824,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
{ d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz },
|
||||
ne01, ne21, ne10, ne10, ne10, ne01,
|
||||
stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n
|
||||
); // NOLINT
|
||||
}
|
||||
|
||||
@@ -6767,7 +6784,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
}
|
||||
ggml_vk_ctx_end(subctx);
|
||||
@@ -7112,7 +7129,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
}
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
@@ -48,6 +48,8 @@ layout (push_constant) uniform parameter
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
// N dimension for the B matrix can be >= p.N
|
||||
uint padded_N;
|
||||
} p;
|
||||
|
||||
|
||||
@@ -202,18 +204,19 @@ void main() {
|
||||
#endif
|
||||
|
||||
// Use end_k rather than p.K as the dimension because that's what
|
||||
// we need to bound check against when using split_k
|
||||
// we need to bound check against when using split_k.
|
||||
// Bounds check B against padded_N, but bounds check D against N.
|
||||
tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k);
|
||||
tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k);
|
||||
tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.padded_N, end_k);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M);
|
||||
tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k);
|
||||
tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k);
|
||||
tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.padded_N, end_k);
|
||||
|
||||
tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
|
||||
|
||||
#if !defined(MUL_MAT_ID)
|
||||
// Detect a fast path where all loads are entirely in bounds and no clamping is required
|
||||
if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 &&
|
||||
if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.padded_N && (start_k % BK) == 0 && (end_k % BK) == 0 &&
|
||||
#if QUANT_K == 1
|
||||
(stride_a % 8) == 0 &&
|
||||
#endif
|
||||
@@ -263,7 +266,7 @@ void main() {
|
||||
#ifdef MUL_MAT_ID
|
||||
bool unclampedB = true;
|
||||
#else
|
||||
bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0;
|
||||
bool unclampedB = (ic + 1) * BN <= p.padded_N && block_k + BK <= end_k && (block_k % 8) == 0;
|
||||
#endif
|
||||
if (unclampedA && unclampedB) {
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
|
||||
|
||||
@@ -945,6 +945,10 @@ extern "C" {
|
||||
// If set to true, the model will only attend to the past tokens
|
||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||
|
||||
// Set whether the model is in warmup mode or not
|
||||
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
|
||||
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
|
||||
|
||||
// Set abort callback
|
||||
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ llama_context::llama_context(
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
cparams.warmup = false;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
@@ -284,11 +285,15 @@ llama_context::llama_context(
|
||||
|
||||
// reserve worst-case graph
|
||||
if (!hparams.vocab_only) {
|
||||
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
|
||||
// restore later
|
||||
// TODO: something cleaner
|
||||
const auto n_outputs_save = n_outputs;
|
||||
|
||||
// max number of outputs
|
||||
n_outputs = n_tokens;
|
||||
|
||||
@@ -340,6 +345,8 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
n_outputs = n_outputs_save;
|
||||
|
||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
@@ -537,16 +544,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
float freq_base_l = cparams.rope_freq_base;
|
||||
float freq_scale_l = cparams.rope_freq_scale;
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
// TODO: improve
|
||||
if (model.arch == LLM_ARCH_GEMMA3) {
|
||||
const bool is_sliding = hparams.is_sliding(il);
|
||||
|
||||
freq_base_l = is_sliding ? 10000.0f : cparams.rope_freq_base;
|
||||
freq_scale_l = is_sliding ? 1.0f : cparams.rope_freq_scale;
|
||||
}
|
||||
// note: the swa rope params could become part of the cparams in the future
|
||||
// if we decide to make them configurable, like the non-sliding ones
|
||||
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||
|
||||
ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
|
||||
|
||||
@@ -952,6 +955,12 @@ void llama_context::set_causal_attn(bool value) {
|
||||
cparams.causal_attn = value;
|
||||
}
|
||||
|
||||
void llama_context::set_warmup(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -1598,7 +1607,7 @@ void llama_context::output_reorder() {
|
||||
//
|
||||
|
||||
int32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<int32_t>(8192, 5*model.n_tensors());
|
||||
return std::max<int32_t>(65536, 5*model.n_tensors());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_init() {
|
||||
@@ -2376,6 +2385,10 @@ void llama_set_causal_attn(llama_context * ctx, bool causal_attn) {
|
||||
ctx->set_causal_attn(causal_attn);
|
||||
}
|
||||
|
||||
void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
ctx->set_warmup(warmup);
|
||||
}
|
||||
|
||||
void llama_synchronize(llama_context * ctx) {
|
||||
ctx->synchronize();
|
||||
}
|
||||
|
||||
@@ -64,6 +64,7 @@ struct llama_context {
|
||||
|
||||
void set_embeddings (bool value);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
void set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
|
||||
@@ -29,6 +29,7 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
||||
@@ -577,7 +577,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
n_embd_head_v (hparams.n_embd_head_v),
|
||||
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (hparams.n_expert_used),
|
||||
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
|
||||
freq_base (cparams.rope_freq_base),
|
||||
freq_scale (cparams.rope_freq_scale),
|
||||
ext_factor (cparams.yarn_ext_factor),
|
||||
@@ -1311,29 +1311,23 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
return cur;
|
||||
}
|
||||
|
||||
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified(
|
||||
bool causal,
|
||||
bool swa) const {
|
||||
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
|
||||
|
||||
const auto n_kv = kv_self->n;
|
||||
|
||||
inp->self_kq_mask = causal
|
||||
? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
|
||||
: ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||
inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||
//cb(inp->self_kq_mask, "KQ_mask", -1);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
||||
if (swa) {
|
||||
if (hparams.n_swa_pattern > 1) {
|
||||
GGML_ASSERT(hparams.n_swa > 0);
|
||||
|
||||
inp->self_kq_mask_swa = causal
|
||||
? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
|
||||
: ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||
//cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
|
||||
@@ -1403,9 +1397,9 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
|
||||
}
|
||||
|
||||
const bool is_sliding = hparams.is_sliding(il);
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
const auto & kq_mask = is_sliding ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
||||
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
||||
|
||||
const auto n_kv = kv_self->n;
|
||||
|
||||
|
||||
@@ -509,9 +509,7 @@ struct llm_graph_context {
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified(
|
||||
bool causal,
|
||||
bool swa) const;
|
||||
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_kv_unified * inp,
|
||||
|
||||
@@ -70,7 +70,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_sliding(uint32_t il) const {
|
||||
bool llama_hparams::is_swa(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
||||
}
|
||||
|
||||
@@ -79,7 +79,9 @@ struct llama_hparams {
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_base_train_swa;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
|
||||
@@ -135,7 +137,7 @@ struct llama_hparams {
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
|
||||
bool is_sliding(uint32_t il) const;
|
||||
bool is_swa(uint32_t il) const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
@@ -475,6 +475,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||||
|
||||
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
||||
|
||||
// non-transformer models do not have attention heads
|
||||
@@ -780,9 +784,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.n_swa = 2047;
|
||||
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
||||
// default value for Phi-3-mini-128k-instruct
|
||||
// note: this seems incorrect because the window is bigger than the train context?
|
||||
hparams.n_swa = 262144;
|
||||
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
||||
// default value for Phi-3-medium-128k-instruct
|
||||
// note: this seems incorrect because the window is equal to the train context?
|
||||
hparams.n_swa = 131072;
|
||||
}
|
||||
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
@@ -877,6 +883,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
{
|
||||
hparams.n_swa_pattern = 6;
|
||||
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
@@ -996,6 +1005,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 16: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 40: type = LLM_TYPE_13B; break;
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -1346,13 +1356,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
|
||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
|
||||
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||
}
|
||||
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
auto * dev = devices.at(layer_gpu);
|
||||
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
|
||||
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
|
||||
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
||||
};
|
||||
|
||||
@@ -2716,6 +2727,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_OLMO2:
|
||||
{
|
||||
const int64_t n_embd_head = n_embd / n_head;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
@@ -2730,7 +2743,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -3702,6 +3715,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||||
LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
||||
@@ -3863,7 +3877,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4026,7 +4040,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4184,7 +4198,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -4302,7 +4316,7 @@ struct llm_build_xverse : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -4410,7 +4424,7 @@ struct llm_build_falcon : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * attn_norm;
|
||||
@@ -4535,7 +4549,7 @@ struct llm_build_grok : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -4689,7 +4703,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -4813,7 +4827,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
cb(pos, "pos_embd", -1);
|
||||
@@ -4916,7 +4930,7 @@ struct llm_build_refact : public llm_graph_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -5179,7 +5193,7 @@ struct llm_build_bloom : public llm_graph_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
inpL = build_norm(inpL,
|
||||
model.tok_norm,
|
||||
@@ -5284,7 +5298,7 @@ struct llm_build_mpt : public llm_graph_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
if (model.pos_embd) {
|
||||
// inp_pos - contains the positions
|
||||
@@ -5428,7 +5442,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -5579,7 +5593,7 @@ struct llm_build_qwen : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -5695,7 +5709,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -5810,7 +5824,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
@@ -5930,7 +5944,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -6079,7 +6093,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
attn_norm_output = build_norm(inpL,
|
||||
@@ -6203,7 +6217,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto * residual = inpL;
|
||||
@@ -6349,7 +6363,7 @@ struct llm_build_plamo : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
@@ -6457,7 +6471,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
cb(pos, "pos_embd", -1);
|
||||
@@ -6565,7 +6579,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = build_norm(inpL,
|
||||
@@ -6678,7 +6692,7 @@ struct llm_build_orion : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -6799,7 +6813,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -6929,7 +6943,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -7133,7 +7147,7 @@ struct llm_build_gemma : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -7243,7 +7257,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -7378,13 +7392,13 @@ struct llm_build_gemma3 : public llm_graph_context {
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// TODO: is causal == true correct? might need some changes
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const bool is_sliding = hparams.is_sliding(il);
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
const float freq_base_l = is_sliding ? 10000.0f : freq_base;
|
||||
const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
|
||||
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
@@ -7507,7 +7521,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -7820,7 +7834,7 @@ struct llm_build_command_r : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
@@ -7970,10 +7984,10 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const bool is_sliding = hparams.is_sliding(il);
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
||||
@@ -8007,7 +8021,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
if (is_sliding) {
|
||||
if (is_swa) {
|
||||
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
|
||||
beta_fast, beta_slow);
|
||||
@@ -8102,7 +8116,7 @@ struct llm_build_olmo : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -8224,7 +8238,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -8350,7 +8364,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -8473,7 +8487,7 @@ struct llm_build_openelm : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head = hparams.n_head(il);
|
||||
@@ -8603,7 +8617,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = build_norm(inpL,
|
||||
@@ -8749,7 +8763,7 @@ struct llm_build_arctic : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -8881,7 +8895,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
@@ -9046,7 +9060,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -9266,7 +9280,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -9524,7 +9538,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
||||
|
||||
const int64_t n_outputs_enc = embd_enc->ne[1];
|
||||
|
||||
auto * inp_attn_self = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn_self = build_attn_inp_kv_unified();
|
||||
auto * inp_attn_cross = build_attn_inp_cross();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -9690,7 +9704,7 @@ struct llm_build_jais : public llm_graph_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = build_norm(inpL,
|
||||
@@ -9786,7 +9800,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -9918,7 +9932,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -10041,7 +10055,7 @@ struct llm_build_exaone : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
@@ -10557,7 +10571,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
Reference in New Issue
Block a user