mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-03 07:34:07 +00:00
Compare commits
86 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c1366056f6 | ||
|
|
2a85f720b8 | ||
|
|
7cbec34a63 | ||
|
|
0c8986403b | ||
|
|
daa242dfc8 | ||
|
|
e70e640db3 | ||
|
|
5fa66c6e67 | ||
|
|
382808c14b | ||
|
|
4ffc47cb20 | ||
|
|
9c675c7140 | ||
|
|
07a0c4ba92 | ||
|
|
60f17f56da | ||
|
|
f8d561eb87 | ||
|
|
e59efe6a78 | ||
|
|
cffa5c46ea | ||
|
|
94de74e7b1 | ||
|
|
4fd59e8427 | ||
|
|
08566977a7 | ||
|
|
a4bf35889e | ||
|
|
026d2ad472 | ||
|
|
06705fdcb3 | ||
|
|
a52dc60ba3 | ||
|
|
9045c9afe5 | ||
|
|
c9ced4910b | ||
|
|
7ac8902133 | ||
|
|
9bf20d8ac3 | ||
|
|
cb999704fb | ||
|
|
b96b82fc85 | ||
|
|
10dc500bdb | ||
|
|
4893cc07bb | ||
|
|
af3be131c0 | ||
|
|
b07cda687c | ||
|
|
85c40c9b02 | ||
|
|
83b3b1c271 | ||
|
|
b0fb0f0aee | ||
|
|
e68c19b0fd | ||
|
|
c54bba869d | ||
|
|
f5acfb2ffa | ||
|
|
4cbafad4f0 | ||
|
|
c184284230 | ||
|
|
c8a2417d7b | ||
|
|
54132f1b1f | ||
|
|
2a9ea2020c | ||
|
|
ce7a6dc0fc | ||
|
|
1ce0126b18 | ||
|
|
7f459c98e7 | ||
|
|
cf2ffc02bc | ||
|
|
10355dc7d0 | ||
|
|
5ee4e43f26 | ||
|
|
5b6c9bc0f3 | ||
|
|
849d021104 | ||
|
|
8e3ead6e4d | ||
|
|
12ee1763a6 | ||
|
|
ed75977717 | ||
|
|
847c35f7d5 | ||
|
|
a6a552e4ec | ||
|
|
96e33a814e | ||
|
|
dfc959b886 | ||
|
|
8f48807380 | ||
|
|
bf6bc3c155 | ||
|
|
179fd82a72 | ||
|
|
d34d5ca1e9 | ||
|
|
eb492bf43f | ||
|
|
e3b35ddf1c | ||
|
|
6ce863c803 | ||
|
|
3997c78e33 | ||
|
|
ee74642982 | ||
|
|
a28310488c | ||
|
|
86af848153 | ||
|
|
147a521636 | ||
|
|
e1f15b454f | ||
|
|
0e1ccf15c7 | ||
|
|
5e25ddebff | ||
|
|
fd05c51cec | ||
|
|
b365c3ff01 | ||
|
|
cb64222b0c | ||
|
|
6eb7081860 | ||
|
|
4117ae5557 | ||
|
|
65e96a2464 | ||
|
|
9496bbb808 | ||
|
|
ddcb75dd8a | ||
|
|
52ab19df63 | ||
|
|
5182dd64cd | ||
|
|
10b4f82d44 | ||
|
|
408616adbd | ||
|
|
9e39a1e6a9 |
@@ -8,7 +8,8 @@ body:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
||||
Before opening an issue, please confirm that the compilation still fails
|
||||
after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
|
||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
||||
by clearing `~/.cache/ccache` (on Linux).
|
||||
- type: textarea
|
||||
|
||||
15
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
15
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -98,7 +98,18 @@ body:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
For very long logs (thousands of lines), preferably upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
<!-- Copy-pasted short logs go into the "console" area here -->
|
||||
|
||||
```console
|
||||
|
||||
```
|
||||
</details>
|
||||
|
||||
<!-- Long logs that you upload as files go here, outside the "console" area -->
|
||||
validations:
|
||||
required: true
|
||||
|
||||
15
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
15
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@@ -85,8 +85,19 @@ body:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
|
||||
render: shell
|
||||
For very long logs (thousands of lines), please upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
<!-- Copy-pasted short logs go into the "console" area here -->
|
||||
|
||||
```console
|
||||
|
||||
```
|
||||
</details>
|
||||
|
||||
<!-- Long logs that you upload as files go here, outside the "console" area -->
|
||||
validations:
|
||||
required: false
|
||||
|
||||
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
@@ -45,8 +45,7 @@ jobs:
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
10
.github/workflows/release.yml
vendored
10
.github/workflows/release.yml
vendored
@@ -688,13 +688,15 @@ jobs:
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
|
||||
# Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
|
||||
# For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
|
||||
zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
|
||||
|
||||
openEuler-cann:
|
||||
@@ -863,7 +865,7 @@ jobs:
|
||||
**macOS/iOS:**
|
||||
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
|
||||
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
|
||||
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
|
||||
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
|
||||
|
||||
**Linux:**
|
||||
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Copilot Instructions for llama.cpp
|
||||
# Instructions for llama.cpp
|
||||
|
||||
## Repository Overview
|
||||
|
||||
@@ -12,6 +12,24 @@ llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model
|
||||
- **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
|
||||
- **License**: MIT
|
||||
|
||||
## Disclose AI Usage
|
||||
|
||||
It is crucial to remind contributors that the project mandates disclosure of any AI usage in pull requests. This requirement stems from the potential for AI-generated code to include suboptimal optimizations and hidden bugs, owing to the inherent overconfidence in AI outputs.
|
||||
|
||||
When generating significant portions of code, address this by:
|
||||
- Informing the user that AI-generated content may be rejected by maintainers.
|
||||
- Clearly marking AI-generated code in commit messages and comments.
|
||||
- Example of commit message: `[AI] Fix a race condition in ...`
|
||||
- Example of code comment: `// [AI] spawn a new thread ...`
|
||||
|
||||
These measures apply to:
|
||||
- Changes resulting in large portions of code or complex logic.
|
||||
- Modifications or additions to public APIs in `llama.h`, `ggml.h`, or `mtmd.h`.
|
||||
- Backend-related changes, such as those involving CPU, CUDA, Metal, Vulkan, etc.
|
||||
- Modifications to `tools/server`.
|
||||
|
||||
Note: These measures can be omitted for small fixes or trivial changes.
|
||||
|
||||
## Build Instructions
|
||||
|
||||
### Prerequisites
|
||||
@@ -251,6 +269,7 @@ Primary tools:
|
||||
- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
|
||||
- **Performance focus**: This is a performance-critical inference library
|
||||
- **API stability**: Changes to `include/llama.h` require careful consideration
|
||||
- **Disclose AI Usage**: Refer to the "Disclose AI Usage" earlier in this document
|
||||
|
||||
### Git Workflow
|
||||
- Always create feature branches from `master`
|
||||
@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
|
||||
unicode.h
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
@@ -151,9 +154,7 @@ if (LLAMA_LLGUIDANCE)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
||||
endif ()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
|
||||
#
|
||||
|
||||
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
|
||||
return *this;
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_preset_only() {
|
||||
is_preset_only = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool common_arg::in_example(enum llama_example ex) {
|
||||
return examples.find(ex) != examples.end();
|
||||
}
|
||||
@@ -2012,7 +2017,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
"comma separated list of RPC servers",
|
||||
"comma separated list of RPC servers (host:port)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
add_rpc_devices(value);
|
||||
GGML_UNUSED(params);
|
||||
@@ -2082,7 +2087,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
||||
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
|
||||
add_opt(common_arg(
|
||||
{"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
||||
@@ -2132,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
||||
GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
|
||||
[](common_params & params, int value) {
|
||||
params.n_gpu_layers = value;
|
||||
string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "auto") {
|
||||
params.n_gpu_layers = -1;
|
||||
} else if (value == "all") {
|
||||
params.n_gpu_layers = -2;
|
||||
} else {
|
||||
params.n_gpu_layers = std::stoi(value);
|
||||
}
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||
@@ -2882,6 +2894,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.lora_init_without_apply = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--sleep-idle-seconds"}, "SECONDS",
|
||||
string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
|
||||
[](common_params & params, int value) {
|
||||
if (value == 0 || value < -1) {
|
||||
throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
|
||||
}
|
||||
params.sleep_idle_seconds = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--simple-io"},
|
||||
"use basic IO for better compatibility in subprocesses and limited consoles",
|
||||
@@ -3160,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.devices = parse_device_list(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
||||
add_opt(common_arg(
|
||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||
"number of layers to store in VRAM for the draft model",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_gpu_layers = value;
|
||||
string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
|
||||
params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "auto") {
|
||||
params.speculative.n_gpu_layers = -1;
|
||||
} else if (value == "all") {
|
||||
params.speculative.n_gpu_layers = -2;
|
||||
} else {
|
||||
params.speculative.n_gpu_layers = std::stoi(value);
|
||||
}
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||
@@ -3494,3 +3524,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args) {
|
||||
// arguments below won't be treated as CLI args, only preset options
|
||||
args.push_back(common_arg(
|
||||
{"load-on-startup"}, "NAME",
|
||||
"in server router mode, autoload this model on startup",
|
||||
[](common_params &, const std::string &) { /* unused */ }
|
||||
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
|
||||
|
||||
args.push_back(common_arg(
|
||||
{"stop-timeout"}, "SECONDS",
|
||||
"in server router mode, force-kill model instance after this many seconds of graceful shutdown",
|
||||
[](common_params &, int) { /* unused */ }
|
||||
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
|
||||
|
||||
// args.push_back(common_arg(
|
||||
// {"pin"},
|
||||
// "in server router mode, do not unload this model if models_max is exceeded",
|
||||
// [](common_params &) { /* unused */ }
|
||||
// ).set_preset_only());
|
||||
}
|
||||
|
||||
12
common/arg.h
12
common/arg.h
@@ -8,6 +8,10 @@
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
// pseudo-env variable to identify preset-only arguments
|
||||
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
||||
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
@@ -22,6 +26,7 @@ struct common_arg {
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
bool is_sparam = false; // is current arg a sampling param?
|
||||
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
|
||||
void (*handler_void) (common_params & params) = nullptr;
|
||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||
@@ -70,6 +75,7 @@ struct common_arg {
|
||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
common_arg & set_preset_only();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output) const;
|
||||
@@ -114,9 +120,13 @@ struct common_params_context {
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// parse input arguments from CLI into a map
|
||||
// TODO: support repeated args in the future
|
||||
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||
|
||||
// populate preset-only arguments
|
||||
// these arguments are not treated as command line arguments
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
|
||||
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
||||
}
|
||||
|
||||
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
||||
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
|
||||
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
||||
return false;
|
||||
}
|
||||
@@ -1078,6 +1078,8 @@ struct common_init_result::impl {
|
||||
impl() = default;
|
||||
~impl() = default;
|
||||
|
||||
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
|
||||
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
|
||||
@@ -1339,10 +1341,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.devices = params.devices.data();
|
||||
}
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
|
||||
@@ -329,7 +329,7 @@ struct common_params {
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
@@ -475,7 +475,8 @@ struct common_params {
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
|
||||
@@ -24,7 +24,14 @@ std::vector<std::string> common_preset::to_args(const std::string & bin_path) co
|
||||
}
|
||||
|
||||
for (const auto & [opt, value] : options) {
|
||||
args.push_back(opt.args.back()); // use the last arg as the main arg
|
||||
if (opt.is_preset_only) {
|
||||
continue; // skip preset-only options (they are not CLI args)
|
||||
}
|
||||
|
||||
// use the last arg as the main arg (i.e. --long-form)
|
||||
args.push_back(opt.args.back());
|
||||
|
||||
// handle value(s)
|
||||
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||
// flag option, no value
|
||||
if (common_arg_utils::is_falsey(value)) {
|
||||
@@ -224,8 +231,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
||||
}
|
||||
|
||||
common_preset_context::common_preset_context(llama_example ex)
|
||||
: ctx_params(common_params_parser_init(default_params, ex)),
|
||||
key_to_opt(get_map_key_opt(ctx_params)) {}
|
||||
: ctx_params(common_params_parser_init(default_params, ex)) {
|
||||
common_params_add_preset_options(ctx_params.options);
|
||||
key_to_opt = get_map_key_opt(ctx_params);
|
||||
}
|
||||
|
||||
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
||||
common_presets out;
|
||||
|
||||
@@ -141,16 +141,24 @@ class ModelBase:
|
||||
self.model_name = model_name
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
_, first_tensor = next(self.get_tensors())
|
||||
if first_tensor.dtype == torch.float16:
|
||||
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
for _, tensor in self.get_tensors():
|
||||
if tensor.dim() < 2:
|
||||
continue
|
||||
|
||||
if tensor.dtype == torch.bfloat16:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
|
||||
break
|
||||
elif tensor.dtype == torch.float16:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
|
||||
break
|
||||
else:
|
||||
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
|
||||
|
||||
self.dequant_model()
|
||||
|
||||
@@ -1204,6 +1212,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
|
||||
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
|
||||
res = "mellum"
|
||||
if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
|
||||
# ref: https://huggingface.co/answerdotai/ModernBERT-base
|
||||
res = "modern-bert"
|
||||
if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
|
||||
# ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
|
||||
res = "afmoe"
|
||||
@@ -1685,6 +1696,84 @@ class TextModel(ModelBase):
|
||||
if template is not None:
|
||||
self.gguf_writer.add_chat_template(template)
|
||||
|
||||
def _set_vocab_plamo(self):
|
||||
# PLaMo models use a custom tokenizer with a .jsonl file
|
||||
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
|
||||
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
||||
|
||||
if not tokenizer_jsonl_path.is_file():
|
||||
raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")
|
||||
|
||||
# Load tokenizer config
|
||||
with open(tokenizer_config_path, "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# Load tokens from JSONL file (actually a list format)
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f:
|
||||
for line_num, line in enumerate(f):
|
||||
if line.strip():
|
||||
token_data = json.loads(line)
|
||||
# Format: [token, score, type, ?, ?, ?, ?]
|
||||
token = token_data[0].encode("utf-8")
|
||||
score = float(token_data[1])
|
||||
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
|
||||
|
||||
tokens.append(token)
|
||||
scores.append(score)
|
||||
|
||||
if token_type_str == "UNKNOWN":
|
||||
toktypes.append(gguf.TokenType.UNKNOWN)
|
||||
elif token_type_str == "CONTROL":
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
elif token_type_str == "BYTE":
|
||||
toktypes.append(gguf.TokenType.BYTE)
|
||||
else:
|
||||
token_str = token_data[0]
|
||||
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("plamo2")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_bos_token_id(token_id)
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_eos_token_id(token_id)
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_pad_token_id(token_id)
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_sep_token_id(token_id)
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_unk_token_id(token_id)
|
||||
|
||||
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
|
||||
self.gguf_writer.add_eot_token_id(4)
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
|
||||
class MmprojModel(ModelBase):
|
||||
model_type = ModelType.MMPROJ
|
||||
@@ -4787,87 +4876,7 @@ class Plamo2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PLAMO2
|
||||
|
||||
def set_vocab(self):
|
||||
# PLaMo 2 uses a custom tokenizer with a .jsonl file
|
||||
# We need to handle this specially
|
||||
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
|
||||
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
||||
|
||||
if not tokenizer_jsonl_path.is_file():
|
||||
raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
|
||||
|
||||
# Load tokenizer config
|
||||
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# Load tokens from JSONL file (actually a list format)
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f):
|
||||
if line.strip():
|
||||
token_data = json.loads(line)
|
||||
# Format: [token, score, type, ?, ?, ?, ?]
|
||||
token = token_data[0].encode("utf-8")
|
||||
score = float(token_data[1])
|
||||
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
|
||||
|
||||
tokens.append(token)
|
||||
scores.append(score)
|
||||
|
||||
# Map token type strings to GGUF token types
|
||||
if token_type_str == "UNKNOWN":
|
||||
toktypes.append(gguf.TokenType.UNKNOWN)
|
||||
elif token_type_str == "CONTROL":
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
elif token_type_str == "BYTE":
|
||||
toktypes.append(gguf.TokenType.BYTE)
|
||||
else:
|
||||
# Check for PLaMo-2 special tokens
|
||||
token_str = token_data[0]
|
||||
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
|
||||
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
|
||||
self.gguf_writer.add_tokenizer_model("plamo2")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# Add special tokens from config
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_bos_token_id(token_id)
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_eos_token_id(token_id)
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_pad_token_id(token_id)
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_sep_token_id(token_id)
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_unk_token_id(token_id)
|
||||
|
||||
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
|
||||
self.gguf_writer.add_eot_token_id(4)
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
self._set_vocab_plamo()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
@@ -4955,6 +4964,56 @@ class Plamo2Model(TextModel):
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM")
|
||||
class Plamo3Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PLAMO3
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_plamo()
|
||||
|
||||
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
||||
tokenizer_config = {}
|
||||
|
||||
if tokenizer_config_path.is_file():
|
||||
with open(tokenizer_config_path, encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
chat_template = tokenizer_config.get("chat_template")
|
||||
chat_template_jinja = self.dir_model / "chat_template.jinja"
|
||||
|
||||
if chat_template_jinja.is_file():
|
||||
with open(chat_template_jinja, encoding="utf-8") as f:
|
||||
chat_template = f.read()
|
||||
|
||||
if chat_template:
|
||||
self.gguf_writer.add_chat_template(chat_template)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
|
||||
self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("rope_local_theta")})["rope_theta"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
if name.endswith(".pre_mixer_norm.weight"):
|
||||
data_torch = data_torch + 1.0
|
||||
elif name.endswith(".post_mixer_norm.weight"):
|
||||
data_torch = data_torch + 1.0 / 5
|
||||
elif name.endswith(".pre_mlp_norm.weight"):
|
||||
data_torch = data_torch + 1.0
|
||||
elif name.endswith(".post_mlp_norm.weight"):
|
||||
data_torch = data_torch + 1.0 / (5**1.5)
|
||||
elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")):
|
||||
data_torch = data_torch + 1.0
|
||||
elif name.endswith(".norm.weight"):
|
||||
data_torch = data_torch + 1.0
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("CodeShellForCausalLM")
|
||||
class CodeShellModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.CODESHELL
|
||||
@@ -7351,6 +7410,90 @@ class MiniMaxM2Model(TextModel):
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("MiMoV2FlashForCausalLM")
|
||||
class MimoV2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MIMO2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
assert self.hparams["swa_head_dim"] == self.hparams["head_dim"]
|
||||
assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"]
|
||||
assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"]
|
||||
assert self.hparams["topk_method"] == "noaux_tc"
|
||||
|
||||
n_head_kv = self.hparams["num_key_value_heads"]
|
||||
n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
|
||||
n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv_arr)
|
||||
|
||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||
self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
|
||||
self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"])
|
||||
self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
|
||||
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||
|
||||
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if name.endswith("e_score_correction_bias"):
|
||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
|
||||
if "attention_sink" in name and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
# TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
|
||||
if "model.mtp." in name:
|
||||
return []
|
||||
|
||||
# process the experts separately
|
||||
if name.find("mlp.experts") != -1:
|
||||
n_experts = self.hparams["n_routed_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["gate_proj", "up_proj", "down_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename_to_retrieve])
|
||||
del self._experts[bid][ename_to_retrieve]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
tensors.append((new_name, data_torch))
|
||||
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("PanguEmbeddedForCausalLM")
|
||||
class PanguEmbeddedModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PANGU_EMBED
|
||||
@@ -8684,6 +8827,11 @@ class NemotronHModel(GraniteHybridModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("LlamaBidirectionalModel")
|
||||
class LlamaEmbedNemotronModel(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
|
||||
|
||||
|
||||
@ModelBase.register("BailingMoeForCausalLM")
|
||||
class BailingMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
||||
@@ -9991,6 +10139,36 @@ class SmallThinkerModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
|
||||
class ModernBertModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.MODERN_BERT
|
||||
|
||||
def set_vocab(self):
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
self.gguf_writer.add_add_sep_token(True)
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
|
||||
if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
|
||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||
self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# these layers act as MLM head, so we don't need them
|
||||
if name.startswith("decoder."):
|
||||
return []
|
||||
|
||||
if name.startswith("model."):
|
||||
name = name[6:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("ApertusForCausalLM")
|
||||
class ApertusModel(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.APERTUS
|
||||
@@ -10557,8 +10735,8 @@ def parse_args() -> argparse.Namespace:
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
|
||||
@@ -139,6 +139,7 @@ models = [
|
||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
||||
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
|
||||
{"name": "modern-bert", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
|
||||
{"name": "afmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
|
||||
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
|
||||
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
|
||||
|
||||
@@ -17,7 +17,7 @@ OpenCL (Open Computing Language) is an open, royalty-free standard for cross-pla
|
||||
|
||||
### Llama.cpp + OpenCL
|
||||
|
||||
The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
|
||||
The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs such as those that do not have [SYCL](/docs/backend/SYCL.md) support although the performance is not optimal.
|
||||
|
||||
## OS
|
||||
|
||||
|
||||
@@ -829,7 +829,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
|
||||
No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
|
||||
|
||||
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
|
||||
Suggest reproducing on llama.cpp and report similar issue to llama.cpp. We will support it.
|
||||
|
||||
It's same for other projects including llama.cpp SYCL backend.
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
|
||||
Simple question for Llama-3.2-1B
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
|
||||
~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
|
||||
...
|
||||
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
|
||||
ggml-hex: Hexagon Arch version v79
|
||||
@@ -136,7 +136,7 @@ llama_memory_breakdown_print: | - HTP0-REPACK | 504 =
|
||||
Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
|
||||
~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
|
||||
...
|
||||
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
|
||||
ggml-hex: Hexagon Arch version v81
|
||||
@@ -234,6 +234,6 @@ build: 6a8cf8914 (6733)
|
||||
|
||||
Examples:
|
||||
|
||||
`GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
|
||||
`GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
|
||||
`GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
|
||||
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
|
||||
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
|
||||
`GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)
|
||||
|
||||
@@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
|
||||
Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
|
||||
|
||||
```
|
||||
M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
|
||||
M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
|
||||
...
|
||||
LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
||||
ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
||||
|
||||
@@ -55,7 +55,7 @@ auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder &
|
||||
```
|
||||
|
||||
For a more complete example, see `test_example_native()` in
|
||||
[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
|
||||
[tests/test-chat-peg-parser.cpp](/tests/test-chat-peg-parser.cpp).
|
||||
|
||||
## Parsers/Combinators
|
||||
|
||||
@@ -175,7 +175,7 @@ Most model output can be placed in one of the following categories:
|
||||
(Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
|
||||
|
||||
To provide broad coverage,
|
||||
[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
|
||||
[`common/chat-peg-parser.h`](/common/chat-peg-parser.h) contains builders and
|
||||
mappers that help create parsers and visitors/extractors for these types. They
|
||||
require parsers to tag nodes to conform to an AST "shape". This normalization
|
||||
makes it easy to extract information and generalize parsing.
|
||||
|
||||
@@ -2,57 +2,74 @@
|
||||
#include "common.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
// Export usage message (-h) to markdown format
|
||||
// Automatically update the markdown docs
|
||||
|
||||
static void write_table_header(std::ofstream & file) {
|
||||
file << "| Argument | Explanation |\n";
|
||||
file << "| -------- | ----------- |\n";
|
||||
#define HELP_START_MARKER "<!-- HELP_START -->"
|
||||
#define HELP_END_MARKER "<!-- HELP_END -->"
|
||||
#define NOTE_MESSAGE "<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->"
|
||||
|
||||
struct md_file {
|
||||
llama_example ex;
|
||||
std::string fname;
|
||||
std::string specific_section_header;
|
||||
};
|
||||
|
||||
std::vector<md_file> md_files = {
|
||||
{LLAMA_EXAMPLE_CLI, "tools/cli/README.md", "CLI-specific params"},
|
||||
{LLAMA_EXAMPLE_COMPLETION, "tools/completion/README.md", "Completion-specific params"},
|
||||
{LLAMA_EXAMPLE_SERVER, "tools/server/README.md", "Server-specific params"},
|
||||
};
|
||||
|
||||
static void write_table_header(std::ostringstream & ss) {
|
||||
ss << "| Argument | Explanation |\n";
|
||||
ss << "| -------- | ----------- |\n";
|
||||
}
|
||||
|
||||
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
||||
file << "| `";
|
||||
static void write_table_entry(std::ostringstream & ss, const common_arg & opt) {
|
||||
ss << "| `";
|
||||
// args
|
||||
auto all_args = opt.get_args();
|
||||
for (const auto & arg : all_args) {
|
||||
if (arg == all_args.front()) {
|
||||
file << arg;
|
||||
if (all_args.size() > 1) file << ", ";
|
||||
ss << arg;
|
||||
if (all_args.size() > 1) ss << ", ";
|
||||
} else {
|
||||
file << arg << (arg != all_args.back() ? ", " : "");
|
||||
ss << arg << (arg != all_args.back() ? ", " : "");
|
||||
}
|
||||
}
|
||||
// value hint
|
||||
if (opt.value_hint) {
|
||||
std::string md_value_hint(opt.value_hint);
|
||||
string_replace_all(md_value_hint, "|", "\\|");
|
||||
file << " " << md_value_hint;
|
||||
ss << " " << md_value_hint;
|
||||
}
|
||||
if (opt.value_hint_2) {
|
||||
std::string md_value_hint_2(opt.value_hint_2);
|
||||
string_replace_all(md_value_hint_2, "|", "\\|");
|
||||
file << " " << md_value_hint_2;
|
||||
ss << " " << md_value_hint_2;
|
||||
}
|
||||
// help text
|
||||
std::string md_help(opt.help);
|
||||
md_help = string_strip(md_help);
|
||||
string_replace_all(md_help, "\n", "<br/>");
|
||||
string_replace_all(md_help, "|", "\\|");
|
||||
file << "` | " << md_help << " |\n";
|
||||
ss << "` | " << md_help << " |\n";
|
||||
}
|
||||
|
||||
static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
|
||||
write_table_header(file);
|
||||
static void write_table(std::ostringstream & ss, std::vector<common_arg *> & opts) {
|
||||
write_table_header(ss);
|
||||
for (const auto & opt : opts) {
|
||||
write_table_entry(file, *opt);
|
||||
write_table_entry(ss, *opt);
|
||||
}
|
||||
}
|
||||
|
||||
static void export_md(std::string fname, llama_example ex, std::string name) {
|
||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||
|
||||
static void write_help(std::ostringstream & ss, const md_file & md) {
|
||||
common_params params;
|
||||
auto ctx_arg = common_params_parser_init(params, ex);
|
||||
auto ctx_arg = common_params_parser_init(params, md.ex);
|
||||
|
||||
std::vector<common_arg *> common_options;
|
||||
std::vector<common_arg *> sparam_options;
|
||||
@@ -68,18 +85,58 @@ static void export_md(std::string fname, llama_example ex, std::string name) {
|
||||
}
|
||||
}
|
||||
|
||||
file << "**Common params**\n\n";
|
||||
write_table(file, common_options);
|
||||
file << "\n\n**Sampling params**\n\n";
|
||||
write_table(file, sparam_options);
|
||||
file << "\n\n**" << name << "-specific params**\n\n";
|
||||
write_table(file, specific_options);
|
||||
ss << HELP_START_MARKER << "\n\n";
|
||||
|
||||
ss << NOTE_MESSAGE << "\n\n";
|
||||
|
||||
ss << "### Common params\n\n";
|
||||
write_table(ss, common_options);
|
||||
ss << "\n\n### Sampling params\n\n";
|
||||
write_table(ss, sparam_options);
|
||||
ss << "\n\n### " << md.specific_section_header << "\n\n";
|
||||
write_table(ss, specific_options);
|
||||
|
||||
ss << "\n" << HELP_END_MARKER;
|
||||
}
|
||||
|
||||
int main(int, char **) {
|
||||
// TODO: add CLI
|
||||
export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
|
||||
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
|
||||
for (const auto & md : md_files) {
|
||||
std::ifstream infile(md.fname);
|
||||
if (!infile.is_open()) {
|
||||
fprintf(stderr, "failed to open file '%s' for reading\n", md.fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << infile.rdbuf();
|
||||
infile.close();
|
||||
|
||||
std::string content = ss.str();
|
||||
|
||||
size_t help_start = content.find(HELP_START_MARKER);
|
||||
size_t help_end = content.find(HELP_END_MARKER);
|
||||
|
||||
if (help_start == std::string::npos || help_end == std::string::npos || help_end <= help_start) {
|
||||
fprintf(stderr, "failed to find help markers in file '%s'\n", md.fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::ostringstream new_help_ss;
|
||||
write_help(new_help_ss, md);
|
||||
std::string new_help = new_help_ss.str();
|
||||
|
||||
content = content.substr(0, help_start) + new_help + content.substr(help_end + strlen(HELP_END_MARKER));
|
||||
|
||||
std::ofstream outfile(md.fname);
|
||||
if (!outfile.is_open()) {
|
||||
fprintf(stderr, "failed to open file '%s' for writing\n", md.fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
outfile << content;
|
||||
outfile.close();
|
||||
|
||||
printf("Updated help in '%s'\n", md.fname.c_str());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -41,11 +41,8 @@ android {
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
sourceCompatibility = JavaVersion.VERSION_17
|
||||
targetCompatibility = JavaVersion.VERSION_17
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import android.util.Log
|
||||
import android.widget.EditText
|
||||
import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import androidx.activity.addCallback
|
||||
import androidx.activity.enableEdgeToEdge
|
||||
import androidx.activity.result.contract.ActivityResultContracts
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
@@ -18,6 +19,7 @@ import com.arm.aichat.gguf.GgufMetadata
|
||||
import com.arm.aichat.gguf.GgufMetadataReader
|
||||
import com.google.android.material.floatingactionbutton.FloatingActionButton
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.flow.onCompletion
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
@@ -36,6 +38,7 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
// Arm AI Chat inference engine
|
||||
private lateinit var engine: InferenceEngine
|
||||
private var generationJob: Job? = null
|
||||
|
||||
// Conversation states
|
||||
private var isModelReady = false
|
||||
@@ -47,11 +50,13 @@ class MainActivity : AppCompatActivity() {
|
||||
super.onCreate(savedInstanceState)
|
||||
enableEdgeToEdge()
|
||||
setContentView(R.layout.activity_main)
|
||||
// View model boilerplate and state management is out of this basic sample's scope
|
||||
onBackPressedDispatcher.addCallback { Log.w(TAG, "Ignore back press for simplicity") }
|
||||
|
||||
// Find views
|
||||
ggufTv = findViewById(R.id.gguf)
|
||||
messagesRv = findViewById(R.id.messages)
|
||||
messagesRv.layoutManager = LinearLayoutManager(this)
|
||||
messagesRv.layoutManager = LinearLayoutManager(this).apply { stackFromEnd = true }
|
||||
messagesRv.adapter = messageAdapter
|
||||
userInputEt = findViewById(R.id.user_input)
|
||||
userActionFab = findViewById(R.id.fab)
|
||||
@@ -157,33 +162,35 @@ class MainActivity : AppCompatActivity() {
|
||||
* Validate and send the user message into [InferenceEngine]
|
||||
*/
|
||||
private fun handleUserInput() {
|
||||
userInputEt.text.toString().also { userSsg ->
|
||||
if (userSsg.isEmpty()) {
|
||||
userInputEt.text.toString().also { userMsg ->
|
||||
if (userMsg.isEmpty()) {
|
||||
Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show()
|
||||
} else {
|
||||
userInputEt.text = null
|
||||
userInputEt.isEnabled = false
|
||||
userActionFab.isEnabled = false
|
||||
|
||||
// Update message states
|
||||
messages.add(Message(UUID.randomUUID().toString(), userSsg, true))
|
||||
messages.add(Message(UUID.randomUUID().toString(), userMsg, true))
|
||||
lastAssistantMsg.clear()
|
||||
messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false))
|
||||
|
||||
lifecycleScope.launch(Dispatchers.Default) {
|
||||
engine.sendUserPrompt(userSsg)
|
||||
generationJob = lifecycleScope.launch(Dispatchers.Default) {
|
||||
engine.sendUserPrompt(userMsg)
|
||||
.onCompletion {
|
||||
withContext(Dispatchers.Main) {
|
||||
userInputEt.isEnabled = true
|
||||
userActionFab.isEnabled = true
|
||||
}
|
||||
}.collect { token ->
|
||||
val messageCount = messages.size
|
||||
check(messageCount > 0 && !messages[messageCount - 1].isUser)
|
||||
|
||||
messages.removeAt(messageCount - 1).copy(
|
||||
content = lastAssistantMsg.append(token).toString()
|
||||
).let { messages.add(it) }
|
||||
|
||||
withContext(Dispatchers.Main) {
|
||||
val messageCount = messages.size
|
||||
check(messageCount > 0 && !messages[messageCount - 1].isUser)
|
||||
|
||||
messages.removeAt(messageCount - 1).copy(
|
||||
content = lastAssistantMsg.append(token).toString()
|
||||
).let { messages.add(it) }
|
||||
|
||||
messageAdapter.notifyItemChanged(messages.size - 1)
|
||||
}
|
||||
}
|
||||
@@ -195,6 +202,7 @@ class MainActivity : AppCompatActivity() {
|
||||
/**
|
||||
* Run a benchmark with the model file
|
||||
*/
|
||||
@Deprecated("This benchmark doesn't accurately indicate GUI performance expected by app developers")
|
||||
private suspend fun runBenchmark(modelName: String, modelFile: File) =
|
||||
withContext(Dispatchers.Default) {
|
||||
Log.i(TAG, "Starts benchmarking $modelName")
|
||||
@@ -223,6 +231,16 @@ class MainActivity : AppCompatActivity() {
|
||||
if (!it.exists()) { it.mkdir() }
|
||||
}
|
||||
|
||||
override fun onStop() {
|
||||
generationJob?.cancel()
|
||||
super.onStop()
|
||||
}
|
||||
|
||||
override fun onDestroy() {
|
||||
engine.destroy()
|
||||
super.onDestroy()
|
||||
}
|
||||
|
||||
companion object {
|
||||
private val TAG = MainActivity::class.java.simpleName
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
android:id="@+id/gguf"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="wrap_content"
|
||||
android:layout_margin="16dp"
|
||||
android:padding="16dp"
|
||||
android:text="Selected GGUF model's metadata will show here."
|
||||
style="@style/TextAppearance.MaterialComponents.Body2" />
|
||||
|
||||
@@ -33,8 +33,7 @@
|
||||
<com.google.android.material.divider.MaterialDivider
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="2dp"
|
||||
android:layout_marginHorizontal="16dp"
|
||||
android:layout_marginVertical="8dp" />
|
||||
android:layout_marginHorizontal="16dp" />
|
||||
|
||||
<androidx.recyclerview.widget.RecyclerView
|
||||
android:id="@+id/messages"
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
[versions]
|
||||
|
||||
# Plugins
|
||||
agp = "8.13.0"
|
||||
kotlin = "2.2.20"
|
||||
agp = "8.13.2"
|
||||
kotlin = "2.3.0"
|
||||
|
||||
# AndroidX
|
||||
activity = "1.11.0"
|
||||
activity = "1.12.2"
|
||||
appcompat = "1.7.1"
|
||||
core-ktx = "1.17.0"
|
||||
constraint-layout = "2.2.1"
|
||||
datastore-preferences = "1.1.7"
|
||||
datastore-preferences = "1.2.0"
|
||||
|
||||
# Material
|
||||
material = "1.13.0"
|
||||
|
||||
@@ -560,6 +560,6 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_unload(JNIEnv * /*unused*/, job
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *env, jobject /*unused*/) {
|
||||
Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *, jobject /*unused*/) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ interface InferenceEngine {
|
||||
/**
|
||||
* Unloads the currently loaded model.
|
||||
*/
|
||||
suspend fun cleanUp()
|
||||
fun cleanUp()
|
||||
|
||||
/**
|
||||
* Cleans up resources when the engine is no longer needed.
|
||||
|
||||
@@ -15,9 +15,11 @@ import kotlinx.coroutines.cancel
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.MutableStateFlow
|
||||
import kotlinx.coroutines.flow.StateFlow
|
||||
import kotlinx.coroutines.flow.asStateFlow
|
||||
import kotlinx.coroutines.flow.flow
|
||||
import kotlinx.coroutines.flow.flowOn
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
import java.io.IOException
|
||||
@@ -109,9 +111,11 @@ internal class InferenceEngineImpl private constructor(
|
||||
|
||||
private val _state =
|
||||
MutableStateFlow<InferenceEngine.State>(InferenceEngine.State.Uninitialized)
|
||||
override val state: StateFlow<InferenceEngine.State> = _state
|
||||
override val state: StateFlow<InferenceEngine.State> = _state.asStateFlow()
|
||||
|
||||
private var _readyForSystemPrompt = false
|
||||
@Volatile
|
||||
private var _cancelGeneration = false
|
||||
|
||||
/**
|
||||
* Single-threaded coroutine dispatcher & scope for LLama asynchronous operations
|
||||
@@ -169,6 +173,8 @@ internal class InferenceEngineImpl private constructor(
|
||||
}
|
||||
Log.i(TAG, "Model loaded!")
|
||||
_readyForSystemPrompt = true
|
||||
|
||||
_cancelGeneration = false
|
||||
_state.value = InferenceEngine.State.ModelReady
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, (e.message ?: "Error loading model") + "\n" + pathToModel, e)
|
||||
@@ -231,15 +237,19 @@ internal class InferenceEngineImpl private constructor(
|
||||
|
||||
Log.i(TAG, "User prompt processed. Generating assistant prompt...")
|
||||
_state.value = InferenceEngine.State.Generating
|
||||
while (true) {
|
||||
while (!_cancelGeneration) {
|
||||
generateNextToken()?.let { utf8token ->
|
||||
if (utf8token.isNotEmpty()) emit(utf8token)
|
||||
} ?: break
|
||||
}
|
||||
Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
|
||||
if (_cancelGeneration) {
|
||||
Log.i(TAG, "Assistant generation aborted per requested.")
|
||||
} else {
|
||||
Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
|
||||
}
|
||||
_state.value = InferenceEngine.State.ModelReady
|
||||
} catch (e: CancellationException) {
|
||||
Log.i(TAG, "Generation cancelled by user.")
|
||||
Log.i(TAG, "Assistant generation's flow collection cancelled.")
|
||||
_state.value = InferenceEngine.State.ModelReady
|
||||
throw e
|
||||
} catch (e: Exception) {
|
||||
@@ -268,8 +278,9 @@ internal class InferenceEngineImpl private constructor(
|
||||
/**
|
||||
* Unloads the model and frees resources, or reset error states
|
||||
*/
|
||||
override suspend fun cleanUp() =
|
||||
withContext(llamaDispatcher) {
|
||||
override fun cleanUp() {
|
||||
_cancelGeneration = true
|
||||
runBlocking(llamaDispatcher) {
|
||||
when (val state = _state.value) {
|
||||
is InferenceEngine.State.ModelReady -> {
|
||||
Log.i(TAG, "Unloading model and free resources...")
|
||||
@@ -293,17 +304,21 @@ internal class InferenceEngineImpl private constructor(
|
||||
else -> throw IllegalStateException("Cannot unload model in ${state.javaClass.simpleName}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel all ongoing coroutines and free GGML backends
|
||||
*/
|
||||
override fun destroy() {
|
||||
_readyForSystemPrompt = false
|
||||
llamaScope.cancel()
|
||||
when(_state.value) {
|
||||
is InferenceEngine.State.Uninitialized -> {}
|
||||
is InferenceEngine.State.Initialized -> shutdown()
|
||||
else -> { unload(); shutdown() }
|
||||
_cancelGeneration = true
|
||||
runBlocking(llamaDispatcher) {
|
||||
_readyForSystemPrompt = false
|
||||
when(_state.value) {
|
||||
is InferenceEngine.State.Uninitialized -> {}
|
||||
is InferenceEngine.State.Initialized -> shutdown()
|
||||
else -> { unload(); shutdown() }
|
||||
}
|
||||
}
|
||||
llamaScope.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,8 @@ define quantize_model
|
||||
@echo "Export the quantized model path to $(2) variable in your environment"
|
||||
endef
|
||||
|
||||
DEVICE ?= auto
|
||||
|
||||
###
|
||||
### Casual Model targets/recipes
|
||||
###
|
||||
@@ -53,7 +55,7 @@ causal-convert-mm-model:
|
||||
|
||||
causal-run-original-model:
|
||||
$(call validate_model_path,causal-run-original-model)
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py --device "$(DEVICE)"
|
||||
|
||||
causal-run-converted-model:
|
||||
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
|
||||
|
||||
@@ -4,149 +4,179 @@ import argparse
|
||||
import os
|
||||
import sys
|
||||
import importlib
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from pathlib import Path
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
||||
import torch
|
||||
import numpy as np
|
||||
from utils.common import debug_hook
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||
parser.add_argument("--model-path", "-m", help="Path to the model")
|
||||
parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
|
||||
args = parser.parse_args()
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||
parser.add_argument("--model-path", "-m", help="Path to the model")
|
||||
parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
|
||||
parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
|
||||
return parser.parse_args()
|
||||
|
||||
model_path = os.environ.get("MODEL_PATH", args.model_path)
|
||||
if model_path is None:
|
||||
parser.error(
|
||||
"Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
|
||||
)
|
||||
def load_model_and_tokenizer(model_path, device="auto"):
|
||||
print("Loading model and tokenizer using AutoTokenizer:", model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
multimodal = False
|
||||
full_config = config
|
||||
|
||||
### If you want to dump RoPE activations, uncomment the following lines:
|
||||
### === START ROPE DEBUG ===
|
||||
# from utils.common import setup_rope_debug
|
||||
# setup_rope_debug("transformers.models.apertus.modeling_apertus")
|
||||
### == END ROPE DEBUG ===
|
||||
|
||||
|
||||
print("Loading model and tokenizer using AutoTokenizer:", model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
multimodal = False
|
||||
full_config = config
|
||||
|
||||
print("Model type: ", config.model_type)
|
||||
if "vocab_size" not in config and "text_config" in config:
|
||||
config = config.text_config
|
||||
multimodal = True
|
||||
print("Vocab size: ", config.vocab_size)
|
||||
print("Hidden size: ", config.hidden_size)
|
||||
print("Number of layers: ", config.num_hidden_layers)
|
||||
print("BOS token id: ", config.bos_token_id)
|
||||
print("EOS token id: ", config.eos_token_id)
|
||||
|
||||
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = (
|
||||
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
)
|
||||
class_name = f"{unreleased_model_name}ForCausalLM"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(
|
||||
importlib.import_module(unreleased_module_path), class_name
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
model_path
|
||||
) # Note: from_pretrained, not fromPretrained
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
if multimodal:
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
|
||||
)
|
||||
# Determine device_map based on device argument
|
||||
if device == "cpu":
|
||||
device_map = {"": "cpu"}
|
||||
print("Forcing CPU usage")
|
||||
elif device == "auto":
|
||||
device_map = "auto"
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
|
||||
device_map = {"": device}
|
||||
|
||||
print("Model type: ", config.model_type)
|
||||
if "vocab_size" not in config and "text_config" in config:
|
||||
config = config.text_config
|
||||
multimodal = True
|
||||
|
||||
print("Vocab size: ", config.vocab_size)
|
||||
print("Hidden size: ", config.hidden_size)
|
||||
print("Number of layers: ", config.num_hidden_layers)
|
||||
print("BOS token id: ", config.bos_token_id)
|
||||
print("EOS token id: ", config.eos_token_id)
|
||||
|
||||
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = (
|
||||
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
)
|
||||
class_name = f"{unreleased_model_name}ForCausalLM"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
if args.verbose:
|
||||
for name, module in model.named_modules():
|
||||
if len(list(module.children())) == 0: # only leaf modules
|
||||
module.register_forward_hook(debug_hook(name))
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
if multimodal:
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=full_config
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
# Printing the Model class to allow for easier debugging. This can be useful
|
||||
# when working with models that have not been publicly released yet and this
|
||||
# migth require that the concrete class is imported and used directly instead
|
||||
# of using AutoModelForCausalLM.
|
||||
print(f"Model class: {model.__class__.__name__}")
|
||||
print(f"Model class: {model.__class__.__name__}")
|
||||
|
||||
device = next(model.parameters()).device
|
||||
if args.prompt_file:
|
||||
with open(args.prompt_file, encoding='utf-8') as f:
|
||||
prompt = f.read()
|
||||
elif os.getenv("MODEL_TESTING_PROMPT"):
|
||||
prompt = os.getenv("MODEL_TESTING_PROMPT")
|
||||
else:
|
||||
prompt = "Hello, my name is"
|
||||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
||||
return model, tokenizer, config
|
||||
|
||||
print(f"Input tokens: {input_ids}")
|
||||
print(f"Input text: {repr(prompt)}")
|
||||
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
|
||||
def enable_torch_debugging(model):
|
||||
for name, module in model.named_modules():
|
||||
if len(list(module.children())) == 0: # only leaf modules
|
||||
module.register_forward_hook(debug_hook(name))
|
||||
|
||||
batch_size = 512
|
||||
def get_prompt(args):
|
||||
if args.prompt_file:
|
||||
with open(args.prompt_file, encoding='utf-8') as f:
|
||||
return f.read()
|
||||
elif os.getenv("MODEL_TESTING_PROMPT"):
|
||||
return os.getenv("MODEL_TESTING_PROMPT")
|
||||
else:
|
||||
return "Hello, my name is"
|
||||
|
||||
with torch.no_grad():
|
||||
past = None
|
||||
outputs = None
|
||||
for i in range(0, input_ids.size(1), batch_size):
|
||||
print(f"Processing chunk with tokens {i} to {i + batch_size}")
|
||||
chunk = input_ids[:, i:i + batch_size]
|
||||
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
|
||||
past = outputs.past_key_values
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
model_path = os.environ.get("MODEL_PATH", args.model_path)
|
||||
if model_path is None:
|
||||
print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
logits = outputs.logits # type: ignore
|
||||
|
||||
# Extract logits for the last token (next token prediction)
|
||||
last_logits = logits[0, -1, :].float().cpu().numpy()
|
||||
model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)
|
||||
|
||||
print(f"Logits shape: {logits.shape}")
|
||||
print(f"Last token logits shape: {last_logits.shape}")
|
||||
print(f"Vocab size: {len(last_logits)}")
|
||||
if args.verbose:
|
||||
enable_torch_debugging(model)
|
||||
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}.txt"
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
# Save to file for comparison
|
||||
last_logits.astype(np.float32).tofile(bin_filename)
|
||||
# Iterate over the model parameters (the tensors) and get the first one
|
||||
# and use it to get the device the model is on.
|
||||
device = next(model.parameters()).device
|
||||
prompt = get_prompt(args)
|
||||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
||||
|
||||
# Also save as text file for easy inspection
|
||||
with open(txt_filename, "w") as f:
|
||||
for i, logit in enumerate(last_logits):
|
||||
f.write(f"{i}: {logit:.6f}\n")
|
||||
print(f"Input tokens: {input_ids}")
|
||||
print(f"Input text: {repr(prompt)}")
|
||||
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
|
||||
|
||||
# Print some sample logits for quick verification
|
||||
print(f"First 10 logits: {last_logits[:10]}")
|
||||
print(f"Last 10 logits: {last_logits[-10:]}")
|
||||
batch_size = 512
|
||||
|
||||
# Show top 5 predicted tokens
|
||||
top_indices = np.argsort(last_logits)[-5:][::-1]
|
||||
print("Top 5 predictions:")
|
||||
for idx in top_indices:
|
||||
token = tokenizer.decode([idx])
|
||||
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
|
||||
with torch.no_grad():
|
||||
past = None
|
||||
outputs = None
|
||||
for i in range(0, input_ids.size(1), batch_size):
|
||||
print(f"Processing chunk with tokens {i} to {i + batch_size}")
|
||||
chunk = input_ids[:, i:i + batch_size]
|
||||
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
|
||||
past = outputs.past_key_values
|
||||
|
||||
print(f"Saved bin logits to: {bin_filename}")
|
||||
print(f"Saved txt logist to: {txt_filename}")
|
||||
logits = outputs.logits # type: ignore
|
||||
|
||||
# Extract logits for the last token (next token prediction)
|
||||
last_logits = logits[0, -1, :].float().cpu().numpy()
|
||||
|
||||
print(f"Logits shape: {logits.shape}")
|
||||
print(f"Last token logits shape: {last_logits.shape}")
|
||||
print(f"Vocab size: {len(last_logits)}")
|
||||
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}.txt"
|
||||
|
||||
# Save to file for comparison
|
||||
last_logits.astype(np.float32).tofile(bin_filename)
|
||||
|
||||
# Also save as text file for easy inspection
|
||||
with open(txt_filename, "w") as f:
|
||||
for i, logit in enumerate(last_logits):
|
||||
f.write(f"{i}: {logit:.6f}\n")
|
||||
|
||||
# Print some sample logits for quick verification
|
||||
print(f"First 10 logits: {last_logits[:10]}")
|
||||
print(f"Last 10 logits: {last_logits[-10:]}")
|
||||
|
||||
# Show top 5 predicted tokens
|
||||
top_indices = np.argsort(last_logits)[-5:][::-1]
|
||||
print("Top 5 predictions:")
|
||||
for idx in top_indices:
|
||||
token = tokenizer.decode([idx])
|
||||
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
|
||||
|
||||
print(f"Saved bin logits to: {bin_filename}")
|
||||
print(f"Saved txt logist to: {txt_filename}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
@@ -9,169 +10,243 @@ from pathlib import Path
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModel
|
||||
import torch
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process model with specified path')
|
||||
parser.add_argument('--model-path', '-m', help='Path to the model')
|
||||
parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
|
||||
parser.add_argument('--use-sentence-transformers', action='store_true',
|
||||
help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
|
||||
args = parser.parse_args()
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description='Run original embedding model')
|
||||
parser.add_argument(
|
||||
'--model-path',
|
||||
'-m',
|
||||
help='Path to the model'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prompts-file',
|
||||
'-p',
|
||||
help='Path to file containing prompts (one per line)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--use-sentence-transformers',
|
||||
action='store_true',
|
||||
help=('Use SentenceTransformer to apply all numbered layers '
|
||||
'(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
'-d',
|
||||
help='Device to use (cpu, cuda, mps, auto)',
|
||||
default='auto'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
def read_prompt_from_file(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{file_path}' not found")
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
exit(1)
|
||||
|
||||
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
|
||||
|
||||
# Determine if we should use SentenceTransformer
|
||||
use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
|
||||
|
||||
if use_sentence_transformers:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
print("Using SentenceTransformer to apply all numbered layers")
|
||||
model = SentenceTransformer(model_path)
|
||||
tokenizer = model.tokenizer
|
||||
config = model[0].auto_model.config # type: ignore
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
# This can be used to override the sliding window size for manual testing. This
|
||||
# can be useful to verify the sliding window attention mask in the original model
|
||||
# and compare it with the converted .gguf model.
|
||||
if hasattr(config, 'sliding_window'):
|
||||
original_sliding_window = config.sliding_window
|
||||
#original_sliding_window = 6
|
||||
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
|
||||
|
||||
print(f"Using unreleased model: {unreleased_model_name}")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
class_name = f"{unreleased_model_name}Model"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(model_path, config=config)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
|
||||
if device == "cpu":
|
||||
device_map = {"": "cpu"}
|
||||
print("Forcing CPU usage")
|
||||
elif device == "auto":
|
||||
# On Mac, "auto" device_map can cause issues with accelerate
|
||||
# So we detect the best device manually
|
||||
if torch.cuda.is_available():
|
||||
device_map = {"": "cuda"}
|
||||
print("Using CUDA")
|
||||
elif torch.backends.mps.is_available():
|
||||
device_map = {"": "mps"}
|
||||
print("Using MPS (Apple Metal)")
|
||||
else:
|
||||
device_map = {"": "cpu"}
|
||||
print("Using CPU")
|
||||
else:
|
||||
model = AutoModel.from_pretrained(model_path, config=config)
|
||||
print(f"Model class: {type(model)}")
|
||||
print(f"Model file: {type(model).__module__}")
|
||||
device_map = {"": device}
|
||||
|
||||
# Verify the model is using the correct sliding window
|
||||
if not use_sentence_transformers:
|
||||
if hasattr(model.config, 'sliding_window'): # type: ignore
|
||||
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
|
||||
else:
|
||||
print("Model config does not have sliding_window attribute")
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
if args.prompts_file:
|
||||
prompt_text = read_prompt_from_file(args.prompts_file)
|
||||
texts = [prompt_text]
|
||||
else:
|
||||
texts = ["Hello world today"]
|
||||
|
||||
with torch.no_grad():
|
||||
if use_sentence_transformers:
|
||||
embeddings = model.encode(texts, convert_to_numpy=True)
|
||||
all_embeddings = embeddings # Shape: [batch_size, hidden_size]
|
||||
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt"
|
||||
)
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
|
||||
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
|
||||
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
|
||||
from sentence_transformers import SentenceTransformer
|
||||
print("Using SentenceTransformer to apply all numbered layers")
|
||||
model = SentenceTransformer(model_path)
|
||||
tokenizer = model.tokenizer
|
||||
config = model[0].auto_model.config # type: ignore
|
||||
else:
|
||||
# Standard approach: use base model output only
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
# This can be used to override the sliding window size for manual testing. This
|
||||
# can be useful to verify the sliding window attention mask in the original model
|
||||
# and compare it with the converted .gguf model.
|
||||
if hasattr(config, 'sliding_window'):
|
||||
original_sliding_window = config.sliding_window
|
||||
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
|
||||
|
||||
outputs = model(**encoded)
|
||||
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
print(f"Using unreleased model: {unreleased_model_name}")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
class_name = f"{unreleased_model_name}Model"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size]
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
model = AutoModel.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
print(f"Model class: {type(model)}")
|
||||
print(f"Model file: {type(model).__module__}")
|
||||
|
||||
print(f"Hidden states shape: {hidden_states.shape}")
|
||||
print(f"All embeddings shape: {all_embeddings.shape}")
|
||||
print(f"Embedding dimension: {all_embeddings.shape[1]}")
|
||||
# Verify the model is using the correct sliding window
|
||||
if hasattr(model.config, 'sliding_window'): # type: ignore
|
||||
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
|
||||
else:
|
||||
print("Model config does not have sliding_window attribute")
|
||||
|
||||
if len(all_embeddings.shape) == 1:
|
||||
n_embd = all_embeddings.shape[0] # type: ignore
|
||||
n_embd_count = 1
|
||||
all_embeddings = all_embeddings.reshape(1, -1)
|
||||
return model, tokenizer, config
|
||||
|
||||
|
||||
def get_prompt(args):
|
||||
if args.prompts_file:
|
||||
try:
|
||||
with open(args.prompts_file, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{args.prompts_file}' not found")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
n_embd = all_embeddings.shape[1] # type: ignore
|
||||
n_embd_count = all_embeddings.shape[0] # type: ignore
|
||||
return "Hello world today"
|
||||
|
||||
print()
|
||||
|
||||
for j in range(n_embd_count):
|
||||
embedding = all_embeddings[j]
|
||||
print(f"embedding {j}: ", end="")
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
# Print first 3 values
|
||||
for i in range(min(3, n_embd)):
|
||||
print(f"{embedding[i]:9.6f} ", end="")
|
||||
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
print("Error: Model path must be specified either via --model-path argument "
|
||||
"or EMBEDDING_MODEL_PATH environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
print(" ... ", end="")
|
||||
# Determine if we should use SentenceTransformer
|
||||
use_st = (
|
||||
args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
|
||||
)
|
||||
|
||||
# Print last 3 values
|
||||
for i in range(n_embd - 3, n_embd):
|
||||
print(f"{embedding[i]:9.6f} ", end="")
|
||||
model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)
|
||||
|
||||
print() # New line
|
||||
# Get the device the model is on
|
||||
if not use_st:
|
||||
device = next(model.parameters()).device
|
||||
else:
|
||||
# For SentenceTransformer, get device from the underlying model
|
||||
device = next(model[0].auto_model.parameters()).device # type: ignore
|
||||
|
||||
print()
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
|
||||
prompt_text = get_prompt(args)
|
||||
texts = [prompt_text]
|
||||
|
||||
flattened_embeddings = all_embeddings.flatten()
|
||||
flattened_embeddings.astype(np.float32).tofile(bin_filename)
|
||||
with torch.no_grad():
|
||||
if use_st:
|
||||
embeddings = model.encode(texts, convert_to_numpy=True)
|
||||
all_embeddings = embeddings # Shape: [batch_size, hidden_size]
|
||||
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt"
|
||||
)
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
|
||||
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
|
||||
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
|
||||
else:
|
||||
# Standard approach: use base model output only
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
|
||||
# Move inputs to the same device as the model
|
||||
encoded = {k: v.to(device) for k, v in encoded.items()}
|
||||
outputs = model(**encoded)
|
||||
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
|
||||
|
||||
all_embeddings = hidden_states[0].float().cpu().numpy() # Shape: [seq_len, hidden_size]
|
||||
|
||||
print(f"Hidden states shape: {hidden_states.shape}")
|
||||
print(f"All embeddings shape: {all_embeddings.shape}")
|
||||
print(f"Embedding dimension: {all_embeddings.shape[1]}")
|
||||
|
||||
if len(all_embeddings.shape) == 1:
|
||||
n_embd = all_embeddings.shape[0] # type: ignore
|
||||
n_embd_count = 1
|
||||
all_embeddings = all_embeddings.reshape(1, -1)
|
||||
else:
|
||||
n_embd = all_embeddings.shape[1] # type: ignore
|
||||
n_embd_count = all_embeddings.shape[0] # type: ignore
|
||||
|
||||
print()
|
||||
|
||||
with open(txt_filename, "w") as f:
|
||||
idx = 0
|
||||
for j in range(n_embd_count):
|
||||
for value in all_embeddings[j]:
|
||||
f.write(f"{idx}: {value:.6f}\n")
|
||||
idx += 1
|
||||
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
|
||||
print("")
|
||||
print(f"Saved bin embeddings to: {bin_filename}")
|
||||
print(f"Saved txt embeddings to: {txt_filename}")
|
||||
embedding = all_embeddings[j]
|
||||
print(f"embedding {j}: ", end="")
|
||||
|
||||
# Print first 3 values
|
||||
for i in range(min(3, n_embd)):
|
||||
print(f"{embedding[i]:9.6f} ", end="")
|
||||
|
||||
print(" ... ", end="")
|
||||
|
||||
# Print last 3 values
|
||||
for i in range(n_embd - 3, n_embd):
|
||||
print(f"{embedding[i]:9.6f} ", end="")
|
||||
|
||||
print() # New line
|
||||
|
||||
print()
|
||||
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
|
||||
|
||||
flattened_embeddings = all_embeddings.flatten()
|
||||
flattened_embeddings.astype(np.float32).tofile(bin_filename)
|
||||
|
||||
with open(txt_filename, "w") as f:
|
||||
idx = 0
|
||||
for j in range(n_embd_count):
|
||||
for value in all_embeddings[j]:
|
||||
f.write(f"{idx}: {value:.6f}\n")
|
||||
idx += 1
|
||||
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
|
||||
print("")
|
||||
print(f"Saved bin embeddings to: {bin_filename}")
|
||||
print(f"Saved txt embeddings to: {txt_filename}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -166,7 +166,7 @@ def main():
|
||||
# Load the python model to get configuration information and also to load the tokenizer.
|
||||
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
|
||||
config = AutoConfig.from_pretrained(args.model_path)
|
||||
config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
|
||||
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
@@ -186,9 +186,9 @@ def main():
|
||||
exit(1)
|
||||
else:
|
||||
if args.causal:
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_path)
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModel.from_pretrained(args.model_path)
|
||||
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
|
||||
|
||||
encoded = tokenizer(prompt, return_tensors="pt")
|
||||
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
|
||||
|
||||
@@ -222,8 +222,8 @@ int main(int argc, char ** argv) {
|
||||
float * emb = embeddings.data();
|
||||
|
||||
// break into batches
|
||||
int p = 0; // number of prompts processed already
|
||||
int s = 0; // number of prompts in current batch
|
||||
unsigned int p = 0; // number of prompts processed already
|
||||
unsigned int s = 0; // number of prompts in current batch
|
||||
for (int k = 0; k < n_chunks; k++) {
|
||||
// clamp to n_batch tokens
|
||||
auto & inp = chunks[k].tokens;
|
||||
@@ -231,7 +231,7 @@ int main(int argc, char ** argv) {
|
||||
const uint64_t n_toks = inp.size();
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
common_batch_clear(batch);
|
||||
|
||||
@@ -22,9 +22,9 @@ if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||
fi
|
||||
|
||||
@@ -24,8 +24,8 @@ export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "Using $GGML_SYCL_DEVICE as the main GPU"
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||
fi
|
||||
|
||||
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
:: support malloc device memory more than 4GB.
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
||||
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
||||
|
||||
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
:: support malloc device memory more than 4GB.
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99
|
||||
.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
|
||||
|
||||
@@ -430,10 +430,22 @@ if (MSVC)
|
||||
configure_msvc_target(ggml-cpu-x64)
|
||||
configure_msvc_target(ggml-cpu-sse42)
|
||||
configure_msvc_target(ggml-cpu-sandybridge)
|
||||
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
# skipping ggml-cpu-ivybridge
|
||||
# skipping ggml-cpu-piledriver
|
||||
configure_msvc_target(ggml-cpu-haswell)
|
||||
configure_msvc_target(ggml-cpu-skylakex)
|
||||
configure_msvc_target(ggml-cpu-cannonlake)
|
||||
configure_msvc_target(ggml-cpu-cascadelake)
|
||||
configure_msvc_target(ggml-cpu-icelake)
|
||||
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
||||
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
||||
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
||||
# skipping ggml-cpu-cooperlake
|
||||
# skipping ggml-cpu-zen4
|
||||
configure_msvc_target(ggml-cpu-alderlake)
|
||||
# MSVC doesn't support AMX
|
||||
# skipping ggml-cpu-sapphirerapids
|
||||
|
||||
if (GGML_BUILD_EXAMPLES)
|
||||
configure_msvc_target(common-ggml)
|
||||
|
||||
@@ -357,15 +357,29 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
endif()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
if (NOT MSVC)
|
||||
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
|
||||
ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
|
||||
ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
|
||||
ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
||||
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
||||
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
||||
ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
|
||||
ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
|
||||
@@ -2338,19 +2338,19 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
|
||||
// Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
|
||||
// TODO: acl_yarn_ramp_tensor use rope cache.
|
||||
bool yarn_ramp_tensor_updated = false;
|
||||
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
|
||||
acl_tensor_ptr acl_yarn_ramp_tensor;
|
||||
if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
|
||||
ctx.rope_cache.freq_scale != freq_scale)) {
|
||||
yarn_ramp_tensor_updated = true;
|
||||
|
||||
if (ctx.rope_cache.yarn_ramp_cache != nullptr) {
|
||||
ACL_CHECK(aclrtFree(ctx.rope_cache.yarn_ramp_cache));
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.yarn_ramp_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
// -rope_yarn_ramp
|
||||
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
||||
// return MIN(1, MAX(0, y)) - 1;
|
||||
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
|
||||
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
|
||||
acl_yarn_ramp_tensor =
|
||||
ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
|
||||
ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
|
||||
float zero_value = 0, one_value = 1;
|
||||
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
|
||||
acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
|
||||
@@ -2380,8 +2380,10 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
|
||||
acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
|
||||
} else {
|
||||
acl_yarn_ramp_tensor =
|
||||
ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
|
||||
}
|
||||
|
||||
// Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
|
||||
if (ext_factor != 0) {
|
||||
if (theta_scale_updated || yarn_ramp_tensor_updated) {
|
||||
@@ -2988,32 +2990,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
// stride
|
||||
int64_t s0 = ((const int32_t *) (dst->op_params))[0];
|
||||
int64_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
|
||||
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
|
||||
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
|
||||
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
|
||||
|
||||
// get base information of input and kernel
|
||||
int64_t input_len = *(src1->ne);
|
||||
int64_t dst_len = *(dst->ne);
|
||||
int64_t kernel_size = *(src0->ne);
|
||||
|
||||
// set the max kernel size for each conv
|
||||
int64_t max_kernel_size = 255;
|
||||
|
||||
// compute the partition of kernel
|
||||
int64_t part_num = 1;
|
||||
part_num = (kernel_size + max_kernel_size - 1) / max_kernel_size;
|
||||
|
||||
int64_t strideVal[1];
|
||||
strideVal[0] = s0;
|
||||
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
|
||||
int64_t paddingVal[] = { 0 };
|
||||
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
|
||||
int64_t dilationVal[] = { 1 };
|
||||
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
|
||||
int8_t cubeMathType = 0;
|
||||
strideVal[0] = s0;
|
||||
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
|
||||
int64_t paddingVal[] = {0};
|
||||
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
|
||||
int64_t dilationVal[] = {1};
|
||||
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
|
||||
bool transposed = true;
|
||||
int64_t groups = 1;
|
||||
int8_t cubeMathType = 0;
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
cubeMathType = 1;
|
||||
#endif
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
|
||||
dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
|
||||
auto weight_type = ggml_cann_type_mapping(src0->type);
|
||||
auto dst_type = ggml_cann_type_mapping(dst->type);
|
||||
|
||||
// slice the kernel to make each conv available
|
||||
int64_t slice_dim = -1;
|
||||
int64_t slice_start = 0;
|
||||
int64_t slice_end = max_kernel_size;
|
||||
int64_t slice_step = 1;
|
||||
int64_t interval = max_kernel_size;
|
||||
|
||||
int64_t left_pad_len = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
|
||||
int64_t right_pad_len = 0;
|
||||
|
||||
acl_scalar_ptr alpha = nullptr;
|
||||
float alphaValue = 1.0;
|
||||
alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
||||
|
||||
// set zero to destination
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
|
||||
|
||||
for(int k = 0; k < part_num; k++){
|
||||
|
||||
// create part kernel tensor and slice from big kernel
|
||||
slice_start = max_kernel_size * k;
|
||||
if(k == part_num - 1){
|
||||
slice_end = kernel_size;
|
||||
interval = kernel_size - max_kernel_size * k;
|
||||
}else{
|
||||
slice_end = max_kernel_size * (k+1);
|
||||
}
|
||||
|
||||
int64_t part_ne[4];
|
||||
for(int i = 0; i < 4; i++) {
|
||||
part_ne[i] = *(src0->ne + i);
|
||||
}
|
||||
part_ne[0] = interval;
|
||||
|
||||
size_t part_nb[4];
|
||||
part_nb[0] = sizeof(weight_type);
|
||||
for (int i = 1; i < 4; i++) {
|
||||
part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
|
||||
}
|
||||
|
||||
ggml_cann_pool_alloc part_kernel_allocator;
|
||||
part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
|
||||
void* part_kernel_buf = part_kernel_allocator.get();
|
||||
|
||||
acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type,
|
||||
ggml_element_size(src0), part_ne, part_nb, 3, ACL_FORMAT_NCL);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight.get(), slice_dim, slice_start, slice_end, slice_step, part_kernel.get());
|
||||
|
||||
// create the part conv result tensor
|
||||
int64_t part_dst_ne[4];
|
||||
for(int i = 0; i < 4; i++){
|
||||
part_dst_ne[i] = *(dst->ne + i);
|
||||
}
|
||||
part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
|
||||
|
||||
size_t part_dst_nb[4];
|
||||
part_dst_nb[0] = sizeof(weight_type);
|
||||
for (int i = 1; i < 4; i++) {
|
||||
part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc part_dst_allocator;
|
||||
part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
|
||||
void* part_dst_buf = part_dst_allocator.get();
|
||||
|
||||
acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
|
||||
part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst.get());
|
||||
|
||||
// compute part conv transpose 1d
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), part_kernel.get(), nullptr, stride.get(),
|
||||
padding.get(), dilation.get(), transposed, padding.get(), groups, acl_part_dst.get(), cubeMathType);
|
||||
|
||||
// compute the position of part result in final result
|
||||
int64_t global_start = slice_start;
|
||||
int64_t global_end = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
|
||||
|
||||
left_pad_len = global_start;
|
||||
right_pad_len = dst_len - global_end;
|
||||
|
||||
std::vector<int64_t> padDataVal = {left_pad_len,right_pad_len};
|
||||
acl_int_array_ptr padData = ggml_cann_create_int_array(padDataVal.data(), 2);
|
||||
|
||||
acl_scalar_ptr pad_value = nullptr;
|
||||
float pad_valueVal = 0.0;
|
||||
pad_value = ggml_cann_create_scalar(&pad_valueVal, aclDataType::ACL_FLOAT);
|
||||
|
||||
int64_t conv_result_ne[4];
|
||||
for(int i = 0; i < 4; i++){
|
||||
conv_result_ne[i] = *(dst->ne + i);
|
||||
}
|
||||
|
||||
size_t conv_result_nb[4];
|
||||
conv_result_nb[0] = sizeof(weight_type);
|
||||
for (int i = 1; i < 4; i++) {
|
||||
conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
|
||||
}
|
||||
|
||||
ggml_cann_pool_alloc conv_result_allocator;
|
||||
conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
|
||||
void* conv_result_buf = conv_result_allocator.get();
|
||||
|
||||
acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
|
||||
conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst.get(), padData.get(), pad_value.get(), conv_result.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), conv_result.get(), alpha.get());
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
@@ -3576,3 +3702,106 @@ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
|
||||
// This op is currently defined only for F32 in ggml_cpu
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
// Shapes follow ggml_compute_forward_ssm_conv_f32
|
||||
const int64_t nc = src1->ne[0]; // d_conv
|
||||
const int64_t ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_s = src0->ne[2]; // n_seqs
|
||||
|
||||
const int64_t n_t = dst->ne[1]; // tokens per sequence
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nr); // dst: {d_inner, n_t, n_s}
|
||||
GGML_ASSERT(src1->ne[1] == nr); // weight: {d_conv, d_inner}
|
||||
GGML_ASSERT(ncs == nc - 1 + n_t); // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
|
||||
// --- Build CANN tensors ---
|
||||
|
||||
// 1) Input: conv_x as NCL
|
||||
//
|
||||
// src0->ne = { ncs, nr, n_s, 1 } // {L_in, C, N}
|
||||
// Passing ACL_FORMAT_NCL here means:
|
||||
// reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
|
||||
acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
|
||||
|
||||
// 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
|
||||
//
|
||||
// src1 original: ne = { nc, nr, 1, 1 } // [K, C, 1, 1]
|
||||
// we want a view: ne_w = { nc, 1, nr } // [K, 1, C]
|
||||
// so that reversed dims -> [C, 1, K] which matches
|
||||
// [out_channels, in_channels/groups, kernel_size]
|
||||
int64_t w_ne[GGML_MAX_DIMS] = { nc, 1, nr, 1 }; // [K, 1 input ch. per group, C groups]
|
||||
// Layout: src1 data is [K, C] with
|
||||
// offset(k, c) = k*nb0 + c*nb1
|
||||
// We want offset_w(k, 0, c) = k*nb0 + c*nb1,
|
||||
// so we can reuse nb0 and nb1, and set nb2 = nb1.
|
||||
size_t w_nb[GGML_MAX_DIMS] = { src1->nb[0], src1->nb[1], src1->nb[1], src1->nb[3] }; // same as src1
|
||||
|
||||
acl_tensor_ptr acl_w = ggml_cann_create_tensor(
|
||||
src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL);
|
||||
|
||||
// 3) Output: dst is { d_inner, n_t, n_s } (CLN)
|
||||
//
|
||||
// We need an NCL view of the same buffer:
|
||||
// desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
|
||||
//
|
||||
// Original CLN layout:
|
||||
// dst->ne = { nr, n_t, n_s }
|
||||
// dst->nb[0] = sizeof(float)
|
||||
// dst->nb[1] = nr * sizeof(float)
|
||||
// dst->nb[2] = nr * n_t * sizeof(float)
|
||||
//
|
||||
// We want offset_new(L, C, N) = offset_orig(C, L, N).
|
||||
// Choose:
|
||||
// nb_y[0] = nr * sizeof(float); // step in L
|
||||
// nb_y[1] = sizeof(float); // step in C
|
||||
// nb_y[2] = nr * n_t * sizeof(float); // step in N
|
||||
int64_t y_ne[GGML_MAX_DIMS] = { n_t, nr, n_s, 1 }; // [L_out, C, N]
|
||||
size_t y_nb[GGML_MAX_DIMS] = { dst->ne[0] * sizeof(float), sizeof(float), dst->ne[0] * dst->ne[1] * sizeof(float), dst->nb[3] }; // [nr, 1, nr * n_t]
|
||||
|
||||
acl_tensor_ptr acl_y = ggml_cann_create_tensor(
|
||||
dst->data, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);
|
||||
|
||||
// --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
|
||||
int64_t strideVal[1] = { 1 };
|
||||
int64_t paddingVal[1] = { 0 };
|
||||
int64_t dilationVal[1] = { 1 };
|
||||
|
||||
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
|
||||
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
|
||||
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
|
||||
|
||||
const bool transposed = false;
|
||||
const int64_t groups = nr; // depthwise: one group per inner dim
|
||||
int8_t cubeMathType = 0;
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
cubeMathType = 1;
|
||||
#endif
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx,
|
||||
Convolution,
|
||||
acl_x.get(), // input: N, C, L_in = ncs
|
||||
acl_w.get(), // weight: [C, 1, K] with groups=nr
|
||||
nullptr, // bias
|
||||
stride.get(),
|
||||
padding.get(),
|
||||
dilation.get(),
|
||||
transposed,
|
||||
padding.get(), // output padding (unused for non-transposed)
|
||||
groups,
|
||||
acl_y.get(),
|
||||
cubeMathType);
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include <aclnnop/aclnn_sign.h>
|
||||
#include <aclnnop/aclnn_silu.h>
|
||||
#include <aclnnop/aclnn_sin.h>
|
||||
#include <aclnnop/aclnn_slice.h>
|
||||
#include <aclnnop/aclnn_sqrt.h>
|
||||
#include <aclnnop/aclnn_tanh.h>
|
||||
|
||||
@@ -1032,6 +1033,8 @@ void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTenso
|
||||
ggml_backend_cann_context & ctx,
|
||||
ggml_tensor * dst);
|
||||
|
||||
void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Applies a gated (GLU-style) unary operation using the CANN backend.
|
||||
*
|
||||
|
||||
@@ -229,6 +229,60 @@ struct ggml_graph_node_properties {
|
||||
// op
|
||||
ggml_op node_op;
|
||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||
|
||||
/**
|
||||
* @brief Check if a ggml tensor node matches this property set.
|
||||
*
|
||||
* This function compares all relevant fields (address, op type, shape, source inputs, op params)
|
||||
* to determine whether the current node matches these previously recorded properties.
|
||||
*
|
||||
* @param node The current ggml tensor node.
|
||||
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
||||
*/
|
||||
bool has_matching_properties(ggml_tensor * node) {
|
||||
if (node->data != this->node_address && node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node->op != this->node_op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != this->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (node->nb[i] != this->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i]) {
|
||||
if (node->src[i]->data != this->src_address[i] && node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int d = 0; d < GGML_MAX_DIMS; d++) {
|
||||
if (node->src[i]->ne[d] != this->src_ne[i][d]) {
|
||||
return false;
|
||||
}
|
||||
if (node->src[i]->nb[d] != this->src_nb[i][d]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (this->src_address[i] != nullptr) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
|
||||
return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_cann_graph {
|
||||
@@ -241,6 +295,79 @@ struct ggml_cann_graph {
|
||||
aclmdlRI graph = nullptr;
|
||||
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
|
||||
/**
|
||||
* @brief Create a new CANN graph from a ggml computation graph.
|
||||
*
|
||||
* This function creates a new ggml_cann_graph object and fills its node properties
|
||||
* (operation type, dimensions, strides, input sources, and operation parameters)
|
||||
* based on the current ggml computation graph.
|
||||
*
|
||||
* Each node in the ggml graph is mapped to a property entry in the new CANN graph:
|
||||
* - node address
|
||||
* - operation type
|
||||
* - shape (ne) and strides (nb)
|
||||
* - source tensor addresses
|
||||
* - operation parameters
|
||||
*
|
||||
* @param cgraph The current ggml computation graph.
|
||||
* @return Pointer to the newly created ggml_cann_graph object.
|
||||
*/
|
||||
static ggml_cann_graph * create_from_cgraph(ggml_cgraph * cgraph) {
|
||||
ggml_cann_graph * new_graph = new ggml_cann_graph();
|
||||
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||
|
||||
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
|
||||
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||
auto & prop = new_graph->ggml_graph_properties[node_idx];
|
||||
|
||||
prop.node_address = node->data;
|
||||
prop.node_op = node->op;
|
||||
|
||||
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
|
||||
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
|
||||
|
||||
for (int src = 0; src < GGML_MAX_SRC; ++src) {
|
||||
if (node->src[src]) {
|
||||
prop.src_address[src] = node->src[src]->data;
|
||||
std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
|
||||
std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
|
||||
} else {
|
||||
prop.src_address[src] = nullptr;
|
||||
std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
|
||||
std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
|
||||
return new_graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check whether this CANN graph matches the given ggml computation graph.
|
||||
*
|
||||
* This function compares the number of nodes and each node's properties
|
||||
* (operation type, dimensions, strides, inputs, and operation parameters)
|
||||
* to determine whether this CANN graph matches the given ggml graph.
|
||||
*
|
||||
* @param cgraph The current ggml computation graph.
|
||||
* @return true if this CANN graph matches the ggml graph; false otherwise.
|
||||
*/
|
||||
bool matches_cgraph(ggml_cgraph * cgraph) {
|
||||
if (this->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
if (!this->ggml_graph_properties[i].has_matching_properties(cgraph->nodes[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -272,15 +399,6 @@ struct ggml_cann_graph_lru_cache {
|
||||
cache_list.push_front(new_node);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Move an existing graph to the front of the cache.
|
||||
* @param node Pointer to the ggml_cann_graph to move.
|
||||
*/
|
||||
void move_to_front(ggml_cann_graph * node) {
|
||||
cache_list.remove(node);
|
||||
cache_list.push_front(node);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Clear all graphs from the cache (also frees memory).
|
||||
*/
|
||||
@@ -295,6 +413,28 @@ struct ggml_cann_graph_lru_cache {
|
||||
* @brief Destructor that clears the cache and frees all cached graphs.
|
||||
*/
|
||||
~ggml_cann_graph_lru_cache() { clear(); }
|
||||
|
||||
/**
|
||||
* @brief Find a cached CANN graph that matches the given ggml graph and move it to front.
|
||||
*
|
||||
* This function iterates through the cached CANN graphs stored in the LRU cache and
|
||||
* compares them against the given ggml computation graph. If a matching graph is found,
|
||||
* it is promoted to the front of the LRU cache and returned. Otherwise, the function
|
||||
* returns nullptr.
|
||||
*
|
||||
* @param cgraph The current ggml computation graph.
|
||||
* @return true if found; false otherwise.
|
||||
*/
|
||||
bool find_and_move_to_front(ggml_cgraph * cgraph) {
|
||||
for (auto & graph_ptr : this->cache_list) {
|
||||
if (graph_ptr->matches_cgraph(cgraph)) {
|
||||
cache_list.remove(graph_ptr);
|
||||
cache_list.push_front(graph_ptr);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
@@ -318,6 +458,9 @@ struct ggml_cann_rope_cache {
|
||||
if (position_select_index_host) {
|
||||
free(position_select_index_host);
|
||||
}
|
||||
if (yarn_ramp_cache) {
|
||||
ACL_CHECK(aclrtFree(yarn_ramp_cache));
|
||||
}
|
||||
}
|
||||
|
||||
bool equal(int64_t theta_scale_length,
|
||||
@@ -370,6 +513,7 @@ struct ggml_cann_rope_cache {
|
||||
float * theta_scale_exp_host = nullptr;
|
||||
int * position_select_index_host = nullptr;
|
||||
void * position_select_index = nullptr;
|
||||
void * yarn_ramp_cache = nullptr;
|
||||
// sin/cos cache, used only to accelerate first layer on each device
|
||||
void * sin_cache = nullptr;
|
||||
void * cos_cache = nullptr;
|
||||
|
||||
@@ -1888,6 +1888,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
|
||||
break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
ggml_cann_out_prod(ctx, dst);
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_cann_ssm_conv(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -2075,162 +2077,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
}
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
/**
|
||||
* @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
|
||||
*
|
||||
* This function creates a new ggml_cann_graph object and fills its node properties
|
||||
* (operation type, dimensions, strides, input sources, and operation parameters)
|
||||
* based on the current ggml computation graph.
|
||||
*
|
||||
* Each node in the ggml graph is mapped to a property entry in the new CANN graph:
|
||||
* - node address
|
||||
* - operation type
|
||||
* - shape (ne) and strides (nb)
|
||||
* - source tensor addresses
|
||||
* - operation parameters
|
||||
*
|
||||
* After initialization, the new graph is pushed into the LRU cache owned by the
|
||||
* CANN backend context. The cache takes ownership of the graph and manages its
|
||||
* lifetime (including deletion upon eviction).
|
||||
*
|
||||
* @param cann_ctx The CANN backend context containing the graph cache.
|
||||
* @param cgraph The current ggml computation graph.
|
||||
*/
|
||||
static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||
// Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
|
||||
ggml_cann_graph * new_graph = new ggml_cann_graph();
|
||||
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||
|
||||
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
|
||||
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||
auto & prop = new_graph->ggml_graph_properties[node_idx];
|
||||
|
||||
prop.node_address = node->data;
|
||||
prop.node_op = node->op;
|
||||
|
||||
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
|
||||
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
|
||||
|
||||
for (int src = 0; src < GGML_MAX_SRC; ++src) {
|
||||
if (node->src[src]) {
|
||||
prop.src_address[src] = node->src[src]->data;
|
||||
std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
|
||||
std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
|
||||
} else {
|
||||
prop.src_address[src] = nullptr;
|
||||
std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
|
||||
std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
|
||||
// Insert into the LRU cache (cache takes ownership and will delete it when evicted).
|
||||
cann_ctx->graph_lru_cache.push(new_graph);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check if a ggml tensor node matches a previously captured CANN graph node.
|
||||
*
|
||||
* This function compares all relevant fields (address, op type, shape, source inputs, op params)
|
||||
* to determine whether the current node matches a previously recorded version.
|
||||
*
|
||||
* @param node The current ggml tensor node.
|
||||
* @param graph_node_properties The stored properties of a CANN graph node.
|
||||
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
||||
*/
|
||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node,
|
||||
ggml_graph_node_properties * graph_node_properties) {
|
||||
if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node->op != graph_node_properties->node_op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i]) {
|
||||
if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int d = 0; d < GGML_MAX_DIMS; d++) {
|
||||
if (node->src[i]->ne[d] != graph_node_properties->src_ne[i][d]) {
|
||||
return false;
|
||||
}
|
||||
if (node->src[i]->nb[d] != graph_node_properties->src_nb[i][d]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (graph_node_properties->src_address[i] != nullptr) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
|
||||
return memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check whether there is a cached CANN graph that matches the current ggml graph.
|
||||
*
|
||||
* This function iterates through the cached CANN graphs stored in the LRU cache and
|
||||
* compares them against the given ggml computation graph. A match requires that the
|
||||
* number of nodes is the same and that each node’s properties (operation type,
|
||||
* dimensions, strides, inputs, and operation parameters) are identical.
|
||||
*
|
||||
* If a matching graph is found, it is promoted to the front of the LRU cache and the
|
||||
* function returns true. Otherwise, the function returns false, indicating that a new
|
||||
* CANN graph needs to be captured.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context containing the graph cache.
|
||||
* @param cgraph The current ggml computation graph.
|
||||
* @return true if a matching cached graph exists; false otherwise.
|
||||
*/
|
||||
static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||
ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache;
|
||||
for (auto & graph_ptr : lru_cache.cache_list) {
|
||||
// Skip graphs with a different number of nodes.
|
||||
if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if all nodes match.
|
||||
bool all_match = true;
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
|
||||
all_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_match) {
|
||||
// update cache_list && renturn graph_ptr
|
||||
lru_cache.move_to_front(graph_ptr);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
/**
|
||||
* @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
|
||||
*
|
||||
@@ -2239,23 +2085,23 @@ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph *
|
||||
*
|
||||
* Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The ggml computation graph.
|
||||
* @param use_cann_graph Whether to use CANN graph execution.
|
||||
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The ggml computation graph.
|
||||
* @param use_cann_graph Whether to use CANN graph execution.
|
||||
* @param cann_graph_capture_required Whether graph capture is needed due to graph changes.
|
||||
*/
|
||||
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
|
||||
ggml_cgraph * cgraph,
|
||||
bool & use_cann_graph,
|
||||
bool & cann_graph_update_required) {
|
||||
bool use_cann_graph,
|
||||
bool cann_graph_capture_required) {
|
||||
#ifdef USE_ACL_GRAPH
|
||||
if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture
|
||||
if (use_cann_graph && cann_graph_capture_required) { // Begin CANN graph capture
|
||||
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
||||
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
||||
if (!use_cann_graph || cann_graph_update_required) {
|
||||
if (!use_cann_graph || cann_graph_capture_required) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2274,9 +2120,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
if (use_cann_graph) {
|
||||
GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
|
||||
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
|
||||
|
||||
if (cann_graph_update_required) { // End CANN graph capture
|
||||
if (cann_graph_capture_required) { // End CANN graph capture
|
||||
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
|
||||
}
|
||||
|
||||
@@ -2306,7 +2153,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
||||
// calculate rope cache for fist layer in current device.
|
||||
cann_ctx->rope_cache.cached = false;
|
||||
|
||||
bool cann_graph_update_required = false;
|
||||
bool graph_capture_required = false;
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
|
||||
@@ -2331,16 +2178,17 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
||||
|
||||
if (use_cann_graph) {
|
||||
// If no matching graph is found, the graph needs to be recaptured.
|
||||
cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
|
||||
if (cann_graph_update_required) {
|
||||
graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
|
||||
if (graph_capture_required) {
|
||||
// If no matching graph is found, add a new ACL graph.
|
||||
add_lru_matched_graph_node_properties(cann_ctx, cgraph);
|
||||
ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
|
||||
cann_ctx->graph_lru_cache.push(new_graph);
|
||||
}
|
||||
}
|
||||
#else
|
||||
bool use_cann_graph = false;
|
||||
#endif // USE_ACL_GRAPH
|
||||
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
|
||||
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -2578,8 +2426,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
}
|
||||
}
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
|
||||
return (op->src[0]->ne[0] - 1) <= 255;
|
||||
return true;
|
||||
case GGML_OP_SCALE:
|
||||
float bias;
|
||||
memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
|
||||
@@ -2626,6 +2473,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -328,7 +328,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
@@ -69,6 +69,10 @@
|
||||
#define VECTOR_REGISTERS 16
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
#define LMUL 4
|
||||
#endif
|
||||
|
||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||
|
||||
namespace {
|
||||
@@ -175,6 +179,46 @@ inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_zvfh)
|
||||
template <>
|
||||
inline vfloat32m1_t madd(vfloat16mf2_t a, vfloat16mf2_t b, vfloat32m1_t c) {
|
||||
return __riscv_vfwmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
|
||||
}
|
||||
inline vfloat32m2_t madd(vfloat16m1_t a, vfloat16m1_t b, vfloat32m2_t c) {
|
||||
return __riscv_vfwmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
|
||||
}
|
||||
inline vfloat32m4_t madd(vfloat16m2_t a, vfloat16m2_t b, vfloat32m4_t c) {
|
||||
return __riscv_vfwmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
|
||||
}
|
||||
inline vfloat32m8_t madd(vfloat16m4_t a, vfloat16m4_t b, vfloat32m8_t c) {
|
||||
return __riscv_vfwmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
|
||||
}
|
||||
inline vfloat32m1_t madd(vfloat32m1_t a, vfloat32m1_t b, vfloat32m1_t c) {
|
||||
return __riscv_vfmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
|
||||
}
|
||||
inline vfloat32m2_t madd(vfloat32m2_t a, vfloat32m2_t b, vfloat32m2_t c) {
|
||||
return __riscv_vfmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
|
||||
}
|
||||
inline vfloat32m4_t madd(vfloat32m4_t a, vfloat32m4_t b, vfloat32m4_t c) {
|
||||
return __riscv_vfmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
|
||||
}
|
||||
inline vfloat32m8_t madd(vfloat32m8_t a, vfloat32m8_t b, vfloat32m8_t c) {
|
||||
return __riscv_vfmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_zvfbfwma)
|
||||
inline vfloat32m1_t madd(vbfloat16mf2_t a, vbfloat16mf2_t b, vfloat32m1_t c) {
|
||||
return __riscv_vfwmaccbf16_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
|
||||
}
|
||||
inline vfloat32m2_t madd(vbfloat16m1_t a, vbfloat16m1_t b, vfloat32m2_t c) {
|
||||
return __riscv_vfwmaccbf16_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
|
||||
}
|
||||
inline vfloat32m4_t madd(vbfloat16m2_t a, vbfloat16m2_t b, vfloat32m4_t c) {
|
||||
return __riscv_vfwmaccbf16_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// VECTORIZED HORIZONTAL SUM
|
||||
|
||||
@@ -227,6 +271,25 @@ inline float hsum(__m512 x) {
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
|
||||
#if defined(__riscv_zvfh)
|
||||
inline float hsum(vfloat32m1_t x) {
|
||||
return __riscv_vfmv_f_s_f32m1_f32(
|
||||
__riscv_vfredusum_vs_f32m1_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
inline float hsum(vfloat32m2_t x) {
|
||||
return __riscv_vfmv_f_s_f32m1_f32(
|
||||
__riscv_vfredusum_vs_f32m2_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m2()));
|
||||
}
|
||||
inline float hsum(vfloat32m4_t x) {
|
||||
return __riscv_vfmv_f_s_f32m1_f32(
|
||||
__riscv_vfredusum_vs_f32m4_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m4()));
|
||||
}
|
||||
inline float hsum(vfloat32m8_t x) {
|
||||
return __riscv_vfmv_f_s_f32m1_f32(
|
||||
__riscv_vfredusum_vs_f32m8_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m8()));
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// VECTORIZED MEMORY LOADING
|
||||
|
||||
@@ -315,6 +378,88 @@ template <> inline __m256bh load(const float *p) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_zvfh)
|
||||
template <> inline vfloat16mf2_t load(const ggml_fp16_t *p) {
|
||||
return __riscv_vle16_v_f16mf2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16mf2());
|
||||
}
|
||||
template <> inline vfloat16m1_t load(const ggml_fp16_t *p) {
|
||||
return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m1());
|
||||
}
|
||||
template <> inline vfloat16m2_t load(const ggml_fp16_t *p) {
|
||||
return __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m2());
|
||||
}
|
||||
template <> inline vfloat16m4_t load(const ggml_fp16_t *p) {
|
||||
return __riscv_vle16_v_f16m4(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m4());
|
||||
}
|
||||
template <> inline vfloat32m1_t load(const float *p) {
|
||||
return __riscv_vle32_v_f32m1(p, __riscv_vsetvlmax_e32m1());
|
||||
}
|
||||
template <> inline vfloat32m2_t load(const float *p) {
|
||||
return __riscv_vle32_v_f32m2(p, __riscv_vsetvlmax_e32m2());
|
||||
}
|
||||
template <> inline vfloat32m4_t load(const float *p) {
|
||||
return __riscv_vle32_v_f32m4(p, __riscv_vsetvlmax_e32m4());
|
||||
}
|
||||
template <> inline vfloat32m8_t load(const float *p) {
|
||||
return __riscv_vle32_v_f32m8(p, __riscv_vsetvlmax_e32m8());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_zvfbfwma)
|
||||
template <> inline vbfloat16mf2_t load(const ggml_bf16_t *p) {
|
||||
return __riscv_vle16_v_bf16mf2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16mf2());
|
||||
}
|
||||
template <> inline vbfloat16m1_t load(const ggml_bf16_t *p) {
|
||||
return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m1());
|
||||
}
|
||||
template <> inline vbfloat16m2_t load(const ggml_bf16_t *p) {
|
||||
return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m2());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_zvfh)
|
||||
template <typename T> T set_zero();
|
||||
|
||||
template <> inline vfloat16mf2_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f16mf2(0, __riscv_vsetvlmax_e16mf2());
|
||||
}
|
||||
template <> inline vfloat16m1_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f16m1(0, __riscv_vsetvlmax_e16m1());
|
||||
}
|
||||
template <> inline vfloat16m2_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f16m2(0, __riscv_vsetvlmax_e16m2());
|
||||
}
|
||||
template <> inline vfloat16m4_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f16m4(0, __riscv_vsetvlmax_e16m4());
|
||||
}
|
||||
template <> inline vfloat32m1_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvlmax_e32m1());
|
||||
}
|
||||
template <> inline vfloat32m2_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
|
||||
}
|
||||
template <> inline vfloat32m4_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
|
||||
}
|
||||
template <> inline vfloat32m8_t set_zero() {
|
||||
return __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
template <typename T> size_t vlmax() {
|
||||
if constexpr (std::is_same_v<T, vfloat16mf2_t>) { return __riscv_vsetvlmax_e16mf2(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat16m1_t>) { return __riscv_vsetvlmax_e16m1(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat16m2_t>) { return __riscv_vsetvlmax_e16m2(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat16m4_t>) { return __riscv_vsetvlmax_e16m4(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat32m1_t>) { return __riscv_vsetvlmax_e32m1(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat32m2_t>) { return __riscv_vsetvlmax_e32m2(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat32m4_t>) { return __riscv_vsetvlmax_e32m4(); }
|
||||
else if constexpr (std::is_same_v<T, vfloat32m8_t>) { return __riscv_vsetvlmax_e32m8(); }
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FLOATING POINT MATRIX MULTIPLICATION
|
||||
|
||||
@@ -488,6 +633,573 @@ class tinyBLAS {
|
||||
const int64_t ldc;
|
||||
};
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
template <typename D, typename V, typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_RVV {
|
||||
public:
|
||||
tinyBLAS_RVV(const ggml_compute_params * params, int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc)
|
||||
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
|
||||
}
|
||||
|
||||
bool matmul(int64_t m, int64_t n) {
|
||||
if (k % vlmax<V>() != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if LMUL == 1
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
#elif LMUL == 2
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
#else // LMUL = 4
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
|
||||
mnpack<2, 2, 8>(m, n, SIZE_N, 36);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
|
||||
mnpack<2, 2, 4>(m, n, SIZE_N, 36);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
|
||||
mnpack<2, 2, 2>(m, n, SIZE_N, 36);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
template<int RM, int RN, int BM>
|
||||
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
|
||||
if (SIZE_N == RN) {
|
||||
return gemm<RM, RN, BM>(m, n, BN);
|
||||
}
|
||||
if constexpr (RN > 1) {
|
||||
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
|
||||
} else {
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_ASSERT(false); // we have miss something.
|
||||
}
|
||||
}
|
||||
|
||||
inline void gemm_bloc_4x6(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv02 = set_zero<D>();
|
||||
D Cv03 = set_zero<D>();
|
||||
D Cv10 = set_zero<D>();
|
||||
D Cv11 = set_zero<D>();
|
||||
D Cv12 = set_zero<D>();
|
||||
D Cv13 = set_zero<D>();
|
||||
D Cv20 = set_zero<D>();
|
||||
D Cv21 = set_zero<D>();
|
||||
D Cv22 = set_zero<D>();
|
||||
D Cv23 = set_zero<D>();
|
||||
D Cv30 = set_zero<D>();
|
||||
D Cv31 = set_zero<D>();
|
||||
D Cv32 = set_zero<D>();
|
||||
D Cv33 = set_zero<D>();
|
||||
D Cv40 = set_zero<D>();
|
||||
D Cv41 = set_zero<D>();
|
||||
D Cv42 = set_zero<D>();
|
||||
D Cv43 = set_zero<D>();
|
||||
D Cv50 = set_zero<D>();
|
||||
D Cv51 = set_zero<D>();
|
||||
D Cv52 = set_zero<D>();
|
||||
D Cv53 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
|
||||
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
|
||||
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
|
||||
V Bv4 = load<V>(B + ldb * (jj + 4) + l);
|
||||
V Bv5 = load<V>(B + ldb * (jj + 5) + l);
|
||||
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv10 = madd(Av0, Bv1, Cv10);
|
||||
Cv20 = madd(Av0, Bv2, Cv20);
|
||||
Cv30 = madd(Av0, Bv3, Cv30);
|
||||
Cv40 = madd(Av0, Bv4, Cv40);
|
||||
Cv50 = madd(Av0, Bv5, Cv50);
|
||||
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
Cv11 = madd(Av1, Bv1, Cv11);
|
||||
Cv21 = madd(Av1, Bv2, Cv21);
|
||||
Cv31 = madd(Av1, Bv3, Cv31);
|
||||
Cv41 = madd(Av1, Bv4, Cv41);
|
||||
Cv51 = madd(Av1, Bv5, Cv51);
|
||||
|
||||
V Av2 = load<V>(A + lda * (ii + 2) + l);
|
||||
Cv02 = madd(Av2, Bv0, Cv02);
|
||||
Cv12 = madd(Av2, Bv1, Cv12);
|
||||
Cv22 = madd(Av2, Bv2, Cv22);
|
||||
Cv32 = madd(Av2, Bv3, Cv32);
|
||||
Cv42 = madd(Av2, Bv4, Cv42);
|
||||
Cv52 = madd(Av2, Bv5, Cv52);
|
||||
|
||||
V Av3 = load<V>(A + lda * (ii + 3) + l);
|
||||
Cv03 = madd(Av3, Bv0, Cv03);
|
||||
Cv13 = madd(Av3, Bv1, Cv13);
|
||||
Cv23 = madd(Av3, Bv2, Cv23);
|
||||
Cv33 = madd(Av3, Bv3, Cv33);
|
||||
Cv43 = madd(Av3, Bv4, Cv43);
|
||||
Cv53 = madd(Av3, Bv5, Cv53);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
|
||||
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
|
||||
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
|
||||
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
|
||||
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
|
||||
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
|
||||
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
|
||||
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
|
||||
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
|
||||
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
|
||||
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
|
||||
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
|
||||
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
|
||||
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
|
||||
C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
|
||||
C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
|
||||
C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
|
||||
C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
|
||||
C[ldc * (jj + 5) + (ii + 0)] = hsum(Cv50);
|
||||
C[ldc * (jj + 5) + (ii + 1)] = hsum(Cv51);
|
||||
C[ldc * (jj + 5) + (ii + 2)] = hsum(Cv52);
|
||||
C[ldc * (jj + 5) + (ii + 3)] = hsum(Cv53);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_4x5(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv02 = set_zero<D>();
|
||||
D Cv03 = set_zero<D>();
|
||||
D Cv10 = set_zero<D>();
|
||||
D Cv11 = set_zero<D>();
|
||||
D Cv12 = set_zero<D>();
|
||||
D Cv13 = set_zero<D>();
|
||||
D Cv20 = set_zero<D>();
|
||||
D Cv21 = set_zero<D>();
|
||||
D Cv22 = set_zero<D>();
|
||||
D Cv23 = set_zero<D>();
|
||||
D Cv30 = set_zero<D>();
|
||||
D Cv31 = set_zero<D>();
|
||||
D Cv32 = set_zero<D>();
|
||||
D Cv33 = set_zero<D>();
|
||||
D Cv40 = set_zero<D>();
|
||||
D Cv41 = set_zero<D>();
|
||||
D Cv42 = set_zero<D>();
|
||||
D Cv43 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
|
||||
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
|
||||
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
|
||||
V Bv4 = load<V>(B + ldb * (jj + 4) + l);
|
||||
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv10 = madd(Av0, Bv1, Cv10);
|
||||
Cv20 = madd(Av0, Bv2, Cv20);
|
||||
Cv30 = madd(Av0, Bv3, Cv30);
|
||||
Cv40 = madd(Av0, Bv4, Cv40);
|
||||
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
Cv11 = madd(Av1, Bv1, Cv11);
|
||||
Cv21 = madd(Av1, Bv2, Cv21);
|
||||
Cv31 = madd(Av1, Bv3, Cv31);
|
||||
Cv41 = madd(Av1, Bv4, Cv41);
|
||||
|
||||
V Av2 = load<V>(A + lda * (ii + 2) + l);
|
||||
Cv02 = madd(Av2, Bv0, Cv02);
|
||||
Cv12 = madd(Av2, Bv1, Cv12);
|
||||
Cv22 = madd(Av2, Bv2, Cv22);
|
||||
Cv32 = madd(Av2, Bv3, Cv32);
|
||||
Cv42 = madd(Av2, Bv4, Cv42);
|
||||
|
||||
V Av3 = load<V>(A + lda * (ii + 3) + l);
|
||||
Cv03 = madd(Av3, Bv0, Cv03);
|
||||
Cv13 = madd(Av3, Bv1, Cv13);
|
||||
Cv23 = madd(Av3, Bv2, Cv23);
|
||||
Cv33 = madd(Av3, Bv3, Cv33);
|
||||
Cv43 = madd(Av3, Bv4, Cv43);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
|
||||
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
|
||||
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
|
||||
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
|
||||
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
|
||||
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
|
||||
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
|
||||
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
|
||||
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
|
||||
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
|
||||
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
|
||||
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
|
||||
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
|
||||
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
|
||||
C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
|
||||
C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
|
||||
C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
|
||||
C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_4x4(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv02 = set_zero<D>();
|
||||
D Cv03 = set_zero<D>();
|
||||
D Cv10 = set_zero<D>();
|
||||
D Cv11 = set_zero<D>();
|
||||
D Cv12 = set_zero<D>();
|
||||
D Cv13 = set_zero<D>();
|
||||
D Cv20 = set_zero<D>();
|
||||
D Cv21 = set_zero<D>();
|
||||
D Cv22 = set_zero<D>();
|
||||
D Cv23 = set_zero<D>();
|
||||
D Cv30 = set_zero<D>();
|
||||
D Cv31 = set_zero<D>();
|
||||
D Cv32 = set_zero<D>();
|
||||
D Cv33 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
V Av2 = load<V>(A + lda * (ii + 2) + l);
|
||||
V Av3 = load<V>(A + lda * (ii + 3) + l);
|
||||
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
Cv02 = madd(Av2, Bv0, Cv02);
|
||||
Cv03 = madd(Av3, Bv0, Cv03);
|
||||
|
||||
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
|
||||
Cv10 = madd(Av0, Bv1, Cv10);
|
||||
Cv11 = madd(Av1, Bv1, Cv11);
|
||||
Cv12 = madd(Av2, Bv1, Cv12);
|
||||
Cv13 = madd(Av3, Bv1, Cv13);
|
||||
|
||||
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
|
||||
Cv20 = madd(Av0, Bv2, Cv20);
|
||||
Cv21 = madd(Av1, Bv2, Cv21);
|
||||
Cv22 = madd(Av2, Bv2, Cv22);
|
||||
Cv23 = madd(Av3, Bv2, Cv23);
|
||||
|
||||
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
|
||||
Cv30 = madd(Av0, Bv3, Cv30);
|
||||
Cv31 = madd(Av1, Bv3, Cv31);
|
||||
Cv32 = madd(Av2, Bv3, Cv32);
|
||||
Cv33 = madd(Av3, Bv3, Cv33);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
|
||||
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
|
||||
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
|
||||
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
|
||||
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
|
||||
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
|
||||
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
|
||||
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
|
||||
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
|
||||
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
|
||||
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
|
||||
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
|
||||
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
|
||||
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_4x3(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv02 = set_zero<D>();
|
||||
D Cv03 = set_zero<D>();
|
||||
D Cv10 = set_zero<D>();
|
||||
D Cv11 = set_zero<D>();
|
||||
D Cv12 = set_zero<D>();
|
||||
D Cv13 = set_zero<D>();
|
||||
D Cv20 = set_zero<D>();
|
||||
D Cv21 = set_zero<D>();
|
||||
D Cv22 = set_zero<D>();
|
||||
D Cv23 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
V Av2 = load<V>(A + lda * (ii + 2) + l);
|
||||
V Av3 = load<V>(A + lda * (ii + 3) + l);
|
||||
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
Cv02 = madd(Av2, Bv0, Cv02);
|
||||
Cv03 = madd(Av3, Bv0, Cv03);
|
||||
|
||||
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
|
||||
Cv10 = madd(Av0, Bv1, Cv10);
|
||||
Cv11 = madd(Av1, Bv1, Cv11);
|
||||
Cv12 = madd(Av2, Bv1, Cv12);
|
||||
Cv13 = madd(Av3, Bv1, Cv13);
|
||||
|
||||
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
|
||||
Cv20 = madd(Av0, Bv2, Cv20);
|
||||
Cv21 = madd(Av1, Bv2, Cv21);
|
||||
Cv22 = madd(Av2, Bv2, Cv22);
|
||||
Cv23 = madd(Av3, Bv2, Cv23);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
|
||||
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
|
||||
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
|
||||
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
|
||||
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
|
||||
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
|
||||
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
|
||||
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
|
||||
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
|
||||
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_4x2(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv02 = set_zero<D>();
|
||||
D Cv03 = set_zero<D>();
|
||||
D Cv10 = set_zero<D>();
|
||||
D Cv11 = set_zero<D>();
|
||||
D Cv12 = set_zero<D>();
|
||||
D Cv13 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
V Av2 = load<V>(A + lda * (ii + 2) + l);
|
||||
V Av3 = load<V>(A + lda * (ii + 3) + l);
|
||||
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
Cv02 = madd(Av2, Bv0, Cv02);
|
||||
Cv03 = madd(Av3, Bv0, Cv03);
|
||||
|
||||
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
|
||||
Cv10 = madd(Av0, Bv1, Cv10);
|
||||
Cv11 = madd(Av1, Bv1, Cv11);
|
||||
Cv12 = madd(Av2, Bv1, Cv12);
|
||||
Cv13 = madd(Av3, Bv1, Cv13);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
|
||||
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
|
||||
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
|
||||
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
|
||||
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
|
||||
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_4x1(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv02 = set_zero<D>();
|
||||
D Cv03 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
V Av2 = load<V>(A + lda * (ii + 2) + l);
|
||||
V Av3 = load<V>(A + lda * (ii + 3) + l);
|
||||
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
Cv02 = madd(Av2, Bv0, Cv02);
|
||||
Cv03 = madd(Av3, Bv0, Cv03);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
|
||||
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_2x2(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
D Cv10 = set_zero<D>();
|
||||
D Cv11 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
|
||||
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
|
||||
Cv10 = madd(Av0, Bv1, Cv10);
|
||||
Cv11 = madd(Av1, Bv1, Cv11);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
|
||||
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
|
||||
}
|
||||
|
||||
inline void gemm_bloc_2x1(int64_t ii, int64_t jj) {
|
||||
size_t vl = vlmax<V>();
|
||||
D Cv00 = set_zero<D>();
|
||||
D Cv01 = set_zero<D>();
|
||||
|
||||
for (int64_t l = 0; l < k; l += vl) {
|
||||
V Av0 = load<V>(A + lda * (ii + 0) + l);
|
||||
V Av1 = load<V>(A + lda * (ii + 1) + l);
|
||||
|
||||
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
|
||||
Cv00 = madd(Av0, Bv0, Cv00);
|
||||
Cv01 = madd(Av1, Bv0, Cv01);
|
||||
}
|
||||
|
||||
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
|
||||
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
inline void gemm_bloc(int64_t ii, int64_t jj) {
|
||||
if constexpr (RM == 4) {
|
||||
if constexpr (RN == 6) { return gemm_bloc_4x6(ii, jj); }
|
||||
if constexpr (RN == 5) { return gemm_bloc_4x5(ii, jj); }
|
||||
if constexpr (RN == 4) { return gemm_bloc_4x4(ii, jj); }
|
||||
if constexpr (RN == 3) { return gemm_bloc_4x3(ii, jj); }
|
||||
if constexpr (RN == 2) { return gemm_bloc_4x2(ii, jj); }
|
||||
if constexpr (RN == 1) { return gemm_bloc_4x1(ii, jj); }
|
||||
} else if constexpr (RM == 2) {
|
||||
if constexpr (RN == 2) { return gemm_bloc_2x2(ii, jj); }
|
||||
if constexpr (RN == 1) { return gemm_bloc_2x1(ii, jj); }
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN, int BM>
|
||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||
GGML_ASSERT(m % (RM * BM) == 0);
|
||||
const int64_t ytiles = m / (RM * BM);
|
||||
const int64_t xtiles = (n + RN -1) / RN;
|
||||
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
|
||||
|
||||
// "round" bloc_size to "nearest" BN
|
||||
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
|
||||
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
|
||||
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
|
||||
const int64_t nb_job = ytiles * NB_BN;
|
||||
|
||||
if (params->ith == 0) {
|
||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
int64_t job = params->ith;
|
||||
while (job < nb_job) {
|
||||
const int64_t ii = (job % ytiles) * RM * BM;
|
||||
const int64_t jb = job / ytiles;
|
||||
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
||||
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
|
||||
|
||||
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
|
||||
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
|
||||
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
|
||||
|
||||
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
|
||||
int64_t jj = jj0;
|
||||
for (; jj < jj1; jj += RN) {
|
||||
gemm_bloc<RM, RN>(ii + bi, jj);
|
||||
}
|
||||
if constexpr (RN > 1) {
|
||||
for (; jj < jj2; jj += RN - 1) {
|
||||
gemm_bloc<RM, RN-1>(ii + bi, jj);
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
return;
|
||||
}
|
||||
|
||||
const ggml_compute_params * params;
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *const C;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
};
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// QUANT ZERO MATRIX MULTIPLICATION
|
||||
|
||||
@@ -2657,6 +3369,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__riscv_zvfh)
|
||||
#if LMUL == 1
|
||||
tinyBLAS_RVV<vfloat32m1_t, vfloat32m1_t, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#elif LMUL == 2
|
||||
tinyBLAS_RVV<vfloat32m2_t, vfloat32m2_t, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#else // LMUL = 4
|
||||
tinyBLAS_RVV<vfloat32m4_t, vfloat32m4_t, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#endif
|
||||
return tb.matmul(m, n);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@@ -2699,6 +3429,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
}
|
||||
#elif defined(__riscv_zvfbfwma)
|
||||
#if LMUL == 1
|
||||
tinyBLAS_RVV<vfloat32m1_t, vbfloat16mf2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
|
||||
k, (const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#elif LMUL == 2
|
||||
tinyBLAS_RVV<vfloat32m2_t, vbfloat16m1_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
|
||||
k, (const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#else // LMUL = 4
|
||||
tinyBLAS_RVV<vfloat32m4_t, vbfloat16m2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
|
||||
k, (const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#endif
|
||||
return tb.matmul(m, n);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
@@ -2748,6 +3496,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__riscv_zvfh)
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
#if LMUL == 1
|
||||
tinyBLAS_RVV<vfloat32m1_t, vfloat16mf2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#elif LMUL == 2
|
||||
tinyBLAS_RVV<vfloat32m2_t, vfloat16m1_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#else // LMUL = 4
|
||||
tinyBLAS_RVV<vfloat32m4_t, vfloat16m2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
#endif
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -14,10 +14,6 @@
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
@@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND)
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
# 120 == Blackwell, needs CUDA v12.8, FP4 tensor cores
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
# XX-real == compile CUDA code as device code for this specific architecture
|
||||
@@ -34,12 +35,52 @@ if (CUDAToolkit_FOUND)
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# The CUDA architecture 120f-virtual would in principle work for Blackwell support
|
||||
# but the newly added "f" suffix conflicted with a preexising regex for validating CUDA architectures in CMake.
|
||||
# So either a recent CMake version or one with the backported fix is needed.
|
||||
# The following versions should work:
|
||||
# - CMake >= v3.31.8 && CMake < v4.0.0
|
||||
# - CMake >= v4.0.2
|
||||
# This is NOT documented in the CMake release notes,
|
||||
# check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
|
||||
# However, the architectures 120a-real and 121a-real should work with basically any CMake version and
|
||||
# until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real 121a-real)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
# Replace any 12x-real architectures with 12x{a}-real. FP4 ptx instructions are not available in just 12x
|
||||
if (GGML_NATIVE)
|
||||
set(PROCESSED_ARCHITECTURES "")
|
||||
if (CMAKE_CUDA_ARCHITECTURES_NATIVE)
|
||||
set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
|
||||
else()
|
||||
set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES})
|
||||
endif()
|
||||
foreach(ARCH ${ARCH_LIST})
|
||||
if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
|
||||
string(REGEX REPLACE "^(12[0-9]).*$" "\\1" BASE_ARCH ${ARCH})
|
||||
message(STATUS "Replacing ${ARCH} with ${BASE_ARCH}a-real")
|
||||
list(APPEND PROCESSED_ARCHITECTURES "${BASE_ARCH}a-real")
|
||||
else()
|
||||
list(APPEND PROCESSED_ARCHITECTURES ${ARCH})
|
||||
endif()
|
||||
endforeach()
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${PROCESSED_ARCHITECTURES})
|
||||
else()
|
||||
foreach(ARCH ${CMAKE_CUDA_ARCHITECTURES})
|
||||
if(ARCH MATCHES "^12[0-9](-real|-virtual)?$")
|
||||
message(FATAL_ERROR "Compute capability ${ARCH} used, use ${ARCH}a or ${ARCH}f for Blackwell-specific optimizations")
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
file(GLOB GGML_HEADERS_CUDA "*.cuh")
|
||||
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
|
||||
|
||||
|
||||
@@ -50,6 +50,10 @@
|
||||
#define GGML_CUDA_CC_TURING 750
|
||||
#define GGML_CUDA_CC_AMPERE 800
|
||||
#define GGML_CUDA_CC_ADA_LOVELACE 890
|
||||
// While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
|
||||
// https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
|
||||
#define GGML_CUDA_CC_BLACKWELL 1200
|
||||
#define GGML_CUDA_CC_RUBIN 1300
|
||||
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
|
||||
#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
|
||||
#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
|
||||
@@ -246,6 +250,10 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define AMPERE_MMA_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL && __CUDA_ARCH__ < GGML_CUDA_CC_RUBIN
|
||||
# define BLACKWELL_MMA_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#define CP_ASYNC_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
@@ -316,6 +324,11 @@ static bool cp_async_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
}
|
||||
|
||||
static bool blackwell_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_BLACKWELL &&
|
||||
ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_RUBIN;
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
@@ -701,6 +714,28 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
|
||||
const uint8_t sign_bit = (x < 0.0f) << 3;
|
||||
float ax = fabsf(x) * e;
|
||||
|
||||
// Positive LUT
|
||||
static constexpr float pos_lut[8] = { 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f };
|
||||
|
||||
int best_i = 0;
|
||||
float best_err = fabsf(ax - pos_lut[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
const float err = fabsf(ax - pos_lut[i]);
|
||||
if (err < best_err) {
|
||||
best_err = err;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
|
||||
return static_cast<uint8_t>(best_i | sign_bit);
|
||||
}
|
||||
|
||||
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
|
||||
// Precompute mp (m' in the paper) and L such that division
|
||||
// can be computed using a multiply (high 32b of 64b result)
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
# include <cub/device/device_scan.cuh>
|
||||
# include <cub/block/block_scan.cuh>
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
template<typename T, int BLOCK_SIZE>
|
||||
@@ -16,12 +16,14 @@ static __global__ void cumsum_cub_kernel(
|
||||
const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t s1, const int64_t s2, const int64_t s3) {
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;
|
||||
using BlockScanT = cub::BlockScan<T, BLOCK_SIZE>;
|
||||
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
__shared__ T block_carry; // carry from previous tile
|
||||
__shared__ typename BlockScanT::TempStorage temp_storage;
|
||||
__shared__ T block_carry;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
constexpr int UNROLL_FACTOR = 4;
|
||||
constexpr int TILE_SIZE = BLOCK_SIZE * UNROLL_FACTOR;
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
@@ -39,37 +41,47 @@ static __global__ void cumsum_cub_kernel(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int64_t start = 0; start < ne00; start += BLOCK_SIZE) {
|
||||
int64_t idx = start + tid;
|
||||
T x = (idx < ne00) ? src_row[idx] : T(0);
|
||||
for (int64_t start = 0; start < ne00; start += TILE_SIZE) {
|
||||
T items[UNROLL_FACTOR];
|
||||
T thread_sum = T(0);
|
||||
|
||||
T inclusive;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_FACTOR; i++) {
|
||||
int64_t idx = start + tid * UNROLL_FACTOR + i;
|
||||
T val = (idx < ne00) ? src_row[idx] : T(0);
|
||||
thread_sum += val;
|
||||
items[i] = thread_sum;
|
||||
}
|
||||
|
||||
// Block-wide scan on thread sums
|
||||
T thread_prefix;
|
||||
T block_total;
|
||||
BlockScan(temp_storage).InclusiveSum(x, inclusive, block_total);
|
||||
|
||||
BlockScanT(temp_storage).InclusiveSum(thread_sum, thread_prefix, block_total);
|
||||
__syncthreads();
|
||||
|
||||
T final_val = inclusive + block_carry;
|
||||
|
||||
// store result
|
||||
if (idx < ne00) {
|
||||
dst_row[idx] = final_val;
|
||||
// Add offset to each item and store
|
||||
T thread_offset = thread_prefix - thread_sum + block_carry;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_FACTOR; i++) {
|
||||
int64_t idx = start + tid * UNROLL_FACTOR + i;
|
||||
if (idx < ne00) {
|
||||
dst_row[idx] = items[i] + thread_offset;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Update carry for next tile
|
||||
if (tid == 0) {
|
||||
block_carry += block_total;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
}
|
||||
|
||||
// Fallback kernel implementation (original)
|
||||
// Fallback kernel implementation
|
||||
template<typename T>
|
||||
static __global__ void cumsum_kernel(
|
||||
const T * src, T * dst,
|
||||
@@ -86,10 +98,10 @@ static __global__ void cumsum_kernel(
|
||||
const int warps_per_block = blockDim.x / warp_size;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
float * s_vals = smem;
|
||||
float * s_warp_sums = smem + blockDim.x;
|
||||
float * s_carry = smem + blockDim.x + warps_per_block;
|
||||
float * s_chunk_total = s_carry + 1;
|
||||
float * s_vals = smem;
|
||||
float * s_warp_sums = smem + blockDim.x;
|
||||
float * s_carry = smem + blockDim.x + warps_per_block;
|
||||
float * s_chunk_total = s_carry + 1;
|
||||
|
||||
// Initialize carry
|
||||
if (tid == 0) {
|
||||
@@ -107,21 +119,39 @@ static __global__ void cumsum_kernel(
|
||||
const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
|
||||
for (int64_t start = 0; start < ne00; start += blockDim.x) {
|
||||
int64_t idx = start + tid;
|
||||
float val = (idx < ne00) ? ggml_cuda_cast<float, T>(src_row[idx]) : 0.0f;
|
||||
// register blocking: process 4 elements per thread to hide latency
|
||||
// and reduce synchronization overhead
|
||||
constexpr int num_unroll = 4;
|
||||
T temp[num_unroll];
|
||||
|
||||
// 1. Warp inclusive scan
|
||||
for (int64_t i = 0; i < ne00; i += num_unroll * blockDim.x) {
|
||||
int64_t idx = i + tid * num_unroll;
|
||||
|
||||
// thread local sequential scan
|
||||
temp[0] = (idx < ne00 ? src_row[idx] : T(0));
|
||||
#pragma unroll
|
||||
for (int64_t j = 1; j < num_unroll; j++) {
|
||||
temp[j] = temp[j - 1];
|
||||
if (idx + j < ne00) {
|
||||
temp[j] += src_row[idx + j];
|
||||
} else {
|
||||
temp[j] += 0;
|
||||
}
|
||||
}
|
||||
|
||||
// last emenent is sum of all values assigned to thread
|
||||
float val = (idx < ne00) ? ggml_cuda_cast<float, T>(temp[num_unroll - 1]) : 0.0f;
|
||||
|
||||
// Warp inclusive scan
|
||||
val = warp_prefix_inclusive_sum<T, warp_size>(val);
|
||||
s_vals[tid] = val;
|
||||
|
||||
// Store warp total
|
||||
if (lane == warp_size - 1) {
|
||||
s_warp_sums[warp] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 2. Exclusive scan of warp sums (warp 0 only)
|
||||
// Exclusive scan of warp sums (warp 0 only)
|
||||
if (warp == 0) {
|
||||
float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
|
||||
float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
|
||||
@@ -134,18 +164,24 @@ static __global__ void cumsum_kernel(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// write back results
|
||||
float carry = *s_carry;
|
||||
float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
|
||||
if (idx < ne00) {
|
||||
dst_row[idx] = ggml_cuda_cast<T, float>(final_val);
|
||||
// calculate sum offset for this thread
|
||||
float final_val_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
|
||||
|
||||
#pragma unroll
|
||||
for (int32_t j = 0; j < num_unroll; j++) {
|
||||
if (idx + j < ne00) {
|
||||
dst_row[idx + j] = temp[j] + ggml_cuda_cast<T, float>(final_val_offset);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Update carry for next chunk
|
||||
if (tid == 0) {
|
||||
*s_carry += *s_chunk_total;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -177,7 +213,7 @@ static void cumsum_cuda(
|
||||
const int warps_per_block = block_size / warp_size;
|
||||
const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
|
||||
|
||||
if (use_cub) {
|
||||
if (use_cub && ne00 >= 1024) {
|
||||
cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
|
||||
src, dst,
|
||||
ne00, ne01, ne02, ne03,
|
||||
|
||||
@@ -2211,7 +2211,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
@@ -2219,7 +2219,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
|
||||
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
@@ -3076,8 +3076,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
||||
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
||||
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -3085,7 +3088,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
||||
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -3094,8 +3101,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
||||
ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
|
||||
ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
|
||||
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
|
||||
// Heuristic for block size selection to optimize occupancy.
|
||||
// See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
|
||||
@@ -900,6 +900,27 @@ namespace ggml_cuda_mma {
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
|
||||
const tile<16, 8, int> & A,
|
||||
const tile<8, 8, int> & B,
|
||||
uint32_t a_scale,
|
||||
uint32_t b_scale) {
|
||||
#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
float * Dxi = (float *) D.x;
|
||||
|
||||
asm volatile(
|
||||
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
|
||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
|
||||
"%10, {0, 0}, %11, {0, 0};"
|
||||
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
|
||||
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
|
||||
#endif // BLACKWELL_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "common.cuh"
|
||||
#include "mmq.cuh"
|
||||
#include "quantize.cuh"
|
||||
#include "mmid.cuh"
|
||||
@@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q(
|
||||
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| GGML_CUDA_CC_IS_CDNA(cc);
|
||||
|
||||
// TODO: tighter pool buffer size vs q8 path
|
||||
const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
|
||||
|
||||
if (!ids) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
||||
@@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
|
||||
if (use_native_mxfp4) {
|
||||
static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
|
||||
quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
||||
ne11, ne12, ne13, stream);
|
||||
|
||||
} else {
|
||||
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
||||
ne11, ne12, ne13, stream);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
// Stride depends on quantization format
|
||||
const int64_t s12 = use_native_mxfp4 ?
|
||||
ne11 * ne10_padded * sizeof(block_fp4_mmq) /
|
||||
(8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
|
||||
:
|
||||
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
const mmq_args args = {
|
||||
@@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[2] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
|
||||
if (use_native_mxfp4) {
|
||||
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
} else {
|
||||
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
|
||||
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
@@ -236,7 +259,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts) {
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
return false;
|
||||
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||
@@ -297,7 +320,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
||||
return true;
|
||||
}
|
||||
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
||||
if (n_experts > 64 || ne11 <= 128) {
|
||||
return true;
|
||||
}
|
||||
if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
||||
return true;
|
||||
}
|
||||
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|
||||
|
||||
@@ -11,6 +11,7 @@ using namespace ggml_cuda_mma;
|
||||
|
||||
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
|
||||
#define MMQ_ITER_K 256
|
||||
#define MMQ_ITER_K_MXFP4_FP4 512
|
||||
#define MMQ_NWARPS 8
|
||||
|
||||
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
|
||||
@@ -44,8 +45,15 @@ struct block_q8_1_mmq {
|
||||
};
|
||||
int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
|
||||
};
|
||||
|
||||
struct block_fp4_mmq {
|
||||
uint32_t d4[4]; // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc.
|
||||
int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
|
||||
static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size");
|
||||
static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected block_fp4_mmq size");
|
||||
|
||||
static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
|
||||
switch (type_x) {
|
||||
@@ -129,6 +137,14 @@ static int get_mmq_y_host(const int cc) {
|
||||
((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K;
|
||||
#else
|
||||
return MMQ_ITER_K;
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA1)
|
||||
@@ -191,6 +207,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
|
||||
}
|
||||
|
||||
#define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4)
|
||||
@@ -201,6 +218,8 @@ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_FP4 % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_FP4 == MMQ_MMA_TILE_X_K_Q8_1, "Wrong tile size for MXFP4");
|
||||
|
||||
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
switch (type) {
|
||||
@@ -209,6 +228,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
// tile sizes are the same for Q8_1 and FP4 for blackwell
|
||||
case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
|
||||
case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
|
||||
@@ -228,7 +248,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
}
|
||||
|
||||
// block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales)
|
||||
#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI8_1)
|
||||
#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1)
|
||||
#define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K
|
||||
|
||||
static int mmq_get_granularity_host(const int mmq_x, const int cc) {
|
||||
if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
|
||||
@@ -761,6 +782,50 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check>
|
||||
static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restrict__ x,
|
||||
int * __restrict__ x_tile,
|
||||
const int kbx0,
|
||||
const int i_max,
|
||||
const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
int * x_qs = (int *) x_tile;
|
||||
uint32_t * x_sc = (uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
|
||||
|
||||
const int txi = threadIdx.x;
|
||||
|
||||
constexpr int iter_k = get_iter_k(GGML_TYPE_MXFP4);
|
||||
|
||||
constexpr int threads_per_row = iter_k / QK_MXFP4; // each thread processes 1 block
|
||||
constexpr int rows_per_warp = warp_size / threads_per_row;
|
||||
const int kbx = txi % threads_per_row;
|
||||
const int row_in_warp = txi / threads_per_row;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
|
||||
int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
|
||||
|
||||
if constexpr (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx;
|
||||
|
||||
// quantize_mxfp4_mmq permutes nibbles to match the quantized format
|
||||
const int k0 = kbx * 4;
|
||||
memcpy(x_qs + i * MMQ_MMA_TILE_X_K_FP4 + k0, bxi->qs, 16);
|
||||
|
||||
// Load E8M0 scales: pack 2 consecutive scales into one uint32
|
||||
if (kbx % 2 == 0) {
|
||||
uint32_t e = bxi->e;
|
||||
e |= ((bxi + 1)->e << 8);
|
||||
x_sc[i * MMQ_MMA_TILE_X_K_FP4 + kbx / 2] = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
@@ -931,6 +996,78 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x,
|
||||
const int * __restrict__ y,
|
||||
float * __restrict__ sum,
|
||||
const int k00) {
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<8, 8, int> tile_B;
|
||||
typedef tile<16, 8, float> tile_C; // Output is float for native scaled MMA
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = 2 * granularity;
|
||||
constexpr int ntx = rows_per_warp / tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K);
|
||||
|
||||
// Match layout from load_tiles_mxfp4_fp4
|
||||
const int * x_qs = (const int *) x;
|
||||
const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const uint32_t * y_sc = (const uint32_t *) y;
|
||||
|
||||
// tile_A has a length of 64 logical values vs. 32 values in block_mxfp4
|
||||
tile_A A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
|
||||
uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
|
||||
|
||||
// Block scale
|
||||
// Each thread has to point to a 4 byte scale value
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
#pragma unroll
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0,
|
||||
MMQ_MMA_TILE_X_K_FP4);
|
||||
|
||||
// based on block-scaling document, 2 threads in each quad need to supply to the scale value
|
||||
const int tidx = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
|
||||
scaleA[n][k01 / (2 * QI_MXFP4)] =
|
||||
*(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4));
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
|
||||
#pragma unroll
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
|
||||
tile_B B;
|
||||
uint32_t scaleB; // 2xN scales
|
||||
|
||||
load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K);
|
||||
|
||||
scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)];
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
|
||||
mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
@@ -3109,8 +3246,13 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q8_0> {
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
|
||||
static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ;
|
||||
#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
|
||||
#else
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
#endif // BLACKWELL_MMA_AVAILABLE
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
@@ -3243,17 +3385,26 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, need_check>;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
// FP4 tile stores 8 blocks
|
||||
constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1;
|
||||
#else
|
||||
constexpr int ne_block = 4 * QK8_1;
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
|
||||
|
||||
constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int);
|
||||
|
||||
for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
|
||||
load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
|
||||
|
||||
{
|
||||
const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz;
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) {
|
||||
for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
|
||||
int l = l0 + threadIdx.y*warp_size + threadIdx.x;
|
||||
|
||||
tile_y[l] = by0[l];
|
||||
@@ -3267,9 +3418,9 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
__syncthreads();
|
||||
|
||||
{
|
||||
const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz);
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) {
|
||||
for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
|
||||
int l = l0 + threadIdx.y*warp_size + threadIdx.x;
|
||||
|
||||
tile_y[l] = by0[l];
|
||||
@@ -3401,8 +3552,10 @@ static __global__ void mul_mat_q(
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
@@ -3463,7 +3616,7 @@ static __global__ void mul_mat_q(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
|
||||
offset_dst += it*mmq_y;
|
||||
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
@@ -3530,7 +3683,7 @@ static __global__ void mul_mat_q(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
|
||||
offset_dst += it*mmq_y;
|
||||
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
@@ -3553,7 +3706,9 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int ncols_max) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
@@ -3711,7 +3866,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
|
||||
const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
|
||||
const size_t nbs_ids = mmq_x*sizeof(int);
|
||||
const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
|
||||
const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
|
||||
const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq));
|
||||
return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
|
||||
}
|
||||
|
||||
@@ -3927,4 +4082,4 @@ void ggml_cuda_op_mul_mat_q(
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts);
|
||||
|
||||
@@ -47,6 +47,131 @@ static __global__ void quantize_q8_1(
|
||||
y[ib].ds = make_half2(d, sum);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
|
||||
if (!(amax > 0.0f)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// FP4 E2M1: max exponent (unbiased) is 2.
|
||||
constexpr int FP4_E2M1_EMAX = 2;
|
||||
|
||||
const float e = log2f(amax);
|
||||
|
||||
// "even" -> round-to-nearest integer, ties-to-even
|
||||
const int e_int = __float2int_rn(e);
|
||||
|
||||
const int shared_exp = e_int - FP4_E2M1_EMAX;
|
||||
|
||||
int biased = shared_exp + 127;
|
||||
|
||||
biased = max(biased, 0);
|
||||
biased = min(biased, 254);
|
||||
|
||||
return static_cast<uint8_t>(biased);
|
||||
}
|
||||
|
||||
// quantize values in the format mxfp4 is stored which is interleaved nibbles
|
||||
// i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31
|
||||
static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
|
||||
const int32_t * __restrict__ ids,
|
||||
void * __restrict__ vy,
|
||||
const int64_t ne00,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t ne0,
|
||||
const int ne1,
|
||||
const int ne2) {
|
||||
constexpr int vals_per_scale = 32;
|
||||
constexpr int vals_per_warp = 2 * vals_per_scale; // Each warp processes 2 blocks of 32 = 64 values
|
||||
|
||||
const int warp_id = threadIdx.y;
|
||||
const int lane_id_32 = threadIdx.x;
|
||||
|
||||
const int nwarps = blockDim.y;
|
||||
|
||||
const int64_t warp_start_offset = (blockIdx.y * nwarps + warp_id) * vals_per_warp;
|
||||
|
||||
if (warp_start_offset >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
const int64_t i01 = ids ? ids[i1] : i1;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i03 = i3;
|
||||
|
||||
block_fp4_mmq * y = (block_fp4_mmq *) vy;
|
||||
|
||||
const int64_t block_fp4_mmq_size = 8 * QK_MXFP4; // 256 values
|
||||
const int64_t ib0 = blockIdx.z * ((int64_t) ne1 * (ne0 / block_fp4_mmq_size));
|
||||
const int64_t ib = ib0 + (warp_start_offset / block_fp4_mmq_size) * ne1 + blockIdx.x;
|
||||
const int64_t quad_idx_in_block = (warp_start_offset % block_fp4_mmq_size) / vals_per_warp;
|
||||
|
||||
const int group_id = lane_id_32 / 4;
|
||||
const int lane_in_group = lane_id_32 % 4;
|
||||
const int base = group_id * 2;
|
||||
char2 * yqs2 = (char2 *) y[ib].qs;
|
||||
|
||||
const int64_t base_pos = i03 * s03 + i02 * s02 + i01 * s01;
|
||||
|
||||
uint8_t scales[2];
|
||||
|
||||
#pragma unroll
|
||||
for (int b = 0; b < 2; ++b) {
|
||||
const int64_t i0 = warp_start_offset + b * vals_per_scale + lane_id_32;
|
||||
const float xi = (i0 < ne00) ? x[base_pos + i0] : 0.0f;
|
||||
|
||||
float amax = fabsf(xi);
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
|
||||
}
|
||||
|
||||
const uint8_t e = compute_e8m0_scale(amax);
|
||||
scales[b] = e;
|
||||
const float inv_s = (amax == 0.0f) ? 0.0f : __frcp_rn(ggml_cuda_e8m0_to_fp32(e));
|
||||
|
||||
#if CUDART_VERSION >= 12080
|
||||
const float scaled_val = xi * inv_s;
|
||||
|
||||
const float val0 = __shfl_sync(0xFFFFFFFF, scaled_val, base, WARP_SIZE);
|
||||
const float val1 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 16, WARP_SIZE);
|
||||
const float val2 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 1, WARP_SIZE);
|
||||
const float val3 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 17, WARP_SIZE);
|
||||
|
||||
if (lane_in_group == 0) {
|
||||
__nv_fp4x4_e2m1 fp4_packed(make_float4(val0, val1, val2, val3));
|
||||
|
||||
yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = *(char2 *) &fp4_packed;
|
||||
}
|
||||
#else
|
||||
// Fallback: manual FP4 conversion using LUT
|
||||
const uint8_t q_val = ggml_cuda_float_to_fp4_e2m1(xi, inv_s);
|
||||
|
||||
const uint8_t q_lo_0 = __shfl_sync(0xFFFFFFFF, q_val, base, WARP_SIZE);
|
||||
const uint8_t q_lo_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 1, WARP_SIZE);
|
||||
const uint8_t q_hi_0 = __shfl_sync(0xFFFFFFFF, q_val, base + 16, WARP_SIZE);
|
||||
const uint8_t q_hi_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 17, WARP_SIZE);
|
||||
|
||||
if (lane_in_group == 0) {
|
||||
char2 q;
|
||||
q.x = (q_hi_0 << 4) | q_lo_0;
|
||||
q.y = (q_hi_1 << 4) | q_lo_1;
|
||||
yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = q;
|
||||
}
|
||||
#endif // CUDART_VERSION >= 12080
|
||||
}
|
||||
|
||||
if (lane_id_32 == 0) {
|
||||
// Store 2 scales packed into 1 uint32
|
||||
y[ib].d4[quad_idx_in_block] = (scales[1] << 8) | scales[0];
|
||||
}
|
||||
}
|
||||
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
|
||||
@@ -190,3 +315,29 @@ void quantize_mmq_q8_1_cuda(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_mmq_mxfp4_cuda(const float * x,
|
||||
const int32_t * ids,
|
||||
void * vy,
|
||||
[[maybe_unused]] const ggml_type type_src0,
|
||||
const int64_t ne00,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t ne0,
|
||||
const int64_t ne1,
|
||||
const int64_t ne2,
|
||||
const int64_t ne3,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
|
||||
|
||||
constexpr int nwarps = 8;
|
||||
constexpr int vals_per_warp = 2 * QK_MXFP4;
|
||||
constexpr int vals_per_block = nwarps * vals_per_warp;
|
||||
|
||||
const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
|
||||
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
|
||||
const dim3 block_size(WARP_SIZE, nwarps, 1);
|
||||
|
||||
quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
}
|
||||
|
||||
@@ -25,3 +25,17 @@ void quantize_mmq_q8_1_cuda(
|
||||
const float * x, const int32_t * ids, void * vy,
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
|
||||
void quantize_mmq_mxfp4_cuda(const float * x,
|
||||
const int32_t * ids,
|
||||
void * vy,
|
||||
ggml_type type_src0,
|
||||
int64_t ne00,
|
||||
int64_t s01,
|
||||
int64_t s02,
|
||||
int64_t s03,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3,
|
||||
cudaStream_t stream);
|
||||
|
||||
@@ -268,7 +268,23 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
|
||||
const ggml_tensor * weights,
|
||||
const ggml_tensor * get_rows,
|
||||
const ggml_tensor * argsort,
|
||||
const ggml_tensor * clamp,
|
||||
int n_expert) {
|
||||
ggml_tensor * probs = get_rows->src[0];
|
||||
if (probs->op != GGML_OP_RESHAPE) {
|
||||
return false;
|
||||
}
|
||||
probs = probs->src[0];
|
||||
ggml_tensor * selection_probs = argsort->src[0];
|
||||
|
||||
if (probs != selection_probs) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
@@ -288,7 +304,6 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
|
||||
return false;
|
||||
}
|
||||
|
||||
const int n_expert = softmax->ne[0];
|
||||
// n_expert must be a power of 2
|
||||
if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
|
||||
return false;
|
||||
|
||||
@@ -11,6 +11,11 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const bool delayed_softmax = false,
|
||||
ggml_tensor * weight_clamp = nullptr);
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
|
||||
const ggml_tensor * weights,
|
||||
const ggml_tensor * get_rows,
|
||||
const ggml_tensor * argsort,
|
||||
const ggml_tensor * clamp,
|
||||
int n_expert);
|
||||
|
||||
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
|
||||
|
||||
4
ggml/src/ggml-cuda/vendors/cuda.h
vendored
4
ggml/src/ggml-cuda/vendors/cuda.h
vendored
@@ -10,6 +10,10 @@
|
||||
#include <cuda_fp8.h>
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
|
||||
#if CUDART_VERSION >= 12080
|
||||
#include <cuda_fp4.h>
|
||||
#endif // CUDART_VERSION >= 12080
|
||||
|
||||
#if CUDART_VERSION < 11020
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ extern "C" {
|
||||
#include <AEEStdErr.h>
|
||||
#include <inttypes.h>
|
||||
#include <remote.h>
|
||||
#include <rpcmem.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
/* Offset to differentiate HLOS and Hexagon error codes.
|
||||
|
||||
@@ -263,7 +263,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||
struct htp_spad * dst_spad,
|
||||
uint32_t nth,
|
||||
uint32_t ith,
|
||||
uint32_t src0_nrows_per_thread) {
|
||||
uint32_t src0_nrows_per_thread,
|
||||
dma_queue * dma_queue) {
|
||||
htp_act_preamble2;
|
||||
|
||||
uint64_t t1, t2;
|
||||
@@ -271,6 +272,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||
|
||||
const size_t src0_row_size = nb01;
|
||||
const size_t dst_row_size = nb1;
|
||||
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
|
||||
|
||||
const uint32_t src0_nrows = ne01 * ne02 * ne03;
|
||||
|
||||
@@ -282,60 +285,81 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||
return;
|
||||
}
|
||||
|
||||
int is_aligned = 1;
|
||||
int opt_path = 0;
|
||||
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
|
||||
is_aligned = 0;
|
||||
FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
|
||||
}
|
||||
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
|
||||
opt_path = 1;
|
||||
const uint8_t * data_src0 = (const uint8_t *) src0->data;
|
||||
uint8_t * data_dst = (uint8_t *) dst->data;
|
||||
|
||||
uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
|
||||
uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
|
||||
|
||||
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
|
||||
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
|
||||
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
|
||||
|
||||
// In gelu = x*sigmoid(x*1.702)
|
||||
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
|
||||
|
||||
if (BLOCK == 0) {
|
||||
FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
|
||||
src0_spad->size_per_thread, src0_row_size_aligned);
|
||||
return;
|
||||
}
|
||||
|
||||
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
|
||||
uint8_t * restrict data_dst = (uint8_t *) dst->data;
|
||||
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
|
||||
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
|
||||
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
|
||||
|
||||
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
|
||||
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
|
||||
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
|
||||
dma_queue_push_vtcm_to_ddr(dma_queue,
|
||||
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
|
||||
dst_row_size, dst_row_size_aligned, 0);
|
||||
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue,
|
||||
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
|
||||
src0_row_size_aligned, src0_row_size, block_size);
|
||||
}
|
||||
|
||||
const int BLOCK = 8;
|
||||
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
|
||||
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
|
||||
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
|
||||
|
||||
// Prefetch next block
|
||||
if (block_end < src0_end_row) {
|
||||
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
|
||||
htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
|
||||
}
|
||||
float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
|
||||
float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
|
||||
|
||||
// Process rows in current block
|
||||
for (uint32_t ib = ir; ib < block_end; ib++) {
|
||||
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
|
||||
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
|
||||
for (uint32_t ib = 0; ib < block_size; ib++) {
|
||||
const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
|
||||
float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
|
||||
|
||||
// gelu = x * sigmoid(1.702 * x) // current implementation
|
||||
if (1 == opt_path) {
|
||||
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||
} else {
|
||||
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||
}
|
||||
hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
|
||||
hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
|
||||
hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
|
||||
}
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma_queue,
|
||||
dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
|
||||
dst_row_size, dst_row_size_aligned, block_size);
|
||||
|
||||
// prefetch N+2 loop iteration if any
|
||||
const uint32_t pref_block = (ir + BLOCK * 2);
|
||||
if (pref_block < src0_end_row) {
|
||||
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue,
|
||||
dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
|
||||
src0_row_size_aligned, src0_row_size, pref_block_size);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_flush(dma_queue);
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
|
||||
FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
|
||||
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
|
||||
struct htp_ops_context * octx = (struct htp_ops_context *) data;
|
||||
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
|
||||
octx->src0_nrows_per_thread);
|
||||
octx->src0_nrows_per_thread, octx->ctx->dma[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -468,21 +492,45 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
|
||||
const uint32_t n_threads = octx->n_threads;
|
||||
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
||||
|
||||
const size_t src0_row_size = src0->nb[1];
|
||||
const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
|
||||
const size_t dst_row_size = dst->nb[1];
|
||||
size_t src0_row_size = src0->nb[1];
|
||||
size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
|
||||
size_t dst_row_size = dst->nb[1];
|
||||
|
||||
const bool src1_valid = src1->ne[0];
|
||||
if (!src1_valid) {
|
||||
src1_row_size = src0_row_size;
|
||||
}
|
||||
|
||||
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
|
||||
const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
|
||||
// VTCM scratchpads for all tensors
|
||||
// N rows per thread, padded to HVX vector size
|
||||
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
|
||||
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
|
||||
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
|
||||
|
||||
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
|
||||
size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
|
||||
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
|
||||
|
||||
// Make sure the reserved vtcm size is sufficient
|
||||
if(vtcm_row_per_thread ==0){
|
||||
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
|
||||
spad_size_per_row * n_threads);
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
|
||||
octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
|
||||
octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;
|
||||
|
||||
octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
|
||||
octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
|
||||
octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
||||
|
||||
if (src1->ne[0]) {
|
||||
FARF(HIGH,
|
||||
"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
|
||||
FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
|
||||
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
|
||||
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
|
||||
octx->dst_spad.size);
|
||||
@@ -492,20 +540,8 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
|
||||
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
|
||||
}
|
||||
|
||||
// Make sure the reserved vtcm size is sufficient
|
||||
if (octx->ctx->vtcm_size < spad_size) {
|
||||
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
|
||||
spad_size);
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
uint32_t n_jobs = MIN(n_threads, src0_nrows);
|
||||
|
||||
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
|
||||
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
|
||||
}
|
||||
|
||||
@@ -34,12 +34,12 @@ dma_queue * dma_queue_create(size_t capacity) {
|
||||
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
|
||||
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
|
||||
|
||||
q->dst = (void **) memalign(4, capacity * sizeof(void *));
|
||||
memset(q->dst, 0, capacity * sizeof(void *));
|
||||
q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
|
||||
memset(q->dptr, 0, capacity * sizeof(dma_ptr));
|
||||
|
||||
q->tail = &q->desc[capacity - 1];
|
||||
|
||||
if (!q->desc && !q->dst) {
|
||||
if (!q->desc && !q->dptr) {
|
||||
FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
|
||||
return NULL;
|
||||
}
|
||||
@@ -54,16 +54,10 @@ void dma_queue_delete(dma_queue * q) {
|
||||
return;
|
||||
}
|
||||
free(q->desc);
|
||||
free(q->dst);
|
||||
free(q->dptr);
|
||||
free(q);
|
||||
}
|
||||
|
||||
void dma_queue_flush(dma_queue * q) {
|
||||
while (1) {
|
||||
uint32_t s = dmwait() & 0x3;
|
||||
if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
q->tail = NULL;
|
||||
while (dma_queue_pop(q).dst != NULL) ;
|
||||
}
|
||||
|
||||
@@ -11,10 +11,15 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
void *dst;
|
||||
const void *src;
|
||||
} dma_ptr;
|
||||
|
||||
typedef struct {
|
||||
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
|
||||
hexagon_udma_descriptor_type1_t * tail; // tail pointer
|
||||
void ** dst; // dst pointers
|
||||
dma_ptr * dptr; // dst/src pointers
|
||||
uint32_t push_idx;
|
||||
uint32_t pop_idx;
|
||||
uint32_t capacity;
|
||||
@@ -49,13 +54,20 @@ static inline unsigned int dmwait(void) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool dma_queue_push(dma_queue * q,
|
||||
void * dst,
|
||||
const void * src,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t nrows) {
|
||||
static inline dma_ptr dma_make_ptr(void *dst, const void *src)
|
||||
{
|
||||
dma_ptr p = { dst, src };
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline bool dma_queue_push(dma_queue * q,
|
||||
dma_ptr dptr,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t width, // width in bytes. number of bytes to transfer per row
|
||||
size_t nrows) {
|
||||
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
|
||||
FARF(ERROR, "dma-push: queue full\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -75,18 +87,18 @@ static inline bool dma_queue_push(dma_queue * q,
|
||||
#endif
|
||||
desc->order = 0;
|
||||
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
|
||||
desc->src = (void *) src;
|
||||
desc->dst = (void *) dst;
|
||||
desc->src = (void *) dptr.src;
|
||||
desc->dst = (void *) dptr.dst;
|
||||
desc->allocation = 0;
|
||||
desc->padding = 0;
|
||||
desc->roiwidth = src_row_size;
|
||||
desc->roiwidth = width;
|
||||
desc->roiheight = nrows;
|
||||
desc->srcstride = src_row_size;
|
||||
desc->dststride = dst_row_size;
|
||||
desc->srcwidthoffset = 0;
|
||||
desc->dstwidthoffset = 0;
|
||||
|
||||
q->dst[q->push_idx] = dst;
|
||||
q->dptr[q->push_idx] = dptr;
|
||||
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = desc;
|
||||
@@ -96,9 +108,28 @@ static inline bool dma_queue_push(dma_queue * q,
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline uint8_t * dma_queue_pop(dma_queue * q) {
|
||||
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
|
||||
dma_ptr dptr,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t nrows) {
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
|
||||
}
|
||||
|
||||
|
||||
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
|
||||
dma_ptr dptr,
|
||||
size_t dst_row_size,
|
||||
size_t src_row_size,
|
||||
size_t nrows) {
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
|
||||
}
|
||||
|
||||
static inline dma_ptr dma_queue_pop(dma_queue * q) {
|
||||
dma_ptr dptr = { NULL };
|
||||
|
||||
if (q->push_idx == q->pop_idx) {
|
||||
return NULL;
|
||||
return dptr;
|
||||
}
|
||||
|
||||
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
|
||||
@@ -112,11 +143,11 @@ static inline uint8_t * dma_queue_pop(dma_queue * q) {
|
||||
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
|
||||
}
|
||||
|
||||
uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
|
||||
dptr = q->dptr[q->pop_idx];
|
||||
|
||||
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
|
||||
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
|
||||
return dst;
|
||||
return dptr;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -980,8 +980,6 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
||||
int step_of_1 = num_elems >> 5;
|
||||
int remaining = num_elems - step_of_1 * VLEN_FP32;
|
||||
|
||||
assert(remaining == 0);
|
||||
|
||||
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
@@ -996,8 +994,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
||||
for (int i = 0; i < step_of_1; i++) {
|
||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
|
||||
}
|
||||
}
|
||||
|
||||
if (remaining > 0) {
|
||||
const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
|
||||
float * dstf = (float *) dst + step_of_1*VLEN_FP32;
|
||||
|
||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||
HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
|
||||
hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
|
||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||
|
||||
@@ -299,7 +299,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
|
||||
ctx->n_threads = n_hvx;
|
||||
for (int i = 0; i < ctx->n_threads; i++) {
|
||||
ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
|
||||
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
|
||||
ctx->dma[i] = dma_queue_create(64);
|
||||
}
|
||||
|
||||
// init worker pool
|
||||
|
||||
@@ -1127,13 +1127,13 @@ static void matmul(struct htp_matmul_type * mt,
|
||||
if (is0 >= HTP_SPAD_SRC0_NROWS) {
|
||||
break;
|
||||
}
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
|
||||
// Process src0 rows
|
||||
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
|
||||
@@ -1146,7 +1146,7 @@ static void matmul(struct htp_matmul_type * mt,
|
||||
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
|
||||
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
|
||||
if (pr0 < src0_end_row_x2) {
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
}
|
||||
@@ -1155,9 +1155,9 @@ static void matmul(struct htp_matmul_type * mt,
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
uint32_t ir0 = src0_end_row_x2;
|
||||
const int is0 = (ir0 - src0_start_row);
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
|
||||
@@ -1229,20 +1229,20 @@ static void matvec(struct htp_matmul_type * mt,
|
||||
if (is0 >= HTP_SPAD_SRC0_NROWS) {
|
||||
break;
|
||||
}
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
|
||||
// Process src0 rows
|
||||
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);
|
||||
|
||||
// Prefetch next (n + spad_nrows) row
|
||||
const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
|
||||
const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
|
||||
if (pr0 < src0_end_row_x2) {
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
}
|
||||
@@ -1251,9 +1251,9 @@ static void matvec(struct htp_matmul_type * mt,
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
const uint32_t ir0 = src0_end_row_x2;
|
||||
const uint32_t is0 = (ir0 - src0_start_row);
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
|
||||
}
|
||||
|
||||
@@ -1343,13 +1343,13 @@ static void matmul_id(struct htp_matmul_type * mt,
|
||||
if (is0 >= HTP_SPAD_SRC0_NROWS) {
|
||||
break;
|
||||
}
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
|
||||
// Process src0 rows
|
||||
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
|
||||
for (uint32_t cid = 0; cid < cne1; ++cid) {
|
||||
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
|
||||
@@ -1368,7 +1368,7 @@ static void matmul_id(struct htp_matmul_type * mt,
|
||||
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
|
||||
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
|
||||
if (pr0 < src0_end_row_x2) {
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
}
|
||||
@@ -1377,9 +1377,9 @@ static void matmul_id(struct htp_matmul_type * mt,
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
uint32_t ir0 = src0_end_row_x2;
|
||||
const uint32_t is0 = (ir0 - src0_start_row);
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
|
||||
for (uint32_t cid = 0; cid < cne1; ++cid) {
|
||||
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
|
||||
@@ -1467,20 +1467,20 @@ static void matvec_id(struct htp_matmul_type * mt,
|
||||
if (is0 >= HTP_SPAD_SRC0_NROWS) {
|
||||
break;
|
||||
}
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
|
||||
// Process src0 rows
|
||||
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
|
||||
|
||||
// Prefetch next (n + spad_nrows) row
|
||||
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
|
||||
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
|
||||
if (pr0 < src0_end_row_x2) {
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 2);
|
||||
}
|
||||
}
|
||||
@@ -1489,9 +1489,9 @@ static void matvec_id(struct htp_matmul_type * mt,
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
uint32_t ir0 = src0_end_row_x2;
|
||||
const uint32_t is0 = (ir0 - src0_start_row);
|
||||
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
|
||||
}
|
||||
}
|
||||
|
||||
153
ggml/src/ggml-hexagon/op-desc.h
Normal file
153
ggml/src/ggml-hexagon/op-desc.h
Normal file
@@ -0,0 +1,153 @@
|
||||
#ifndef OP_DESC_H
|
||||
#define OP_DESC_H
|
||||
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
struct op_desc {
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
} else {
|
||||
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_dims(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += format_tensor_dims(p, t->src[0]);
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_dims(p, t->src[i]);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
// format self dims separately for better visual alignment
|
||||
char self[64];
|
||||
format_tensor_dims(self, t);
|
||||
|
||||
p += sprintf(p, "%s", self);
|
||||
}
|
||||
|
||||
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
|
||||
const char * c = ggml_is_contiguous(t) ? "" : "!";
|
||||
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
} else {
|
||||
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_strides(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += format_tensor_strides(p, t->src[0]);
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_strides(p, t->src[i]);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
// format self dims separately for better visual alignment
|
||||
char self[64];
|
||||
format_tensor_strides(self, t);
|
||||
|
||||
p += sprintf(p, "%s", self);
|
||||
}
|
||||
|
||||
void format_op_types(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", ggml_type_name(t->type));
|
||||
}
|
||||
|
||||
const char * tensor_buff_name(const struct ggml_tensor * t) {
|
||||
if (t->buffer) {
|
||||
return ggml_backend_buffer_name(t->buffer);
|
||||
}
|
||||
return "NONE";
|
||||
}
|
||||
|
||||
void format_op_buffs(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", tensor_buff_name(t));
|
||||
}
|
||||
|
||||
void format_op_names(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += sprintf(p, "%s", t->src[0]->name);
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", t->src[i]->name);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", t->name);
|
||||
}
|
||||
|
||||
void format(const ggml_tensor * op) {
|
||||
format_op_dims(dims, op);
|
||||
format_op_strides(strides, op);
|
||||
format_op_types(types, op);
|
||||
format_op_buffs(buffs, op);
|
||||
format_op_names(names, op);
|
||||
}
|
||||
|
||||
op_desc() {}
|
||||
op_desc(const ggml_tensor * op) { format(op); }
|
||||
};
|
||||
|
||||
#endif // OP_DESC_H
|
||||
@@ -24,10 +24,6 @@
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
@@ -263,6 +263,32 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
||||
return { type, major, minor, patch };
|
||||
}
|
||||
|
||||
// cl buffer wrapper
|
||||
struct ggml_cl_buffer {
|
||||
cl_mem buffer;
|
||||
size_t size;
|
||||
|
||||
ggml_cl_buffer()
|
||||
: buffer(nullptr), size(0) {}
|
||||
|
||||
~ggml_cl_buffer() {
|
||||
if (buffer) {
|
||||
CL_CHECK(clReleaseMemObject(buffer));
|
||||
}
|
||||
}
|
||||
|
||||
void allocate(cl_context context, size_t new_size) {
|
||||
if (new_size > size) {
|
||||
size = new_size;
|
||||
if (buffer) {
|
||||
CL_CHECK(clReleaseMemObject(buffer));
|
||||
}
|
||||
cl_int err;
|
||||
CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Profiling
|
||||
struct ProfilingInfo {
|
||||
std::string op_name;
|
||||
@@ -376,6 +402,11 @@ struct ggml_backend_opencl_context {
|
||||
cl_context context;
|
||||
cl_command_queue queue;
|
||||
|
||||
// prealloc buffers for transposing weights and activations
|
||||
ggml_cl_buffer prealloc_quant_trans;
|
||||
ggml_cl_buffer prealloc_scales_trans;
|
||||
ggml_cl_buffer prealloc_act_trans;
|
||||
|
||||
cl_program program_add;
|
||||
cl_program program_add_id;
|
||||
cl_program program_clamp;
|
||||
@@ -494,6 +525,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
||||
@@ -634,12 +666,9 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_transpose_32;
|
||||
cl_kernel kernel_transpose_32_16;
|
||||
cl_kernel kernel_transpose_16;
|
||||
cl_kernel kernel_transpose_16_buf;
|
||||
cl_kernel kernel_transpose_16_4x1;
|
||||
|
||||
cl_mem A_s_d_max; // max scale buffer size for transpose
|
||||
cl_mem A_q_d_max; // max weight buffer size for transpose
|
||||
cl_mem B_d_max; // max activation buffer size for transpose
|
||||
|
||||
// Gemm and Gemv related programs, kernels, etc
|
||||
cl_program program_CL_gemm;
|
||||
cl_program program_CL_gemv_general;
|
||||
@@ -806,6 +835,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
||||
@@ -2004,7 +2034,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -2596,9 +2627,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
required_B_d_bytes, max_B_d_bytes);
|
||||
}
|
||||
|
||||
CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
|
||||
CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
|
||||
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
||||
backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes);
|
||||
backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes);
|
||||
backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes);
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
|
||||
@@ -3603,32 +3634,35 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
// use sub_buffer of max buffer size instead
|
||||
|
||||
size_t q_size_bytes = K * M / 8 * sizeof(float);
|
||||
backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
|
||||
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = q_size_bytes;
|
||||
cl_mem qT_d = clCreateSubBuffer(
|
||||
backend_ctx->A_q_d_max,
|
||||
backend_ctx->prealloc_quant_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
bool K_tile_trans = true;
|
||||
if ((K / 32) % 4 != 0){
|
||||
K_tile_trans =false;
|
||||
}
|
||||
|
||||
size_t d_size_bytes = M * (K / 32) * 2;
|
||||
backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
|
||||
|
||||
region.origin = 0;
|
||||
region.size = d_size_bytes;
|
||||
cl_mem dT_d = clCreateSubBuffer(
|
||||
backend_ctx->A_s_d_max,
|
||||
backend_ctx->prealloc_scales_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
// cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
// <----------------------------------------------------------------------------------> //
|
||||
@@ -3933,6 +3967,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int M = tensor->ne[1]; // ne01
|
||||
cl_int K = tensor->ne[0]; // ne00
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
GGML_ASSERT(M % 4 == 0);
|
||||
|
||||
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_mem buf_trans_q;
|
||||
cl_mem buf_trans_d;
|
||||
|
||||
CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
size_q, NULL, &err), err));
|
||||
CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
size_d, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_16_buf;
|
||||
|
||||
// transpose q back
|
||||
cl_int stride_k_q = K/4;
|
||||
size_t local_size_q[3] = {64, 1, 1};
|
||||
size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_size_q, local_size_q, 0, NULL, NULL));
|
||||
|
||||
// transpose scales back
|
||||
cl_int stride_k_d = K/32;
|
||||
size_t local_size_d[3] = {64, 1, 1};
|
||||
size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_size_d, local_size_d, 0, NULL, NULL));
|
||||
|
||||
// unpack
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, NULL));
|
||||
|
||||
// read back to host
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
CL_CHECK(clReleaseMemObject(buf_trans_q));
|
||||
CL_CHECK(clReleaseMemObject(buf_trans_d));
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
@@ -7306,8 +7425,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
region.origin = 0;
|
||||
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
|
||||
B_d = clCreateSubBuffer(
|
||||
backend_ctx->B_d_max,
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
|
||||
@@ -117,6 +117,27 @@ kernel void kernel_convert_block_q4_0_noshuffle(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q4_0_noshuffle(
|
||||
global uchar * src_q,
|
||||
global half * src_d,
|
||||
global struct block_q4_0 * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
for (int i = 0; i < QK4_0/4; ++i) {
|
||||
uchar x0 = q[i + 0 ] ;
|
||||
uchar x1 = q[i + QK4_0/4];
|
||||
|
||||
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
||||
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_mxfp4
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -44,6 +44,19 @@ kernel void kernel_transpose_16_4x1(
|
||||
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
|
||||
}
|
||||
|
||||
// Transpose treating each element as 16-bit using buffer
|
||||
kernel void kernel_transpose_16_buf(
|
||||
global const ushort * input,
|
||||
global ushort * output,
|
||||
const int ldi,
|
||||
const int ldo
|
||||
) {
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
output[x*ldo + y] = input[y*ldi + x];
|
||||
}
|
||||
|
||||
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||
kernel void kernel_transpose_32(
|
||||
__read_only image1d_buffer_t input,
|
||||
|
||||
@@ -524,6 +524,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
std::string host;
|
||||
int port;
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
GGML_LOG_ERROR("Failed to parse endpoint: %s\n", endpoint.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
#ifdef _WIN32
|
||||
@@ -571,6 +572,10 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return ctx->base_ptr;
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
|
||||
}
|
||||
|
||||
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
rpc_tensor result;
|
||||
if (!tensor) {
|
||||
@@ -580,10 +585,10 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
|
||||
result.id = reinterpret_cast<uint64_t>(tensor);
|
||||
result.type = tensor->type;
|
||||
if (tensor->buffer) {
|
||||
if (tensor->buffer && ggml_backend_buffer_is_rpc(tensor->buffer)) {
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
result.buffer = ctx->remote_ptr;
|
||||
result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
|
||||
} else {
|
||||
result.buffer = 0;
|
||||
}
|
||||
@@ -664,10 +669,6 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
if (ggml_backend_buffer_is_rpc(src->buffer)) {
|
||||
// check if src and dst are on the same server
|
||||
@@ -2053,6 +2054,10 @@ ggml_backend_reg_t ggml_backend_rpc_reg(void) {
|
||||
|
||||
static uint32_t ggml_backend_rpc_get_device_count(const char * endpoint) {
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
|
||||
return 0;
|
||||
}
|
||||
rpc_msg_device_count_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_DEVICE_COUNT, nullptr, 0, &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
51
ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp
Normal file
51
ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp
Normal file
@@ -0,0 +1,51 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint32_t ne00;
|
||||
uint32_t ne01;
|
||||
uint32_t nb00;
|
||||
uint32_t nb01;
|
||||
uint32_t a_offset;
|
||||
} p;
|
||||
|
||||
#define BLOCK_SIZE 256
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {uint data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {uint data_d[];};
|
||||
|
||||
shared uint vals[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint expert_id = gl_WorkGroupID.x;
|
||||
const uint num_elements = p.ne00 * p.ne01;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
uint count = 0;
|
||||
for (uint idx = tid; idx < num_elements; idx += BLOCK_SIZE) {
|
||||
const uint i01 = idx / p.ne00;
|
||||
const uint i00 = idx % p.ne00;
|
||||
const uint a = data_a[p.a_offset + i01 * p.nb01 + i00 * p.nb00];
|
||||
|
||||
count += uint(a == expert_id);
|
||||
}
|
||||
|
||||
vals[tid] = count;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] += vals[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
data_d[expert_id] = vals[0];
|
||||
}
|
||||
}
|
||||
@@ -401,13 +401,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
|
||||
const uint qshift = (iqs & 16) >> 2;
|
||||
u8vec4 qs = u8vec4(
|
||||
data_a[a_offset + ib].qs[iq + 0],
|
||||
data_a[a_offset + ib].qs[iq + 1],
|
||||
data_a[a_offset + ib].qs[iq + 2],
|
||||
data_a[a_offset + ib].qs[iq + 3]
|
||||
);
|
||||
qs = (qs >> qshift) & uint8_t(0xF);
|
||||
const u8vec4 qs = unpack8((data_a_packed32[a_offset + ib].qs[iq/4] >> qshift) & 0x0F0F0F0F);
|
||||
|
||||
const float dl = float(int(sl | (sh << 4)) - 32);
|
||||
return dl * vec4(
|
||||
|
||||
@@ -6,4 +6,6 @@ layout (push_constant) uniform parameter
|
||||
uint KY;
|
||||
float param1;
|
||||
float param2;
|
||||
float param3;
|
||||
float param4;
|
||||
} p;
|
||||
|
||||
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
|
||||
int s0; int s1;
|
||||
int p0; int p1;
|
||||
int d0; int d1;
|
||||
uint batch_IC;
|
||||
} p;
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
@@ -34,12 +35,12 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
void im2col(const uint y, const uint z) {
|
||||
const uint gidx = gl_GlobalInvocationID.x;
|
||||
|
||||
const uint oh = gl_GlobalInvocationID.y;
|
||||
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
||||
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||
const uint oh = y;
|
||||
const uint batch = z / p.IC;
|
||||
const uint ic = z % p.IC;
|
||||
|
||||
const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
|
||||
const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
|
||||
@@ -101,3 +102,15 @@ void main() {
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
uint y = gl_GlobalInvocationID.y;
|
||||
while (y < p.OH) {
|
||||
uint z = gl_GlobalInvocationID.z;
|
||||
while (z < p.batch_IC) {
|
||||
im2col(y, z);
|
||||
z += gl_NumWorkGroups.z;
|
||||
}
|
||||
y += gl_NumWorkGroups.y;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,36 +11,54 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||
const uint nibble_shift = 4 * (itid & 1);
|
||||
const uint ib32 = itid / 2; // 0..7
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
// Precompute db multiplication factors
|
||||
float db_vals[NUM_ROWS];
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
|
||||
const float db = d * (0.5 + scale) * 0.25;
|
||||
|
||||
const uint scale_raw = data_a[ibi].scales[ib32];
|
||||
const uint scale = (scale_raw >> nibble_shift) & 0xF;
|
||||
// Merge constant calculations d * (0.5 + scale) * 0.25 = d*0.125 + d*scale*0.25
|
||||
db_vals[n] = d * (0.125f + float(scale) * 0.25f);
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
// Preload grid and sign data for all l values
|
||||
vec4 grid0_vals[2], grid1_vals[2];
|
||||
uint sign_vals[2], sign7_vals[2];
|
||||
[[unroll]] for (uint l = 0; l < 2; ++l) {
|
||||
const uint qs = data_a[ibi].qs[2 * itid + l];
|
||||
const uint sign = qs >> 9;
|
||||
const uint sign7 = bitCount(sign);
|
||||
const vec4 grid0 = vec4(unpack8(iq2xs_grid[qs & 511].x));
|
||||
const vec4 grid1 = vec4(unpack8(iq2xs_grid[qs & 511].y));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
FLOAT_TYPE sum =
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
|
||||
FLOAT_TYPE(0.0)))))))));
|
||||
temp[j][n] = fma(db, sum, temp[j][n]);
|
||||
sign_vals[l] = qs >> 9;
|
||||
sign7_vals[l] = bitCount(sign_vals[l]);
|
||||
const uvec2 grid_data = iq2xs_grid[qs & 511];
|
||||
grid0_vals[l] = vec4(unpack8(grid_data.x));
|
||||
grid1_vals[l] = vec4(unpack8(grid_data.y));
|
||||
}
|
||||
// Preload B data for all j columns (reduce repeated index calculations)
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (uint l = 0; l < 2; ++l) {
|
||||
const uint sign = sign_vals[l];
|
||||
const uint sign7 = sign7_vals[l];
|
||||
const vec4 grid0 = grid0_vals[l];
|
||||
const vec4 grid1 = grid1_vals[l];
|
||||
// Precompute indices
|
||||
const uint b_idx = (j * p.batch_stride_b + b_offset + y_idx) / 4 + 2 * l;
|
||||
const vec4 b0 = vec4(data_b_v4[b_idx + 0]);
|
||||
const vec4 b4 = vec4(data_b_v4[b_idx + 1]);
|
||||
sum +=
|
||||
fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x),
|
||||
fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y),
|
||||
fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z),
|
||||
fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w),
|
||||
fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x),
|
||||
fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y),
|
||||
fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z),
|
||||
fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 & 1) != 0 ? -grid1.w : grid1.w),
|
||||
FLOAT_TYPE(0.0)))))))));
|
||||
}
|
||||
temp[j][n] = fma(FLOAT_TYPE(db_vals[n]), sum, temp[j][n]);
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
|
||||
@@ -68,6 +68,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
layout (binding = 4) readonly buffer Counts {int data_expert_count[];};
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
@@ -135,13 +136,19 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
||||
#include "mul_mm_funcs.glsl"
|
||||
|
||||
void main() {
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
if (ic * BN >= data_expert_count[expert_idx]) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
#else
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
@@ -156,7 +163,6 @@ void main() {
|
||||
const uint blocks_m = (p.M + BM - 1) / BM;
|
||||
const uint ir = gl_WorkGroupID.x % blocks_m;
|
||||
const uint ik = gl_WorkGroupID.x / blocks_m;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
|
||||
const uint WSUBM = WM / WMITER;
|
||||
|
||||
@@ -92,6 +92,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
layout (binding = 4) readonly buffer Counts {int data_expert_count[];};
|
||||
|
||||
shared u16vec4 row_ids[BN];
|
||||
|
||||
@@ -107,11 +108,7 @@ B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const i
|
||||
{
|
||||
const uint row_i = blockCoords[0];
|
||||
|
||||
if (row_i >= _ne1) {
|
||||
return B_TYPE(0.0);
|
||||
}
|
||||
|
||||
const u16vec4 row_idx = row_ids[row_i & (BN - 1)];
|
||||
const u16vec4 row_idx = row_ids[row_i];
|
||||
B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]];
|
||||
|
||||
return ret;
|
||||
@@ -138,6 +135,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
|
||||
uint ids[16];
|
||||
uint iter = 0;
|
||||
|
||||
uint expert_count = data_expert_count[expert_idx];
|
||||
|
||||
for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
|
||||
// prefetch up to 16 elements
|
||||
if (iter == 0) {
|
||||
@@ -185,7 +184,7 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
|
||||
}
|
||||
_ne1 += total;
|
||||
iter &= 15;
|
||||
if (_ne1 >= (ic + 1) * BN) {
|
||||
if (_ne1 >= (ic + 1) * BN || _ne1 == expert_count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -194,15 +193,28 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationIndex;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
if (ic * BN >= data_expert_count[expert_idx]) {
|
||||
return;
|
||||
}
|
||||
// initialize to row 0 so we don't need to bounds check
|
||||
if (tid < BN) {
|
||||
row_ids[tid] = u16vec4(0);
|
||||
}
|
||||
#if !defined(NEEDS_INIT_IQ_SHMEM)
|
||||
barrier();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
#endif
|
||||
|
||||
const uint tid = gl_LocalInvocationIndex;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
#else
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
@@ -217,7 +229,6 @@ void main() {
|
||||
const uint blocks_m = (p.M + BM - 1) / BM;
|
||||
const uint ir = gl_WorkGroupID.x % blocks_m;
|
||||
const uint ik = gl_WorkGroupID.x / blocks_m;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
if (bitCount(p.nei0) == 1) {
|
||||
@@ -482,7 +493,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
} else {
|
||||
@@ -490,7 +501,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
}
|
||||
@@ -526,7 +537,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
} else {
|
||||
@@ -534,7 +545,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
}
|
||||
@@ -571,7 +582,7 @@ void main() {
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
#ifdef MUL_MAT_ID
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
#else
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
#endif
|
||||
@@ -583,7 +594,7 @@ void main() {
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
#ifdef MUL_MAT_ID
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
#else
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
#endif
|
||||
|
||||
@@ -159,14 +159,16 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint is = iqs / 8; // 0..15
|
||||
const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3
|
||||
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
||||
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
||||
|
||||
const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
|
||||
| (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
|
||||
const float dl = float(data_a[ib].d) * float(us - 32);
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)),
|
||||
dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
|
||||
const vec2 qs = vec2(unpack8((uint(data_a_packed16[ib].qs[qsi / 2]) >> qsshift) & 0x0303).xy);
|
||||
const vec2 hm = vec2(unpack8(((uint(data_a_packed16[ib].hmask[hmi / 2]) >> (4 * n + halfsplit)) & 0x0101 ^ 0x0101) << 2).xy);
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(dl * (qs.x - hm.x),
|
||||
dl * (qs.y - hm.y));
|
||||
#elif defined(DATA_A_Q4_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
@@ -198,8 +200,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m),
|
||||
fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
|
||||
const vec2 q = vec2(unpack8((uint(data_a_packed16[ib].qs[qsi / 2]) >> (b * 4)) & 0x0F0F).xy);
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, q.x, m),
|
||||
fma(d, q.y, m));
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
@@ -213,8 +217,6 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
const uint qhi = (iqs % 16) * 2; // 0,2,4..30
|
||||
|
||||
const uint8_t hm = uint8_t(1 << (iqs / 16));
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].dm);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
@@ -234,8 +236,12 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m),
|
||||
fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
|
||||
const uint qs = (uint(data_a_packed16[ib].qs[qsi / 2]) >> (b * 4)) & 0x0F0F;
|
||||
const uint qh = ((uint(data_a_packed16[ib].qh[qhi / 2]) >> (iqs / 16)) & 0x0101) << 4;
|
||||
const vec2 q = vec2(unpack8(qs | qh).xy);
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, q.x, m),
|
||||
fma(d, q.y, m));
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
@@ -394,11 +400,9 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[iqs];
|
||||
const uint signs = pack32(u8vec4(
|
||||
data_a[ib].qs[is+0],
|
||||
data_a[ib].qs[is+1],
|
||||
data_a[ib].qs[is+2],
|
||||
data_a[ib].qs[is+3]
|
||||
const uint signs = pack32(u16vec2(
|
||||
data_a_packed16[ib].qs[is/2],
|
||||
data_a_packed16[ib].qs[is/2+1]
|
||||
));
|
||||
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
||||
const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
||||
@@ -443,8 +447,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
|
||||
const uint qshift = (idx & 8) >> 1;
|
||||
u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
|
||||
qs = (qs >> qshift) & uint8_t(0xF);
|
||||
u8vec2 qs = unpack8((uint(data_a_packed16[ib].qs[iq/2]) >> qshift) & 0x0F0F).xy;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
||||
|
||||
@@ -13,6 +13,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
|
||||
uint ids[16];
|
||||
uint iter = 0;
|
||||
|
||||
uint expert_count = data_expert_count[expert_idx];
|
||||
|
||||
for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
|
||||
// prefetch up to 16 elements
|
||||
if (iter == 0) {
|
||||
@@ -60,7 +62,7 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
|
||||
}
|
||||
_ne1 += total;
|
||||
iter &= 15;
|
||||
if (_ne1 >= (ic + 1) * BN) {
|
||||
if (_ne1 >= (ic + 1) * BN || _ne1 == expert_count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
layout (binding = 4) readonly buffer Counts {int data_expert_count[];};
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
@@ -104,13 +105,19 @@ block_b_cache cache_b;
|
||||
#include "mul_mmq_funcs.glsl"
|
||||
|
||||
void main() {
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
if (ic * BN >= data_expert_count[expert_idx]) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
#else
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
@@ -125,7 +132,6 @@ void main() {
|
||||
const uint blocks_m = (p.M + BM - 1) / BM;
|
||||
const uint ir = gl_WorkGroupID.x % blocks_m;
|
||||
const uint ik = gl_WorkGroupID.x / blocks_m;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
|
||||
const uint WSUBM = WM / WMITER;
|
||||
|
||||
@@ -49,8 +49,8 @@ void rope_norm(const uint i0, const uint i1, rope_params p) {
|
||||
uint idst = i1*ne0 + i0;
|
||||
const uint ix = rope_a_coord(i0, i01, i02, p);
|
||||
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS..
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS.
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
|
||||
if (p.set_rows_stride != 0) {
|
||||
idst = i01*ne0 + i0;
|
||||
idst += rope_data_i[i02].x * p.set_rows_stride;
|
||||
@@ -91,7 +91,7 @@ void rope_neox(const uint i0, const uint i1, rope_params p) {
|
||||
uint idst = i1*ne0 + i0/2;
|
||||
const uint ix = rope_a_coord(i0/2, i01, i02, p);
|
||||
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS..
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS.
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
|
||||
if (p.set_rows_stride != 0) {
|
||||
idst = i01*ne0 + i0/2;
|
||||
@@ -132,9 +132,16 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
|
||||
const uint i01 = i1 % ne1;
|
||||
const uint i02 = i1 / ne1;
|
||||
|
||||
const uint idst = i1*ne0 + i0/2;
|
||||
uint idst = i1*ne0 + i0/2;
|
||||
const uint ix = rope_a_coord(i0/2, i01, i02, p);
|
||||
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS.
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
|
||||
if (p.set_rows_stride != 0) {
|
||||
idst = i01*ne0 + i0/2;
|
||||
idst += rope_data_i[i02].x * p.set_rows_stride;
|
||||
}
|
||||
|
||||
if (i0 >= p.n_dims) {
|
||||
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
|
||||
rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
void main() {
|
||||
const uint i0 = 2*gl_GlobalInvocationID.y;
|
||||
// i1 is actually i2*nb2+i1, but the rows are contiguous
|
||||
const uint i1 = gl_GlobalInvocationID.x;
|
||||
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
|
||||
if (i1 >= pc.nrows) {
|
||||
return;
|
||||
}
|
||||
rope_multi(i0, i1, pc);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
void main() {
|
||||
const uint i0 = 2*gl_GlobalInvocationID.y;
|
||||
// i1 is actually i2*nb2+i1, but the rows are contiguous
|
||||
const uint i1 = gl_GlobalInvocationID.x;
|
||||
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
|
||||
if (i1 >= pc.nrows) {
|
||||
return;
|
||||
}
|
||||
rope_neox(i0, i1, pc);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
void main() {
|
||||
const uint i0 = 2*gl_GlobalInvocationID.y;
|
||||
// i1 is actually i2*nb2+i1, but the rows are contiguous
|
||||
const uint i1 = gl_GlobalInvocationID.x;
|
||||
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
|
||||
if (i1 >= pc.nrows) {
|
||||
return;
|
||||
}
|
||||
rope_norm(i0, i1, pc);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
struct rope_params {
|
||||
uint rope_mode;
|
||||
uint ncols;
|
||||
uint nrows;
|
||||
uint n_dims;
|
||||
float freq_scale;
|
||||
uint p_delta_rows;
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
void main() {
|
||||
const uint i0 = 2*gl_GlobalInvocationID.y;
|
||||
// i1 is actually i2*nb2+i1, but the rows are contiguous
|
||||
const uint i1 = gl_GlobalInvocationID.x;
|
||||
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
|
||||
if (i1 >= pc.nrows) {
|
||||
return;
|
||||
}
|
||||
rope_vision(i0, i1, pc);
|
||||
}
|
||||
|
||||
@@ -172,16 +172,12 @@ struct block_q8_0
|
||||
float16_t d;
|
||||
int8_t qs[32];
|
||||
};
|
||||
|
||||
struct block_q8_0_packed16
|
||||
{
|
||||
float16_t d;
|
||||
int16_t qs[32/2];
|
||||
};
|
||||
struct block_q8_0_packed32
|
||||
{
|
||||
float16_t d;
|
||||
int32_t qs[32/4];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
#define QUANT_K QUANT_K_Q8_0
|
||||
@@ -189,7 +185,6 @@ struct block_q8_0_packed32
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_q8_0
|
||||
#define A_TYPE_PACKED16 block_q8_0_packed16
|
||||
#define A_TYPE_PACKED32 block_q8_0_packed32
|
||||
#define DATA_A_QUANT_LEGACY
|
||||
#endif
|
||||
|
||||
@@ -201,11 +196,13 @@ struct block_q8_1
|
||||
f16vec2 ds;
|
||||
int8_t qs[32];
|
||||
};
|
||||
|
||||
struct block_q8_1_packed16
|
||||
{
|
||||
f16vec2 ds;
|
||||
int16_t qs[16];
|
||||
};
|
||||
|
||||
struct block_q8_1_packed32
|
||||
{
|
||||
f16vec2 ds;
|
||||
@@ -218,6 +215,7 @@ struct block_q8_1_x4
|
||||
f16vec2 ds[4];
|
||||
int32_t qs[32];
|
||||
};
|
||||
|
||||
struct block_q8_1_x4_packed128
|
||||
{
|
||||
f16vec2 ds[4];
|
||||
@@ -1346,10 +1344,28 @@ struct block_iq4_xs
|
||||
uint8_t qs[QUANT_K_IQ4_XS/2];
|
||||
};
|
||||
|
||||
struct block_iq4_xs_packed16
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t scales_h;
|
||||
uint16_t scales_l[QUANT_K_IQ4_XS/128];
|
||||
uint16_t qs[QUANT_K_IQ4_XS/4];
|
||||
};
|
||||
|
||||
struct block_iq4_xs_packed32
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t scales_h;
|
||||
uint32_t scales_l;
|
||||
uint32_t qs[QUANT_K_IQ4_XS/8];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_IQ4_XS)
|
||||
#define QUANT_K QUANT_K_IQ4_XS
|
||||
#define QUANT_R QUANT_R_IQ4_XS
|
||||
#define A_TYPE block_iq4_xs
|
||||
#define A_TYPE_PACKED16 block_iq4_xs_packed16
|
||||
#define A_TYPE_PACKED32 block_iq4_xs_packed32
|
||||
#endif
|
||||
|
||||
#define QUANT_K_IQ4_NL 32
|
||||
|
||||
@@ -21,6 +21,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
#define NEAREST 0
|
||||
#define BILINEAR 1
|
||||
#define BICUBIC 2
|
||||
#define BILINEAR_ANTIALIAS 513
|
||||
|
||||
layout (constant_id = 0) const uint scale_mode = 0;
|
||||
|
||||
@@ -62,6 +63,56 @@ float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
|
||||
return fetch_bilinear(c0, c1, d, i12, i13);
|
||||
}
|
||||
|
||||
float triangle_filter(float x) {
|
||||
return max(1.0f - abs(x), 0.0f);
|
||||
}
|
||||
|
||||
float interpolate_bilinear_antialias(uint i10, uint i11, uint i12, uint i13) {
|
||||
const float support1 = max(1.0f, 1.0f / p.sf1);
|
||||
const float invscale1 = 1.0f / support1;
|
||||
const float support0 = max(1.0f, 1.0f / p.sf0);
|
||||
const float invscale0 = 1.0f / support0;
|
||||
|
||||
const uint i02 = uint(i12 / p.sf2);
|
||||
const uint i03 = uint(i13 / p.sf3);
|
||||
|
||||
const float y = (float(i11) + p.pixel_offset) / p.sf1;
|
||||
const float x = (float(i10) + p.pixel_offset) / p.sf0;
|
||||
|
||||
// the range of source pixels that contribute
|
||||
const int x_min = max(int(x - support0 + p.pixel_offset), 0);
|
||||
const int x_max = min(int(x + support0 + p.pixel_offset), int(p.ne00));
|
||||
const int y_min = max(int(y - support1 + p.pixel_offset), 0);
|
||||
const int y_max = min(int(y + support1 + p.pixel_offset), int(p.ne01));
|
||||
|
||||
// bilinear filter with antialiasing
|
||||
float val = 0.0f;
|
||||
float total_weight = 0.0f;
|
||||
|
||||
for (int sy = y_min; sy < y_max; sy++) {
|
||||
const float weight_y = triangle_filter((sy - y + p.pixel_offset) * invscale1);
|
||||
|
||||
for (int sx = x_min; sx < x_max; sx++) {
|
||||
const float weight_x = triangle_filter((sx - x + p.pixel_offset) * invscale0);
|
||||
const float weight = weight_x * weight_y;
|
||||
|
||||
if (weight <= 0.0f) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const float pixel = data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + sy * p.nb01 + sx * p.nb00];
|
||||
val += pixel * weight;
|
||||
total_weight += weight;
|
||||
}
|
||||
}
|
||||
|
||||
if (total_weight > 0.0f) {
|
||||
val /= total_weight;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
// Bicubic interpolation with alpha = -0.75
|
||||
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
||||
const vec4 bcoeffs1 = vec4( 1.25, -2.25, 0.0, 1.0);
|
||||
@@ -118,6 +169,9 @@ void main() {
|
||||
case BICUBIC:
|
||||
result = interpolate_bicubic(i10, i11, i12, i13);
|
||||
break;
|
||||
case BILINEAR_ANTIALIAS:
|
||||
result = interpolate_bilinear_antialias(i10, i11, i12, i13);
|
||||
break;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(result);
|
||||
|
||||
@@ -853,6 +853,8 @@ void process_shaders() {
|
||||
string_to_spv("hardswish_f32", "hardswish.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("abs_f16", "abs.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("abs_f32", "abs.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("xielu_f16", "xielu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("xielu_f32", "xielu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
@@ -925,6 +927,8 @@ void process_shaders() {
|
||||
string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
string_to_spv("rope_multi_f32_f16", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_multi_f32_f16_rte", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
@@ -941,6 +945,8 @@ void process_shaders() {
|
||||
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
|
||||
string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("count_experts", "count_experts.comp", merge_maps(base_dict, {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}}));
|
||||
|
||||
for (std::string dim_str : {"", "_3d"}) {
|
||||
for (bool bda : {false, true}) {
|
||||
std::string bda_str = bda ? "_bda" : "";
|
||||
|
||||
35
ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp
Normal file
35
ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp
Normal file
@@ -0,0 +1,35 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
float x = float(data_a[i]);
|
||||
|
||||
float alpha_n = p.param1;
|
||||
float alpha_p = p.param2;
|
||||
float beta = p.param3;
|
||||
float eps = p.param4;
|
||||
|
||||
if (x > 0.0f) {
|
||||
x = alpha_p * x * x + beta * x;
|
||||
} else {
|
||||
const float min_x_eps = min(x, eps);
|
||||
x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x;
|
||||
}
|
||||
|
||||
data_d[i] = D_TYPE(x);
|
||||
}
|
||||
@@ -181,6 +181,7 @@ class Keys:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
@@ -354,6 +355,7 @@ class MODEL_ARCH(IntEnum):
|
||||
STARCODER = auto()
|
||||
REFACT = auto()
|
||||
BERT = auto()
|
||||
MODERN_BERT = auto()
|
||||
NOMIC_BERT = auto()
|
||||
NOMIC_BERT_MOE = auto()
|
||||
NEO_BERT = auto()
|
||||
@@ -375,6 +377,7 @@ class MODEL_ARCH(IntEnum):
|
||||
PHIMOE = auto()
|
||||
PLAMO = auto()
|
||||
PLAMO2 = auto()
|
||||
PLAMO3 = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
@@ -447,6 +450,8 @@ class MODEL_ARCH(IntEnum):
|
||||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
MISTRAL3 = auto()
|
||||
MIMO2 = auto()
|
||||
LLAMA_EMBED = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -747,6 +752,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.STARCODER: "starcoder",
|
||||
MODEL_ARCH.REFACT: "refact",
|
||||
MODEL_ARCH.BERT: "bert",
|
||||
MODEL_ARCH.MODERN_BERT: "modern-bert",
|
||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||
MODEL_ARCH.NEO_BERT: "neo-bert",
|
||||
@@ -768,6 +774,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.PHIMOE: "phimoe",
|
||||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.PLAMO2: "plamo2",
|
||||
MODEL_ARCH.PLAMO3: "plamo3",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
@@ -841,6 +848,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
MODEL_ARCH.MISTRAL3: "mistral3",
|
||||
MODEL_ARCH.MIMO2: "mimo2",
|
||||
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -1367,6 +1376,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.CLS,
|
||||
MODEL_TENSOR.CLS_OUT,
|
||||
],
|
||||
MODEL_ARCH.MODERN_BERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.CLS,
|
||||
MODEL_TENSOR.CLS_OUT,
|
||||
],
|
||||
MODEL_ARCH.NOMIC_BERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
@@ -1743,6 +1765,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.SSM_B_NORM,
|
||||
MODEL_TENSOR.SSM_C_NORM,
|
||||
],
|
||||
MODEL_ARCH.PLAMO3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GPT2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
@@ -3180,6 +3217,46 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.MIMO2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_SINKS,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
],
|
||||
MODEL_ARCH.LLAMA_EMBED: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -774,8 +774,12 @@ class GGUFWriter:
|
||||
def add_shared_kv_layers(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
||||
|
||||
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
|
||||
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
|
||||
def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
|
||||
key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
|
||||
if isinstance(value, int):
|
||||
self.add_uint32(key, value)
|
||||
else:
|
||||
self.add_array(key, value)
|
||||
|
||||
def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
|
||||
self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)
|
||||
@@ -886,6 +890,9 @@ class GGUFWriter:
|
||||
def add_value_residual_mix_lora_rank(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length)
|
||||
|
||||
def add_rope_freq_base_swa(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value)
|
||||
|
||||
def add_gate_lora_rank(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ class TensorNameMap:
|
||||
"embed_tokens", # embeddinggemma
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert nomic-bert
|
||||
"embeddings.tok_embeddings", # modern-bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
@@ -46,6 +47,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||
"word_embeddings_layernorm", # bloom
|
||||
"embeddings.LayerNorm", # bert
|
||||
"embeddings.norm", # modern-bert
|
||||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
"rwkv.blocks.0.pre_ln", # rwkv
|
||||
@@ -75,6 +77,7 @@ class TensorNameMap:
|
||||
"head.out", # wavtokenizer
|
||||
"lm_head", # llama4
|
||||
"model.transformer.ff_out", # llada
|
||||
"head.decoder", # modern-bert
|
||||
),
|
||||
MODEL_TENSOR.DENSE_2_OUT: (
|
||||
"dense_2_out", # embeddinggemma
|
||||
@@ -104,6 +107,7 @@ class TensorNameMap:
|
||||
"backbone.final_layer_norm", # wavtokenizer
|
||||
"model.norm", # llama4
|
||||
"model.transformer.ln_f", # llada
|
||||
"final_norm", # modern-bert
|
||||
"model.norm", # cogvlm
|
||||
),
|
||||
|
||||
@@ -151,6 +155,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.input_layernorm", # llama4
|
||||
"layers.{bid}.input_layernorm", # embeddinggemma
|
||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||
"layers.{bid}.attn_norm", # modern-bert
|
||||
"model.layers.{bid}.operator_norm", # lfm2
|
||||
"model.transformer.blocks.{bid}.attn_norm", # llada
|
||||
"layers.{bid}.input_layernorm", # qwen3-embedding
|
||||
@@ -187,6 +192,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
"transformer_encoder.{bid}.qkv", # neobert
|
||||
"layers.{bid}.attn.Wqkv", # modern-bert
|
||||
"model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm
|
||||
),
|
||||
|
||||
@@ -261,6 +267,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.linear_attn", # deci
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"layers.{bid}.attn.Wo", # modern-bert
|
||||
"transformer.layer.{bid}.attention.out_lin", # distillbert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||
@@ -313,6 +320,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.ATTN_SINKS: (
|
||||
"model.layers.{bid}.self_attn.sinks", # openai-moe
|
||||
"model.layers.{bid}.self_attn.attention_sink_bias", # mimov2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_GATE: (
|
||||
@@ -344,6 +352,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
||||
"model.layers.{bid}.feedforward_layernorm", # apertus
|
||||
"model.layers.{bid}.pre_mlp_layernorm", # kormo
|
||||
"layers.{bid}.mlp_norm" # modern-bert
|
||||
),
|
||||
|
||||
# Pre feed-forward norm
|
||||
@@ -407,6 +416,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.mlp.up_proj", # embeddinggemma
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"layers.{bid}.mlp.Wi", # modern-bert
|
||||
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
"transformer.h.{bid}.mlp.linear_3", # refact
|
||||
@@ -521,6 +531,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.mlp.down_proj", # embeddinggemma
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"layers.{bid}.mlp.Wo", # modern-bert
|
||||
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
@@ -584,6 +595,7 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.q", # plamo2
|
||||
"model.layers.layers.{bid}.mixer.q_norm", # plamo3
|
||||
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
|
||||
"model.layers.{bid}.attention.query_layernorm", # apertus
|
||||
),
|
||||
@@ -599,6 +611,7 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.k", # plamo2
|
||||
"model.layers.layers.{bid}.mixer.k_norm", # plamo3
|
||||
"layers.{bid}.self_attn.k_norm", # qwen3-embedding
|
||||
"model.layers.{bid}.attention.key_layernorm", # apertus
|
||||
),
|
||||
@@ -1122,6 +1135,7 @@ class TensorNameMap:
|
||||
"classifier.dense", # roberta
|
||||
"pre_classifier", # distillbert
|
||||
"dense", # neobert
|
||||
"head.dense", # modern-bert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CLS_OUT: (
|
||||
|
||||
@@ -110,7 +110,6 @@ class SafetensorRemote:
|
||||
"""
|
||||
|
||||
BASE_DOMAIN = "https://huggingface.co"
|
||||
ALIGNMENT = 8 # bytes
|
||||
|
||||
@classmethod
|
||||
def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
|
||||
@@ -204,9 +203,6 @@ class SafetensorRemote:
|
||||
|
||||
# Calculate the data start offset
|
||||
data_start_offset = 8 + metadata_length
|
||||
alignment = SafetensorRemote.ALIGNMENT
|
||||
if data_start_offset % alignment != 0:
|
||||
data_start_offset += alignment - (data_start_offset % alignment)
|
||||
|
||||
# Check if we have enough data to read the metadata
|
||||
if len(raw_data) < 8 + metadata_length:
|
||||
@@ -298,7 +294,6 @@ class SafetensorsLocal:
|
||||
Custom parsing gives a bit more control over the memory usage.
|
||||
The official safetensors library doesn't expose file ranges.
|
||||
"""
|
||||
ALIGNMENT = 8 # bytes
|
||||
|
||||
tensors: dict[str, LocalTensor]
|
||||
|
||||
@@ -316,9 +311,6 @@ class SafetensorsLocal:
|
||||
raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}")
|
||||
|
||||
data_start_offset = f.tell()
|
||||
alignment = self.ALIGNMENT
|
||||
if data_start_offset % alignment != 0:
|
||||
data_start_offset += alignment - (data_start_offset % alignment)
|
||||
|
||||
tensors: dict[str, LocalTensor] = {}
|
||||
for name, meta in metadata.items():
|
||||
|
||||
@@ -286,7 +286,7 @@ extern "C" {
|
||||
// NULL-terminated list of buffer types to use for tensors that match a pattern
|
||||
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||
@@ -467,10 +467,17 @@ extern "C" {
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
enum llama_params_fit_status {
|
||||
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// returns true if the parameters could be successfully modified to fit device memory
|
||||
// this function is NOT thread safe because it modifies the global llama logger state
|
||||
LLAMA_API bool llama_params_fit(
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
|
||||
@@ -18,17 +18,17 @@ model="Llama-3.2-3B-Instruct-Q4_0.gguf"
|
||||
device="HTP0"
|
||||
[ "$D" != "" ] && device="$D"
|
||||
|
||||
verbose=
|
||||
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
|
||||
|
||||
experimental=
|
||||
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
|
||||
|
||||
verbose=
|
||||
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
|
||||
|
||||
sched=
|
||||
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
@@ -45,9 +45,9 @@ adb $adbserial shell " \
|
||||
cd $basedir; ulimit -c unlimited; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
|
||||
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
|
||||
-ngl 99 --device $device $cli_opts $@ \
|
||||
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
|
||||
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --batch-size 128 -fa on \
|
||||
-ngl 99 --device $device $cli_opts $@ \
|
||||
"
|
||||
|
||||
53
scripts/snapdragon/adb/run-completion.sh
Executable file
53
scripts/snapdragon/adb/run-completion.sh
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
|
||||
# Basedir on device
|
||||
basedir=/data/local/tmp/llama.cpp
|
||||
|
||||
cli_opts=
|
||||
|
||||
branch=.
|
||||
[ "$B" != "" ] && branch=$B
|
||||
|
||||
adbserial=
|
||||
[ "$S" != "" ] && adbserial="-s $S"
|
||||
|
||||
model="Llama-3.2-3B-Instruct-Q4_0.gguf"
|
||||
[ "$M" != "" ] && model="$M"
|
||||
|
||||
device="HTP0"
|
||||
[ "$D" != "" ] && device="$D"
|
||||
|
||||
experimental=
|
||||
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
|
||||
|
||||
verbose=
|
||||
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
|
||||
|
||||
sched=
|
||||
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
|
||||
nhvx=
|
||||
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||
|
||||
ndev=
|
||||
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
|
||||
|
||||
set -x
|
||||
|
||||
adb $adbserial shell " \
|
||||
cd $basedir; ulimit -c unlimited; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
|
||||
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --batch-size 128 -fa on \
|
||||
-ngl 99 -no-cnv --device $device $cli_opts $@ \
|
||||
"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user