Compare commits

...

9 Commits

Author SHA1 Message Date
Georgi Gerganov
ed58975f51 server : improve infill stop criteria 2025-03-12 15:28:48 +02:00
BB-fat
6ab2e4765a metal : Cache the Metal library at the device context level (#12265) 2025-03-11 13:45:02 +02:00
Xuan-Son Nguyen
96e1280839 clip : bring back GPU support (#12322)
* clip : bring back GPU support

* use n_gpu_layers param

* fix double free

* ggml_backend_init_by_type

* clean up
2025-03-11 09:20:16 +01:00
Eve
2c9f833d17 mat vec double buffer (#12188) 2025-03-10 19:28:11 +00:00
R0CKSTAR
251364549f musa: support new arch mp_31 and update doc (#12296)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
2025-03-10 18:18:25 +01:00
Henry Linjamäki
8acdacb3ea opencl: use OpenCL C standard supported by the device (#12221)
This patch nudges the llama.cpp a bit to be supported on PoCL which
doesn't support OpenCL C CL2.0. The issue is solved by querying the
device for the supported OpenCL C versions and using the highest one
available.
2025-03-10 09:57:00 -07:00
John Bean
89b2b56e86 readme: added Sidekick to available UIs (#12311) 2025-03-10 16:13:09 +02:00
Georgi Gerganov
e128a1bf5b tests : fix test-quantize-fns to init the CPU backend (#12306)
ggml-ci
2025-03-10 14:07:15 +02:00
marcoStocchi
6ef79a67ca common : refactor '-o' option (#12278)
As discussed in PR 'llama-tts : add -o option' (#12042):

* common_params : 'out_file' string is the only output file name parameter left in common_params. It's intended to be used in all example programs implementing an '-o' option.

* cvector-generator, export-lora, imatrix : default output filenames moved from 'common_params' to the 'main()' of each example program.
2025-03-10 13:34:13 +02:00
21 changed files with 503 additions and 348 deletions

View File

@@ -836,7 +836,7 @@ ifdef GGML_MUSA
else
MUSA_PATH ?= /opt/musa
endif
MUSA_ARCHITECTURES ?= 21;22
MUSA_ARCHITECTURES ?= 21;22;31
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib

View File

@@ -172,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)

View File

@@ -1867,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg(
{"-o", "--output", "--output-file"}, "FNAME",
string_format("output file (default: '%s')",
ex == LLAMA_EXAMPLE_EXPORT_LORA
? params.lora_outfile.c_str()
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
? params.cvector_outfile.c_str()
: params.out_file.c_str()),
string_format("output file (default: '%s')", params.out_file.c_str()),
[](common_params & params, const std::string & value) {
params.out_file = value;
params.cvector_outfile = value;
params.lora_outfile = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(

View File

@@ -407,8 +407,6 @@ struct common_params {
int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
@@ -420,16 +418,16 @@ struct common_params {
int n_pca_batch = 100;
int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
// batched-bench params
bool batched_bench_output_jsonl = false;
// common params
std::string out_file; // output filename for all example programs
};
// call once at the start of a program if it uses libcommon

View File

@@ -197,29 +197,53 @@ The following compilation options are also available to tweak performance:
## MUSA
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
- Using `CMake`:
#### Download directly from Moore Threads
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
### Compilation
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
```
#### Override Compute Capability Specifications
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
```bash
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
```
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
#### Compilation options
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
```
For static build:
```bash
cmake -B build -DGGML_MUSA=ON \
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build build --config Release
```
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
### Runtime MUSA environmental variables
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
```bash
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
```
### Unified Memory
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
## HIP
This provides GPU acceleration on HIP-supported AMD GPUs.

View File

@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "control_vector.gguf";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
return 1;
}
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
export_gguf(ctx_train.v_final, params.out_file, model_hint);
llama_backend_free();

View File

@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "ggml-lora-merged-f16.gguf";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
return 1;
}
g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.lora_outfile.c_str());
printf("done, output file is %s\n", params.out_file.c_str());
return 0;
}

View File

@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.dat";
}
if (ncall > 0) {
fname += ".at_";
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "imatrix.dat" ;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;

View File

@@ -4,31 +4,12 @@
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
//#ifdef GGML_USE_CUDA
//#include "ggml-cuda.h"
//#endif
//
//#ifdef GGML_USE_SYCL
//#include "ggml-sycl.h"
//#endif
//
//#ifdef GGML_USE_METAL
//#include "ggml-metal.h"
//#endif
//
//#ifdef GGML_USE_CANN
//#include "ggml-cann.h"
//#endif
//
//#ifdef GGML_USE_VULKAN
//#include "ggml-vulkan.h"
//#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@@ -600,18 +581,54 @@ struct clip_ctx {
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
struct gguf_context * ctx_gguf = nullptr;
struct ggml_context * ctx_data = nullptr;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL;
std::vector<ggml_backend_t> backend_ptrs;
std::vector<ggml_backend_buffer_type_t> backend_buft;
ggml_backend_t backend = NULL;
ggml_gallocr_t compute_alloc = NULL;
ggml_backend_t backend = nullptr;
ggml_backend_t backend_cpu = nullptr;
ggml_backend_buffer_t buf = nullptr;
ggml_backend_sched_ptr sched;
struct clip_image_size * load_image_size;
clip_ctx(clip_context_params & ctx_params) {
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
backend = ctx_params.use_gpu
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
: nullptr;
if (backend) {
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
backend_ptrs.push_back(backend);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
} else {
backend = backend_cpu;
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
backend_ptrs.push_back(backend_cpu);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
sched.reset(
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
);
}
~clip_ctx() {
ggml_free(ctx_data);
gguf_free(ctx_gguf);
ggml_backend_buffer_free(buf);
ggml_backend_free(backend);
if (backend_cpu != backend) {
ggml_backend_free(backend_cpu);
}
}
};
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
@@ -1184,6 +1201,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return clip_init(fname, clip_context_params{
/* use_gpu */ true,
/* verbosity */ verbosity,
});
}
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
int verbosity = ctx_params.verbosity;
struct ggml_context * meta = NULL;
struct gguf_init_params params = {
@@ -1277,7 +1302,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
clip_ctx * new_clip = new clip_ctx{};
clip_ctx * new_clip = new clip_ctx(ctx_params);
// update projector type
{
@@ -1296,36 +1321,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
//#ifdef GGML_USE_CUDA
// new_clip->backend = ggml_backend_cuda_init(0);
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_METAL
// new_clip->backend = ggml_backend_metal_init();
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_CANN
// new_clip->backend = ggml_backend_cann_init(0);
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_VULKAN
// new_clip->backend = ggml_backend_vk_init(0);
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
//#endif
//
//#ifdef GGML_USE_SYCL
// new_clip->backend = ggml_backend_sycl_init(0);
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
//#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
// model size and capabilities
{
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1416,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
// alloc memory and offload data
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
@@ -1434,7 +1431,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return nullptr;
}
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
if (ggml_backend_buft_is_host(buft)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
} else {
@@ -1720,14 +1717,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// measure mem requirement and allocate
{
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch;
batch.size = 1;
batch.data = nullptr;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
ggml_backend_t backend = new_clip->backend_ptrs[i];
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
if (size > 1) {
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0);
}
}
}
return new_clip;
@@ -2408,12 +2412,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
}
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
@@ -2609,8 +2607,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
// build the inference graph
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
// set inputs
const auto & model = ctx->vision_model;
@@ -2775,11 +2774,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
}
if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
ggml_backend_graph_compute(ctx->backend, gf);
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
if (status != GGML_STATUS_SUCCESS) {
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
return false;
}
// the last node is the embedding tensor
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);

View File

@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
size_t size;
};
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
struct clip_context_params {
bool use_gpu;
int verbosity;
};
// deprecated, use clip_init
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
CLIP_API void clip_free(struct clip_ctx * ctx);

View File

@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
struct clip_context_params clip_params = {
/* use_gpu */ params->n_gpu_layers != 0,
/* verbosity */ params->verbosity,
};
auto * ctx_clip = clip_init(clip_path, clip_params);
return ctx_clip;
}

View File

@@ -2162,35 +2162,58 @@ struct server_context {
if (slot.has_new_line) {
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
if (slot.params.n_indent > 0) {
if (slot.params.n_indent >= 0) {
// check the current indentation
// TODO: improve by not doing it more than once for each new line
if (slot.last_nl_pos > 0) {
size_t pos = slot.last_nl_pos;
int n_indent = 0;
int n_indent = 0;
while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
n_indent++;
pos++;
size_t pos = slot.last_nl_pos;
while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t' || slot.generated_text[pos] == '\n')) {
n_indent++;
if (slot.generated_text[pos] == '\n') {
n_indent = 0;
slot.last_nl_pos = pos + 1;
}
if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
pos++;
}
if (0 < pos && pos < slot.generated_text.size()) {
if (n_indent < slot.params.n_indent) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
// cut the last line
slot.generated_text.erase(pos, std::string::npos);
//slot.generated_text.erase(pos, std::string::npos);
SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
}
}
//SLT_ERR(slot, "n_indent = %d (%d), generated_text.size() = %d, n_decoded = %d, last_nl_pos = %d\n", n_indent, slot.params.n_indent, slot.generated_text.size(), slot.n_decoded, slot.last_nl_pos);
// find the next new line
{
const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
if (pos != std::string::npos) {
while (pos != std::string::npos) {
slot.last_nl_pos = pos + 1;
// detect end of paragraph at current indent level
if (slot.generated_text[slot.last_nl_pos - 2] == '\n' && n_indent <= slot.params.n_indent) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
// cut the last line
slot.generated_text.erase(pos, std::string::npos);
SLT_DBG(slot, "stopped by reaching end of paragraph, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
break;
}
pos = slot.generated_text.find('\n', slot.last_nl_pos);
}
}
}

View File

@@ -195,6 +195,8 @@ option(GGML_OPENCL "ggml: use OpenCL"
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
"gmml: OpenCL API version to target")
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

View File

@@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
static struct ggml_backend_metal_device_context {
id<MTLDevice> mtl_device;
int mtl_device_ref_count;
id<MTLLibrary> mtl_library;
bool has_simdgroup_reduction;
bool has_simdgroup_mm;
@@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
} g_ggml_ctx_dev_main = {
/*.mtl_device =*/ nil,
/*.mtl_device_ref_count =*/ 0,
/*.mtl_library =*/ nil,
/*.has_simdgroup_reduction =*/ false,
/*.has_simdgroup_mm =*/ false,
/*.has_residency_sets =*/ false,
@@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
ctx->mtl_device_ref_count--;
if (ctx->mtl_device_ref_count == 0) {
if (ctx->mtl_library) {
[ctx->mtl_library release];
ctx->mtl_library = nil;
}
if (ctx->mtl_device) {
[ctx->mtl_device release];
ctx->mtl_device = nil;
@@ -495,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) {
return data;
}
// load library
//
// - first check if the library is embedded
// - then check if the library is in the bundle
// - if not found, load the source and compile it
// - if that fails, return NULL
static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
id<MTLLibrary> metal_library = nil;
NSError * error = nil;
NSString * src = nil;
#if GGML_METAL_EMBED_LIBRARY
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
extern const char ggml_metallib_start[];
extern const char ggml_metallib_end[];
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else
#ifdef SWIFT_PACKAGE
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
#else
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
if (path_lib == nil) {
// Try to find the resource in the directory where the current binary located.
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
// Optionally, if this is a symlink, try to resolve it.
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
// It is a relative path, adding the binary directory as directory prefix.
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
}
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
// Link to the resource could not be resolved.
default_metallib_path = nil;
} else {
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
}
}
} else {
// The resource couldn't be found in the binary's directory.
default_metallib_path = nil;
}
path_lib = default_metallib_path;
}
if (path_lib != nil) {
// pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
metal_library = [device newLibraryWithURL:libURL error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
} else {
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * path_source;
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
if (path_resource) {
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
} else {
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
}
if (path_source == nil) {
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
path_source = @"ggml-metal.metal";
}
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
}
#endif
if (!metal_library) {
@autoreleasepool {
// dictionary of preprocessor macros
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
if (use_bfloat) {
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
}
#if GGML_METAL_EMBED_LIBRARY
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
#endif
MTLCompileOptions * options = [MTLCompileOptions new];
options.preprocessorMacros = prep;
//[options setFastMathEnabled:false];
metal_library = [device newLibraryWithSource:src options:options error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
#if !__has_feature(objc_arc)
[options release];
#endif
}
}
#if GGML_METAL_EMBED_LIBRARY
[src release];
#endif // GGML_METAL_EMBED_LIBRARY
return metal_library;
}
static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
GGML_LOG_INFO("%s: allocating\n", __func__);
@@ -522,136 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library = nil;
// load library
//
// - first check if the library is embedded
// - then check if the library is in the bundle
// - if not found, load the source and compile it
// - if that fails, return NULL
{
NSError * error = nil;
NSString * src = nil;
#if GGML_METAL_EMBED_LIBRARY
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
extern const char ggml_metallib_start[];
extern const char ggml_metallib_end[];
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else
#ifdef SWIFT_PACKAGE
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
#else
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
if (path_lib == nil) {
// Try to find the resource in the directory where the current binary located.
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
// Optionally, if this is a symlink, try to resolve it.
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
// It is a relative path, adding the binary directory as directory prefix.
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
}
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
// Link to the resource could not be resolved.
default_metallib_path = nil;
} else {
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
}
}
} else {
// The resource couldn't be found in the binary's directory.
default_metallib_path = nil;
}
path_lib = default_metallib_path;
}
if (path_lib != nil) {
// pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
metal_library = [device newLibraryWithURL:libURL error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
} else {
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * path_source;
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
if (path_resource) {
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
} else {
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
}
if (path_source == nil) {
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
path_source = @"ggml-metal.metal";
}
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
}
#endif
if (!metal_library) {
@autoreleasepool {
// dictionary of preprocessor macros
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
if (ctx_dev->use_bfloat) {
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
}
#if GGML_METAL_EMBED_LIBRARY
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
#endif
MTLCompileOptions * options = [MTLCompileOptions new];
options.preprocessorMacros = prep;
//[options setFastMathEnabled:false];
metal_library = [device newLibraryWithSource:src options:options error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
#if !__has_feature(objc_arc)
[options release];
#endif
}
}
#if GGML_METAL_EMBED_LIBRARY
[src release];
#endif // GGML_METAL_EMBED_LIBRARY
if (ctx_dev->mtl_library == nil) {
ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
}
id<MTLLibrary> metal_library = ctx_dev->mtl_library;
if (metal_library == nil) {
GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
return NULL;
}
// print MTL GPU family:
@@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
[metal_function release]; \
if (error) { \
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
[metal_library release]; \
return NULL; \
} \
} else { \
@@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
}
[metal_library release];
return ctx;
}

View File

@@ -21,7 +21,7 @@ if (MUSAToolkit_FOUND)
message(STATUS "MUSA Toolkit found")
if (NOT DEFINED MUSA_ARCHITECTURES)
set(MUSA_ARCHITECTURES "21;22")
set(MUSA_ARCHITECTURES "21;22;31")
endif()
message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")

View File

@@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
endif ()
add_compile_definitions(GGML_OPENCL_SOA_Q)
add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
if (GGML_OPENCL_USE_ADRENO_KERNELS)
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")

View File

@@ -1,4 +1,4 @@
#define CL_TARGET_OPENCL_VERSION 220
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
// suppress warnings in CL headers for GCC and Clang
@@ -25,6 +25,8 @@
#include <vector>
#include <string>
#include <cmath>
#include <memory>
#include <charconv>
#undef MIN
#undef MAX
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
X1E,
};
struct ggml_cl_version {
cl_uint major = 0;
cl_uint minor = 0;
};
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
static ggml_cl_version parse_cl_version(std::string_view str) {
size_t major_str_begin = 0;
size_t major_str_end = str.find(".", major_str_begin);
if (major_str_end == std::string::npos) {
return {};
}
size_t minor_str_begin = major_str_end + 1;
size_t minor_str_end = str.find(" ", minor_str_begin);
if (minor_str_end == std::string::npos) {
return {};
}
cl_uint version_major;
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
return {};
}
cl_uint version_minor;
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
return {};
}
return { version_major, version_minor };
}
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
size_t param_size;
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &param_size));
std::unique_ptr<char[]> param_storage(new char[param_size]);
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
auto param_value = std::string_view(param_storage.get(), param_size);
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
if (param_value.find(version_prefix) != 0) {
return {};
}
param_value.remove_prefix(version_prefix.length());
return parse_cl_version(param_value);
}
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
size_t param_size;
#if CL_TARGET_OPENCL_VERSION >= 300
if (platform_version.major >= 3) {
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, &param_size));
if (!param_size) {
return {};
}
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
unsigned versions_count = param_size / sizeof(cl_name_version);
cl_version version_max = 0;
for (unsigned i = 0; i < versions_count; i++) {
version_max = std::max<cl_version>(versions[i].version, version_max);
}
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
}
#else
GGML_UNUSED(platform_version);
#endif // CL_TARGET_OPENCL_VERSION >= 300
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, &param_size));
if (!param_size) {
return {};
}
std::unique_ptr<char[]> param_storage(new char[param_size]);
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
auto param_value = std::string_view(param_storage.get(), param_size);
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
if (param_value.find(version_prefix) != 0) {
return {};
}
param_value.remove_prefix(version_prefix.length());
return parse_cl_version(param_value);
}
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
if (strstr(device_name, "730") ||
strstr(device_name, "740") ||
@@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
// A local ref of cl_device_id for convenience
cl_device_id device = backend_ctx->device;
// Check device OpenCL version, OpenCL 2.0 or above is required
size_t device_ver_str_size;
clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
device_ver_buffer[device_ver_str_size] = '\0';
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
// Check device OpenCL version, OpenCL 2.0 or above is required
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
if (opencl_c_version.major < 2) {
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
return backend_ctx;
}
@@ -516,8 +604,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
if (strstr(device_ver_buffer, "OpenCL 3") &&
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
@@ -581,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
const std::string kernel_src = read_file("ggml-opencl.cl");
#endif
std::string compile_opts =
"-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
"-cl-finite-math-only -cl-fast-relaxed-math ";
auto opencl_c_std =
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable -cl-unsafe-math-optimizations"
" -cl-finite-math-only -cl-fast-relaxed-math";
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
// Non matmul kernels.
@@ -693,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
// Gemv general
std::string CL_gemv_compile_opts =
" -cl-std=CL2.0 "
" -cl-mad-enable "
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -DSIMDGROUP_WIDTH=" +
std::to_string(backend_ctx->adreno_wave_size);
if (has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
}
@@ -713,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
// Gemv 2048, 16384
CL_gemv_compile_opts =
" -cl-std=CL2.0 "
" -cl-mad-enable "
" -DLINE_STRIDE_A=2048 "
" -DBLOCK_STRIDE_A=16384 "
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -DLINE_STRIDE_A=2048 "
" -DBLOCK_STRIDE_A=16384 "
" -DSIMDGROUP_WIDTH=" +
std::to_string(backend_ctx->adreno_wave_size);
if (has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
}
@@ -735,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
// Gemv 2048, 16384
CL_gemv_compile_opts =
" -cl-std=CL2.0 "
" -cl-mad-enable "
" -DLINE_STRIDE_A=2048 "
" -DBLOCK_STRIDE_A=16384 "
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -DLINE_STRIDE_A=2048 "
" -DBLOCK_STRIDE_A=16384 "
" -DSIMDGROUP_WIDTH=" +
std::to_string(backend_ctx->adreno_wave_size);
if (has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
}
@@ -750,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
// Gemv 5504, 44032
CL_gemv_compile_opts =
" -cl-std=CL2.0 "
" -cl-mad-enable "
" -DLINE_STRIDE_A=5504 "
" -DBLOCK_STRIDE_A=44032 "
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -DLINE_STRIDE_A=5504 "
" -DBLOCK_STRIDE_A=44032 "
" -DSIMDGROUP_WIDTH=" +
std::to_string(backend_ctx->adreno_wave_size);
if (has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
}
@@ -765,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
// Gemv 16000, 128000
CL_gemv_compile_opts =
" -cl-std=CL2.0 "
" -cl-mad-enable "
" -DLINE_STRIDE_A=16000 "
" -DBLOCK_STRIDE_A=128000 "
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -DLINE_STRIDE_A=16000 "
" -DBLOCK_STRIDE_A=128000 "
" -DSIMDGROUP_WIDTH=" +
std::to_string(backend_ctx->adreno_wave_size);
if (has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
}

View File

@@ -5,23 +5,24 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
shared FLOAT_TYPE sccache1[2][BLOCK_SIZE/16][16];
shared FLOAT_TYPE sccache2[2][BLOCK_SIZE/16][16];
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
uint csel = 0;
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
csel ^= 1;
barrier();
if (!all_threads) { // when we don't have enough blocks to use all threads
if (i < num_blocks_per_row) {
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
}
barrier();
@@ -29,8 +30,8 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
continue;
} else {
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
barrier();
}
@@ -57,22 +58,22 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 2; ++l) {
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ],
fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ],
fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ],
fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ],
fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im],
fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im],
fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im],
fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im],
fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im],
fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im],
fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im],
fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[csel][ix][ 8*v_im] * qs_u32_0[l ],
fma(FLOAT_TYPE(b16[l]), sccache1[csel][ix][1 + 8*v_im] * qs_u32_0[l+2],
fma(FLOAT_TYPE(b32[l]), sccache1[csel][ix][2 + 8*v_im] * qs_u32_2[l ],
fma(FLOAT_TYPE(b48[l]), sccache1[csel][ix][3 + 8*v_im] * qs_u32_2[l+2],
fma(FLOAT_TYPE(b64[l]), sccache1[csel][ix][4 + 8*v_im] * qs_u32_4[l ],
fma(FLOAT_TYPE(b80[l]), sccache1[csel][ix][5 + 8*v_im] * qs_u32_4[l+2],
fma(FLOAT_TYPE(b96[l]), sccache1[csel][ix][6 + 8*v_im] * qs_u32_6[l ],
fma(FLOAT_TYPE(b112[l]), sccache1[csel][ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[csel][ix][ 8*v_im],
fma(FLOAT_TYPE(b16[l]), sccache2[csel][ix][1 + 8*v_im],
fma(FLOAT_TYPE(b32[l]), sccache2[csel][ix][2 + 8*v_im],
fma(FLOAT_TYPE(b48[l]), sccache2[csel][ix][3 + 8*v_im],
fma(FLOAT_TYPE(b64[l]), sccache2[csel][ix][4 + 8*v_im],
fma(FLOAT_TYPE(b80[l]), sccache2[csel][ix][5 + 8*v_im],
fma(FLOAT_TYPE(b96[l]), sccache2[csel][ix][6 + 8*v_im],
fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2))))))));
}
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
}

View File

@@ -5,20 +5,21 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][2][8];
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
uint csel = 0;
void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
csel ^= 1;
if (!all_threads) { // when we don't have enough blocks to use all threads
barrier();
if (i < num_blocks_per_row)
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
barrier();
if (i >= num_blocks_per_row)
@@ -40,8 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
if (all_threads) {
barrier();
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
barrier();
}
@@ -59,14 +59,14 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 2; ++l) {
sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
sum = fma(FLOAT_TYPE( b0[l]) * sccache[csel][ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
fma(FLOAT_TYPE( b16[l]) * sccache[csel][ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
fma(FLOAT_TYPE( b32[l]) * sccache[csel][ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
fma(FLOAT_TYPE( b48[l]) * sccache[csel][ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
fma(FLOAT_TYPE( b64[l]) * sccache[csel][ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
fma(FLOAT_TYPE( b80[l]) * sccache[csel][ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
fma(FLOAT_TYPE( b96[l]) * sccache[csel][ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
fma(FLOAT_TYPE(b112[l]) * sccache[csel][ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
}
temp[j][n] = fma(d, sum, temp[j][n]);
}

View File

@@ -6,20 +6,21 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][16];
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
uint csel = 0;
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
csel ^= 1;
if (!all_threads) { // when we don't have enough blocks to use all threads
barrier();
if (i < num_blocks_per_row)
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
barrier();
if (i >= num_blocks_per_row)
@@ -51,8 +52,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
if (all_threads) {
barrier();
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
barrier();
}
@@ -71,7 +71,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
}
temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
temp[j][n] = fma(fma(sum[0], sccache[csel][ix][s_offset], fma(sum[1], sccache[csel][ix][s_offset + 2], fma(sum[2], sccache[csel][ix][s_offset + 4], sum[3] * sccache[csel][ix][s_offset + 6]))), d, temp[j][n]);
}
}
}

View File

@@ -120,13 +120,7 @@ int main(int argc, char * argv[]) {
generate_data(0.0, test_data.size(), test_data.data());
generate_data(1.0, test_data2.size(), test_data2.data());
// Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = {
/* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ true,
};
struct ggml_context * ctx = ggml_init(ggml_params);
ggml_cpu_init();
int num_failed = 0;
bool failed = false;
@@ -188,7 +182,5 @@ int main(int argc, char * argv[]) {
printf("%d tests failed\n", num_failed);
}
ggml_free(ctx);
return num_failed > 0;
}